1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2019 Joyent, Inc. 25 */ 26 27 /* 28 * Copyright (c) 2016 by Delphix. All rights reserved. 29 */ 30 31 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 32 /* All Rights Reserved */ 33 34 /* 35 * University Copyright- Copyright (c) 1982, 1986, 1988 36 * The Regents of the University of California 37 * All Rights Reserved 38 * 39 * University Acknowledgment- Portions of this document are derived from 40 * software developed by the University of California, Berkeley, and its 41 * contributors. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/t_lock.h> 46 #include <sys/sysmacros.h> 47 #include <sys/conf.h> 48 #include <sys/cpuvar.h> 49 #include <sys/errno.h> 50 #include <sys/debug.h> 51 #include <sys/buf.h> 52 #include <sys/var.h> 53 #include <sys/vnode.h> 54 #include <sys/bitmap.h> 55 #include <sys/cmn_err.h> 56 #include <sys/kmem.h> 57 #include <sys/vmem.h> 58 #include <sys/atomic.h> 59 #include <vm/seg_kmem.h> 60 #include <vm/page.h> 61 #include <vm/pvn.h> 62 #include <sys/vtrace.h> 63 #include <sys/fs/ufs_inode.h> 64 #include <sys/fs/ufs_bio.h> 65 #include <sys/fs/ufs_log.h> 66 #include <sys/systm.h> 67 #include <sys/vfs.h> 68 #include <sys/sdt.h> 69 70 /* Locks */ 71 static kmutex_t blist_lock; /* protects b_list */ 72 static kmutex_t bhdr_lock; /* protects the bhdrlist */ 73 static kmutex_t bfree_lock; /* protects the bfreelist structure */ 74 75 struct hbuf *hbuf; /* Hash buckets */ 76 struct dwbuf *dwbuf; /* Delayed write buckets */ 77 static struct buf *bhdrlist; /* buf header free list */ 78 static int nbuf; /* number of buffer headers allocated */ 79 80 static int lastindex; /* Reference point on where to start */ 81 /* when looking for free buffers */ 82 83 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask) 84 #define EMPTY_LIST ((struct buf *)-1) 85 86 static kcondvar_t bio_mem_cv; /* Condition variables */ 87 static kcondvar_t bio_flushinval_cv; 88 static int bio_doingflush; /* flush in progress */ 89 static int bio_doinginval; /* inval in progress */ 90 static int bio_flinv_cv_wanted; /* someone waiting for cv */ 91 92 /* 93 * Statistics on the buffer cache 94 */ 95 struct biostats biostats = { 96 { "buffer_cache_lookups", KSTAT_DATA_UINT32 }, 97 { "buffer_cache_hits", KSTAT_DATA_UINT32 }, 98 { "new_buffer_requests", KSTAT_DATA_UINT32 }, 99 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 }, 100 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 }, 101 { "duplicate_buffers_found", KSTAT_DATA_UINT32 } 102 }; 103 104 /* 105 * kstat data 106 */ 107 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats; 108 uint_t biostats_ndata = (uint_t)(sizeof (biostats) / 109 sizeof (kstat_named_t)); 110 111 /* 112 * Statistics on ufs buffer cache 113 * Not protected by locks 114 */ 115 struct ufsbiostats ub = { 116 { "breads", KSTAT_DATA_UINT32 }, 117 { "bwrites", KSTAT_DATA_UINT32 }, 118 { "fbiwrites", KSTAT_DATA_UINT32 }, 119 { "getpages", KSTAT_DATA_UINT32 }, 120 { "getras", KSTAT_DATA_UINT32 }, 121 { "putsyncs", KSTAT_DATA_UINT32 }, 122 { "putasyncs", KSTAT_DATA_UINT32 }, 123 { "putpageios", KSTAT_DATA_UINT32 }, 124 }; 125 126 /* 127 * more UFS Logging eccentricities... 128 * 129 * required since "#pragma weak ..." doesn't work in reverse order. 130 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers 131 * to ufs routines don't get plugged into bio.c calls so 132 * we initialize it when setting up the "lufsops" table 133 * in "lufs.c:_init()" 134 */ 135 void (*bio_lufs_strategy)(void *, buf_t *); 136 void (*bio_snapshot_strategy)(void *, buf_t *); 137 138 139 /* Private routines */ 140 static struct buf *bio_getfreeblk(long); 141 static void bio_mem_get(long); 142 static void bio_bhdr_free(struct buf *); 143 static struct buf *bio_bhdr_alloc(void); 144 static void bio_recycle(int, long); 145 static void bio_pageio_done(struct buf *); 146 static int bio_incore(dev_t, daddr_t); 147 148 /* 149 * Buffer cache constants 150 */ 151 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */ 152 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */ 153 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */ 154 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */ 155 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024) 156 #define BIO_HASHLEN 4 /* Target length of hash chains */ 157 158 159 /* Flags for bio_recycle() */ 160 #define BIO_HEADER 0x01 161 #define BIO_MEM 0x02 162 163 extern int bufhwm; /* User tunable - high water mark for mem */ 164 extern int bufhwm_pct; /* ditto - given in % of physmem */ 165 166 /* 167 * The following routines allocate and free 168 * buffers with various side effects. In general the 169 * arguments to an allocate routine are a device and 170 * a block number, and the value is a pointer to 171 * to the buffer header; the buffer returned is locked with a 172 * binary semaphore so that no one else can touch it. If the block was 173 * already in core, no I/O need be done; if it is 174 * already locked, the process waits until it becomes free. 175 * The following routines allocate a buffer: 176 * getblk 177 * bread/BREAD 178 * breada 179 * Eventually the buffer must be released, possibly with the 180 * side effect of writing it out, by using one of 181 * bwrite/BWRITE/brwrite 182 * bdwrite/bdrwrite 183 * bawrite 184 * brelse 185 * 186 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization. 187 * Instead, a binary semaphore, b_sem is used to gain exclusive access to 188 * a buffer and a binary semaphore, b_io is used for I/O synchronization. 189 * B_DONE is still used to denote a buffer with I/O complete on it. 190 * 191 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is 192 * should not be used where a very accurate count of the free buffers is 193 * needed. 194 */ 195 196 /* 197 * Read in (if necessary) the block and return a buffer pointer. 198 * 199 * This interface is provided for binary compatibility. Using 200 * BREAD() directly avoids the extra function call overhead invoked 201 * by calling this routine. 202 */ 203 struct buf * 204 bread(dev_t dev, daddr_t blkno, long bsize) 205 { 206 return (BREAD(dev, blkno, bsize)); 207 } 208 209 /* 210 * Common code for reading a buffer with various options 211 * 212 * Read in (if necessary) the block and return a buffer pointer. 213 */ 214 struct buf * 215 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize) 216 { 217 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 218 struct buf *bp; 219 klwp_t *lwp = ttolwp(curthread); 220 221 CPU_STATS_ADD_K(sys, lread, 1); 222 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1); 223 if (bp->b_flags & B_DONE) 224 return (bp); 225 bp->b_flags |= B_READ; 226 ASSERT(bp->b_bcount == bsize); 227 if (ufsvfsp == NULL) { /* !ufs */ 228 (void) bdev_strategy(bp); 229 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 230 /* ufs && logging */ 231 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 232 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 233 /* ufs && snapshots */ 234 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 235 } else { 236 ufsvfsp->vfs_iotstamp = ddi_get_lbolt(); 237 ub.ub_breads.value.ul++; /* ufs && !logging */ 238 (void) bdev_strategy(bp); 239 } 240 if (lwp != NULL) 241 lwp->lwp_ru.inblock++; 242 CPU_STATS_ADD_K(sys, bread, 1); 243 (void) biowait(bp); 244 return (bp); 245 } 246 247 /* 248 * Read in the block, like bread, but also start I/O on the 249 * read-ahead block (which is not allocated to the caller). 250 */ 251 struct buf * 252 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize) 253 { 254 struct buf *bp, *rabp; 255 klwp_t *lwp = ttolwp(curthread); 256 257 bp = NULL; 258 if (!bio_incore(dev, blkno)) { 259 CPU_STATS_ADD_K(sys, lread, 1); 260 bp = GETBLK(dev, blkno, bsize); 261 if ((bp->b_flags & B_DONE) == 0) { 262 bp->b_flags |= B_READ; 263 bp->b_bcount = bsize; 264 (void) bdev_strategy(bp); 265 if (lwp != NULL) 266 lwp->lwp_ru.inblock++; 267 CPU_STATS_ADD_K(sys, bread, 1); 268 } 269 } 270 if (rablkno && bfreelist.b_bcount > 1 && 271 !bio_incore(dev, rablkno)) { 272 rabp = GETBLK(dev, rablkno, bsize); 273 if (rabp->b_flags & B_DONE) 274 brelse(rabp); 275 else { 276 rabp->b_flags |= B_READ|B_ASYNC; 277 rabp->b_bcount = bsize; 278 (void) bdev_strategy(rabp); 279 if (lwp != NULL) 280 lwp->lwp_ru.inblock++; 281 CPU_STATS_ADD_K(sys, bread, 1); 282 } 283 } 284 if (bp == NULL) 285 return (BREAD(dev, blkno, bsize)); 286 (void) biowait(bp); 287 return (bp); 288 } 289 290 /* 291 * Common code for writing a buffer with various options. 292 * 293 * force_wait - wait for write completion regardless of B_ASYNC flag 294 * do_relse - release the buffer when we are done 295 * clear_flags - flags to clear from the buffer 296 */ 297 void 298 bwrite_common(void *arg, struct buf *bp, int force_wait, 299 int do_relse, int clear_flags) 300 { 301 register int do_wait; 302 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg; 303 int flag; 304 klwp_t *lwp = ttolwp(curthread); 305 struct cpu *cpup; 306 307 ASSERT(SEMA_HELD(&bp->b_sem)); 308 flag = bp->b_flags; 309 bp->b_flags &= ~clear_flags; 310 if (lwp != NULL) 311 lwp->lwp_ru.oublock++; 312 CPU_STATS_ENTER_K(); 313 cpup = CPU; /* get pointer AFTER preemption is disabled */ 314 CPU_STATS_ADDQ(cpup, sys, lwrite, 1); 315 CPU_STATS_ADDQ(cpup, sys, bwrite, 1); 316 do_wait = ((flag & B_ASYNC) == 0 || force_wait); 317 if (do_wait == 0) 318 CPU_STATS_ADDQ(cpup, sys, bawrite, 1); 319 CPU_STATS_EXIT_K(); 320 if (ufsvfsp == NULL) { 321 (void) bdev_strategy(bp); 322 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) { 323 /* ufs && logging */ 324 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp); 325 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) { 326 /* ufs && snapshots */ 327 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp); 328 } else { 329 ub.ub_bwrites.value.ul++; /* ufs && !logging */ 330 (void) bdev_strategy(bp); 331 } 332 if (do_wait) { 333 (void) biowait(bp); 334 if (do_relse) { 335 brelse(bp); 336 } 337 } 338 } 339 340 /* 341 * Write the buffer, waiting for completion (unless B_ASYNC is set). 342 * Then release the buffer. 343 * This interface is provided for binary compatibility. Using 344 * BWRITE() directly avoids the extra function call overhead invoked 345 * by calling this routine. 346 */ 347 void 348 bwrite(struct buf *bp) 349 { 350 BWRITE(bp); 351 } 352 353 /* 354 * Write the buffer, waiting for completion. 355 * But don't release the buffer afterwards. 356 * This interface is provided for binary compatibility. Using 357 * BWRITE2() directly avoids the extra function call overhead. 358 */ 359 void 360 bwrite2(struct buf *bp) 361 { 362 BWRITE2(bp); 363 } 364 365 /* 366 * Release the buffer, marking it so that if it is grabbed 367 * for another purpose it will be written out before being 368 * given up (e.g. when writing a partial block where it is 369 * assumed that another write for the same block will soon follow). 370 * Also save the time that the block is first marked as delayed 371 * so that it will be written in a reasonable time. 372 */ 373 void 374 bdwrite(struct buf *bp) 375 { 376 ASSERT(SEMA_HELD(&bp->b_sem)); 377 CPU_STATS_ADD_K(sys, lwrite, 1); 378 if ((bp->b_flags & B_DELWRI) == 0) 379 bp->b_start = ddi_get_lbolt(); 380 /* 381 * B_DONE allows others to use the buffer, B_DELWRI causes the 382 * buffer to be written before being reused, and setting b_resid 383 * to zero says the buffer is complete. 384 */ 385 bp->b_flags |= B_DELWRI | B_DONE; 386 bp->b_resid = 0; 387 brelse(bp); 388 } 389 390 /* 391 * Release the buffer, start I/O on it, but don't wait for completion. 392 */ 393 void 394 bawrite(struct buf *bp) 395 { 396 ASSERT(SEMA_HELD(&bp->b_sem)); 397 398 /* Use bfreelist.b_bcount as a weird-ass heuristic */ 399 if (bfreelist.b_bcount > 4) 400 bp->b_flags |= B_ASYNC; 401 BWRITE(bp); 402 } 403 404 /* 405 * Release the buffer, with no I/O implied. 406 */ 407 void 408 brelse(struct buf *bp) 409 { 410 struct buf **backp; 411 uint_t index; 412 kmutex_t *hmp; 413 struct buf *dp; 414 struct hbuf *hp; 415 416 417 ASSERT(SEMA_HELD(&bp->b_sem)); 418 419 /* 420 * Clear the retry write flag if the buffer was written without 421 * error. The presence of B_DELWRI means the buffer has not yet 422 * been written and the presence of B_ERROR means that an error 423 * is still occurring. 424 */ 425 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) { 426 bp->b_flags &= ~B_RETRYWRI; 427 } 428 429 /* Check for anomalous conditions */ 430 if (bp->b_flags & (B_ERROR|B_NOCACHE)) { 431 if (bp->b_flags & B_NOCACHE) { 432 /* Don't add to the freelist. Destroy it now */ 433 kmem_free(bp->b_un.b_addr, bp->b_bufsize); 434 sema_destroy(&bp->b_sem); 435 sema_destroy(&bp->b_io); 436 kmem_free(bp, sizeof (struct buf)); 437 return; 438 } 439 /* 440 * If a write failed and we are supposed to retry write, 441 * don't toss the buffer. Keep it around and mark it 442 * delayed write in the hopes that it will eventually 443 * get flushed (and still keep the system running.) 444 */ 445 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) { 446 bp->b_flags |= B_DELWRI; 447 /* keep fsflush from trying continuously to flush */ 448 bp->b_start = ddi_get_lbolt(); 449 } else 450 bp->b_flags |= B_AGE|B_STALE; 451 bp->b_flags &= ~B_ERROR; 452 bp->b_error = 0; 453 } 454 455 /* 456 * If delayed write is set then put in on the delayed 457 * write list instead of the free buffer list. 458 */ 459 index = bio_bhash(bp->b_edev, bp->b_blkno); 460 hmp = &hbuf[index].b_lock; 461 462 mutex_enter(hmp); 463 hp = &hbuf[index]; 464 dp = (struct buf *)hp; 465 466 /* 467 * Make sure that the number of entries on this list are 468 * Zero <= count <= total # buffers 469 */ 470 ASSERT(hp->b_length >= 0); 471 ASSERT(hp->b_length < nbuf); 472 473 hp->b_length++; /* We are adding this buffer */ 474 475 if (bp->b_flags & B_DELWRI) { 476 /* 477 * This buffer goes on the delayed write buffer list 478 */ 479 dp = (struct buf *)&dwbuf[index]; 480 } 481 ASSERT(bp->b_bufsize > 0); 482 ASSERT(bp->b_bcount > 0); 483 ASSERT(bp->b_un.b_addr != NULL); 484 485 if (bp->b_flags & B_AGE) { 486 backp = &dp->av_forw; 487 (*backp)->av_back = bp; 488 bp->av_forw = *backp; 489 *backp = bp; 490 bp->av_back = dp; 491 } else { 492 backp = &dp->av_back; 493 (*backp)->av_forw = bp; 494 bp->av_back = *backp; 495 *backp = bp; 496 bp->av_forw = dp; 497 } 498 mutex_exit(hmp); 499 500 if (bfreelist.b_flags & B_WANTED) { 501 /* 502 * Should come here very very rarely. 503 */ 504 mutex_enter(&bfree_lock); 505 if (bfreelist.b_flags & B_WANTED) { 506 bfreelist.b_flags &= ~B_WANTED; 507 cv_broadcast(&bio_mem_cv); 508 } 509 mutex_exit(&bfree_lock); 510 } 511 512 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC); 513 /* 514 * Don't let anyone get the buffer off the freelist before we 515 * release our hold on it. 516 */ 517 sema_v(&bp->b_sem); 518 } 519 520 /* 521 * Return a count of the number of B_BUSY buffers in the system 522 * Can only be used as a good estimate. If 'cleanit' is set, 523 * try to flush all bufs. 524 */ 525 int 526 bio_busy(int cleanit) 527 { 528 struct buf *bp, *dp; 529 int busy = 0; 530 int i; 531 kmutex_t *hmp; 532 533 for (i = 0; i < v.v_hbuf; i++) { 534 dp = (struct buf *)&hbuf[i]; 535 hmp = &hbuf[i].b_lock; 536 537 mutex_enter(hmp); 538 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 539 if (bp->b_flags & B_BUSY) 540 busy++; 541 } 542 mutex_exit(hmp); 543 } 544 545 if (cleanit && busy != 0) { 546 bflush(NODEV); 547 } 548 549 return (busy); 550 } 551 552 /* 553 * this interface is provided for binary compatibility. 554 * 555 * Assign a buffer for the given block. If the appropriate 556 * block is already associated, return it; otherwise search 557 * for the oldest non-busy buffer and reassign it. 558 */ 559 struct buf * 560 getblk(dev_t dev, daddr_t blkno, long bsize) 561 { 562 return (getblk_common(/* ufsvfsp */ NULL, dev, 563 blkno, bsize, /* errflg */ 0)); 564 } 565 566 /* 567 * Assign a buffer for the given block. If the appropriate 568 * block is already associated, return it; otherwise search 569 * for the oldest non-busy buffer and reassign it. 570 */ 571 struct buf * 572 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg) 573 { 574 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg; 575 struct buf *bp; 576 struct buf *dp; 577 struct buf *nbp = NULL; 578 struct buf *errbp; 579 uint_t index; 580 kmutex_t *hmp; 581 struct hbuf *hp; 582 583 if (getmajor(dev) >= devcnt) 584 cmn_err(CE_PANIC, "blkdev"); 585 586 biostats.bio_lookup.value.ui32++; 587 588 index = bio_bhash(dev, blkno); 589 hp = &hbuf[index]; 590 dp = (struct buf *)hp; 591 hmp = &hp->b_lock; 592 593 mutex_enter(hmp); 594 loop: 595 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 596 if (bp->b_blkno != blkno || bp->b_edev != dev || 597 (bp->b_flags & B_STALE)) 598 continue; 599 /* 600 * Avoid holding the hash lock in the event that 601 * the buffer is locked by someone. Since the hash chain 602 * may change when we drop the hash lock 603 * we have to start at the beginning of the chain if the 604 * buffer identity/contents aren't valid. 605 */ 606 if (!sema_tryp(&bp->b_sem)) { 607 biostats.bio_bufbusy.value.ui32++; 608 mutex_exit(hmp); 609 /* 610 * OK, we are dealing with a busy buffer. 611 * In the case that we are panicking and we 612 * got called from bread(), we have some chance 613 * for error recovery. So better bail out from 614 * here since sema_p() won't block. If we got 615 * called directly from ufs routines, there is 616 * no way to report an error yet. 617 */ 618 if (panicstr && errflg) 619 goto errout; 620 /* 621 * For the following line of code to work 622 * correctly never kmem_free the buffer "header". 623 */ 624 sema_p(&bp->b_sem); 625 if (bp->b_blkno != blkno || bp->b_edev != dev || 626 (bp->b_flags & B_STALE)) { 627 sema_v(&bp->b_sem); 628 mutex_enter(hmp); 629 goto loop; /* start over */ 630 } 631 mutex_enter(hmp); 632 } 633 /* Found */ 634 biostats.bio_hit.value.ui32++; 635 bp->b_flags &= ~B_AGE; 636 637 /* 638 * Yank it off the free/delayed write lists 639 */ 640 hp->b_length--; 641 notavail(bp); 642 mutex_exit(hmp); 643 644 ASSERT((bp->b_flags & B_NOCACHE) == 0); 645 646 if (nbp == NULL) { 647 /* 648 * Make the common path short. 649 */ 650 ASSERT(SEMA_HELD(&bp->b_sem)); 651 return (bp); 652 } 653 654 biostats.bio_bufdup.value.ui32++; 655 656 /* 657 * The buffer must have entered during the lock upgrade 658 * so free the new buffer we allocated and return the 659 * found buffer. 660 */ 661 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize); 662 nbp->b_un.b_addr = NULL; 663 664 /* 665 * Account for the memory 666 */ 667 mutex_enter(&bfree_lock); 668 bfreelist.b_bufsize += nbp->b_bufsize; 669 mutex_exit(&bfree_lock); 670 671 /* 672 * Destroy buf identity, and place on avail list 673 */ 674 nbp->b_dev = (o_dev_t)NODEV; 675 nbp->b_edev = NODEV; 676 nbp->b_flags = 0; 677 nbp->b_file = NULL; 678 nbp->b_offset = -1; 679 680 sema_v(&nbp->b_sem); 681 bio_bhdr_free(nbp); 682 683 ASSERT(SEMA_HELD(&bp->b_sem)); 684 return (bp); 685 } 686 687 /* 688 * bio_getfreeblk may block so check the hash chain again. 689 */ 690 if (nbp == NULL) { 691 mutex_exit(hmp); 692 nbp = bio_getfreeblk(bsize); 693 mutex_enter(hmp); 694 goto loop; 695 } 696 697 /* 698 * New buffer. Assign nbp and stick it on the hash. 699 */ 700 nbp->b_flags = B_BUSY; 701 nbp->b_edev = dev; 702 nbp->b_dev = (o_dev_t)cmpdev(dev); 703 nbp->b_blkno = blkno; 704 nbp->b_iodone = NULL; 705 nbp->b_bcount = bsize; 706 /* 707 * If we are given a ufsvfsp and the vfs_root field is NULL 708 * then this must be I/O for a superblock. A superblock's 709 * buffer is set up in mountfs() and there is no root vnode 710 * at that point. 711 */ 712 if (ufsvfsp && ufsvfsp->vfs_root) { 713 nbp->b_vp = ufsvfsp->vfs_root; 714 } else { 715 nbp->b_vp = NULL; 716 } 717 718 ASSERT((nbp->b_flags & B_NOCACHE) == 0); 719 720 binshash(nbp, dp); 721 mutex_exit(hmp); 722 723 ASSERT(SEMA_HELD(&nbp->b_sem)); 724 725 return (nbp); 726 727 728 /* 729 * Come here in case of an internal error. At this point we couldn't 730 * get a buffer, but we have to return one. Hence we allocate some 731 * kind of error reply buffer on the fly. This buffer is marked as 732 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following: 733 * - B_ERROR will indicate error to the caller. 734 * - B_DONE will prevent us from reading the buffer from 735 * the device. 736 * - B_NOCACHE will cause that this buffer gets free'd in 737 * brelse(). 738 */ 739 740 errout: 741 errbp = geteblk(); 742 sema_p(&errbp->b_sem); 743 errbp->b_flags &= ~B_BUSY; 744 errbp->b_flags |= (B_ERROR | B_DONE); 745 return (errbp); 746 } 747 748 /* 749 * Get an empty block, not assigned to any particular device. 750 * Returns a locked buffer that is not on any hash or free list. 751 */ 752 struct buf * 753 ngeteblk(long bsize) 754 { 755 struct buf *bp; 756 757 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP); 758 bioinit(bp); 759 bp->av_forw = bp->av_back = NULL; 760 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 761 bp->b_bufsize = bsize; 762 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE; 763 bp->b_dev = (o_dev_t)NODEV; 764 bp->b_edev = NODEV; 765 bp->b_lblkno = 0; 766 bp->b_bcount = bsize; 767 bp->b_iodone = NULL; 768 return (bp); 769 } 770 771 /* 772 * Interface of geteblk() is kept intact to maintain driver compatibility. 773 * Use ngeteblk() to allocate block size other than 1 KB. 774 */ 775 struct buf * 776 geteblk(void) 777 { 778 return (ngeteblk((long)1024)); 779 } 780 781 /* 782 * Return a buffer w/o sleeping 783 */ 784 struct buf * 785 trygetblk(dev_t dev, daddr_t blkno) 786 { 787 struct buf *bp; 788 struct buf *dp; 789 struct hbuf *hp; 790 kmutex_t *hmp; 791 uint_t index; 792 793 index = bio_bhash(dev, blkno); 794 hp = &hbuf[index]; 795 hmp = &hp->b_lock; 796 797 if (!mutex_tryenter(hmp)) 798 return (NULL); 799 800 dp = (struct buf *)hp; 801 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 802 if (bp->b_blkno != blkno || bp->b_edev != dev || 803 (bp->b_flags & B_STALE)) 804 continue; 805 /* 806 * Get access to a valid buffer without sleeping 807 */ 808 if (sema_tryp(&bp->b_sem)) { 809 if (bp->b_flags & B_DONE) { 810 hp->b_length--; 811 notavail(bp); 812 mutex_exit(hmp); 813 return (bp); 814 } else { 815 sema_v(&bp->b_sem); 816 break; 817 } 818 } 819 break; 820 } 821 mutex_exit(hmp); 822 return (NULL); 823 } 824 825 /* 826 * Wait for I/O completion on the buffer; return errors 827 * to the user. 828 */ 829 int 830 iowait(struct buf *bp) 831 { 832 ASSERT(SEMA_HELD(&bp->b_sem)); 833 return (biowait(bp)); 834 } 835 836 /* 837 * Mark I/O complete on a buffer, release it if I/O is asynchronous, 838 * and wake up anyone waiting for it. 839 */ 840 void 841 iodone(struct buf *bp) 842 { 843 ASSERT(SEMA_HELD(&bp->b_sem)); 844 (void) biodone(bp); 845 } 846 847 /* 848 * Zero the core associated with a buffer. 849 */ 850 void 851 clrbuf(struct buf *bp) 852 { 853 ASSERT(SEMA_HELD(&bp->b_sem)); 854 bzero(bp->b_un.b_addr, bp->b_bcount); 855 bp->b_resid = 0; 856 } 857 858 859 /* 860 * Make sure all write-behind blocks on dev (or NODEV for all) 861 * are flushed out. 862 */ 863 void 864 bflush(dev_t dev) 865 { 866 struct buf *bp, *dp; 867 struct hbuf *hp; 868 struct buf *delwri_list = EMPTY_LIST; 869 int i, index; 870 kmutex_t *hmp; 871 872 mutex_enter(&blist_lock); 873 /* 874 * Wait for any invalidates or flushes ahead of us to finish. 875 * We really could split blist_lock up per device for better 876 * parallelism here. 877 */ 878 while (bio_doinginval || bio_doingflush) { 879 bio_flinv_cv_wanted = 1; 880 cv_wait(&bio_flushinval_cv, &blist_lock); 881 } 882 bio_doingflush++; 883 /* 884 * Gather all B_DELWRI buffer for device. 885 * Lock ordering is b_sem > hash lock (brelse). 886 * Since we are finding the buffer via the delayed write list, 887 * it may be busy and we would block trying to get the 888 * b_sem lock while holding hash lock. So transfer all the 889 * candidates on the delwri_list and then drop the hash locks. 890 */ 891 for (i = 0; i < v.v_hbuf; i++) { 892 hmp = &hbuf[i].b_lock; 893 dp = (struct buf *)&dwbuf[i]; 894 mutex_enter(hmp); 895 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) { 896 if (dev == NODEV || bp->b_edev == dev) { 897 if (bp->b_list == NULL) { 898 bp->b_list = delwri_list; 899 delwri_list = bp; 900 } 901 } 902 } 903 mutex_exit(hmp); 904 } 905 mutex_exit(&blist_lock); 906 907 /* 908 * Now that the hash locks have been dropped grab the semaphores 909 * and write back all the buffers that have B_DELWRI set. 910 */ 911 while (delwri_list != EMPTY_LIST) { 912 bp = delwri_list; 913 914 sema_p(&bp->b_sem); /* may block */ 915 if ((dev != bp->b_edev && dev != NODEV) || 916 (panicstr && bp->b_flags & B_BUSY)) { 917 sema_v(&bp->b_sem); 918 delwri_list = bp->b_list; 919 bp->b_list = NULL; 920 continue; /* No longer a candidate */ 921 } 922 if (bp->b_flags & B_DELWRI) { 923 index = bio_bhash(bp->b_edev, bp->b_blkno); 924 hp = &hbuf[index]; 925 hmp = &hp->b_lock; 926 dp = (struct buf *)hp; 927 928 bp->b_flags |= B_ASYNC; 929 mutex_enter(hmp); 930 hp->b_length--; 931 notavail(bp); 932 mutex_exit(hmp); 933 if (bp->b_vp == NULL) { /* !ufs */ 934 BWRITE(bp); 935 } else { /* ufs */ 936 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 937 } 938 } else { 939 sema_v(&bp->b_sem); 940 } 941 delwri_list = bp->b_list; 942 bp->b_list = NULL; 943 } 944 mutex_enter(&blist_lock); 945 bio_doingflush--; 946 if (bio_flinv_cv_wanted) { 947 bio_flinv_cv_wanted = 0; 948 cv_broadcast(&bio_flushinval_cv); 949 } 950 mutex_exit(&blist_lock); 951 } 952 953 /* 954 * Ensure that a specified block is up-to-date on disk. 955 */ 956 void 957 blkflush(dev_t dev, daddr_t blkno) 958 { 959 struct buf *bp, *dp; 960 struct hbuf *hp; 961 struct buf *sbp = NULL; 962 uint_t index; 963 kmutex_t *hmp; 964 965 index = bio_bhash(dev, blkno); 966 hp = &hbuf[index]; 967 dp = (struct buf *)hp; 968 hmp = &hp->b_lock; 969 970 /* 971 * Identify the buffer in the cache belonging to 972 * this device and blkno (if any). 973 */ 974 mutex_enter(hmp); 975 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 976 if (bp->b_blkno != blkno || bp->b_edev != dev || 977 (bp->b_flags & B_STALE)) 978 continue; 979 sbp = bp; 980 break; 981 } 982 mutex_exit(hmp); 983 if (sbp == NULL) 984 return; 985 /* 986 * Now check the buffer we have identified and 987 * make sure it still belongs to the device and is B_DELWRI 988 */ 989 sema_p(&sbp->b_sem); 990 if (sbp->b_blkno == blkno && sbp->b_edev == dev && 991 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) { 992 mutex_enter(hmp); 993 hp->b_length--; 994 notavail(sbp); 995 mutex_exit(hmp); 996 /* 997 * XXX - There is nothing to guarantee a synchronous 998 * write here if the B_ASYNC flag is set. This needs 999 * some investigation. 1000 */ 1001 if (sbp->b_vp == NULL) { /* !ufs */ 1002 BWRITE(sbp); /* synchronous write */ 1003 } else { /* ufs */ 1004 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp); 1005 } 1006 } else { 1007 sema_v(&sbp->b_sem); 1008 } 1009 } 1010 1011 /* 1012 * Same as binval, except can force-invalidate delayed-write buffers 1013 * (which are not be already flushed because of device errors). Also 1014 * makes sure that the retry write flag is cleared. 1015 */ 1016 int 1017 bfinval(dev_t dev, int force) 1018 { 1019 struct buf *dp; 1020 struct buf *bp; 1021 struct buf *binval_list = EMPTY_LIST; 1022 int i, error = 0; 1023 kmutex_t *hmp; 1024 uint_t index; 1025 struct buf **backp; 1026 1027 mutex_enter(&blist_lock); 1028 /* 1029 * Wait for any flushes ahead of us to finish, it's ok to 1030 * do invalidates in parallel. 1031 */ 1032 while (bio_doingflush) { 1033 bio_flinv_cv_wanted = 1; 1034 cv_wait(&bio_flushinval_cv, &blist_lock); 1035 } 1036 bio_doinginval++; 1037 1038 /* Gather bp's */ 1039 for (i = 0; i < v.v_hbuf; i++) { 1040 dp = (struct buf *)&hbuf[i]; 1041 hmp = &hbuf[i].b_lock; 1042 1043 mutex_enter(hmp); 1044 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1045 if (bp->b_edev == dev) { 1046 if (bp->b_list == NULL) { 1047 bp->b_list = binval_list; 1048 binval_list = bp; 1049 } 1050 } 1051 } 1052 mutex_exit(hmp); 1053 } 1054 mutex_exit(&blist_lock); 1055 1056 /* Invalidate all bp's found */ 1057 while (binval_list != EMPTY_LIST) { 1058 bp = binval_list; 1059 1060 sema_p(&bp->b_sem); 1061 if (bp->b_edev == dev) { 1062 if (force && (bp->b_flags & B_DELWRI)) { 1063 /* clear B_DELWRI, move to non-dw freelist */ 1064 index = bio_bhash(bp->b_edev, bp->b_blkno); 1065 hmp = &hbuf[index].b_lock; 1066 dp = (struct buf *)&hbuf[index]; 1067 mutex_enter(hmp); 1068 1069 /* remove from delayed write freelist */ 1070 notavail(bp); 1071 1072 /* add to B_AGE side of non-dw freelist */ 1073 backp = &dp->av_forw; 1074 (*backp)->av_back = bp; 1075 bp->av_forw = *backp; 1076 *backp = bp; 1077 bp->av_back = dp; 1078 1079 /* 1080 * make sure write retries and busy are cleared 1081 */ 1082 bp->b_flags &= 1083 ~(B_BUSY | B_DELWRI | B_RETRYWRI); 1084 mutex_exit(hmp); 1085 } 1086 if ((bp->b_flags & B_DELWRI) == 0) 1087 bp->b_flags |= B_STALE|B_AGE; 1088 else 1089 error = EIO; 1090 } 1091 sema_v(&bp->b_sem); 1092 binval_list = bp->b_list; 1093 bp->b_list = NULL; 1094 } 1095 mutex_enter(&blist_lock); 1096 bio_doinginval--; 1097 if (bio_flinv_cv_wanted) { 1098 cv_broadcast(&bio_flushinval_cv); 1099 bio_flinv_cv_wanted = 0; 1100 } 1101 mutex_exit(&blist_lock); 1102 return (error); 1103 } 1104 1105 /* 1106 * If possible, invalidate blocks for a dev on demand 1107 */ 1108 void 1109 binval(dev_t dev) 1110 { 1111 (void) bfinval(dev, 0); 1112 } 1113 1114 /* 1115 * Initialize the buffer I/O system by freeing 1116 * all buffers and setting all device hash buffer lists to empty. 1117 */ 1118 void 1119 binit(void) 1120 { 1121 struct buf *bp; 1122 unsigned int i, pct; 1123 ulong_t bio_max_hwm, bio_default_hwm; 1124 1125 /* 1126 * Maximum/Default values for bufhwm are set to the smallest of: 1127 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory 1128 * - 1/4 of kernel virtual memory 1129 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int). 1130 * Additionally, in order to allow simple tuning by percentage of 1131 * physical memory, bufhwm_pct is used to calculate the default if 1132 * the value of this tunable is between 0 and BIO_MAX_PERCENT. 1133 * 1134 * Since the unit for v.v_bufhwm is kilobytes, this allows for 1135 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers. 1136 */ 1137 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT, 1138 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1139 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm); 1140 1141 pct = BIO_BUF_PERCENT; 1142 if (bufhwm_pct != 0 && 1143 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) { 1144 pct = BIO_BUF_PERCENT; 1145 /* 1146 * Invalid user specified value, emit a warning. 1147 */ 1148 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \ 1149 range(1..%d). Using %d as default.", 1150 bufhwm_pct, 1151 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT); 1152 } 1153 1154 bio_default_hwm = MIN(physmem / pct, 1155 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024); 1156 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm); 1157 1158 if ((v.v_bufhwm = bufhwm) == 0) 1159 v.v_bufhwm = bio_default_hwm; 1160 1161 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) { 1162 v.v_bufhwm = (int)bio_max_hwm; 1163 /* 1164 * Invalid user specified value, emit a warning. 1165 */ 1166 cmn_err(CE_WARN, 1167 "binit: bufhwm(%d) out \ 1168 of range(%d..%lu). Using %lu as default", 1169 bufhwm, 1170 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm); 1171 } 1172 1173 /* 1174 * Determine the number of hash buckets. Default is to 1175 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers. 1176 * Round up number to the next power of 2. 1177 */ 1178 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) / 1179 BIO_HASHLEN); 1180 v.v_hmask = v.v_hbuf - 1; 1181 v.v_buf = BIO_BHDR_POOL; 1182 1183 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP); 1184 1185 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP); 1186 1187 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024; 1188 bp = &bfreelist; 1189 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp; 1190 1191 for (i = 0; i < v.v_hbuf; i++) { 1192 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i]; 1193 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i]; 1194 1195 /* 1196 * Initialize the delayed write buffer list. 1197 */ 1198 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i]; 1199 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i]; 1200 } 1201 } 1202 1203 /* 1204 * Wait for I/O completion on the buffer; return error code. 1205 * If bp was for synchronous I/O, bp is invalid and associated 1206 * resources are freed on return. 1207 */ 1208 int 1209 biowait(struct buf *bp) 1210 { 1211 int error = 0; 1212 struct cpu *cpup; 1213 1214 ASSERT(SEMA_HELD(&bp->b_sem)); 1215 1216 cpup = CPU; 1217 atomic_inc_64(&cpup->cpu_stats.sys.iowait); 1218 DTRACE_IO1(wait__start, struct buf *, bp); 1219 1220 /* 1221 * In case of panic, busy wait for completion 1222 */ 1223 if (panicstr) { 1224 while ((bp->b_flags & B_DONE) == 0) 1225 drv_usecwait(10); 1226 } else 1227 sema_p(&bp->b_io); 1228 1229 DTRACE_IO1(wait__done, struct buf *, bp); 1230 atomic_dec_64(&cpup->cpu_stats.sys.iowait); 1231 1232 error = geterror(bp); 1233 if ((bp->b_flags & B_ASYNC) == 0) { 1234 if (bp->b_flags & B_REMAPPED) 1235 bp_mapout(bp); 1236 } 1237 return (error); 1238 } 1239 1240 /* 1241 * Mark I/O complete on a buffer, release it if I/O is asynchronous, 1242 * and wake up anyone waiting for it. 1243 */ 1244 void 1245 biodone(struct buf *bp) 1246 { 1247 if (bp->b_flags & B_STARTED) { 1248 DTRACE_IO1(done, struct buf *, bp); 1249 bp->b_flags &= ~B_STARTED; 1250 } 1251 1252 if (bp->b_iodone != NULL) { 1253 (*(bp->b_iodone))(bp); 1254 return; 1255 } 1256 ASSERT((bp->b_flags & B_DONE) == 0); 1257 ASSERT(SEMA_HELD(&bp->b_sem)); 1258 bp->b_flags |= B_DONE; 1259 if (bp->b_flags & B_ASYNC) { 1260 if (bp->b_flags & (B_PAGEIO|B_REMAPPED)) 1261 bio_pageio_done(bp); 1262 else 1263 brelse(bp); /* release bp to freelist */ 1264 } else { 1265 sema_v(&bp->b_io); 1266 } 1267 } 1268 1269 /* 1270 * Pick up the device's error number and pass it to the user; 1271 * if there is an error but the number is 0 set a generalized code. 1272 */ 1273 int 1274 geterror(struct buf *bp) 1275 { 1276 int error = 0; 1277 1278 ASSERT(SEMA_HELD(&bp->b_sem)); 1279 if (bp->b_flags & B_ERROR) { 1280 error = bp->b_error; 1281 if (!error) 1282 error = EIO; 1283 } 1284 return (error); 1285 } 1286 1287 /* 1288 * Support for pageio buffers. 1289 * 1290 * This stuff should be generalized to provide a generalized bp 1291 * header facility that can be used for things other than pageio. 1292 */ 1293 1294 /* 1295 * Allocate and initialize a buf struct for use with pageio. 1296 */ 1297 struct buf * 1298 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) 1299 { 1300 struct buf *bp; 1301 struct cpu *cpup; 1302 1303 if (flags & B_READ) { 1304 CPU_STATS_ENTER_K(); 1305 cpup = CPU; /* get pointer AFTER preemption is disabled */ 1306 CPU_STATS_ADDQ(cpup, vm, pgin, 1); 1307 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len)); 1308 1309 atomic_add_64(&curzone->zone_pgpgin, btopr(len)); 1310 1311 if ((flags & B_ASYNC) == 0) { 1312 klwp_t *lwp = ttolwp(curthread); 1313 if (lwp != NULL) 1314 lwp->lwp_ru.majflt++; 1315 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1); 1316 } 1317 /* 1318 * Update statistics for pages being paged in 1319 */ 1320 if (pp != NULL && pp->p_vnode != NULL) { 1321 if (IS_SWAPFSVP(pp->p_vnode)) { 1322 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len)); 1323 atomic_add_64(&curzone->zone_anonpgin, 1324 btopr(len)); 1325 } else { 1326 if (pp->p_vnode->v_flag & VVMEXEC) { 1327 CPU_STATS_ADDQ(cpup, vm, execpgin, 1328 btopr(len)); 1329 atomic_add_64(&curzone->zone_execpgin, 1330 btopr(len)); 1331 } else { 1332 CPU_STATS_ADDQ(cpup, vm, fspgin, 1333 btopr(len)); 1334 atomic_add_64(&curzone->zone_fspgin, 1335 btopr(len)); 1336 } 1337 } 1338 } 1339 CPU_STATS_EXIT_K(); 1340 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN, 1341 "page_ws_in:pp %p", pp); 1342 } 1343 1344 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP); 1345 bp->b_bcount = len; 1346 bp->b_bufsize = len; 1347 bp->b_pages = pp; 1348 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags; 1349 bp->b_offset = -1; 1350 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 1351 1352 /* Initialize bp->b_sem in "locked" state */ 1353 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 1354 1355 VN_HOLD(vp); 1356 bp->b_vp = vp; 1357 1358 /* 1359 * Caller sets dev & blkno and can adjust 1360 * b_addr for page offset and can use bp_mapin 1361 * to make pages kernel addressable. 1362 */ 1363 return (bp); 1364 } 1365 1366 void 1367 pageio_done(struct buf *bp) 1368 { 1369 ASSERT(SEMA_HELD(&bp->b_sem)); 1370 if (bp->b_flags & B_REMAPPED) 1371 bp_mapout(bp); 1372 VN_RELE(bp->b_vp); 1373 bp->b_vp = NULL; 1374 ASSERT((bp->b_flags & B_NOCACHE) != 0); 1375 1376 /* A sema_v(bp->b_sem) is implied if we are destroying it */ 1377 sema_destroy(&bp->b_sem); 1378 sema_destroy(&bp->b_io); 1379 kmem_free(bp, sizeof (struct buf)); 1380 } 1381 1382 /* 1383 * Check to see whether the buffers, except the one pointed by sbp, 1384 * associated with the device are busy. 1385 * NOTE: This expensive operation shall be improved together with ufs_icheck(). 1386 */ 1387 int 1388 bcheck(dev_t dev, struct buf *sbp) 1389 { 1390 struct buf *bp; 1391 struct buf *dp; 1392 int i; 1393 kmutex_t *hmp; 1394 1395 /* 1396 * check for busy bufs for this filesystem 1397 */ 1398 for (i = 0; i < v.v_hbuf; i++) { 1399 dp = (struct buf *)&hbuf[i]; 1400 hmp = &hbuf[i].b_lock; 1401 1402 mutex_enter(hmp); 1403 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1404 /* 1405 * if buf is busy or dirty, then filesystem is busy 1406 */ 1407 if ((bp->b_edev == dev) && 1408 ((bp->b_flags & B_STALE) == 0) && 1409 (bp->b_flags & (B_DELWRI|B_BUSY)) && 1410 (bp != sbp)) { 1411 mutex_exit(hmp); 1412 return (1); 1413 } 1414 } 1415 mutex_exit(hmp); 1416 } 1417 return (0); 1418 } 1419 1420 /* 1421 * Hash two 32 bit entities. 1422 */ 1423 int 1424 hash2ints(int x, int y) 1425 { 1426 int hash = 0; 1427 1428 hash = x - 1; 1429 hash = ((hash * 7) + (x >> 8)) - 1; 1430 hash = ((hash * 7) + (x >> 16)) - 1; 1431 hash = ((hash * 7) + (x >> 24)) - 1; 1432 hash = ((hash * 7) + y) - 1; 1433 hash = ((hash * 7) + (y >> 8)) - 1; 1434 hash = ((hash * 7) + (y >> 16)) - 1; 1435 hash = ((hash * 7) + (y >> 24)) - 1; 1436 1437 return (hash); 1438 } 1439 1440 1441 /* 1442 * Return a new buffer struct. 1443 * Create a new buffer if we haven't gone over our high water 1444 * mark for memory, otherwise try to get one off the freelist. 1445 * 1446 * Returns a locked buf that has no id and is not on any hash or free 1447 * list. 1448 */ 1449 static struct buf * 1450 bio_getfreeblk(long bsize) 1451 { 1452 struct buf *bp, *dp; 1453 struct hbuf *hp; 1454 kmutex_t *hmp; 1455 uint_t start, end; 1456 1457 /* 1458 * mutex_enter(&bfree_lock); 1459 * bfreelist.b_bufsize represents the amount of memory 1460 * mutex_exit(&bfree_lock); protect ref to bfreelist 1461 * we are allowed to allocate in the cache before we hit our hwm. 1462 */ 1463 bio_mem_get(bsize); /* Account for our memory request */ 1464 1465 bp = bio_bhdr_alloc(); /* Get a buf hdr */ 1466 sema_p(&bp->b_sem); /* Should never fail */ 1467 1468 ASSERT(bp->b_un.b_addr == NULL); 1469 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP); 1470 if (bp->b_un.b_addr != NULL) { 1471 /* 1472 * Make the common path short 1473 */ 1474 bp->b_bufsize = bsize; 1475 ASSERT(SEMA_HELD(&bp->b_sem)); 1476 return (bp); 1477 } else { 1478 struct buf *save; 1479 1480 save = bp; /* Save bp we allocated */ 1481 start = end = lastindex; 1482 1483 biostats.bio_bufwant.value.ui32++; 1484 1485 /* 1486 * Memory isn't available from the system now. Scan 1487 * the hash buckets till enough space is found. 1488 */ 1489 do { 1490 hp = &hbuf[start]; 1491 hmp = &hp->b_lock; 1492 dp = (struct buf *)hp; 1493 1494 mutex_enter(hmp); 1495 bp = dp->av_forw; 1496 1497 while (bp != dp) { 1498 1499 ASSERT(bp != NULL); 1500 1501 if (!sema_tryp(&bp->b_sem)) { 1502 bp = bp->av_forw; 1503 continue; 1504 } 1505 1506 /* 1507 * Since we are going down the freelist 1508 * associated with this hash bucket the 1509 * B_DELWRI flag should not be set. 1510 */ 1511 ASSERT(!(bp->b_flags & B_DELWRI)); 1512 1513 if (bp->b_bufsize == bsize) { 1514 hp->b_length--; 1515 notavail(bp); 1516 bremhash(bp); 1517 mutex_exit(hmp); 1518 1519 /* 1520 * Didn't kmem_alloc any more, so don't 1521 * count it twice. 1522 */ 1523 mutex_enter(&bfree_lock); 1524 bfreelist.b_bufsize += bsize; 1525 mutex_exit(&bfree_lock); 1526 1527 /* 1528 * Update the lastindex value. 1529 */ 1530 lastindex = start; 1531 1532 /* 1533 * Put our saved bp back on the list 1534 */ 1535 sema_v(&save->b_sem); 1536 bio_bhdr_free(save); 1537 ASSERT(SEMA_HELD(&bp->b_sem)); 1538 return (bp); 1539 } 1540 sema_v(&bp->b_sem); 1541 bp = bp->av_forw; 1542 } 1543 mutex_exit(hmp); 1544 start = ((start + 1) % v.v_hbuf); 1545 } while (start != end); 1546 1547 biostats.bio_bufwait.value.ui32++; 1548 bp = save; /* Use original bp */ 1549 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP); 1550 } 1551 1552 bp->b_bufsize = bsize; 1553 ASSERT(SEMA_HELD(&bp->b_sem)); 1554 return (bp); 1555 } 1556 1557 /* 1558 * Allocate a buffer header. If none currently available, allocate 1559 * a new pool. 1560 */ 1561 static struct buf * 1562 bio_bhdr_alloc(void) 1563 { 1564 struct buf *dp, *sdp; 1565 struct buf *bp; 1566 int i; 1567 1568 for (;;) { 1569 mutex_enter(&bhdr_lock); 1570 if (bhdrlist != NULL) { 1571 bp = bhdrlist; 1572 bhdrlist = bp->av_forw; 1573 mutex_exit(&bhdr_lock); 1574 bp->av_forw = NULL; 1575 return (bp); 1576 } 1577 mutex_exit(&bhdr_lock); 1578 1579 /* 1580 * Need to allocate a new pool. If the system is currently 1581 * out of memory, then try freeing things on the freelist. 1582 */ 1583 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP); 1584 if (dp == NULL) { 1585 /* 1586 * System can't give us a pool of headers, try 1587 * recycling from the free lists. 1588 */ 1589 bio_recycle(BIO_HEADER, 0); 1590 } else { 1591 sdp = dp; 1592 for (i = 0; i < v.v_buf; i++, dp++) { 1593 /* 1594 * The next two lines are needed since NODEV 1595 * is -1 and not NULL 1596 */ 1597 dp->b_dev = (o_dev_t)NODEV; 1598 dp->b_edev = NODEV; 1599 dp->av_forw = dp + 1; 1600 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT, 1601 NULL); 1602 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT, 1603 NULL); 1604 dp->b_offset = -1; 1605 } 1606 mutex_enter(&bhdr_lock); 1607 (--dp)->av_forw = bhdrlist; /* Fix last pointer */ 1608 bhdrlist = sdp; 1609 nbuf += v.v_buf; 1610 bp = bhdrlist; 1611 bhdrlist = bp->av_forw; 1612 mutex_exit(&bhdr_lock); 1613 1614 bp->av_forw = NULL; 1615 return (bp); 1616 } 1617 } 1618 } 1619 1620 static void 1621 bio_bhdr_free(struct buf *bp) 1622 { 1623 ASSERT(bp->b_back == NULL); 1624 ASSERT(bp->b_forw == NULL); 1625 ASSERT(bp->av_back == NULL); 1626 ASSERT(bp->av_forw == NULL); 1627 ASSERT(bp->b_un.b_addr == NULL); 1628 ASSERT(bp->b_dev == (o_dev_t)NODEV); 1629 ASSERT(bp->b_edev == NODEV); 1630 ASSERT(bp->b_flags == 0); 1631 1632 mutex_enter(&bhdr_lock); 1633 bp->av_forw = bhdrlist; 1634 bhdrlist = bp; 1635 mutex_exit(&bhdr_lock); 1636 } 1637 1638 /* 1639 * If we haven't gone over the high water mark, it's o.k. to 1640 * allocate more buffer space, otherwise recycle buffers 1641 * from the freelist until enough memory is free for a bsize request. 1642 * 1643 * We account for this memory, even though 1644 * we don't allocate it here. 1645 */ 1646 static void 1647 bio_mem_get(long bsize) 1648 { 1649 mutex_enter(&bfree_lock); 1650 if (bfreelist.b_bufsize > bsize) { 1651 bfreelist.b_bufsize -= bsize; 1652 mutex_exit(&bfree_lock); 1653 return; 1654 } 1655 mutex_exit(&bfree_lock); 1656 bio_recycle(BIO_MEM, bsize); 1657 } 1658 1659 /* 1660 * flush a list of delayed write buffers. 1661 * (currently used only by bio_recycle below.) 1662 */ 1663 static void 1664 bio_flushlist(struct buf *delwri_list) 1665 { 1666 struct buf *bp; 1667 1668 while (delwri_list != EMPTY_LIST) { 1669 bp = delwri_list; 1670 bp->b_flags |= B_AGE | B_ASYNC; 1671 if (bp->b_vp == NULL) { /* !ufs */ 1672 BWRITE(bp); 1673 } else { /* ufs */ 1674 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp); 1675 } 1676 delwri_list = bp->b_list; 1677 bp->b_list = NULL; 1678 } 1679 } 1680 1681 /* 1682 * Start recycling buffers on the freelist for one of 2 reasons: 1683 * - we need a buffer header 1684 * - we need to free up memory 1685 * Once started we continue to recycle buffers until the B_AGE 1686 * buffers are gone. 1687 */ 1688 static void 1689 bio_recycle(int want, long bsize) 1690 { 1691 struct buf *bp, *dp, *dwp, *nbp; 1692 struct hbuf *hp; 1693 int found = 0; 1694 kmutex_t *hmp; 1695 int start, end; 1696 struct buf *delwri_list = EMPTY_LIST; 1697 1698 /* 1699 * Recycle buffers. 1700 */ 1701 top: 1702 start = end = lastindex; 1703 do { 1704 hp = &hbuf[start]; 1705 hmp = &hp->b_lock; 1706 dp = (struct buf *)hp; 1707 1708 mutex_enter(hmp); 1709 bp = dp->av_forw; 1710 1711 while (bp != dp) { 1712 1713 ASSERT(bp != NULL); 1714 1715 if (!sema_tryp(&bp->b_sem)) { 1716 bp = bp->av_forw; 1717 continue; 1718 } 1719 /* 1720 * Do we really want to nuke all of the B_AGE stuff?? 1721 */ 1722 if ((bp->b_flags & B_AGE) == 0 && found) { 1723 sema_v(&bp->b_sem); 1724 mutex_exit(hmp); 1725 lastindex = start; 1726 return; /* All done */ 1727 } 1728 1729 ASSERT(MUTEX_HELD(&hp->b_lock)); 1730 ASSERT(!(bp->b_flags & B_DELWRI)); 1731 hp->b_length--; 1732 notavail(bp); 1733 1734 /* 1735 * Remove bhdr from cache, free up memory, 1736 * and add the hdr to the freelist. 1737 */ 1738 bremhash(bp); 1739 mutex_exit(hmp); 1740 1741 if (bp->b_bufsize) { 1742 kmem_free(bp->b_un.b_addr, bp->b_bufsize); 1743 bp->b_un.b_addr = NULL; 1744 mutex_enter(&bfree_lock); 1745 bfreelist.b_bufsize += bp->b_bufsize; 1746 mutex_exit(&bfree_lock); 1747 } 1748 1749 bp->b_dev = (o_dev_t)NODEV; 1750 bp->b_edev = NODEV; 1751 bp->b_flags = 0; 1752 sema_v(&bp->b_sem); 1753 bio_bhdr_free(bp); 1754 if (want == BIO_HEADER) { 1755 found = 1; 1756 } else { 1757 ASSERT(want == BIO_MEM); 1758 if (!found && bfreelist.b_bufsize >= bsize) { 1759 /* Account for the memory we want */ 1760 mutex_enter(&bfree_lock); 1761 if (bfreelist.b_bufsize >= bsize) { 1762 bfreelist.b_bufsize -= bsize; 1763 found = 1; 1764 } 1765 mutex_exit(&bfree_lock); 1766 } 1767 } 1768 1769 /* 1770 * Since we dropped hmp start from the 1771 * begining. 1772 */ 1773 mutex_enter(hmp); 1774 bp = dp->av_forw; 1775 } 1776 mutex_exit(hmp); 1777 1778 /* 1779 * Look at the delayed write list. 1780 * First gather into a private list, then write them. 1781 */ 1782 dwp = (struct buf *)&dwbuf[start]; 1783 mutex_enter(&blist_lock); 1784 bio_doingflush++; 1785 mutex_enter(hmp); 1786 for (bp = dwp->av_forw; bp != dwp; bp = nbp) { 1787 1788 ASSERT(bp != NULL); 1789 nbp = bp->av_forw; 1790 1791 if (!sema_tryp(&bp->b_sem)) 1792 continue; 1793 ASSERT(bp->b_flags & B_DELWRI); 1794 /* 1795 * Do we really want to nuke all of the B_AGE stuff?? 1796 */ 1797 1798 if ((bp->b_flags & B_AGE) == 0 && found) { 1799 sema_v(&bp->b_sem); 1800 mutex_exit(hmp); 1801 lastindex = start; 1802 mutex_exit(&blist_lock); 1803 bio_flushlist(delwri_list); 1804 mutex_enter(&blist_lock); 1805 bio_doingflush--; 1806 if (bio_flinv_cv_wanted) { 1807 bio_flinv_cv_wanted = 0; 1808 cv_broadcast(&bio_flushinval_cv); 1809 } 1810 mutex_exit(&blist_lock); 1811 return; /* All done */ 1812 } 1813 1814 /* 1815 * If the buffer is already on a flush or 1816 * invalidate list then just skip it. 1817 */ 1818 if (bp->b_list != NULL) { 1819 sema_v(&bp->b_sem); 1820 continue; 1821 } 1822 /* 1823 * We are still on the same bucket. 1824 */ 1825 hp->b_length--; 1826 notavail(bp); 1827 bp->b_list = delwri_list; 1828 delwri_list = bp; 1829 } 1830 mutex_exit(hmp); 1831 mutex_exit(&blist_lock); 1832 bio_flushlist(delwri_list); 1833 delwri_list = EMPTY_LIST; 1834 mutex_enter(&blist_lock); 1835 bio_doingflush--; 1836 if (bio_flinv_cv_wanted) { 1837 bio_flinv_cv_wanted = 0; 1838 cv_broadcast(&bio_flushinval_cv); 1839 } 1840 mutex_exit(&blist_lock); 1841 start = (start + 1) % v.v_hbuf; 1842 1843 } while (start != end); 1844 1845 if (found) 1846 return; 1847 1848 /* 1849 * Free lists exhausted and we haven't satisfied the request. 1850 * Wait here for more entries to be added to freelist. 1851 * Because this might have just happened, make it timed. 1852 */ 1853 mutex_enter(&bfree_lock); 1854 bfreelist.b_flags |= B_WANTED; 1855 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK); 1856 mutex_exit(&bfree_lock); 1857 goto top; 1858 } 1859 1860 /* 1861 * See if the block is associated with some buffer 1862 * (mainly to avoid getting hung up on a wait in breada). 1863 */ 1864 static int 1865 bio_incore(dev_t dev, daddr_t blkno) 1866 { 1867 struct buf *bp; 1868 struct buf *dp; 1869 uint_t index; 1870 kmutex_t *hmp; 1871 1872 index = bio_bhash(dev, blkno); 1873 dp = (struct buf *)&hbuf[index]; 1874 hmp = &hbuf[index].b_lock; 1875 1876 mutex_enter(hmp); 1877 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 1878 if (bp->b_blkno == blkno && bp->b_edev == dev && 1879 (bp->b_flags & B_STALE) == 0) { 1880 mutex_exit(hmp); 1881 return (1); 1882 } 1883 } 1884 mutex_exit(hmp); 1885 return (0); 1886 } 1887 1888 static void 1889 bio_pageio_done(struct buf *bp) 1890 { 1891 if (bp->b_flags & B_PAGEIO) { 1892 1893 if (bp->b_flags & B_REMAPPED) 1894 bp_mapout(bp); 1895 1896 if (bp->b_flags & B_READ) 1897 pvn_read_done(bp->b_pages, bp->b_flags); 1898 else 1899 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags); 1900 pageio_done(bp); 1901 } else { 1902 ASSERT(bp->b_flags & B_REMAPPED); 1903 bp_mapout(bp); 1904 brelse(bp); 1905 } 1906 } 1907 1908 /* 1909 * bioerror(9F) - indicate error in buffer header 1910 * If 'error' is zero, remove the error indication. 1911 */ 1912 void 1913 bioerror(struct buf *bp, int error) 1914 { 1915 ASSERT(bp != NULL); 1916 ASSERT(error >= 0); 1917 ASSERT(SEMA_HELD(&bp->b_sem)); 1918 1919 if (error != 0) { 1920 bp->b_flags |= B_ERROR; 1921 } else { 1922 bp->b_flags &= ~B_ERROR; 1923 } 1924 bp->b_error = error; 1925 } 1926 1927 /* 1928 * bioreset(9F) - reuse a private buffer header after I/O is complete 1929 */ 1930 void 1931 bioreset(struct buf *bp) 1932 { 1933 ASSERT(bp != NULL); 1934 1935 biofini(bp); 1936 bioinit(bp); 1937 } 1938 1939 /* 1940 * biosize(9F) - return size of a buffer header 1941 */ 1942 size_t 1943 biosize(void) 1944 { 1945 return (sizeof (struct buf)); 1946 } 1947 1948 /* 1949 * biomodified(9F) - check if buffer is modified 1950 */ 1951 int 1952 biomodified(struct buf *bp) 1953 { 1954 int npf; 1955 int ppattr; 1956 struct page *pp; 1957 1958 ASSERT(bp != NULL); 1959 1960 if ((bp->b_flags & B_PAGEIO) == 0) { 1961 return (-1); 1962 } 1963 pp = bp->b_pages; 1964 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET)); 1965 1966 while (npf > 0) { 1967 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO | 1968 HAT_SYNC_STOPON_MOD); 1969 if (ppattr & P_MOD) 1970 return (1); 1971 pp = pp->p_next; 1972 npf--; 1973 } 1974 1975 return (0); 1976 } 1977 1978 /* 1979 * bioinit(9F) - initialize a buffer structure 1980 */ 1981 void 1982 bioinit(struct buf *bp) 1983 { 1984 bzero(bp, sizeof (struct buf)); 1985 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL); 1986 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL); 1987 bp->b_offset = -1; 1988 } 1989 1990 /* 1991 * biofini(9F) - uninitialize a buffer structure 1992 */ 1993 void 1994 biofini(struct buf *bp) 1995 { 1996 sema_destroy(&bp->b_io); 1997 sema_destroy(&bp->b_sem); 1998 } 1999 2000 /* 2001 * bioclone(9F) - clone a buffer 2002 */ 2003 struct buf * 2004 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno, 2005 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep) 2006 { 2007 struct buf *bufp; 2008 2009 ASSERT(bp); 2010 if (bp_mem == NULL) { 2011 bufp = kmem_alloc(sizeof (struct buf), sleep); 2012 if (bufp == NULL) { 2013 return (NULL); 2014 } 2015 bioinit(bufp); 2016 } else { 2017 bufp = bp_mem; 2018 bioreset(bufp); 2019 } 2020 2021 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\ 2022 B_ABRWRITE) 2023 2024 /* 2025 * The cloned buffer does not inherit the B_REMAPPED flag. 2026 */ 2027 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY; 2028 bufp->b_bcount = len; 2029 bufp->b_blkno = blkno; 2030 bufp->b_iodone = iodone; 2031 bufp->b_proc = bp->b_proc; 2032 bufp->b_edev = dev; 2033 bufp->b_file = bp->b_file; 2034 bufp->b_offset = bp->b_offset; 2035 2036 if (bp->b_flags & B_SHADOW) { 2037 ASSERT(bp->b_shadow); 2038 ASSERT(bp->b_flags & B_PHYS); 2039 2040 bufp->b_shadow = bp->b_shadow + 2041 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off); 2042 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2043 if (bp->b_flags & B_REMAPPED) 2044 bufp->b_proc = NULL; 2045 } else { 2046 if (bp->b_flags & B_PAGEIO) { 2047 struct page *pp; 2048 off_t o; 2049 int i; 2050 2051 pp = bp->b_pages; 2052 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off; 2053 for (i = btop(o); i > 0; i--) { 2054 pp = pp->p_next; 2055 } 2056 bufp->b_pages = pp; 2057 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET); 2058 } else { 2059 bufp->b_un.b_addr = 2060 (caddr_t)((uintptr_t)bp->b_un.b_addr + off); 2061 if (bp->b_flags & B_REMAPPED) 2062 bufp->b_proc = NULL; 2063 } 2064 } 2065 return (bufp); 2066 } 2067