1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/ksynch.h> 29 #include <sys/kmem.h> 30 #include <sys/stat.h> 31 #include <sys/buf.h> 32 #include <sys/open.h> 33 #include <sys/conf.h> 34 #include <sys/file.h> 35 #include <sys/cmn_err.h> 36 #include <sys/errno.h> 37 #include <sys/ddi.h> 38 39 #include <sys/nsc_thread.h> 40 #include <sys/nsctl/nsctl.h> 41 42 #include <sys/sdt.h> /* dtrace is S10 or later */ 43 44 #include <vm/seg_kmem.h> 45 #include "sd_bcache.h" 46 #include "sd_trace.h" 47 #include "sd_io.h" 48 #include "sd_iob.h" 49 #include "sd_misc.h" 50 #if defined(_SD_DEBUG) /* simulate disk errors */ 51 #include "sd_tdaemon.h" 52 #endif 53 54 #ifndef DS_DDICT 55 extern uintptr_t kobj_getsymvalue(char *, int); /* DDI violation */ 56 #endif 57 58 #define DO_PAGE_LIST sdbc_do_page /* enable pagelist code */ 59 60 int sdbc_do_page = 0; 61 62 #define SGIO_MAX 254 63 64 static kmutex_t sdbc_bio_mutex; 65 static int sdbc_bio_count; 66 67 static unsigned long page_size, page_offset_mask; 68 69 #ifdef _SD_BIO_STATS 70 static __start_io_count = 0; 71 #endif /* _SD_BIO_STATS */ 72 73 /* 74 * Forward declare all statics that are used before defined to enforce 75 * parameter checking. Also forward-declare all functions that have 64-bit 76 * argument types to enforce correct parameter checking. 77 * 78 * Some (if not all) of these could be removed if the code were reordered 79 */ 80 81 static int _sd_sync_ea(struct buf *, iob_hook_t *); 82 static int _sd_async_ea(struct buf *, iob_hook_t *); 83 static void _sd_pack_pages(struct buf *bp, struct buf *list, sd_addr_t *addr, 84 nsc_off_t offset, nsc_size_t size); 85 static void _sd_pack_pages_nopageio(struct buf *bp, struct buf *list, 86 sd_addr_t *addr, nsc_off_t offset, nsc_size_t size); 87 static void _sd_setup_iob(struct buf *bp, dev_t dev, nsc_off_t pos, int flag); 88 89 #ifdef DEBUG 90 static int _sdbc_ioj_lookup(dev_t); 91 static void _sdbc_ioj_clear_err(int); 92 #endif 93 94 static int SD_WRITES_TOT = 0; 95 static int SD_WRITES_LEN[100]; 96 97 _sd_buf_list_t _sd_buflist; 98 99 /* 100 * _sd_add_vm_to_bp_plist - add the page corresponding to the 101 * virtual address "v" (kernel virtaddr) to the pagelist linked 102 * to buffer "bp". 103 * 104 * The virtual address "v" is "known" to be allocated by segkmem 105 * and we can look up the page by using the segkmem vnode kvp. 106 * This violates the ddi/ddk but is workable for now anyway. 107 * 108 * 109 */ 110 static void 111 _sd_add_vm_to_bp_plist(struct buf *bp, unsigned char *v) 112 { 113 page_t *pp; 114 page_t *one_pg = NULL; 115 116 pp = page_find(&kvp, (u_offset_t)((uintptr_t)v & ~page_offset_mask)); 117 if (!pp) { 118 cmn_err(CE_PANIC, 119 "_sd_add_vm_to_bp_plist: couldn't find page for 0x%p", 120 (void *)v); 121 } 122 123 page_add(&one_pg, pp); 124 page_list_concat(&(bp->b_pages), &one_pg); 125 126 } 127 128 #ifdef _SD_BIO_STATS 129 static int 130 _sd_count_pages(page_t *pp) 131 { 132 int cnt = 0; 133 page_t *pp1; 134 if (pp == NULL) 135 return (cnt); 136 137 for (cnt = 1, pp1 = pp->p_next; pp != pp1; cnt++, pp1 = pp1->p_next) 138 ; 139 140 return (cnt); 141 } 142 #endif /* _SD_BIO_STATS */ 143 144 145 /* 146 * _sdbc_iobuf_load - load time initialization of io bufs structures. 147 * 148 * 149 * RETURNS: 150 * 0 - success. 151 * -1 - failure. 152 * 153 * USAGE: 154 * This routine initializes load time buf structures. 155 * Should be called when the cache is loaded. 156 */ 157 158 int 159 _sdbc_iobuf_load(void) 160 { 161 mutex_init(&sdbc_bio_mutex, NULL, MUTEX_DRIVER, NULL); 162 163 /* 164 * HACK add a ref to kvp, to prevent VN_RELE on it from panicing 165 * the system 166 */ 167 VN_HOLD(&kvp); 168 169 return (0); 170 } 171 172 /* 173 * _sdbc_iobuf_unload - unload time cleanup of io buf structures. 174 * 175 * 176 * USAGE: 177 * This routine removes load time buf structures. 178 * Should be called when the cache is unloaded. 179 */ 180 void 181 _sdbc_iobuf_unload(void) 182 { 183 /* Undo our VN_HOLD hack, by putting ref count back to normal state */ 184 mutex_enter(&kvp.v_lock); 185 kvp.v_count = 0; 186 mutex_exit(&kvp.v_lock); 187 188 mutex_destroy(&sdbc_bio_mutex); 189 bzero(&_sd_buflist, sizeof (_sd_buf_list_t)); 190 } 191 192 /* 193 * _sdbc_iobuf_configure - configure a list of io bufs for later use. 194 * 195 * ARGUMENTS: 196 * num_bufs - number of buffers. (from the configuration file) 197 * 198 * RETURNS: 199 * 0 - success. 200 * <0 - failure. 201 * 202 * USAGE: 203 * This routine configures the buf structures for io. 204 * Should be called when the cache is configured. 205 */ 206 207 int 208 _sdbc_iobuf_configure(int num) 209 { 210 int i; 211 _sd_buf_list_t *buflist; 212 iob_hook_t *hook; 213 char symbol_name[32]; 214 215 if (!num || (num > _SD_DEFAULT_IOBUFS)) 216 num = _SD_DEFAULT_IOBUFS; 217 218 if ((_sd_buflist.hooks = (iob_hook_t *)nsc_kmem_zalloc( 219 num * sizeof (iob_hook_t), KM_SLEEP, sdbc_iobuf_mem)) == NULL) { 220 return (-1); 221 } 222 223 buflist = &_sd_buflist; 224 buflist->bl_init_count = num; 225 buflist->bl_hooks_avail = num; 226 buflist->bl_hook_lowmark = num; 227 hook = buflist->hooks; 228 buflist->hook_head = hook; 229 for (i = 0; i < num; i++, hook++) { 230 cv_init(&hook->wait, NULL, CV_DRIVER, NULL); 231 (void) sprintf(symbol_name, "sd_iob_dcb%d", i); 232 hook->iob_drv_iodone = (dcb_t)kobj_getsymvalue(symbol_name, 0); 233 if (!hook->iob_drv_iodone) { 234 return (-2); 235 } 236 hook->next_hook = hook+1; 237 } 238 (hook-1)->next_hook = NULL; 239 240 for (i = 0; i < MAX_HOOK_LOCKS; i++) 241 mutex_init(&_sd_buflist.hook_locks[i], NULL, MUTEX_DRIVER, 242 NULL); 243 244 cv_init(&_sd_buflist.hook_wait, NULL, CV_DRIVER, NULL); 245 _sd_buflist.hook_waiters = 0; 246 247 sdbc_bio_count = 0; 248 SD_WRITES_TOT = 0; 249 bzero(SD_WRITES_LEN, sizeof (SD_WRITES_LEN)); 250 251 /* pagelist i/o pages must be done in cache_init */ 252 253 page_size = ptob(1); 254 page_offset_mask = page_size - 1; 255 256 return (0); 257 } 258 259 /* 260 * _sdbc_iobuf_deconfigure - release all memory allocated for buf list 261 * 262 * ARGUMENTS: 263 * None. 264 * 265 * RETURNS: 266 * 0 267 */ 268 void 269 _sdbc_iobuf_deconfigure(void) 270 { 271 ushort_t i; 272 273 if (_sd_buflist.hooks) { 274 for (i = 0; i < _sd_buflist.bl_init_count; i ++) { 275 cv_destroy(&_sd_buflist.hooks[i].wait); 276 } 277 cv_destroy(&_sd_buflist.hook_wait); 278 nsc_kmem_free(_sd_buflist.hooks, 279 _sd_buflist.bl_init_count * sizeof (iob_hook_t)); 280 for (i = 0; i < MAX_HOOK_LOCKS; i ++) { 281 mutex_destroy(&_sd_buflist.hook_locks[i]); 282 } 283 } 284 285 _sd_buflist.hooks = NULL; 286 287 #ifdef DEBUG 288 { 289 void _sdbc_ioj_clear_err(int); 290 _sdbc_ioj_clear_err(-1); /* clear any injected i/o errors */ 291 _sdbc_ioj_set_dev(-1, 0); /* clear dev entries */ 292 } 293 #endif 294 295 } 296 297 /* 298 * _sd_pending_iobuf() 299 * 300 * Return the number of I/O bufs outstanding 301 */ 302 int 303 _sd_pending_iobuf(void) 304 { 305 return (sdbc_bio_count); 306 } 307 308 /* 309 * _sd_get_iobuf - allocate a buf. 310 * 311 * ARGUMENTS: 312 * None. 313 * 314 * RETURNS: 315 * NULL - failure. 316 * buf ptr otherwise. 317 * 318 * ASSUMPTIONS - process could block if we run out. 319 * 320 */ 321 /*ARGSUSED*/ 322 static struct buf * 323 _sd_get_iobuf(int num_bdl) 324 { 325 struct buf *bp; 326 327 /* Get a buffer, ready for page list i/o */ 328 329 if (DO_PAGE_LIST) 330 bp = pageio_setup(NULL, 0, &kvp, 0); 331 else 332 bp = getrbuf(KM_SLEEP); 333 334 if (bp == NULL) 335 return (NULL); 336 mutex_enter(&sdbc_bio_mutex); 337 sdbc_bio_count++; 338 mutex_exit(&sdbc_bio_mutex); 339 return (bp); 340 } 341 342 /* 343 * _sd_put_iobuf - put a buf back in the freelist. 344 * 345 * ARGUMENTS: 346 * bp - buf pointer. 347 * 348 * RETURNS: 349 * 0 350 * 351 */ 352 static void 353 _sd_put_iobuf(struct buf *bp) 354 { 355 mutex_enter(&sdbc_bio_mutex); 356 sdbc_bio_count--; 357 mutex_exit(&sdbc_bio_mutex); 358 if (DO_PAGE_LIST) 359 pageio_done(bp); 360 else 361 freerbuf(bp); 362 } 363 364 365 /* use for ORing only */ 366 #define B_KERNBUF 0 367 368 static void 369 _sd_setup_iob(struct buf *bp, dev_t dev, nsc_off_t pos, int flag) 370 { 371 bp->b_pages = NULL; 372 bp->b_un.b_addr = 0; 373 374 flag &= (B_READ | B_WRITE); 375 376 /* 377 * if pagelist i/o, _sd_get_iobuf()/pageio_setup() has already 378 * set b_flags to 379 * B_KERNBUF | B_PAGEIO | B_NOCACHE | B_BUSY (sol 6,7,8) 380 * or 381 * B_PAGEIO | B_NOCACHE | B_BUSY (sol 9) 382 */ 383 384 bp->b_flags |= B_KERNBUF | B_BUSY | flag; 385 386 bp->b_error = 0; 387 388 bp->b_forw = NULL; 389 bp->b_back = NULL; 390 391 bp->b_lblkno = (diskaddr_t)pos; 392 bp->b_bufsize = 0; 393 bp->b_resid = 0; 394 bp->b_proc = NULL; 395 bp->b_edev = dev; 396 } 397 398 399 /* 400 * _sd_get_hook - get an iob hook from the free list. 401 * 402 * ARGUMENTS: 403 * none 404 * 405 * RETURNS: 406 * the newly allocated iob_hook. 407 * 408 */ 409 static iob_hook_t * 410 _sd_get_hook(void) 411 { 412 413 iob_hook_t *ret; 414 415 mutex_enter(&sdbc_bio_mutex); 416 417 retry: 418 ret = _sd_buflist.hook_head; 419 if (ret) 420 _sd_buflist.hook_head = ret->next_hook; 421 else { 422 ++_sd_buflist.hook_waiters; 423 if (_sd_buflist.max_hook_waiters < _sd_buflist.hook_waiters) 424 _sd_buflist.max_hook_waiters = _sd_buflist.hook_waiters; 425 cv_wait(&_sd_buflist.hook_wait, &sdbc_bio_mutex); 426 --_sd_buflist.hook_waiters; 427 goto retry; 428 } 429 430 if (_sd_buflist.bl_hook_lowmark > --_sd_buflist.bl_hooks_avail) 431 _sd_buflist.bl_hook_lowmark = _sd_buflist.bl_hooks_avail; 432 433 mutex_exit(&sdbc_bio_mutex); 434 ret->skipped = 0; 435 436 ret->count = 0; 437 438 #ifdef _SD_BIO_STATS 439 ret->PAGE_IO = 0; 440 ret->NORM_IO = 0; 441 ret->NORM_IO_SIZE = 0; 442 ret->SKIP_IO = 0; 443 ret->PAGE_COMBINED = 0; 444 #endif /* _SD_BIO_STATS */ 445 446 return (ret); 447 } 448 449 /* 450 * _sd_put_hook - put an iob hook back on the free list. 451 * 452 * ARGUMENTS: 453 * hook - an iob_hook to be returned to the freelist. 454 * 455 * 456 */ 457 static void 458 _sd_put_hook(iob_hook_t *hook) 459 { 460 461 mutex_enter(&sdbc_bio_mutex); 462 463 if (_sd_buflist.hook_waiters) { 464 cv_signal(&_sd_buflist.hook_wait); 465 } 466 hook->next_hook = _sd_buflist.hook_head; 467 _sd_buflist.hook_head = hook; 468 469 ++_sd_buflist.bl_hooks_avail; 470 471 mutex_exit(&sdbc_bio_mutex); 472 } 473 474 /* 475 * _sd_extend_iob - the i/o block we are handling needs a new struct buf to 476 * describe the next hunk of i/o. Get a new struct buf initialize it based 477 * on the state in the struct buf we are passed as an arg. 478 * ARGUMENTS: 479 * head_bp - a buffer header in the current i/o block we are handling. 480 * (generally the initial header but in fact could be any 481 * of the ones [if any] that were chained to the initial 482 * one). 483 */ 484 static struct buf * 485 _sd_extend_iob(struct buf *head_bp) 486 { 487 struct buf *bp; 488 iob_hook_t *hook = (iob_hook_t *)head_bp->b_private; 489 490 491 if (!(bp = _sd_get_iobuf(0))) 492 return (0); 493 494 bp->b_pages = NULL; 495 bp->b_un.b_addr = 0; 496 497 bp->b_flags |= (head_bp->b_flags & (B_READ | B_WRITE)); 498 499 if (!DO_PAGE_LIST) 500 bp->b_flags |= B_KERNBUF | B_BUSY; 501 502 bp->b_error = 0; 503 504 /* 505 * b_forw/b_back will form a doubly linked list of all the buffers 506 * associated with this block of i/o. 507 * hook->tail points to the last buffer in the chain. 508 */ 509 bp->b_forw = NULL; 510 bp->b_back = hook->tail; 511 hook->tail->b_forw = bp; 512 hook->tail = bp; 513 hook->count++; 514 515 ASSERT(BLK_FBA_OFF(hook->size) == 0); 516 517 bp->b_lblkno = (diskaddr_t)hook->start_fba + 518 (diskaddr_t)FBA_NUM(hook->size); 519 520 bp->b_bufsize = 0; 521 bp->b_resid = 0; 522 bp->b_proc = NULL; 523 bp->b_edev = head_bp->b_edev; 524 525 bp->b_iodone = NULL; /* for now */ 526 bp->b_private = hook; 527 528 return (bp); 529 } 530 531 /* 532 * sd_alloc_iob - start processing a block of i/o. This allocates an initial 533 * buffer header for describing the i/o and a iob_hook for collecting 534 * information about all the i/o requests added to this buffer. 535 * 536 * ARGUMENTS: 537 * dev - the device all the i/o is destined for. 538 * fba_pos - the initial disk block to read. 539 * blks - ignored 540 * flag - signal whether this is a read or write request. 541 * 542 * RETURNS: 543 * pointer to free struct buf which will be used to describe i/o request. 544 */ 545 /* ARGSUSED */ 546 struct buf * 547 sd_alloc_iob(dev_t dev, nsc_off_t fba_pos, int blks, int flag) 548 { 549 struct buf *bp; 550 iob_hook_t *hook; 551 552 if (!(bp = _sd_get_iobuf(0))) 553 return (0); 554 555 _sd_setup_iob(bp, dev, fba_pos, flag); 556 557 bp->b_iodone = NULL; /* for now */ 558 hook = _sd_get_hook(); 559 if (!hook) { 560 /* can't see how this could happen */ 561 _sd_put_iobuf(bp); 562 return (0); 563 } 564 565 /* 566 * pick an arbitrary lock 567 */ 568 hook->lockp = &_sd_buflist.hook_locks[((long)hook >> 9) & 569 (MAX_HOOK_LOCKS - 1)]; 570 hook->start_fba = fba_pos; 571 hook->last_fba = fba_pos; 572 hook->size = 0; 573 hook->tail = bp; 574 hook->chain = bp; 575 hook->count = 1; 576 hook->error = 0; 577 bp->b_private = hook; 578 579 return (bp); 580 } 581 582 /* 583 * _sd_pack_pages - produce i/o requests that will perform the type of i/o 584 * described by bp (READ/WRITE). It attempt to tack the i/o onto the 585 * buf pointer to by list to minimize the number of bufs required. 586 * 587 * ARGUMENTS: 588 * bp - is the i/o description i.e. head 589 * list - is where to start adding this i/o request (null if we should extend) 590 * addr - address describing where the data is. 591 * offset - offset from addr where data begins 592 * size - size of the i/o request. 593 */ 594 static void 595 _sd_pack_pages(struct buf *bp, struct buf *list, sd_addr_t *addr, 596 nsc_off_t offset, nsc_size_t size) 597 { 598 uintptr_t start_addr, end_addr; 599 int page_end_aligned; 600 #ifdef _SD_BIO_STATS 601 iob_hook_t *hook = (iob_hook_t *)bp->b_private; 602 struct buf *orig_list = list; 603 #endif /* _SD_BIO_STATS */ 604 605 start_addr = (uintptr_t)addr->sa_virt + offset; 606 end_addr = start_addr + size; 607 608 page_end_aligned = !(end_addr & page_offset_mask); 609 610 if (!list && !(list = _sd_extend_iob(bp))) { 611 /* 612 * we're hosed since we have no error return... 613 * though we could ignore stuff from here on out 614 * and return ENOMEM when we get to sd_start_io. 615 * This will do for now. 616 */ 617 cmn_err(CE_PANIC, "_sd_pack_pages: couldn't extend iob"); 618 } 619 620 /* 621 * We only want to do pagelist i/o if we end on a page boundary. 622 * If we don't end on a page boundary we won't combine with the 623 * next request and so we may as well do it as normal as it 624 * will only use one buffer. 625 */ 626 627 if (DO_PAGE_LIST && page_end_aligned) { 628 if (start_addr & page_offset_mask) { 629 /* 630 * handle the partial page 631 */ 632 if (list->b_bufsize) { 633 if (!(list = _sd_extend_iob(bp))) { 634 /* 635 * we're hosed since we have no error 636 * return though we could ignore stuff 637 * from here on out and return ENOMEM 638 * when we get to sd_start_io. 639 * This will do for now. 640 */ 641 cmn_err(CE_PANIC, 642 "_sd_pack_pages: couldn't extend iob"); 643 } 644 } 645 #ifdef _SD_BIO_STATS 646 hook->PAGE_IO++; 647 #endif /* _SD_BIO_STATS */ 648 _sd_add_vm_to_bp_plist(list, 649 (unsigned char *) start_addr); 650 list->b_bufsize = page_size - 651 (start_addr & page_offset_mask); 652 list->b_un.b_addr = (caddr_t) 653 (start_addr & page_offset_mask); 654 size -= list->b_bufsize; 655 start_addr += list->b_bufsize; 656 } 657 /* 658 * Now fill with all the full pages remaining. 659 */ 660 for (; size > 0; size -= page_size) { 661 #ifdef _SD_BIO_STATS 662 hook->PAGE_IO++; 663 #endif /* _SD_BIO_STATS */ 664 665 _sd_add_vm_to_bp_plist(list, 666 (unsigned char *) start_addr); 667 start_addr += page_size; 668 list->b_bufsize += page_size; 669 #ifdef _SD_BIO_STATS 670 if (list == orig_list) 671 hook->PAGE_COMBINED++; 672 #endif /* _SD_BIO_STATS */ 673 } 674 if (size) 675 cmn_err(CE_PANIC, "_sd_pack_pages: bad size: %" 676 NSC_SZFMT, size); 677 } else { 678 /* 679 * Wasn't worth it as pagelist i/o, do as normal 680 */ 681 if (list->b_bufsize && !(list = _sd_extend_iob(bp))) { 682 /* 683 * we're hosed since we have no error return... 684 * though we could ignore stuff from here on out 685 * and return ENOMEM when we get to sd_start_io. 686 * This will do for now. 687 */ 688 cmn_err(CE_PANIC, 689 "_sd_pack_pages: couldn't extend iob"); 690 } 691 692 /* kernel virtual */ 693 list->b_flags &= ~(B_PHYS | B_PAGEIO); 694 list->b_un.b_addr = (caddr_t)start_addr; 695 #ifdef _SD_BIO_STATS 696 hook->NORM_IO++; 697 hook->NORM_IO_SIZE += size; 698 #endif /* _SD_BIO_STATS */ 699 list->b_bufsize = (size_t)size; 700 } 701 702 } 703 704 /* 705 * perform same function as _sd_pack_pages() when not doing pageio 706 */ 707 static void 708 _sd_pack_pages_nopageio(struct buf *bp, struct buf *list, sd_addr_t *addr, 709 nsc_off_t offset, nsc_size_t size) 710 { 711 uintptr_t start_addr; 712 #ifdef _SD_BIO_STATS 713 iob_hook_t *hook = (iob_hook_t *)bp->b_private; 714 struct buf *orig_list = list; 715 #endif /* _SD_BIO_STATS */ 716 717 start_addr = (uintptr_t)addr->sa_virt + offset; 718 719 if (!list && !(list = _sd_extend_iob(bp))) { 720 /* 721 * we're hosed since we have no error return... 722 * though we could ignore stuff from here on out 723 * and return ENOMEM when we get to sd_start_io. 724 * This will do for now. 725 */ 726 cmn_err(CE_PANIC, "_sd_pack_pages_nopageio: couldn't " 727 "extend iob"); 728 } 729 730 if (list->b_bufsize && 731 (start_addr == (uintptr_t)(list->b_un.b_addr + list->b_bufsize))) { 732 /* contiguous */ 733 list->b_bufsize += (size_t)size; 734 } else { 735 /* 736 * not contiguous mem (extend) or first buffer (bufsize == 0). 737 */ 738 if (list->b_bufsize && !(list = _sd_extend_iob(bp))) { 739 /* 740 * we're hosed since we have no error return... 741 * though we could ignore stuff from here on out 742 * and return ENOMEM when we get to sd_start_io. 743 * This will do for now. 744 */ 745 cmn_err(CE_PANIC, "_sd_pack_pages_nopageio: couldn't " 746 "extend iob"); 747 } 748 list->b_un.b_addr = (caddr_t)start_addr; 749 list->b_bufsize = (size_t)size; 750 } 751 752 #ifdef _SD_BIO_STATS 753 hook->NORM_IO++; 754 hook->NORM_IO_SIZE += size; 755 #endif /* _SD_BIO_STATS */ 756 } 757 758 /* 759 * sd_add_fba - add an i/o request to the block of i/o described by bp. 760 * We try and combine this request with the previous request. In 761 * Addition we try and do the i/o as PAGELIST_IO if it satisfies 762 * the restrictions for it. If the i/o request can't be combined 763 * we extend the i/o description with a new buffer header and add 764 * it to the chain headed by bp. 765 * 766 * ARGUMENTS: 767 * bp - the struct buf describing the block i/o we are collecting. 768 * addr - description of the address where the data will read/written to. 769 * A NULL indicates that this i/o request doesn't need to actually 770 * happen. Used to mark reads when the fba is already in cache and 771 * dirty. 772 * 773 * fba_pos - offset from address in addr where the i/o is to start. 774 * 775 * fba_len - number of consecutive fbas to transfer. 776 * 777 * NOTE: It is assumed that the memory is physically contiguous but may span 778 * multiple pages (should a cache block be larger than a page). 779 * 780 */ 781 void 782 sd_add_fba(struct buf *bp, sd_addr_t *addr, nsc_off_t fba_pos, 783 nsc_size_t fba_len) 784 { 785 nsc_off_t offset; 786 nsc_size_t size; 787 iob_hook_t *hook = (iob_hook_t *)bp->b_private; 788 789 size = FBA_SIZE(fba_len); 790 offset = FBA_SIZE(fba_pos); 791 792 if (addr) { 793 /* 794 * See if this can be combined with previous request(s) 795 */ 796 if (!bp->b_bufsize) { 797 if (DO_PAGE_LIST) 798 _sd_pack_pages(bp, bp, addr, offset, size); 799 else 800 _sd_pack_pages_nopageio(bp, bp, addr, offset, 801 size); 802 } else { 803 if (DO_PAGE_LIST) { 804 if (hook->tail->b_flags & B_PAGEIO) { 805 /* 806 * Last buffer was a pagelist. Unless a 807 * skip was detected the last request 808 * ended on a page boundary. If this 809 * one starts on one we combine the 810 * best we can. 811 */ 812 if (hook->skipped) 813 _sd_pack_pages(bp, NULL, addr, 814 offset, size); 815 else 816 _sd_pack_pages(bp, hook->tail, 817 addr, offset, size); 818 } else { 819 /* 820 * Last buffer was vanilla i/o or worse 821 * (sd_add_mem) 822 */ 823 _sd_pack_pages(bp, NULL, addr, offset, 824 size); 825 } 826 } else { 827 if (hook->skipped) 828 _sd_pack_pages_nopageio(bp, NULL, 829 addr, offset, size); 830 else 831 _sd_pack_pages_nopageio(bp, 832 hook->tail, addr, offset, size); 833 } 834 } 835 hook->skipped = 0; 836 } else { 837 /* Must be a read of dirty block we want to discard */ 838 839 ASSERT(bp->b_flags & B_READ); 840 #ifdef _SD_BIO_STATS 841 hook->SKIP_IO++; 842 #endif /* _SD_BIO_STATS */ 843 hook->skipped = 1; 844 if (!bp->b_bufsize) 845 bp->b_lblkno += fba_len; 846 } 847 hook->size += size; 848 849 } 850 851 /* 852 * sd_add_mem - add an i/o request to the block of i/o described by bp. 853 * The memory target for this i/o may span multiple pages and may 854 * not be physically contiguous. 855 * also the len might also not be a multiple of an fba. 856 * 857 * ARGUMENTS: 858 * bp - the struct buf describing the block i/o we are collecting. 859 * 860 * buf - target of this i/o request. 861 * 862 * len - number of bytes to transfer. 863 * 864 */ 865 void 866 sd_add_mem(struct buf *bp, char *buf, nsc_size_t len) 867 { 868 nsc_size_t n; 869 uintptr_t start; 870 iob_hook_t *hook = (iob_hook_t *)bp->b_private; 871 872 start = (uintptr_t)buf & page_offset_mask; 873 874 for (; len > 0; buf += n, len -= n, start = 0) { 875 n = min((nsc_size_t)len, (nsc_size_t)(page_size - start)); 876 /* 877 * i/o size must be multiple of an FBA since we can't 878 * count on lower level drivers to understand b_offset 879 */ 880 if (BLK_FBA_OFF(n) != 0) { 881 cmn_err(CE_WARN, 882 "!sdbc(sd_add_mem) i/o request not FBA sized (%" 883 NSC_SZFMT ")", n); 884 } 885 886 if (!bp->b_bufsize) { 887 /* first request */ 888 bp->b_flags &= ~(B_PHYS | B_PAGEIO); 889 bp->b_un.b_addr = buf; 890 bp->b_bufsize = (size_t)n; 891 } else { 892 struct buf *new_bp; 893 if (!(new_bp = _sd_extend_iob(bp))) { 894 /* we're hosed */ 895 cmn_err(CE_PANIC, 896 "sd_add_mem: couldn't extend iob"); 897 } 898 new_bp->b_flags &= ~(B_PHYS | B_PAGEIO); 899 new_bp->b_un.b_addr = buf; 900 new_bp->b_bufsize = (size_t)n; 901 } 902 hook->size += n; 903 } 904 } 905 906 907 /* 908 * sd_start_io - start all the i/o needed to satisfy the i/o request described 909 * by bp. If supplied the a non-NULL fn then this is an async request 910 * and we will return NSC_PENDING and call fn when all the i/o complete. 911 * Otherwise this is a synchronous request and we sleep until all the 912 * i/o is complete. If any buffer in the chain gets an error we return 913 * the first error we see (once all the i/o is complete). 914 * 915 * ARGUMENTS: 916 * bp - the struct buf describing the block i/o we are collecting. 917 * 918 * strategy - strategy function to call if known by the user, or NULL. 919 * 920 * fn - user's callback function. NULL implies synchronous request. 921 * 922 * arg - an argument passed to user's callback function. 923 * 924 */ 925 int 926 sd_start_io(struct buf *bp, strategy_fn_t strategy, sdbc_ea_fn_t fn, 927 blind_t arg) 928 { 929 int err; 930 iob_hook_t *hook = (iob_hook_t *)bp->b_private; 931 struct buf *bp_next; 932 int (*ea_fn)(struct buf *, iob_hook_t *); 933 #ifdef _SD_BIO_STATS 934 static int total_pages, total_pages_combined, total_norm; 935 static int total_norm_combined, total_skipped; 936 static nsc_size_t total_norm_size; 937 938 static int total_bufs; 939 static int total_xpages_w, total_ypages_w; 940 static int total_xpages_r, total_ypages_r; 941 static int max_run_r, max_run_w; 942 943 #endif /* _SD_BIO_STATS */ 944 945 hook->func = fn; 946 hook->param = arg; 947 if (fn != NULL) 948 ea_fn = _sd_async_ea; 949 else 950 ea_fn = _sd_sync_ea; 951 952 hook->iob_hook_iodone = ea_fn; 953 954 #ifdef _SD_BIO_STATS 955 __start_io_count++; 956 total_pages += hook->PAGE_IO; 957 total_pages_combined += hook->PAGE_COMBINED; 958 total_norm += hook->NORM_IO; 959 total_norm_size += hook->NORM_IO_SIZE; 960 total_skipped += hook->SKIP_IO; 961 #endif /* _SD_BIO_STATS */ 962 963 for (; bp; bp = bp_next) { 964 965 DTRACE_PROBE4(sd_start_io_bufs, struct buf *, bp, long, bp->b_bufsize, 966 int, bp->b_flags, iob_hook_t *, hook); 967 968 bp_next = bp->b_forw; 969 if (!(bp->b_flags & B_READ)) { 970 SD_WRITES_TOT++; 971 SD_WRITES_LEN[(bp->b_bufsize/32768) % 972 (sizeof (SD_WRITES_LEN)/sizeof (int))]++; 973 } 974 bp->b_iodone = hook->iob_drv_iodone; 975 bp->b_bcount = bp->b_bufsize; 976 bp->b_forw = NULL; 977 bp->b_back = NULL; 978 bp->b_private = NULL; 979 980 #ifdef _SD_BIO_STATS 981 total_bufs ++; 982 if (bp->b_flags & B_PAGEIO) { 983 int i; 984 i = _sd_count_pages(bp->b_pages); 985 if (bp->b_flags & B_READ) { 986 if (i > max_run_r) 987 max_run_r = i; 988 total_xpages_r += i; 989 total_ypages_r++; 990 } else { 991 if (i > max_run_w) 992 max_run_w = i; 993 total_xpages_w += i; 994 total_ypages_w++; 995 } 996 } 997 #endif /* _SD_BIO_STATS */ 998 999 1000 /* 1001 * It's possible for us to be told to read a dirty block 1002 * where all the i/o can go away (e.g. read one fba, it's 1003 * in cache and dirty) so we really have nothing to do but 1004 * say we're done. 1005 */ 1006 if (bp->b_bcount) { 1007 if (!strategy) { 1008 strategy = 1009 nsc_get_strategy(getmajor(bp->b_edev)); 1010 } 1011 1012 if (!strategy) { 1013 bp->b_flags |= B_ERROR; 1014 bp->b_error = ENXIO; 1015 (*bp->b_iodone)(bp); 1016 } else 1017 #ifdef DEBUG 1018 /* inject i/o error for testing */ 1019 if (bp->b_error = _sdbc_ioj_lookup(bp->b_edev)) { 1020 bp->b_flags |= B_ERROR; 1021 (*bp->b_iodone)(bp); 1022 } else 1023 #endif 1024 { 1025 (*strategy)(bp); 1026 } 1027 } else { 1028 (*bp->b_iodone)(bp); 1029 } 1030 1031 } 1032 1033 #ifdef _SD_BIO_STATS 1034 if (__start_io_count == 2000) { 1035 __start_io_count = 0; 1036 cmn_err(CE_WARN, 1037 "!sdbc(sd_start_io) t_bufs %d pages %d " 1038 "combined %d norm %d norm_size %" NSC_SZFMT " skipped %d", 1039 total_bufs, 1040 total_pages, total_pages_combined, total_norm, 1041 total_norm_size, total_skipped); 1042 1043 total_bufs = 0; 1044 total_pages = 0; 1045 total_pages_combined = 0; 1046 total_norm = 0; 1047 total_norm_combined = 0; 1048 total_skipped = 0; 1049 total_norm_size = 0; 1050 1051 cmn_err(CE_WARN, 1052 "!sdbc(sd_start_io)(r) max_run %d, total_xp %d total yp %d", 1053 max_run_r, total_xpages_r, total_ypages_r); 1054 1055 total_xpages_r = 0; 1056 total_ypages_r = 0; 1057 max_run_r = 0; 1058 1059 cmn_err(CE_WARN, 1060 "!sdbc(sd_start_io)(w) max_run %d, total_xp %d total yp %d", 1061 max_run_w, total_xpages_w, total_ypages_w); 1062 1063 total_xpages_w = 0; 1064 total_ypages_w = 0; 1065 max_run_w = 0; 1066 } 1067 #endif /* _SD_BIO_STATS */ 1068 1069 if (ea_fn == _sd_async_ea) { 1070 DTRACE_PROBE(sd_start_io_end); 1071 1072 return (NSC_PENDING); 1073 } 1074 1075 mutex_enter(hook->lockp); 1076 1077 while (hook->count) { 1078 cv_wait(&hook->wait, hook->lockp); 1079 } 1080 mutex_exit(hook->lockp); 1081 1082 err = hook->error ? hook->error : NSC_DONE; 1083 bp = hook->tail; 1084 _sd_put_hook(hook); 1085 _sd_put_iobuf(bp); 1086 1087 return (err); 1088 } 1089 1090 /* 1091 * _sd_sync_ea - called when a single i/o operation is complete. If this 1092 * is the last outstanding i/o we wakeup the sleeper. 1093 * If this i/o had an error then we store the error result in the 1094 * iob_hook if this was the first error. 1095 * 1096 * ARGUMENTS: 1097 * bp - the struct buf describing the block i/o that just completed. 1098 * 1099 * Comments: 1100 * This routine is called at interrupt level when the io is done. 1101 */ 1102 1103 static int 1104 _sd_sync_ea(struct buf *bp, iob_hook_t *hook) 1105 { 1106 1107 int error; 1108 int done; 1109 1110 /* 1111 * We get called for each buf that completes. When they are all done. 1112 * we wakeup the waiter. 1113 */ 1114 error = (bp->b_flags & B_ERROR) ? bp->b_error : 0; 1115 1116 mutex_enter(hook->lockp); 1117 1118 if (!hook->error) 1119 hook->error = error; 1120 1121 done = !(--hook->count); 1122 if (done) { 1123 /* remember the last buffer so we can free it later */ 1124 hook->tail = bp; 1125 cv_signal(&hook->wait); 1126 } 1127 mutex_exit(hook->lockp); 1128 1129 /* 1130 * let sd_start_io free the final buffer so the hook can be returned 1131 * first. 1132 */ 1133 if (!done) 1134 _sd_put_iobuf(bp); 1135 1136 return (0); 1137 } 1138 1139 /* 1140 * static int 1141 * _sd_async_ea - End action for async read/write. 1142 * 1143 * ARGUMENTS: 1144 * bp - io buf pointer. 1145 * 1146 * RETURNS: 1147 * NONE. 1148 * 1149 * Comments: 1150 * This routine is called at interrupt level when the io is done. 1151 * This is only called when the operation is asynchronous. 1152 */ 1153 static int 1154 _sd_async_ea(struct buf *bp, iob_hook_t *hook) 1155 { 1156 int done, error; 1157 1158 /* 1159 * We get called for each buf that completes. When they are all done. 1160 * we call the requestor's callback function. 1161 */ 1162 error = (bp->b_flags & B_ERROR) ? bp->b_error : 0; 1163 1164 mutex_enter(hook->lockp); 1165 done = !(--hook->count); 1166 1167 if (!hook->error) 1168 hook->error = error; 1169 1170 mutex_exit(hook->lockp); 1171 1172 bp->b_forw = NULL; 1173 bp->b_back = NULL; 1174 1175 if (done) { 1176 nsc_off_t fba_pos; 1177 nsc_size_t fba_len; 1178 int error; 1179 sdbc_ea_fn_t fn; 1180 blind_t arg; 1181 1182 arg = hook->param; 1183 fn = hook->func; 1184 error = hook->error; 1185 #if defined(_SD_DEBUG) /* simulate disk errors */ 1186 if (_test_async_fail == bp->b_edev) error = EIO; 1187 #endif 1188 1189 /* MAKE SURE b_lblkno, b_count never changes!! */ 1190 fba_pos = hook->start_fba; 1191 fba_len = FBA_LEN(hook->size); 1192 1193 _sd_put_hook(hook); 1194 _sd_put_iobuf(bp); 1195 (*fn)(arg, fba_pos, fba_len, error); 1196 } else 1197 _sd_put_iobuf(bp); 1198 1199 return (0); 1200 } 1201 1202 #ifdef DEBUG 1203 typedef struct ioerr_inject_s { 1204 dev_t ioj_dev; 1205 int ioj_err; 1206 int ioj_cnt; 1207 } ioerr_inject_t; 1208 1209 static ioerr_inject_t *ioerr_inject_table = NULL; 1210 1211 void 1212 _sdbc_ioj_load() 1213 { 1214 ioerr_inject_table = 1215 kmem_zalloc(sdbc_max_devs * sizeof (ioerr_inject_t), KM_SLEEP); 1216 } 1217 1218 void 1219 _sdbc_ioj_unload() 1220 { 1221 if (ioerr_inject_table != NULL) { 1222 kmem_free(ioerr_inject_table, 1223 sdbc_max_devs * sizeof (ioerr_inject_t)); 1224 ioerr_inject_table = NULL; 1225 } 1226 } 1227 1228 static int 1229 _sdbc_ioj_lookup(dev_t dev) 1230 { 1231 int cd; 1232 1233 for (cd = 0; cd < sdbc_max_devs; ++cd) 1234 if (ioerr_inject_table[cd].ioj_dev == dev) { 1235 if (ioerr_inject_table[cd].ioj_cnt > 0) { 1236 --ioerr_inject_table[cd].ioj_cnt; 1237 return (0); 1238 } else { 1239 return (ioerr_inject_table[cd].ioj_err); 1240 } 1241 } 1242 return (0); 1243 } 1244 1245 void 1246 _sdbc_ioj_set_dev(int cd, dev_t crdev) 1247 { 1248 int i; 1249 1250 if (cd == -1) { /* all -- used for clearing table on shutdown */ 1251 for (i = 0; i < sdbc_max_devs; ++i) { 1252 ioerr_inject_table[i].ioj_dev = crdev; 1253 } 1254 } else 1255 ioerr_inject_table[cd].ioj_dev = crdev; /* assume valid cd */ 1256 } 1257 1258 static 1259 void 1260 _sdbc_ioj_set_err(int cd, int err, int count) 1261 { 1262 int i; 1263 1264 if (cd == -1) { /* all */ 1265 for (i = 0; i < sdbc_max_devs; ++i) { 1266 ioerr_inject_table[i].ioj_err = err; 1267 ioerr_inject_table[i].ioj_cnt = count; 1268 } 1269 } else { 1270 ioerr_inject_table[cd].ioj_err = err; 1271 ioerr_inject_table[cd].ioj_cnt = count; 1272 } 1273 } 1274 1275 static void 1276 _sdbc_ioj_clear_err(int cd) 1277 { 1278 _sdbc_ioj_set_err(cd, 0, 0); 1279 } 1280 1281 int 1282 _sdbc_inject_ioerr(int cd, int ioj_err, int count) 1283 { 1284 if ((cd < -1) || (cd >= sdbc_max_devs)) 1285 return (EINVAL); 1286 1287 _sdbc_ioj_set_err(cd, ioj_err, count); 1288 1289 return (0); 1290 } 1291 1292 int 1293 _sdbc_clear_ioerr(int cd) 1294 { 1295 if ((cd < -1) || (cd >= sdbc_max_devs)) 1296 return (EINVAL); 1297 1298 _sdbc_ioj_clear_err(cd); 1299 1300 return (0); 1301 } 1302 #endif 1303