1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/scsi/scsi.h> 27 #include <sys/ddi.h> 28 #include <sys/sunddi.h> 29 #include <sys/thread.h> 30 #include <sys/var.h> 31 32 #include "sd_xbuf.h" 33 34 /* 35 * xbuf.c: buf(9s) extension facility. 36 * 37 * The buf(9S) extension facility is intended to allow block drivers to 38 * allocate additional memory that is associated with a particular buf(9S) 39 * struct. It is further intended to help in addressing the usual set of 40 * problems associated with such allocations, in particular those involving 41 * recovery from allocation failures, especially in code paths that the 42 * system relies on to free memory. 43 * 44 * CAVEAT: Currently this code is completely private to the sd driver and in 45 * NO WAY constitutes a public or supported interface of any kind. It is 46 * envisioned that this may one day migrate into the Solaris DDI, but until 47 * that time this ought to be considered completely unstable and is subject 48 * to change without notice. This code may NOT in any way be utilized by 49 * ANY code outside the sd driver. 50 */ 51 52 53 static int xbuf_iostart(ddi_xbuf_attr_t xap); 54 static void xbuf_dispatch(ddi_xbuf_attr_t xap); 55 static void xbuf_restart_callback(void *arg); 56 static void xbuf_enqueue(struct buf *bp, ddi_xbuf_attr_t xap); 57 static int xbuf_brk_done(struct buf *bp); 58 59 60 /* 61 * Note: Should this be exposed to the caller.... do we want to give the 62 * caller the fexibility of specifying the parameters for the thread pool? 63 * Note: these values are just estimates at this time, based upon what 64 * seems reasonable for the sd driver. It may be preferable to make these 65 * parameters self-scaling in a real (future) implementation. 66 */ 67 #define XBUF_TQ_MINALLOC 64 68 #define XBUF_TQ_MAXALLOC 512 69 #define XBUF_DISPATCH_DELAY (drv_usectohz(50000)) /* 50 msec */ 70 71 static taskq_t *xbuf_tq = NULL; 72 static int xbuf_attr_tq_minalloc = XBUF_TQ_MINALLOC; 73 static int xbuf_attr_tq_maxalloc = XBUF_TQ_MAXALLOC; 74 75 static kmutex_t xbuf_mutex = { 0 }; 76 static uint32_t xbuf_refcount = 0; 77 78 /* 79 * Private wrapper for buf cloned via ddi_xbuf_qstrategy() 80 */ 81 struct xbuf_brk { 82 kmutex_t mutex; 83 struct buf *bp0; 84 uint8_t nbufs; /* number of buf allocated */ 85 uint8_t active; /* number of active xfer */ 86 87 size_t brksize; /* break size used for this buf */ 88 int brkblk; 89 90 /* xfer position */ 91 off_t off; 92 off_t noff; 93 daddr_t blkno; 94 }; 95 96 _NOTE(DATA_READABLE_WITHOUT_LOCK(xbuf_brk::off)) 97 98 /* 99 * Hack needed in the prototype so buf breakup will work. 100 * Here we can rely on the sd code not changing the value in 101 * b_forw. 102 */ 103 #define b_clone_private b_forw 104 105 106 /* ARGSUSED */ 107 DDII ddi_xbuf_attr_t 108 ddi_xbuf_attr_create(size_t xsize, 109 void (*xa_strategy)(struct buf *bp, ddi_xbuf_t xp, void *attr_arg), 110 void *attr_arg, uint32_t active_limit, uint32_t reserve_limit, 111 major_t major, int flags) 112 { 113 ddi_xbuf_attr_t xap; 114 115 xap = kmem_zalloc(sizeof (struct __ddi_xbuf_attr), KM_SLEEP); 116 117 mutex_init(&xap->xa_mutex, NULL, MUTEX_DRIVER, NULL); 118 mutex_init(&xap->xa_reserve_mutex, NULL, MUTEX_DRIVER, NULL); 119 120 /* Future: Allow the caller to specify alignment requirements? */ 121 xap->xa_allocsize = max(xsize, sizeof (void *)); 122 xap->xa_active_limit = active_limit; 123 xap->xa_active_lowater = xap->xa_active_limit / 2; 124 xap->xa_reserve_limit = reserve_limit; 125 xap->xa_strategy = xa_strategy; 126 xap->xa_attr_arg = attr_arg; 127 128 mutex_enter(&xbuf_mutex); 129 if (xbuf_refcount == 0) { 130 ASSERT(xbuf_tq == NULL); 131 /* 132 * Note: Would be nice if: (1) #threads in the taskq pool (set 133 * to the value of 'ncpus' at the time the taskq is created) 134 * could adjust automatically with DR; (2) the taskq 135 * minalloc/maxalloc counts could be grown/shrunk on the fly. 136 */ 137 xbuf_tq = taskq_create("xbuf_taskq", ncpus, 138 (v.v_maxsyspri - 2), xbuf_attr_tq_minalloc, 139 xbuf_attr_tq_maxalloc, TASKQ_PREPOPULATE); 140 } 141 xbuf_refcount++; 142 mutex_exit(&xbuf_mutex); 143 144 /* In this prototype we just always use the global system pool. */ 145 xap->xa_tq = xbuf_tq; 146 147 return (xap); 148 } 149 150 151 DDII void 152 ddi_xbuf_attr_destroy(ddi_xbuf_attr_t xap) 153 { 154 ddi_xbuf_t xp; 155 156 mutex_destroy(&xap->xa_mutex); 157 mutex_destroy(&xap->xa_reserve_mutex); 158 159 /* Free any xbufs on the reserve list */ 160 while (xap->xa_reserve_count != 0) { 161 xp = xap->xa_reserve_headp; 162 xap->xa_reserve_headp = *((void **)xp); 163 xap->xa_reserve_count--; 164 kmem_free(xp, xap->xa_allocsize); 165 } 166 ASSERT(xap->xa_reserve_headp == NULL); 167 168 mutex_enter(&xbuf_mutex); 169 ASSERT((xbuf_refcount != 0) && (xbuf_tq != NULL)); 170 xbuf_refcount--; 171 if (xbuf_refcount == 0) { 172 taskq_destroy(xbuf_tq); 173 xbuf_tq = NULL; 174 } 175 mutex_exit(&xbuf_mutex); 176 177 kmem_free(xap, sizeof (struct __ddi_xbuf_attr)); 178 } 179 180 181 /* ARGSUSED */ 182 DDII void 183 ddi_xbuf_attr_register_devinfo(ddi_xbuf_attr_t xbuf_attr, dev_info_t *dip) 184 { 185 /* Currently a no-op in this prototype */ 186 } 187 188 189 /* ARGSUSED */ 190 DDII void 191 ddi_xbuf_attr_unregister_devinfo(ddi_xbuf_attr_t xbuf_attr, dev_info_t *dip) 192 { 193 /* Currently a no-op in this prototype */ 194 } 195 196 DDII int 197 ddi_xbuf_attr_setup_brk(ddi_xbuf_attr_t xap, size_t size) 198 { 199 if (size < DEV_BSIZE) 200 return (0); 201 202 mutex_enter(&xap->xa_mutex); 203 xap->xa_brksize = size & ~(DEV_BSIZE - 1); 204 mutex_exit(&xap->xa_mutex); 205 return (1); 206 } 207 208 209 210 /* 211 * Enqueue the given buf and attempt to initiate IO. 212 * Called from the driver strategy(9E) routine. 213 */ 214 215 DDII int 216 ddi_xbuf_qstrategy(struct buf *bp, ddi_xbuf_attr_t xap) 217 { 218 ASSERT(xap != NULL); 219 ASSERT(!mutex_owned(&xap->xa_mutex)); 220 ASSERT(!mutex_owned(&xap->xa_reserve_mutex)); 221 222 mutex_enter(&xap->xa_mutex); 223 224 ASSERT((bp->b_bcount & (DEV_BSIZE - 1)) == 0); 225 226 /* 227 * Breakup buf if necessary. bp->b_private is temporarily 228 * used to save xbuf_brk 229 */ 230 if (xap->xa_brksize && bp->b_bcount > xap->xa_brksize) { 231 struct xbuf_brk *brkp; 232 233 brkp = kmem_zalloc(sizeof (struct xbuf_brk), KM_SLEEP); 234 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*brkp)) 235 mutex_init(&brkp->mutex, NULL, MUTEX_DRIVER, NULL); 236 brkp->bp0 = bp; 237 brkp->brksize = xap->xa_brksize; 238 brkp->brkblk = btodt(xap->xa_brksize); 239 brkp->noff = xap->xa_brksize; 240 brkp->blkno = bp->b_blkno; 241 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*brkp)) 242 bp->b_private = brkp; 243 } else { 244 bp->b_private = NULL; 245 } 246 247 /* Enqueue buf */ 248 if (xap->xa_headp == NULL) { 249 xap->xa_headp = xap->xa_tailp = bp; 250 } else { 251 xap->xa_tailp->av_forw = bp; 252 xap->xa_tailp = bp; 253 } 254 bp->av_forw = NULL; 255 256 xap->xa_pending++; 257 mutex_exit(&xap->xa_mutex); 258 return (xbuf_iostart(xap)); 259 } 260 261 262 /* 263 * Drivers call this immediately before calling biodone(9F), to notify the 264 * framework that the indicated xbuf is no longer being used by the driver. 265 * May be called under interrupt context. 266 */ 267 268 DDII int 269 ddi_xbuf_done(struct buf *bp, ddi_xbuf_attr_t xap) 270 { 271 ddi_xbuf_t xp; 272 int done; 273 274 ASSERT(bp != NULL); 275 ASSERT(xap != NULL); 276 ASSERT(!mutex_owned(&xap->xa_mutex)); 277 ASSERT(!mutex_owned(&xap->xa_reserve_mutex)); 278 279 xp = ddi_xbuf_get(bp, xap); 280 281 mutex_enter(&xap->xa_mutex); 282 283 #ifdef SDDEBUG 284 if (xap->xa_active_limit != 0) { 285 ASSERT(xap->xa_active_count > 0); 286 } 287 #endif 288 xap->xa_active_count--; 289 290 if (xap->xa_reserve_limit != 0) { 291 mutex_enter(&xap->xa_reserve_mutex); 292 if (xap->xa_reserve_count < xap->xa_reserve_limit) { 293 /* Put this xbuf onto the reserve list & exit */ 294 *((void **)xp) = xap->xa_reserve_headp; 295 xap->xa_reserve_headp = xp; 296 xap->xa_reserve_count++; 297 mutex_exit(&xap->xa_reserve_mutex); 298 goto done; 299 } 300 mutex_exit(&xap->xa_reserve_mutex); 301 } 302 303 kmem_free(xp, xap->xa_allocsize); /* return it to the system */ 304 305 done: 306 if (bp->b_iodone == xbuf_brk_done) { 307 struct xbuf_brk *brkp = (struct xbuf_brk *)bp->b_clone_private; 308 309 brkp->active--; 310 if (brkp->active || xap->xa_headp == brkp->bp0) { 311 done = 0; 312 } else { 313 brkp->off = -1; /* mark bp0 as completed */ 314 done = 1; 315 } 316 } else { 317 done = 1; 318 } 319 320 if ((xap->xa_active_limit == 0) || 321 (xap->xa_active_count <= xap->xa_active_lowater)) { 322 xbuf_dispatch(xap); 323 } 324 325 mutex_exit(&xap->xa_mutex); 326 return (done); 327 } 328 329 static int 330 xbuf_brk_done(struct buf *bp) 331 { 332 struct xbuf_brk *brkp = (struct xbuf_brk *)bp->b_clone_private; 333 struct buf *bp0 = brkp->bp0; 334 int done; 335 336 mutex_enter(&brkp->mutex); 337 if (bp->b_flags & B_ERROR && !(bp0->b_flags & B_ERROR)) { 338 bp0->b_flags |= B_ERROR; 339 bp0->b_error = bp->b_error; 340 } 341 if (bp->b_resid) 342 bp0->b_resid = bp0->b_bcount; 343 344 freerbuf(bp); 345 brkp->nbufs--; 346 347 done = (brkp->off == -1 && brkp->nbufs == 0); 348 mutex_exit(&brkp->mutex); 349 350 /* All buf segments done */ 351 if (done) { 352 mutex_destroy(&brkp->mutex); 353 kmem_free(brkp, sizeof (struct xbuf_brk)); 354 biodone(bp0); 355 } 356 return (0); 357 } 358 359 DDII void 360 ddi_xbuf_dispatch(ddi_xbuf_attr_t xap) 361 { 362 mutex_enter(&xap->xa_mutex); 363 if ((xap->xa_active_limit == 0) || 364 (xap->xa_active_count <= xap->xa_active_lowater)) { 365 xbuf_dispatch(xap); 366 } 367 mutex_exit(&xap->xa_mutex); 368 } 369 370 371 /* 372 * ISSUE: in this prototype we cannot really implement ddi_xbuf_get() 373 * unless we explicitly hide the xbuf pointer somewhere in the buf 374 * during allocation, and then rely on the driver never changing it. 375 * We can probably get away with using b_private for this for now, 376 * tho it really is kinda gnarly..... 377 */ 378 379 /* ARGSUSED */ 380 DDII ddi_xbuf_t 381 ddi_xbuf_get(struct buf *bp, ddi_xbuf_attr_t xap) 382 { 383 return (bp->b_private); 384 } 385 386 387 /* 388 * Initiate IOs for bufs on the queue. Called from kernel thread or taskq 389 * thread context. May execute concurrently for the same ddi_xbuf_attr_t. 390 */ 391 392 static int 393 xbuf_iostart(ddi_xbuf_attr_t xap) 394 { 395 struct buf *bp; 396 ddi_xbuf_t xp; 397 398 ASSERT(xap != NULL); 399 ASSERT(!mutex_owned(&xap->xa_mutex)); 400 ASSERT(!mutex_owned(&xap->xa_reserve_mutex)); 401 402 /* 403 * For each request on the queue, attempt to allocate the specified 404 * xbuf extension area, and call the driver's iostart() routine. 405 * We process as many requests on the queue as we can, until either 406 * (1) we run out of requests; or 407 * (2) we run out of resources; or 408 * (3) we reach the maximum limit for the given ddi_xbuf_attr_t. 409 */ 410 for (;;) { 411 mutex_enter(&xap->xa_mutex); 412 413 if ((bp = xap->xa_headp) == NULL) { 414 break; /* queue empty */ 415 } 416 417 if ((xap->xa_active_limit != 0) && 418 (xap->xa_active_count >= xap->xa_active_limit)) { 419 break; /* allocation limit reached */ 420 } 421 422 /* 423 * If the reserve_limit is non-zero then work with the 424 * reserve else always allocate a new struct. 425 */ 426 if (xap->xa_reserve_limit != 0) { 427 /* 428 * Don't penalize EVERY I/O by always allocating a new 429 * struct. for the sake of maintaining and not touching 430 * a reserve for a pathalogical condition that may never 431 * happen. Use the reserve entries first, this uses it 432 * like a local pool rather than a reserve that goes 433 * untouched. Make sure it's re-populated whenever it 434 * gets fully depleted just in case it really is needed. 435 * This is safe because under the pathalogical 436 * condition, when the system runs out of memory such 437 * that the below allocs fail, the reserve will still 438 * be available whether the entries are saved away on 439 * the queue unused or in-transport somewhere. Thus 440 * progress can still continue, however slowly. 441 */ 442 mutex_enter(&xap->xa_reserve_mutex); 443 if (xap->xa_reserve_count != 0) { 444 ASSERT(xap->xa_reserve_headp != NULL); 445 /* Grab an xbuf from the reserve */ 446 xp = xap->xa_reserve_headp; 447 xap->xa_reserve_headp = *((void **)xp); 448 ASSERT(xap->xa_reserve_count > 0); 449 xap->xa_reserve_count--; 450 } else { 451 /* 452 * Either this is the first time through, 453 * or the reserve has been totally depleted. 454 * Re-populate the reserve (pool). Excess 455 * structs. get released in the done path. 456 */ 457 while (xap->xa_reserve_count < 458 xap->xa_reserve_limit) { 459 xp = kmem_alloc(xap->xa_allocsize, 460 KM_NOSLEEP); 461 if (xp == NULL) { 462 break; 463 } 464 *((void **)xp) = xap->xa_reserve_headp; 465 xap->xa_reserve_headp = xp; 466 xap->xa_reserve_count++; 467 } 468 /* And one more to use right now. */ 469 xp = kmem_alloc(xap->xa_allocsize, KM_NOSLEEP); 470 } 471 mutex_exit(&xap->xa_reserve_mutex); 472 } else { 473 /* 474 * Try to alloc a new xbuf struct. If this fails just 475 * exit for now. We'll get back here again either upon 476 * cmd completion or via the timer handler. 477 * Question: what if the allocation attempt for the very 478 * first cmd. fails? There are no outstanding cmds so 479 * how do we get back here? 480 * Should look at un_ncmds_in_transport, if it's zero 481 * then schedule xbuf_restart_callback via the timer. 482 * Athough that breaks the architecture by bringing 483 * softstate data into this code. 484 */ 485 xp = kmem_alloc(xap->xa_allocsize, KM_NOSLEEP); 486 } 487 if (xp == NULL) { 488 break; /* Can't process a cmd. right now. */ 489 } 490 491 /* 492 * Always run the counter. It's used/needed when xa_active_limit 493 * is non-zero which is the typical (and right now only) case. 494 */ 495 xap->xa_active_count++; 496 497 if (bp->b_private) { 498 struct xbuf_brk *brkp = bp->b_private; 499 struct buf *bp0 = bp; 500 501 brkp->active++; 502 503 mutex_enter(&brkp->mutex); 504 brkp->nbufs++; 505 mutex_exit(&brkp->mutex); 506 507 if (brkp->noff < bp0->b_bcount) { 508 bp = bioclone(bp0, brkp->off, brkp->brksize, 509 bp0->b_edev, brkp->blkno, xbuf_brk_done, 510 NULL, KM_SLEEP); 511 512 /* update xfer position */ 513 brkp->off = brkp->noff; 514 brkp->noff += brkp->brksize; 515 brkp->blkno += brkp->brkblk; 516 } else { 517 bp = bioclone(bp0, brkp->off, 518 bp0->b_bcount - brkp->off, bp0->b_edev, 519 brkp->blkno, xbuf_brk_done, NULL, KM_SLEEP); 520 521 /* unlink the buf from the list */ 522 xap->xa_headp = bp0->av_forw; 523 bp0->av_forw = NULL; 524 } 525 bp->b_clone_private = (struct buf *)brkp; 526 } else { 527 /* unlink the buf from the list */ 528 xap->xa_headp = bp->av_forw; 529 bp->av_forw = NULL; 530 } 531 532 /* 533 * Hack needed in the prototype so ddi_xbuf_get() will work. 534 * Here we can rely on the sd code not changing the value in 535 * b_private (in fact it wants it there). See ddi_get_xbuf() 536 */ 537 bp->b_private = xp; 538 539 /* call the driver's iostart routine */ 540 mutex_exit(&xap->xa_mutex); 541 (*(xap->xa_strategy))(bp, xp, xap->xa_attr_arg); 542 } 543 544 ASSERT(xap->xa_pending > 0); 545 xap->xa_pending--; 546 mutex_exit(&xap->xa_mutex); 547 return (0); 548 } 549 550 551 /* 552 * Re-start IO processing if there is anything on the queue, AND if the 553 * restart function is not already running/pending for this ddi_xbuf_attr_t 554 */ 555 static void 556 xbuf_dispatch(ddi_xbuf_attr_t xap) 557 { 558 ASSERT(xap != NULL); 559 ASSERT(xap->xa_tq != NULL); 560 ASSERT(mutex_owned(&xap->xa_mutex)); 561 562 if ((xap->xa_headp != NULL) && (xap->xa_timeid == NULL) && 563 (xap->xa_pending == 0)) { 564 /* 565 * First try to see if we can dispatch the restart function 566 * immediately, in a taskq thread. If this fails, then 567 * schedule a timeout(9F) callback to try again later. 568 */ 569 if (taskq_dispatch(xap->xa_tq, 570 (void (*)(void *)) xbuf_iostart, xap, KM_NOSLEEP) == 0) { 571 /* 572 * Unable to enqueue the request for the taskq thread, 573 * try again later. Note that this will keep re-trying 574 * until taskq_dispatch() succeeds. 575 */ 576 xap->xa_timeid = timeout(xbuf_restart_callback, xap, 577 XBUF_DISPATCH_DELAY); 578 } else { 579 /* 580 * This indicates that xbuf_iostart() will soon be 581 * run for this ddi_xbuf_attr_t, and we do not need to 582 * schedule another invocation via timeout/taskq 583 */ 584 xap->xa_pending++; 585 } 586 } 587 } 588 589 /* timeout(9F) callback routine for xbuf restart mechanism. */ 590 static void 591 xbuf_restart_callback(void *arg) 592 { 593 ddi_xbuf_attr_t xap = arg; 594 595 ASSERT(xap != NULL); 596 ASSERT(xap->xa_tq != NULL); 597 ASSERT(!mutex_owned(&xap->xa_mutex)); 598 599 mutex_enter(&xap->xa_mutex); 600 xap->xa_timeid = NULL; 601 xbuf_dispatch(xap); 602 mutex_exit(&xap->xa_mutex); 603 } 604 605 606 DDII void 607 ddi_xbuf_flushq(ddi_xbuf_attr_t xap, int (*funcp)(struct buf *)) 608 { 609 struct buf *bp; 610 struct buf *next_bp; 611 struct buf *prev_bp = NULL; 612 613 ASSERT(xap != NULL); 614 ASSERT(xap->xa_tq != NULL); 615 ASSERT(!mutex_owned(&xap->xa_mutex)); 616 617 mutex_enter(&xap->xa_mutex); 618 619 for (bp = xap->xa_headp; bp != NULL; bp = next_bp) { 620 621 next_bp = bp->av_forw; /* Save for next iteration */ 622 623 /* 624 * If the user-supplied function is non-NULL and returns 625 * FALSE, then just leave the current bp on the queue. 626 */ 627 if ((funcp != NULL) && (!(*funcp)(bp))) { 628 prev_bp = bp; 629 continue; 630 } 631 632 /* de-queue the bp */ 633 if (bp == xap->xa_headp) { 634 xap->xa_headp = next_bp; 635 if (xap->xa_headp == NULL) { 636 xap->xa_tailp = NULL; 637 } 638 } else { 639 ASSERT(xap->xa_headp != NULL); 640 ASSERT(prev_bp != NULL); 641 if (bp == xap->xa_tailp) { 642 ASSERT(next_bp == NULL); 643 xap->xa_tailp = prev_bp; 644 } 645 prev_bp->av_forw = next_bp; 646 } 647 bp->av_forw = NULL; 648 649 /* Add the bp to the flush queue */ 650 if (xap->xa_flush_headp == NULL) { 651 ASSERT(xap->xa_flush_tailp == NULL); 652 xap->xa_flush_headp = xap->xa_flush_tailp = bp; 653 } else { 654 ASSERT(xap->xa_flush_tailp != NULL); 655 xap->xa_flush_tailp->av_forw = bp; 656 xap->xa_flush_tailp = bp; 657 } 658 } 659 660 while ((bp = xap->xa_flush_headp) != NULL) { 661 xap->xa_flush_headp = bp->av_forw; 662 if (xap->xa_flush_headp == NULL) { 663 xap->xa_flush_tailp = NULL; 664 } 665 mutex_exit(&xap->xa_mutex); 666 bioerror(bp, EIO); 667 bp->b_resid = bp->b_bcount; 668 biodone(bp); 669 mutex_enter(&xap->xa_mutex); 670 } 671 672 mutex_exit(&xap->xa_mutex); 673 } 674