1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/scsi/scsi.h> 27 #include <sys/ddi.h> 28 #include <sys/sunddi.h> 29 #include <sys/thread.h> 30 #include <sys/var.h> 31 32 #include "sd_xbuf.h" 33 34 /* 35 * xbuf.c: buf(9s) extension facility. 36 * 37 * The buf(9S) extension facility is intended to allow block drivers to 38 * allocate additional memory that is associated with a particular buf(9S) 39 * struct. It is further intended to help in addressing the usual set of 40 * problems associated with such allocations, in particular those involving 41 * recovery from allocation failures, especially in code paths that the 42 * system relies on to free memory. 43 * 44 * CAVEAT: Currently this code is completely private to the sd driver and in 45 * NO WAY constitutes a public or supported interface of any kind. It is 46 * envisioned that this may one day migrate into the Solaris DDI, but until 47 * that time this ought to be considered completely unstable and is subject 48 * to change without notice. This code may NOT in any way be utilized by 49 * ANY code outside the sd driver. 50 */ 51 52 53 static int xbuf_iostart(ddi_xbuf_attr_t xap); 54 static void xbuf_dispatch(ddi_xbuf_attr_t xap); 55 static void xbuf_restart_callback(void *arg); 56 static int xbuf_brk_done(struct buf *bp); 57 58 59 /* 60 * Note: Should this be exposed to the caller.... do we want to give the 61 * caller the fexibility of specifying the parameters for the thread pool? 62 * Note: these values are just estimates at this time, based upon what 63 * seems reasonable for the sd driver. It may be preferable to make these 64 * parameters self-scaling in a real (future) implementation. 65 */ 66 #define XBUF_TQ_MINALLOC 64 67 #define XBUF_TQ_MAXALLOC 512 68 #define XBUF_DISPATCH_DELAY (drv_usectohz(50000)) /* 50 msec */ 69 70 static taskq_t *xbuf_tq = NULL; 71 static int xbuf_attr_tq_minalloc = XBUF_TQ_MINALLOC; 72 static int xbuf_attr_tq_maxalloc = XBUF_TQ_MAXALLOC; 73 74 static kmutex_t xbuf_mutex = { 0 }; 75 static uint32_t xbuf_refcount = 0; 76 77 /* 78 * Private wrapper for buf cloned via ddi_xbuf_qstrategy() 79 */ 80 struct xbuf_brk { 81 kmutex_t mutex; 82 struct buf *bp0; 83 uint8_t nbufs; /* number of buf allocated */ 84 uint8_t active; /* number of active xfer */ 85 86 size_t brksize; /* break size used for this buf */ 87 int brkblk; 88 89 /* xfer position */ 90 off_t off; 91 off_t noff; 92 daddr_t blkno; 93 }; 94 95 _NOTE(DATA_READABLE_WITHOUT_LOCK(xbuf_brk::off)) 96 97 /* 98 * Hack needed in the prototype so buf breakup will work. 99 * Here we can rely on the sd code not changing the value in 100 * b_forw. 101 */ 102 #define b_clone_private b_forw 103 104 105 /* ARGSUSED */ 106 DDII ddi_xbuf_attr_t 107 ddi_xbuf_attr_create(size_t xsize, 108 void (*xa_strategy)(struct buf *bp, ddi_xbuf_t xp, void *attr_arg), 109 void *attr_arg, uint32_t active_limit, uint32_t reserve_limit, 110 major_t major, int flags) 111 { 112 ddi_xbuf_attr_t xap; 113 114 xap = kmem_zalloc(sizeof (struct __ddi_xbuf_attr), KM_SLEEP); 115 116 mutex_init(&xap->xa_mutex, NULL, MUTEX_DRIVER, NULL); 117 mutex_init(&xap->xa_reserve_mutex, NULL, MUTEX_DRIVER, NULL); 118 119 /* Future: Allow the caller to specify alignment requirements? */ 120 xap->xa_allocsize = max(xsize, sizeof (void *)); 121 xap->xa_active_limit = active_limit; 122 xap->xa_active_lowater = xap->xa_active_limit / 2; 123 xap->xa_reserve_limit = reserve_limit; 124 xap->xa_strategy = xa_strategy; 125 xap->xa_attr_arg = attr_arg; 126 127 mutex_enter(&xbuf_mutex); 128 if (xbuf_refcount == 0) { 129 ASSERT(xbuf_tq == NULL); 130 /* 131 * Note: Would be nice if: (1) #threads in the taskq pool (set 132 * to the value of 'ncpus' at the time the taskq is created) 133 * could adjust automatically with DR; (2) the taskq 134 * minalloc/maxalloc counts could be grown/shrunk on the fly. 135 */ 136 xbuf_tq = taskq_create("xbuf_taskq", ncpus, 137 (v.v_maxsyspri - 2), xbuf_attr_tq_minalloc, 138 xbuf_attr_tq_maxalloc, TASKQ_PREPOPULATE); 139 } 140 xbuf_refcount++; 141 mutex_exit(&xbuf_mutex); 142 143 /* In this prototype we just always use the global system pool. */ 144 xap->xa_tq = xbuf_tq; 145 146 return (xap); 147 } 148 149 150 DDII void 151 ddi_xbuf_attr_destroy(ddi_xbuf_attr_t xap) 152 { 153 ddi_xbuf_t xp; 154 155 mutex_destroy(&xap->xa_mutex); 156 mutex_destroy(&xap->xa_reserve_mutex); 157 158 /* Free any xbufs on the reserve list */ 159 while (xap->xa_reserve_count != 0) { 160 xp = xap->xa_reserve_headp; 161 xap->xa_reserve_headp = *((void **)xp); 162 xap->xa_reserve_count--; 163 kmem_free(xp, xap->xa_allocsize); 164 } 165 ASSERT(xap->xa_reserve_headp == NULL); 166 167 mutex_enter(&xbuf_mutex); 168 ASSERT((xbuf_refcount != 0) && (xbuf_tq != NULL)); 169 xbuf_refcount--; 170 if (xbuf_refcount == 0) { 171 taskq_destroy(xbuf_tq); 172 xbuf_tq = NULL; 173 } 174 mutex_exit(&xbuf_mutex); 175 176 kmem_free(xap, sizeof (struct __ddi_xbuf_attr)); 177 } 178 179 180 /* ARGSUSED */ 181 DDII void 182 ddi_xbuf_attr_register_devinfo(ddi_xbuf_attr_t xbuf_attr, dev_info_t *dip) 183 { 184 /* Currently a no-op in this prototype */ 185 } 186 187 188 /* ARGSUSED */ 189 DDII void 190 ddi_xbuf_attr_unregister_devinfo(ddi_xbuf_attr_t xbuf_attr, dev_info_t *dip) 191 { 192 /* Currently a no-op in this prototype */ 193 } 194 195 DDII int 196 ddi_xbuf_attr_setup_brk(ddi_xbuf_attr_t xap, size_t size) 197 { 198 if (size < DEV_BSIZE) 199 return (0); 200 201 mutex_enter(&xap->xa_mutex); 202 xap->xa_brksize = size & ~(DEV_BSIZE - 1); 203 mutex_exit(&xap->xa_mutex); 204 return (1); 205 } 206 207 208 209 /* 210 * Enqueue the given buf and attempt to initiate IO. 211 * Called from the driver strategy(9E) routine. 212 */ 213 214 DDII int 215 ddi_xbuf_qstrategy(struct buf *bp, ddi_xbuf_attr_t xap) 216 { 217 ASSERT(xap != NULL); 218 ASSERT(!mutex_owned(&xap->xa_mutex)); 219 ASSERT(!mutex_owned(&xap->xa_reserve_mutex)); 220 221 mutex_enter(&xap->xa_mutex); 222 223 ASSERT((bp->b_bcount & (DEV_BSIZE - 1)) == 0); 224 225 /* 226 * Breakup buf if necessary. bp->b_private is temporarily 227 * used to save xbuf_brk 228 */ 229 if (xap->xa_brksize && bp->b_bcount > xap->xa_brksize) { 230 struct xbuf_brk *brkp; 231 232 brkp = kmem_zalloc(sizeof (struct xbuf_brk), KM_SLEEP); 233 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*brkp)) 234 mutex_init(&brkp->mutex, NULL, MUTEX_DRIVER, NULL); 235 brkp->bp0 = bp; 236 brkp->brksize = xap->xa_brksize; 237 brkp->brkblk = btodt(xap->xa_brksize); 238 brkp->noff = xap->xa_brksize; 239 brkp->blkno = bp->b_blkno; 240 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*brkp)) 241 bp->b_private = brkp; 242 } else { 243 bp->b_private = NULL; 244 } 245 246 /* Enqueue buf */ 247 if (xap->xa_headp == NULL) { 248 xap->xa_headp = xap->xa_tailp = bp; 249 } else { 250 xap->xa_tailp->av_forw = bp; 251 xap->xa_tailp = bp; 252 } 253 bp->av_forw = NULL; 254 255 xap->xa_pending++; 256 mutex_exit(&xap->xa_mutex); 257 return (xbuf_iostart(xap)); 258 } 259 260 261 /* 262 * Drivers call this immediately before calling biodone(9F), to notify the 263 * framework that the indicated xbuf is no longer being used by the driver. 264 * May be called under interrupt context. 265 */ 266 267 DDII int 268 ddi_xbuf_done(struct buf *bp, ddi_xbuf_attr_t xap) 269 { 270 ddi_xbuf_t xp; 271 int done; 272 273 ASSERT(bp != NULL); 274 ASSERT(xap != NULL); 275 ASSERT(!mutex_owned(&xap->xa_mutex)); 276 ASSERT(!mutex_owned(&xap->xa_reserve_mutex)); 277 278 xp = ddi_xbuf_get(bp, xap); 279 280 mutex_enter(&xap->xa_mutex); 281 282 #ifdef SDDEBUG 283 if (xap->xa_active_limit != 0) { 284 ASSERT(xap->xa_active_count > 0); 285 } 286 #endif 287 xap->xa_active_count--; 288 289 if (xap->xa_reserve_limit != 0) { 290 mutex_enter(&xap->xa_reserve_mutex); 291 if (xap->xa_reserve_count < xap->xa_reserve_limit) { 292 /* Put this xbuf onto the reserve list & exit */ 293 *((void **)xp) = xap->xa_reserve_headp; 294 xap->xa_reserve_headp = xp; 295 xap->xa_reserve_count++; 296 mutex_exit(&xap->xa_reserve_mutex); 297 goto done; 298 } 299 mutex_exit(&xap->xa_reserve_mutex); 300 } 301 302 kmem_free(xp, xap->xa_allocsize); /* return it to the system */ 303 304 done: 305 if (bp->b_iodone == xbuf_brk_done) { 306 struct xbuf_brk *brkp = (struct xbuf_brk *)bp->b_clone_private; 307 308 brkp->active--; 309 if (brkp->active || xap->xa_headp == brkp->bp0) { 310 done = 0; 311 } else { 312 brkp->off = -1; /* mark bp0 as completed */ 313 done = 1; 314 } 315 } else { 316 done = 1; 317 } 318 319 if ((xap->xa_active_limit == 0) || 320 (xap->xa_active_count <= xap->xa_active_lowater)) { 321 xbuf_dispatch(xap); 322 } 323 324 mutex_exit(&xap->xa_mutex); 325 return (done); 326 } 327 328 static int 329 xbuf_brk_done(struct buf *bp) 330 { 331 struct xbuf_brk *brkp = (struct xbuf_brk *)bp->b_clone_private; 332 struct buf *bp0 = brkp->bp0; 333 int done; 334 335 mutex_enter(&brkp->mutex); 336 if (bp->b_flags & B_ERROR && !(bp0->b_flags & B_ERROR)) { 337 bp0->b_flags |= B_ERROR; 338 bp0->b_error = bp->b_error; 339 } 340 if (bp->b_resid) 341 bp0->b_resid = bp0->b_bcount; 342 343 freerbuf(bp); 344 brkp->nbufs--; 345 346 done = (brkp->off == -1 && brkp->nbufs == 0); 347 mutex_exit(&brkp->mutex); 348 349 /* All buf segments done */ 350 if (done) { 351 mutex_destroy(&brkp->mutex); 352 kmem_free(brkp, sizeof (struct xbuf_brk)); 353 biodone(bp0); 354 } 355 return (0); 356 } 357 358 DDII void 359 ddi_xbuf_dispatch(ddi_xbuf_attr_t xap) 360 { 361 mutex_enter(&xap->xa_mutex); 362 if ((xap->xa_active_limit == 0) || 363 (xap->xa_active_count <= xap->xa_active_lowater)) { 364 xbuf_dispatch(xap); 365 } 366 mutex_exit(&xap->xa_mutex); 367 } 368 369 370 /* 371 * ISSUE: in this prototype we cannot really implement ddi_xbuf_get() 372 * unless we explicitly hide the xbuf pointer somewhere in the buf 373 * during allocation, and then rely on the driver never changing it. 374 * We can probably get away with using b_private for this for now, 375 * tho it really is kinda gnarly..... 376 */ 377 378 /* ARGSUSED */ 379 DDII ddi_xbuf_t 380 ddi_xbuf_get(struct buf *bp, ddi_xbuf_attr_t xap) 381 { 382 return (bp->b_private); 383 } 384 385 386 /* 387 * Initiate IOs for bufs on the queue. Called from kernel thread or taskq 388 * thread context. May execute concurrently for the same ddi_xbuf_attr_t. 389 */ 390 391 static int 392 xbuf_iostart(ddi_xbuf_attr_t xap) 393 { 394 struct buf *bp; 395 ddi_xbuf_t xp; 396 397 ASSERT(xap != NULL); 398 ASSERT(!mutex_owned(&xap->xa_mutex)); 399 ASSERT(!mutex_owned(&xap->xa_reserve_mutex)); 400 401 /* 402 * For each request on the queue, attempt to allocate the specified 403 * xbuf extension area, and call the driver's iostart() routine. 404 * We process as many requests on the queue as we can, until either 405 * (1) we run out of requests; or 406 * (2) we run out of resources; or 407 * (3) we reach the maximum limit for the given ddi_xbuf_attr_t. 408 */ 409 for (;;) { 410 mutex_enter(&xap->xa_mutex); 411 412 if ((bp = xap->xa_headp) == NULL) { 413 break; /* queue empty */ 414 } 415 416 if ((xap->xa_active_limit != 0) && 417 (xap->xa_active_count >= xap->xa_active_limit)) { 418 break; /* allocation limit reached */ 419 } 420 421 /* 422 * If the reserve_limit is non-zero then work with the 423 * reserve else always allocate a new struct. 424 */ 425 if (xap->xa_reserve_limit != 0) { 426 /* 427 * Don't penalize EVERY I/O by always allocating a new 428 * struct. for the sake of maintaining and not touching 429 * a reserve for a pathalogical condition that may never 430 * happen. Use the reserve entries first, this uses it 431 * like a local pool rather than a reserve that goes 432 * untouched. Make sure it's re-populated whenever it 433 * gets fully depleted just in case it really is needed. 434 * This is safe because under the pathalogical 435 * condition, when the system runs out of memory such 436 * that the below allocs fail, the reserve will still 437 * be available whether the entries are saved away on 438 * the queue unused or in-transport somewhere. Thus 439 * progress can still continue, however slowly. 440 */ 441 mutex_enter(&xap->xa_reserve_mutex); 442 if (xap->xa_reserve_count != 0) { 443 ASSERT(xap->xa_reserve_headp != NULL); 444 /* Grab an xbuf from the reserve */ 445 xp = xap->xa_reserve_headp; 446 xap->xa_reserve_headp = *((void **)xp); 447 ASSERT(xap->xa_reserve_count > 0); 448 xap->xa_reserve_count--; 449 } else { 450 /* 451 * Either this is the first time through, 452 * or the reserve has been totally depleted. 453 * Re-populate the reserve (pool). Excess 454 * structs. get released in the done path. 455 */ 456 while (xap->xa_reserve_count < 457 xap->xa_reserve_limit) { 458 xp = kmem_alloc(xap->xa_allocsize, 459 KM_NOSLEEP); 460 if (xp == NULL) { 461 break; 462 } 463 *((void **)xp) = xap->xa_reserve_headp; 464 xap->xa_reserve_headp = xp; 465 xap->xa_reserve_count++; 466 } 467 /* And one more to use right now. */ 468 xp = kmem_alloc(xap->xa_allocsize, KM_NOSLEEP); 469 } 470 mutex_exit(&xap->xa_reserve_mutex); 471 } else { 472 /* 473 * Try to alloc a new xbuf struct. If this fails just 474 * exit for now. We'll get back here again either upon 475 * cmd completion or via the timer handler. 476 * Question: what if the allocation attempt for the very 477 * first cmd. fails? There are no outstanding cmds so 478 * how do we get back here? 479 * Should look at un_ncmds_in_transport, if it's zero 480 * then schedule xbuf_restart_callback via the timer. 481 * Athough that breaks the architecture by bringing 482 * softstate data into this code. 483 */ 484 xp = kmem_alloc(xap->xa_allocsize, KM_NOSLEEP); 485 } 486 if (xp == NULL) { 487 break; /* Can't process a cmd. right now. */ 488 } 489 490 /* 491 * Always run the counter. It's used/needed when xa_active_limit 492 * is non-zero which is the typical (and right now only) case. 493 */ 494 xap->xa_active_count++; 495 496 if (bp->b_private) { 497 struct xbuf_brk *brkp = bp->b_private; 498 struct buf *bp0 = bp; 499 500 brkp->active++; 501 502 mutex_enter(&brkp->mutex); 503 brkp->nbufs++; 504 mutex_exit(&brkp->mutex); 505 506 if (brkp->noff < bp0->b_bcount) { 507 bp = bioclone(bp0, brkp->off, brkp->brksize, 508 bp0->b_edev, brkp->blkno, xbuf_brk_done, 509 NULL, KM_SLEEP); 510 511 /* update xfer position */ 512 brkp->off = brkp->noff; 513 brkp->noff += brkp->brksize; 514 brkp->blkno += brkp->brkblk; 515 } else { 516 bp = bioclone(bp0, brkp->off, 517 bp0->b_bcount - brkp->off, bp0->b_edev, 518 brkp->blkno, xbuf_brk_done, NULL, KM_SLEEP); 519 520 /* unlink the buf from the list */ 521 xap->xa_headp = bp0->av_forw; 522 bp0->av_forw = NULL; 523 } 524 bp->b_clone_private = (struct buf *)brkp; 525 } else { 526 /* unlink the buf from the list */ 527 xap->xa_headp = bp->av_forw; 528 bp->av_forw = NULL; 529 } 530 531 /* 532 * Hack needed in the prototype so ddi_xbuf_get() will work. 533 * Here we can rely on the sd code not changing the value in 534 * b_private (in fact it wants it there). See ddi_get_xbuf() 535 */ 536 bp->b_private = xp; 537 538 /* call the driver's iostart routine */ 539 mutex_exit(&xap->xa_mutex); 540 (*(xap->xa_strategy))(bp, xp, xap->xa_attr_arg); 541 } 542 543 ASSERT(xap->xa_pending > 0); 544 xap->xa_pending--; 545 mutex_exit(&xap->xa_mutex); 546 return (0); 547 } 548 549 550 /* 551 * Re-start IO processing if there is anything on the queue, AND if the 552 * restart function is not already running/pending for this ddi_xbuf_attr_t 553 */ 554 static void 555 xbuf_dispatch(ddi_xbuf_attr_t xap) 556 { 557 ASSERT(xap != NULL); 558 ASSERT(xap->xa_tq != NULL); 559 ASSERT(mutex_owned(&xap->xa_mutex)); 560 561 if ((xap->xa_headp != NULL) && (xap->xa_timeid == NULL) && 562 (xap->xa_pending == 0)) { 563 /* 564 * First try to see if we can dispatch the restart function 565 * immediately, in a taskq thread. If this fails, then 566 * schedule a timeout(9F) callback to try again later. 567 */ 568 if (taskq_dispatch(xap->xa_tq, 569 (void (*)(void *)) xbuf_iostart, xap, KM_NOSLEEP) == 570 TASKQID_INVALID) { 571 /* 572 * Unable to enqueue the request for the taskq thread, 573 * try again later. Note that this will keep re-trying 574 * until taskq_dispatch() succeeds. 575 */ 576 xap->xa_timeid = timeout(xbuf_restart_callback, xap, 577 XBUF_DISPATCH_DELAY); 578 } else { 579 /* 580 * This indicates that xbuf_iostart() will soon be 581 * run for this ddi_xbuf_attr_t, and we do not need to 582 * schedule another invocation via timeout/taskq 583 */ 584 xap->xa_pending++; 585 } 586 } 587 } 588 589 /* timeout(9F) callback routine for xbuf restart mechanism. */ 590 static void 591 xbuf_restart_callback(void *arg) 592 { 593 ddi_xbuf_attr_t xap = arg; 594 595 ASSERT(xap != NULL); 596 ASSERT(xap->xa_tq != NULL); 597 ASSERT(!mutex_owned(&xap->xa_mutex)); 598 599 mutex_enter(&xap->xa_mutex); 600 xap->xa_timeid = NULL; 601 xbuf_dispatch(xap); 602 mutex_exit(&xap->xa_mutex); 603 } 604 605 606 DDII void 607 ddi_xbuf_flushq(ddi_xbuf_attr_t xap, int (*funcp)(struct buf *)) 608 { 609 struct buf *bp; 610 struct buf *next_bp; 611 struct buf *prev_bp = NULL; 612 613 ASSERT(xap != NULL); 614 ASSERT(xap->xa_tq != NULL); 615 ASSERT(!mutex_owned(&xap->xa_mutex)); 616 617 mutex_enter(&xap->xa_mutex); 618 619 for (bp = xap->xa_headp; bp != NULL; bp = next_bp) { 620 621 next_bp = bp->av_forw; /* Save for next iteration */ 622 623 /* 624 * If the user-supplied function is non-NULL and returns 625 * FALSE, then just leave the current bp on the queue. 626 */ 627 if ((funcp != NULL) && (!(*funcp)(bp))) { 628 prev_bp = bp; 629 continue; 630 } 631 632 /* de-queue the bp */ 633 if (bp == xap->xa_headp) { 634 xap->xa_headp = next_bp; 635 if (xap->xa_headp == NULL) { 636 xap->xa_tailp = NULL; 637 } 638 } else { 639 ASSERT(xap->xa_headp != NULL); 640 ASSERT(prev_bp != NULL); 641 if (bp == xap->xa_tailp) { 642 ASSERT(next_bp == NULL); 643 xap->xa_tailp = prev_bp; 644 } 645 prev_bp->av_forw = next_bp; 646 } 647 bp->av_forw = NULL; 648 649 /* Add the bp to the flush queue */ 650 if (xap->xa_flush_headp == NULL) { 651 ASSERT(xap->xa_flush_tailp == NULL); 652 xap->xa_flush_headp = xap->xa_flush_tailp = bp; 653 } else { 654 ASSERT(xap->xa_flush_tailp != NULL); 655 xap->xa_flush_tailp->av_forw = bp; 656 xap->xa_flush_tailp = bp; 657 } 658 } 659 660 while ((bp = xap->xa_flush_headp) != NULL) { 661 xap->xa_flush_headp = bp->av_forw; 662 if (xap->xa_flush_headp == NULL) { 663 xap->xa_flush_tailp = NULL; 664 } 665 mutex_exit(&xap->xa_mutex); 666 bioerror(bp, EIO); 667 bp->b_resid = bp->b_bcount; 668 biodone(bp); 669 mutex_enter(&xap->xa_mutex); 670 } 671 672 mutex_exit(&xap->xa_mutex); 673 } 674