1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/scsi/scsi.h> 30 #include <sys/ddi.h> 31 #include <sys/sunddi.h> 32 #include <sys/thread.h> 33 #include <sys/var.h> 34 35 #include "sd_xbuf.h" 36 37 /* 38 * xbuf.c: buf(9s) extension facility. 39 * 40 * The buf(9S) extension facility is intended to allow block drivers to 41 * allocate additional memory that is associated with a particular buf(9S) 42 * struct. It is further intended to help in addressing the usual set of 43 * problems associated with such allocations, in particular those involving 44 * recovery from allocation failures, especially in code paths that the 45 * system relies on to free memory. 46 * 47 * CAVEAT: Currently this code is completely private to the sd driver and in 48 * NO WAY constitutes a public or supported interface of any kind. It is 49 * envisioned that this may one day migrate into the Solaris DDI, but until 50 * that time this ought to be considered completely unstable and is subject 51 * to change without notice. This code may NOT in any way be utilized by 52 * ANY code outside the sd driver. 53 */ 54 55 56 static int xbuf_iostart(ddi_xbuf_attr_t xap); 57 static void xbuf_dispatch(ddi_xbuf_attr_t xap); 58 static void xbuf_restart_callback(void *arg); 59 60 61 /* 62 * Note: Should this be exposed to the caller.... do we want to give the 63 * caller the fexibility of specifying the parameters for the thread pool? 64 * Note: these values are just estimates at this time, based upon what 65 * seems reasonable for the sd driver. It may be preferable to make these 66 * parameters self-scaling in a real (future) implementation. 67 */ 68 #define XBUF_TQ_MINALLOC 64 69 #define XBUF_TQ_MAXALLOC 512 70 #define XBUF_DISPATCH_DELAY (drv_usectohz(50000)) /* 50 msec */ 71 72 static taskq_t *xbuf_tq = NULL; 73 static int xbuf_attr_tq_minalloc = XBUF_TQ_MINALLOC; 74 static int xbuf_attr_tq_maxalloc = XBUF_TQ_MAXALLOC; 75 76 static kmutex_t xbuf_mutex = { 0 }; 77 static uint32_t xbuf_refcount = 0; 78 79 80 /* ARGSUSED */ 81 DDII ddi_xbuf_attr_t 82 ddi_xbuf_attr_create(size_t xsize, 83 void (*xa_strategy)(struct buf *bp, ddi_xbuf_t xp, void *attr_arg), 84 void *attr_arg, uint32_t active_limit, uint32_t reserve_limit, 85 major_t major, int flags) 86 { 87 ddi_xbuf_attr_t xap; 88 89 xap = kmem_zalloc(sizeof (struct __ddi_xbuf_attr), KM_SLEEP); 90 91 mutex_init(&xap->xa_mutex, NULL, MUTEX_DRIVER, NULL); 92 mutex_init(&xap->xa_reserve_mutex, NULL, MUTEX_DRIVER, NULL); 93 94 /* Future: Allow the caller to specify alignment requirements? */ 95 xap->xa_allocsize = max(xsize, sizeof (void *)); 96 xap->xa_active_limit = active_limit; 97 xap->xa_active_lowater = xap->xa_active_limit / 2; 98 xap->xa_reserve_limit = reserve_limit; 99 xap->xa_strategy = xa_strategy; 100 xap->xa_attr_arg = attr_arg; 101 102 mutex_enter(&xbuf_mutex); 103 if (xbuf_refcount == 0) { 104 ASSERT(xbuf_tq == NULL); 105 /* 106 * Note: Would be nice if: (1) #threads in the taskq pool (set 107 * to the value of 'ncpus' at the time the taskq is created) 108 * could adjust automatically with DR; (2) the taskq 109 * minalloc/maxalloc counts could be grown/shrunk on the fly. 110 */ 111 xbuf_tq = taskq_create("xbuf_taskq", ncpus, 112 (v.v_maxsyspri - 2), xbuf_attr_tq_minalloc, 113 xbuf_attr_tq_maxalloc, TASKQ_PREPOPULATE); 114 } 115 xbuf_refcount++; 116 mutex_exit(&xbuf_mutex); 117 118 /* In this prototype we just always use the global system pool. */ 119 xap->xa_tq = xbuf_tq; 120 121 return (xap); 122 } 123 124 125 DDII void 126 ddi_xbuf_attr_destroy(ddi_xbuf_attr_t xap) 127 { 128 ddi_xbuf_t xp; 129 130 mutex_destroy(&xap->xa_mutex); 131 mutex_destroy(&xap->xa_reserve_mutex); 132 133 /* Free any xbufs on the reserve list */ 134 while (xap->xa_reserve_count != 0) { 135 xp = xap->xa_reserve_headp; 136 xap->xa_reserve_headp = *((void **)xp); 137 xap->xa_reserve_count--; 138 kmem_free(xp, xap->xa_allocsize); 139 } 140 ASSERT(xap->xa_reserve_headp == NULL); 141 142 mutex_enter(&xbuf_mutex); 143 ASSERT((xbuf_refcount != 0) && (xbuf_tq != NULL)); 144 xbuf_refcount--; 145 if (xbuf_refcount == 0) { 146 taskq_destroy(xbuf_tq); 147 xbuf_tq = NULL; 148 } 149 mutex_exit(&xbuf_mutex); 150 151 kmem_free(xap, sizeof (struct __ddi_xbuf_attr)); 152 } 153 154 155 /* ARGSUSED */ 156 DDII void 157 ddi_xbuf_attr_register_devinfo(ddi_xbuf_attr_t xbuf_attr, dev_info_t *dip) 158 { 159 /* Currently a no-op in this prototype */ 160 } 161 162 163 /* ARGSUSED */ 164 DDII void 165 ddi_xbuf_attr_unregister_devinfo(ddi_xbuf_attr_t xbuf_attr, dev_info_t *dip) 166 { 167 /* Currently a no-op in this prototype */ 168 } 169 170 171 172 /* 173 * Enqueue the given buf and attempt to initiate IO. 174 * Called from the driver strategy(9E) routine. 175 */ 176 177 DDII int 178 ddi_xbuf_qstrategy(struct buf *bp, ddi_xbuf_attr_t xap) 179 { 180 ASSERT(xap != NULL); 181 ASSERT(!mutex_owned(&xap->xa_mutex)); 182 ASSERT(!mutex_owned(&xap->xa_reserve_mutex)); 183 184 mutex_enter(&xap->xa_mutex); 185 186 if (xap->xa_headp == NULL) { 187 xap->xa_headp = xap->xa_tailp = bp; 188 } else { 189 xap->xa_tailp->av_forw = bp; 190 xap->xa_tailp = bp; 191 } 192 bp->av_forw = NULL; 193 194 xap->xa_pending++; 195 mutex_exit(&xap->xa_mutex); 196 return (xbuf_iostart(xap)); 197 } 198 199 200 /* 201 * Drivers call this immediately before calling biodone(9F), to notify the 202 * framework that the indicated xbuf is no longer being used by the driver. 203 * May be called under interrupt context. 204 */ 205 206 DDII void 207 ddi_xbuf_done(struct buf *bp, ddi_xbuf_attr_t xap) 208 { 209 ddi_xbuf_t xp; 210 211 ASSERT(bp != NULL); 212 ASSERT(xap != NULL); 213 ASSERT(!mutex_owned(&xap->xa_mutex)); 214 ASSERT(!mutex_owned(&xap->xa_reserve_mutex)); 215 216 xp = ddi_xbuf_get(bp, xap); 217 218 mutex_enter(&xap->xa_mutex); 219 220 #ifdef SDDEBUG 221 if (xap->xa_active_limit != 0) { 222 ASSERT(xap->xa_active_count > 0); 223 } 224 #endif 225 xap->xa_active_count--; 226 227 if (xap->xa_reserve_limit != 0) { 228 mutex_enter(&xap->xa_reserve_mutex); 229 if (xap->xa_reserve_count < xap->xa_reserve_limit) { 230 /* Put this xbuf onto the reserve list & exit */ 231 *((void **)xp) = xap->xa_reserve_headp; 232 xap->xa_reserve_headp = xp; 233 xap->xa_reserve_count++; 234 mutex_exit(&xap->xa_reserve_mutex); 235 goto done; 236 } 237 mutex_exit(&xap->xa_reserve_mutex); 238 } 239 240 kmem_free(xp, xap->xa_allocsize); /* return it to the system */ 241 242 done: 243 if ((xap->xa_active_limit == 0) || 244 (xap->xa_active_count <= xap->xa_active_lowater)) { 245 xbuf_dispatch(xap); 246 } 247 248 mutex_exit(&xap->xa_mutex); 249 } 250 251 DDII void 252 ddi_xbuf_dispatch(ddi_xbuf_attr_t xap) 253 { 254 mutex_enter(&xap->xa_mutex); 255 if ((xap->xa_active_limit == 0) || 256 (xap->xa_active_count <= xap->xa_active_lowater)) { 257 xbuf_dispatch(xap); 258 } 259 mutex_exit(&xap->xa_mutex); 260 } 261 262 263 /* 264 * ISSUE: in this prototype we cannot really implement ddi_xbuf_get() 265 * unless we explicitly hide the xbuf pointer somewhere in the buf 266 * during allocation, and then rely on the driver never changing it. 267 * We can probably get away with using b_private for this for now, 268 * tho it really is kinda gnarly..... 269 */ 270 271 /* ARGSUSED */ 272 DDII ddi_xbuf_t 273 ddi_xbuf_get(struct buf *bp, ddi_xbuf_attr_t xap) 274 { 275 return (bp->b_private); 276 } 277 278 279 /* 280 * Initiate IOs for bufs on the queue. Called from kernel thread or taskq 281 * thread context. May execute concurrently for the same ddi_xbuf_attr_t. 282 */ 283 284 static int 285 xbuf_iostart(ddi_xbuf_attr_t xap) 286 { 287 struct buf *bp; 288 ddi_xbuf_t xp; 289 290 ASSERT(xap != NULL); 291 ASSERT(!mutex_owned(&xap->xa_mutex)); 292 ASSERT(!mutex_owned(&xap->xa_reserve_mutex)); 293 294 /* 295 * For each request on the queue, attempt to allocate the specified 296 * xbuf extension area, and call the driver's iostart() routine. 297 * We process as many requests on the queue as we can, until either 298 * (1) we run out of requests; or 299 * (2) we run out of resources; or 300 * (3) we reach the maximum limit for the given ddi_xbuf_attr_t. 301 */ 302 for (;;) { 303 mutex_enter(&xap->xa_mutex); 304 305 if ((bp = xap->xa_headp) == NULL) { 306 break; /* queue empty */ 307 } 308 309 if ((xap->xa_active_limit != 0) && 310 (xap->xa_active_count >= xap->xa_active_limit)) { 311 break; /* allocation limit reached */ 312 } 313 314 /* 315 * If the reserve_limit is non-zero then work with the 316 * reserve else always allocate a new struct. 317 */ 318 if (xap->xa_reserve_limit != 0) { 319 /* 320 * Don't penalize EVERY I/O by always allocating a new 321 * struct. for the sake of maintaining and not touching 322 * a reserve for a pathalogical condition that may never 323 * happen. Use the reserve entries first, this uses it 324 * like a local pool rather than a reserve that goes 325 * untouched. Make sure it's re-populated whenever it 326 * gets fully depleted just in case it really is needed. 327 * This is safe because under the pathalogical 328 * condition, when the system runs out of memory such 329 * that the below allocs fail, the reserve will still 330 * be available whether the entries are saved away on 331 * the queue unused or in-transport somewhere. Thus 332 * progress can still continue, however slowly. 333 */ 334 mutex_enter(&xap->xa_reserve_mutex); 335 if (xap->xa_reserve_count != 0) { 336 ASSERT(xap->xa_reserve_headp != NULL); 337 /* Grab an xbuf from the reserve */ 338 xp = xap->xa_reserve_headp; 339 xap->xa_reserve_headp = *((void **)xp); 340 ASSERT(xap->xa_reserve_count > 0); 341 xap->xa_reserve_count--; 342 } else { 343 /* 344 * Either this is the first time through, 345 * or the reserve has been totally depleted. 346 * Re-populate the reserve (pool). Excess 347 * structs. get released in the done path. 348 */ 349 while (xap->xa_reserve_count < 350 xap->xa_reserve_limit) { 351 xp = kmem_alloc(xap->xa_allocsize, 352 KM_NOSLEEP); 353 if (xp == NULL) { 354 break; 355 } 356 *((void **)xp) = xap->xa_reserve_headp; 357 xap->xa_reserve_headp = xp; 358 xap->xa_reserve_count++; 359 } 360 /* And one more to use right now. */ 361 xp = kmem_alloc(xap->xa_allocsize, KM_NOSLEEP); 362 } 363 mutex_exit(&xap->xa_reserve_mutex); 364 } else { 365 /* 366 * Try to alloc a new xbuf struct. If this fails just 367 * exit for now. We'll get back here again either upon 368 * cmd completion or via the timer handler. 369 * Question: what if the allocation attempt for the very 370 * first cmd. fails? There are no outstanding cmds so 371 * how do we get back here? 372 * Should look at un_ncmds_in_transport, if it's zero 373 * then schedule xbuf_restart_callback via the timer. 374 * Athough that breaks the architecture by bringing 375 * softstate data into this code. 376 */ 377 xp = kmem_alloc(xap->xa_allocsize, KM_NOSLEEP); 378 } 379 if (xp == NULL) { 380 break; /* Can't process a cmd. right now. */ 381 } 382 383 /* 384 * Always run the counter. It's used/needed when xa_active_limit 385 * is non-zero which is the typical (and right now only) case. 386 */ 387 xap->xa_active_count++; 388 389 /* unlink the buf from the list */ 390 xap->xa_headp = bp->av_forw; 391 bp->av_forw = NULL; 392 393 /* 394 * Hack needed in the prototype so ddi_xbuf_get() will work. 395 * Here we can rely on the sd code not changing the value in 396 * b_private (in fact it wants it there). See ddi_get_xbuf() 397 */ 398 bp->b_private = xp; 399 400 /* call the driver's iostart routine */ 401 mutex_exit(&xap->xa_mutex); 402 (*(xap->xa_strategy))(bp, xp, xap->xa_attr_arg); 403 } 404 405 ASSERT(xap->xa_pending > 0); 406 xap->xa_pending--; 407 mutex_exit(&xap->xa_mutex); 408 return (0); 409 } 410 411 412 /* 413 * Re-start IO processing if there is anything on the queue, AND if the 414 * restart function is not already running/pending for this ddi_xbuf_attr_t 415 */ 416 static void 417 xbuf_dispatch(ddi_xbuf_attr_t xap) 418 { 419 ASSERT(xap != NULL); 420 ASSERT(xap->xa_tq != NULL); 421 ASSERT(mutex_owned(&xap->xa_mutex)); 422 423 if ((xap->xa_headp != NULL) && (xap->xa_timeid == NULL) && 424 (xap->xa_pending == 0)) { 425 /* 426 * First try to see if we can dispatch the restart function 427 * immediately, in a taskq thread. If this fails, then 428 * schedule a timeout(9F) callback to try again later. 429 */ 430 if (taskq_dispatch(xap->xa_tq, 431 (void (*)(void *)) xbuf_iostart, xap, KM_NOSLEEP) == 0) { 432 /* 433 * Unable to enqueue the request for the taskq thread, 434 * try again later. Note that this will keep re-trying 435 * until taskq_dispatch() succeeds. 436 */ 437 xap->xa_timeid = timeout(xbuf_restart_callback, xap, 438 XBUF_DISPATCH_DELAY); 439 } else { 440 /* 441 * This indicates that xbuf_iostart() will soon be 442 * run for this ddi_xbuf_attr_t, and we do not need to 443 * schedule another invocation via timeout/taskq 444 */ 445 xap->xa_pending++; 446 } 447 } 448 } 449 450 /* timeout(9F) callback routine for xbuf restart mechanism. */ 451 static void 452 xbuf_restart_callback(void *arg) 453 { 454 ddi_xbuf_attr_t xap = arg; 455 456 ASSERT(xap != NULL); 457 ASSERT(xap->xa_tq != NULL); 458 ASSERT(!mutex_owned(&xap->xa_mutex)); 459 460 mutex_enter(&xap->xa_mutex); 461 xap->xa_timeid = NULL; 462 xbuf_dispatch(xap); 463 mutex_exit(&xap->xa_mutex); 464 } 465 466 467 DDII void 468 ddi_xbuf_flushq(ddi_xbuf_attr_t xap, int (*funcp)(struct buf *)) 469 { 470 struct buf *bp; 471 struct buf *next_bp; 472 struct buf *prev_bp = NULL; 473 474 ASSERT(xap != NULL); 475 ASSERT(xap->xa_tq != NULL); 476 ASSERT(!mutex_owned(&xap->xa_mutex)); 477 478 mutex_enter(&xap->xa_mutex); 479 480 for (bp = xap->xa_headp; bp != NULL; bp = next_bp) { 481 482 next_bp = bp->av_forw; /* Save for next iteration */ 483 484 /* 485 * If the user-supplied function is non-NULL and returns 486 * FALSE, then just leave the current bp on the queue. 487 */ 488 if ((funcp != NULL) && (!(*funcp)(bp))) { 489 prev_bp = bp; 490 continue; 491 } 492 493 /* de-queue the bp */ 494 if (bp == xap->xa_headp) { 495 xap->xa_headp = next_bp; 496 if (xap->xa_headp == NULL) { 497 xap->xa_tailp = NULL; 498 } 499 } else { 500 ASSERT(xap->xa_headp != NULL); 501 ASSERT(prev_bp != NULL); 502 if (bp == xap->xa_tailp) { 503 ASSERT(next_bp == NULL); 504 xap->xa_tailp = prev_bp; 505 } 506 prev_bp->av_forw = next_bp; 507 } 508 bp->av_forw = NULL; 509 510 /* Add the bp to the flush queue */ 511 if (xap->xa_flush_headp == NULL) { 512 ASSERT(xap->xa_flush_tailp == NULL); 513 xap->xa_flush_headp = xap->xa_flush_tailp = bp; 514 } else { 515 ASSERT(xap->xa_flush_tailp != NULL); 516 xap->xa_flush_tailp->av_forw = bp; 517 xap->xa_flush_tailp = bp; 518 } 519 } 520 521 while ((bp = xap->xa_flush_headp) != NULL) { 522 xap->xa_flush_headp = bp->av_forw; 523 if (xap->xa_flush_headp == NULL) { 524 xap->xa_flush_tailp = NULL; 525 } 526 mutex_exit(&xap->xa_mutex); 527 bioerror(bp, EIO); 528 bp->b_resid = bp->b_bcount; 529 biodone(bp); 530 mutex_enter(&xap->xa_mutex); 531 } 532 533 mutex_exit(&xap->xa_mutex); 534 } 535