1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Tag allocation using scalable bitmaps. Uses active queue tracking to support 4 * fairer distribution of tags between multiple submitters when a shared tag map 5 * is used. 6 * 7 * Copyright (C) 2013-2014 Jens Axboe 8 */ 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 12 #include <linux/blk-mq.h> 13 #include <linux/delay.h> 14 #include "blk.h" 15 #include "blk-mq.h" 16 #include "blk-mq-tag.h" 17 18 /* 19 * If a previously inactive queue goes active, bump the active user count. 20 * We need to do this before try to allocate driver tag, then even if fail 21 * to get tag when first time, the other shared-tag users could reserve 22 * budget for it. 23 */ 24 bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) 25 { 26 if (blk_mq_is_sbitmap_shared(hctx->flags)) { 27 struct request_queue *q = hctx->queue; 28 struct blk_mq_tag_set *set = q->tag_set; 29 30 if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && 31 !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) 32 atomic_inc(&set->active_queues_shared_sbitmap); 33 } else { 34 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && 35 !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 36 atomic_inc(&hctx->tags->active_queues); 37 } 38 39 return true; 40 } 41 42 /* 43 * Wakeup all potentially sleeping on tags 44 */ 45 void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) 46 { 47 sbitmap_queue_wake_all(tags->bitmap_tags); 48 if (include_reserve) 49 sbitmap_queue_wake_all(tags->breserved_tags); 50 } 51 52 /* 53 * If a previously busy queue goes inactive, potential waiters could now 54 * be allowed to queue. Wake them up and check. 55 */ 56 void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) 57 { 58 struct blk_mq_tags *tags = hctx->tags; 59 struct request_queue *q = hctx->queue; 60 struct blk_mq_tag_set *set = q->tag_set; 61 62 if (blk_mq_is_sbitmap_shared(hctx->flags)) { 63 if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, 64 &q->queue_flags)) 65 return; 66 atomic_dec(&set->active_queues_shared_sbitmap); 67 } else { 68 if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 69 return; 70 atomic_dec(&tags->active_queues); 71 } 72 73 blk_mq_tag_wakeup_all(tags, false); 74 } 75 76 static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, 77 struct sbitmap_queue *bt) 78 { 79 if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && 80 !hctx_may_queue(data->hctx, bt)) 81 return BLK_MQ_NO_TAG; 82 83 if (data->shallow_depth) 84 return __sbitmap_queue_get_shallow(bt, data->shallow_depth); 85 else 86 return __sbitmap_queue_get(bt); 87 } 88 89 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) 90 { 91 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 92 struct sbitmap_queue *bt; 93 struct sbq_wait_state *ws; 94 DEFINE_SBQ_WAIT(wait); 95 unsigned int tag_offset; 96 int tag; 97 98 if (data->flags & BLK_MQ_REQ_RESERVED) { 99 if (unlikely(!tags->nr_reserved_tags)) { 100 WARN_ON_ONCE(1); 101 return BLK_MQ_NO_TAG; 102 } 103 bt = tags->breserved_tags; 104 tag_offset = 0; 105 } else { 106 bt = tags->bitmap_tags; 107 tag_offset = tags->nr_reserved_tags; 108 } 109 110 tag = __blk_mq_get_tag(data, bt); 111 if (tag != BLK_MQ_NO_TAG) 112 goto found_tag; 113 114 if (data->flags & BLK_MQ_REQ_NOWAIT) 115 return BLK_MQ_NO_TAG; 116 117 ws = bt_wait_ptr(bt, data->hctx); 118 do { 119 struct sbitmap_queue *bt_prev; 120 121 /* 122 * We're out of tags on this hardware queue, kick any 123 * pending IO submits before going to sleep waiting for 124 * some to complete. 125 */ 126 blk_mq_run_hw_queue(data->hctx, false); 127 128 /* 129 * Retry tag allocation after running the hardware queue, 130 * as running the queue may also have found completions. 131 */ 132 tag = __blk_mq_get_tag(data, bt); 133 if (tag != BLK_MQ_NO_TAG) 134 break; 135 136 sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); 137 138 tag = __blk_mq_get_tag(data, bt); 139 if (tag != BLK_MQ_NO_TAG) 140 break; 141 142 bt_prev = bt; 143 io_schedule(); 144 145 sbitmap_finish_wait(bt, ws, &wait); 146 147 data->ctx = blk_mq_get_ctx(data->q); 148 data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, 149 data->ctx); 150 tags = blk_mq_tags_from_data(data); 151 if (data->flags & BLK_MQ_REQ_RESERVED) 152 bt = tags->breserved_tags; 153 else 154 bt = tags->bitmap_tags; 155 156 /* 157 * If destination hw queue is changed, fake wake up on 158 * previous queue for compensating the wake up miss, so 159 * other allocations on previous queue won't be starved. 160 */ 161 if (bt != bt_prev) 162 sbitmap_queue_wake_up(bt_prev); 163 164 ws = bt_wait_ptr(bt, data->hctx); 165 } while (1); 166 167 sbitmap_finish_wait(bt, ws, &wait); 168 169 found_tag: 170 /* 171 * Give up this allocation if the hctx is inactive. The caller will 172 * retry on an active hctx. 173 */ 174 if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { 175 blk_mq_put_tag(tags, data->ctx, tag + tag_offset); 176 return BLK_MQ_NO_TAG; 177 } 178 return tag + tag_offset; 179 } 180 181 void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, 182 unsigned int tag) 183 { 184 if (!blk_mq_tag_is_reserved(tags, tag)) { 185 const int real_tag = tag - tags->nr_reserved_tags; 186 187 BUG_ON(real_tag >= tags->nr_tags); 188 sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu); 189 } else { 190 BUG_ON(tag >= tags->nr_reserved_tags); 191 sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu); 192 } 193 } 194 195 struct bt_iter_data { 196 struct blk_mq_hw_ctx *hctx; 197 busy_iter_fn *fn; 198 void *data; 199 bool reserved; 200 }; 201 202 static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) 203 { 204 struct bt_iter_data *iter_data = data; 205 struct blk_mq_hw_ctx *hctx = iter_data->hctx; 206 struct blk_mq_tags *tags = hctx->tags; 207 bool reserved = iter_data->reserved; 208 struct request *rq; 209 210 if (!reserved) 211 bitnr += tags->nr_reserved_tags; 212 rq = tags->rqs[bitnr]; 213 214 /* 215 * We can hit rq == NULL here, because the tagging functions 216 * test and set the bit before assigning ->rqs[]. 217 */ 218 if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx) 219 return iter_data->fn(hctx, rq, iter_data->data, reserved); 220 return true; 221 } 222 223 /** 224 * bt_for_each - iterate over the requests associated with a hardware queue 225 * @hctx: Hardware queue to examine. 226 * @bt: sbitmap to examine. This is either the breserved_tags member 227 * or the bitmap_tags member of struct blk_mq_tags. 228 * @fn: Pointer to the function that will be called for each request 229 * associated with @hctx that has been assigned a driver tag. 230 * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) 231 * where rq is a pointer to a request. Return true to continue 232 * iterating tags, false to stop. 233 * @data: Will be passed as third argument to @fn. 234 * @reserved: Indicates whether @bt is the breserved_tags member or the 235 * bitmap_tags member of struct blk_mq_tags. 236 */ 237 static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, 238 busy_iter_fn *fn, void *data, bool reserved) 239 { 240 struct bt_iter_data iter_data = { 241 .hctx = hctx, 242 .fn = fn, 243 .data = data, 244 .reserved = reserved, 245 }; 246 247 sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); 248 } 249 250 struct bt_tags_iter_data { 251 struct blk_mq_tags *tags; 252 busy_tag_iter_fn *fn; 253 void *data; 254 unsigned int flags; 255 }; 256 257 #define BT_TAG_ITER_RESERVED (1 << 0) 258 #define BT_TAG_ITER_STARTED (1 << 1) 259 #define BT_TAG_ITER_STATIC_RQS (1 << 2) 260 261 static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) 262 { 263 struct bt_tags_iter_data *iter_data = data; 264 struct blk_mq_tags *tags = iter_data->tags; 265 bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED; 266 struct request *rq; 267 268 if (!reserved) 269 bitnr += tags->nr_reserved_tags; 270 271 /* 272 * We can hit rq == NULL here, because the tagging functions 273 * test and set the bit before assigning ->rqs[]. 274 */ 275 if (iter_data->flags & BT_TAG_ITER_STATIC_RQS) 276 rq = tags->static_rqs[bitnr]; 277 else 278 rq = tags->rqs[bitnr]; 279 if (!rq) 280 return true; 281 if ((iter_data->flags & BT_TAG_ITER_STARTED) && 282 !blk_mq_request_started(rq)) 283 return true; 284 return iter_data->fn(rq, iter_data->data, reserved); 285 } 286 287 /** 288 * bt_tags_for_each - iterate over the requests in a tag map 289 * @tags: Tag map to iterate over. 290 * @bt: sbitmap to examine. This is either the breserved_tags member 291 * or the bitmap_tags member of struct blk_mq_tags. 292 * @fn: Pointer to the function that will be called for each started 293 * request. @fn will be called as follows: @fn(rq, @data, 294 * @reserved) where rq is a pointer to a request. Return true 295 * to continue iterating tags, false to stop. 296 * @data: Will be passed as second argument to @fn. 297 * @flags: BT_TAG_ITER_* 298 */ 299 static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, 300 busy_tag_iter_fn *fn, void *data, unsigned int flags) 301 { 302 struct bt_tags_iter_data iter_data = { 303 .tags = tags, 304 .fn = fn, 305 .data = data, 306 .flags = flags, 307 }; 308 309 if (tags->rqs) 310 sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); 311 } 312 313 static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, 314 busy_tag_iter_fn *fn, void *priv, unsigned int flags) 315 { 316 WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); 317 318 if (tags->nr_reserved_tags) 319 bt_tags_for_each(tags, tags->breserved_tags, fn, priv, 320 flags | BT_TAG_ITER_RESERVED); 321 bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags); 322 } 323 324 /** 325 * blk_mq_all_tag_iter - iterate over all requests in a tag map 326 * @tags: Tag map to iterate over. 327 * @fn: Pointer to the function that will be called for each 328 * request. @fn will be called as follows: @fn(rq, @priv, 329 * reserved) where rq is a pointer to a request. 'reserved' 330 * indicates whether or not @rq is a reserved request. Return 331 * true to continue iterating tags, false to stop. 332 * @priv: Will be passed as second argument to @fn. 333 * 334 * Caller has to pass the tag map from which requests are allocated. 335 */ 336 void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, 337 void *priv) 338 { 339 __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS); 340 } 341 342 /** 343 * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set 344 * @tagset: Tag set to iterate over. 345 * @fn: Pointer to the function that will be called for each started 346 * request. @fn will be called as follows: @fn(rq, @priv, 347 * reserved) where rq is a pointer to a request. 'reserved' 348 * indicates whether or not @rq is a reserved request. Return 349 * true to continue iterating tags, false to stop. 350 * @priv: Will be passed as second argument to @fn. 351 */ 352 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 353 busy_tag_iter_fn *fn, void *priv) 354 { 355 int i; 356 357 for (i = 0; i < tagset->nr_hw_queues; i++) { 358 if (tagset->tags && tagset->tags[i]) 359 __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, 360 BT_TAG_ITER_STARTED); 361 } 362 } 363 EXPORT_SYMBOL(blk_mq_tagset_busy_iter); 364 365 static bool blk_mq_tagset_count_completed_rqs(struct request *rq, 366 void *data, bool reserved) 367 { 368 unsigned *count = data; 369 370 if (blk_mq_request_completed(rq)) 371 (*count)++; 372 return true; 373 } 374 375 /** 376 * blk_mq_tagset_wait_completed_request - wait until all completed req's 377 * complete funtion is run 378 * @tagset: Tag set to drain completed request 379 * 380 * Note: This function has to be run after all IO queues are shutdown 381 */ 382 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) 383 { 384 while (true) { 385 unsigned count = 0; 386 387 blk_mq_tagset_busy_iter(tagset, 388 blk_mq_tagset_count_completed_rqs, &count); 389 if (!count) 390 break; 391 msleep(5); 392 } 393 } 394 EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); 395 396 /** 397 * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag 398 * @q: Request queue to examine. 399 * @fn: Pointer to the function that will be called for each request 400 * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, 401 * reserved) where rq is a pointer to a request and hctx points 402 * to the hardware queue associated with the request. 'reserved' 403 * indicates whether or not @rq is a reserved request. 404 * @priv: Will be passed as third argument to @fn. 405 * 406 * Note: if @q->tag_set is shared with other request queues then @fn will be 407 * called for all requests on all queues that share that tag set and not only 408 * for requests associated with @q. 409 */ 410 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, 411 void *priv) 412 { 413 struct blk_mq_hw_ctx *hctx; 414 int i; 415 416 /* 417 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx 418 * while the queue is frozen. So we can use q_usage_counter to avoid 419 * racing with it. 420 */ 421 if (!percpu_ref_tryget(&q->q_usage_counter)) 422 return; 423 424 queue_for_each_hw_ctx(q, hctx, i) { 425 struct blk_mq_tags *tags = hctx->tags; 426 427 /* 428 * If no software queues are currently mapped to this 429 * hardware queue, there's nothing to check 430 */ 431 if (!blk_mq_hw_queue_mapped(hctx)) 432 continue; 433 434 if (tags->nr_reserved_tags) 435 bt_for_each(hctx, tags->breserved_tags, fn, priv, true); 436 bt_for_each(hctx, tags->bitmap_tags, fn, priv, false); 437 } 438 blk_queue_exit(q); 439 } 440 441 static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, 442 bool round_robin, int node) 443 { 444 return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, 445 node); 446 } 447 448 static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, 449 int node, int alloc_policy) 450 { 451 unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; 452 bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; 453 454 if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node)) 455 return -ENOMEM; 456 if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags, 457 round_robin, node)) 458 goto free_bitmap_tags; 459 460 tags->bitmap_tags = &tags->__bitmap_tags; 461 tags->breserved_tags = &tags->__breserved_tags; 462 463 return 0; 464 free_bitmap_tags: 465 sbitmap_queue_free(&tags->__bitmap_tags); 466 return -ENOMEM; 467 } 468 469 int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) 470 { 471 unsigned int depth = set->queue_depth - set->reserved_tags; 472 int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); 473 bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; 474 int i, node = set->numa_node; 475 476 if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node)) 477 return -ENOMEM; 478 if (bt_alloc(&set->__breserved_tags, set->reserved_tags, 479 round_robin, node)) 480 goto free_bitmap_tags; 481 482 for (i = 0; i < set->nr_hw_queues; i++) { 483 struct blk_mq_tags *tags = set->tags[i]; 484 485 tags->bitmap_tags = &set->__bitmap_tags; 486 tags->breserved_tags = &set->__breserved_tags; 487 } 488 489 return 0; 490 free_bitmap_tags: 491 sbitmap_queue_free(&set->__bitmap_tags); 492 return -ENOMEM; 493 } 494 495 void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) 496 { 497 sbitmap_queue_free(&set->__bitmap_tags); 498 sbitmap_queue_free(&set->__breserved_tags); 499 } 500 501 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 502 unsigned int reserved_tags, 503 int node, unsigned int flags) 504 { 505 int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags); 506 struct blk_mq_tags *tags; 507 508 if (total_tags > BLK_MQ_TAG_MAX) { 509 pr_err("blk-mq: tag depth too large\n"); 510 return NULL; 511 } 512 513 tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); 514 if (!tags) 515 return NULL; 516 517 tags->nr_tags = total_tags; 518 tags->nr_reserved_tags = reserved_tags; 519 520 if (flags & BLK_MQ_F_TAG_HCTX_SHARED) 521 return tags; 522 523 if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { 524 kfree(tags); 525 return NULL; 526 } 527 return tags; 528 } 529 530 void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) 531 { 532 if (!(flags & BLK_MQ_F_TAG_HCTX_SHARED)) { 533 sbitmap_queue_free(tags->bitmap_tags); 534 sbitmap_queue_free(tags->breserved_tags); 535 } 536 kfree(tags); 537 } 538 539 int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, 540 struct blk_mq_tags **tagsptr, unsigned int tdepth, 541 bool can_grow) 542 { 543 struct blk_mq_tags *tags = *tagsptr; 544 545 if (tdepth <= tags->nr_reserved_tags) 546 return -EINVAL; 547 548 /* 549 * If we are allowed to grow beyond the original size, allocate 550 * a new set of tags before freeing the old one. 551 */ 552 if (tdepth > tags->nr_tags) { 553 struct blk_mq_tag_set *set = hctx->queue->tag_set; 554 /* Only sched tags can grow, so clear HCTX_SHARED flag */ 555 unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; 556 struct blk_mq_tags *new; 557 bool ret; 558 559 if (!can_grow) 560 return -EINVAL; 561 562 /* 563 * We need some sort of upper limit, set it high enough that 564 * no valid use cases should require more. 565 */ 566 if (tdepth > 16 * BLKDEV_MAX_RQ) 567 return -EINVAL; 568 569 new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 570 tags->nr_reserved_tags, flags); 571 if (!new) 572 return -ENOMEM; 573 ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); 574 if (ret) { 575 blk_mq_free_rq_map(new, flags); 576 return -ENOMEM; 577 } 578 579 blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); 580 blk_mq_free_rq_map(*tagsptr, flags); 581 *tagsptr = new; 582 } else { 583 /* 584 * Don't need (or can't) update reserved tags here, they 585 * remain static and should never need resizing. 586 */ 587 sbitmap_queue_resize(tags->bitmap_tags, 588 tdepth - tags->nr_reserved_tags); 589 } 590 591 return 0; 592 } 593 594 void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size) 595 { 596 sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags); 597 } 598 599 /** 600 * blk_mq_unique_tag() - return a tag that is unique queue-wide 601 * @rq: request for which to compute a unique tag 602 * 603 * The tag field in struct request is unique per hardware queue but not over 604 * all hardware queues. Hence this function that returns a tag with the 605 * hardware context index in the upper bits and the per hardware queue tag in 606 * the lower bits. 607 * 608 * Note: When called for a request that is queued on a non-multiqueue request 609 * queue, the hardware context index is set to zero. 610 */ 611 u32 blk_mq_unique_tag(struct request *rq) 612 { 613 return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | 614 (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); 615 } 616 EXPORT_SYMBOL(blk_mq_unique_tag); 617