1 /* 2 * blk-mq scheduling framework 3 * 4 * Copyright (C) 2016 Jens Axboe 5 */ 6 #include <linux/kernel.h> 7 #include <linux/module.h> 8 #include <linux/blk-mq.h> 9 10 #include <trace/events/block.h> 11 12 #include "blk.h" 13 #include "blk-mq.h" 14 #include "blk-mq-debugfs.h" 15 #include "blk-mq-sched.h" 16 #include "blk-mq-tag.h" 17 #include "blk-wbt.h" 18 19 void blk_mq_sched_free_hctx_data(struct request_queue *q, 20 void (*exit)(struct blk_mq_hw_ctx *)) 21 { 22 struct blk_mq_hw_ctx *hctx; 23 int i; 24 25 queue_for_each_hw_ctx(q, hctx, i) { 26 if (exit && hctx->sched_data) 27 exit(hctx); 28 kfree(hctx->sched_data); 29 hctx->sched_data = NULL; 30 } 31 } 32 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); 33 34 void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) 35 { 36 struct request_queue *q = rq->q; 37 struct io_context *ioc = rq_ioc(bio); 38 struct io_cq *icq; 39 40 spin_lock_irq(q->queue_lock); 41 icq = ioc_lookup_icq(ioc, q); 42 spin_unlock_irq(q->queue_lock); 43 44 if (!icq) { 45 icq = ioc_create_icq(ioc, q, GFP_ATOMIC); 46 if (!icq) 47 return; 48 } 49 get_io_context(icq->ioc); 50 rq->elv.icq = icq; 51 } 52 53 /* 54 * Mark a hardware queue as needing a restart. For shared queues, maintain 55 * a count of how many hardware queues are marked for restart. 56 */ 57 static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) 58 { 59 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 60 return; 61 62 if (hctx->flags & BLK_MQ_F_TAG_SHARED) { 63 struct request_queue *q = hctx->queue; 64 65 if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 66 atomic_inc(&q->shared_hctx_restart); 67 } else 68 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 69 } 70 71 static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) 72 { 73 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 74 return false; 75 76 if (hctx->flags & BLK_MQ_F_TAG_SHARED) { 77 struct request_queue *q = hctx->queue; 78 79 if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 80 atomic_dec(&q->shared_hctx_restart); 81 } else 82 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 83 84 return blk_mq_run_hw_queue(hctx, true); 85 } 86 87 /* 88 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts 89 * its queue by itself in its completion handler, so we don't need to 90 * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. 91 */ 92 static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) 93 { 94 struct request_queue *q = hctx->queue; 95 struct elevator_queue *e = q->elevator; 96 LIST_HEAD(rq_list); 97 98 do { 99 struct request *rq; 100 101 if (e->type->ops.mq.has_work && 102 !e->type->ops.mq.has_work(hctx)) 103 break; 104 105 if (!blk_mq_get_dispatch_budget(hctx)) 106 break; 107 108 rq = e->type->ops.mq.dispatch_request(hctx); 109 if (!rq) { 110 blk_mq_put_dispatch_budget(hctx); 111 break; 112 } 113 114 /* 115 * Now this rq owns the budget which has to be released 116 * if this rq won't be queued to driver via .queue_rq() 117 * in blk_mq_dispatch_rq_list(). 118 */ 119 list_add(&rq->queuelist, &rq_list); 120 } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); 121 } 122 123 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, 124 struct blk_mq_ctx *ctx) 125 { 126 unsigned idx = ctx->index_hw; 127 128 if (++idx == hctx->nr_ctx) 129 idx = 0; 130 131 return hctx->ctxs[idx]; 132 } 133 134 /* 135 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts 136 * its queue by itself in its completion handler, so we don't need to 137 * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. 138 */ 139 static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) 140 { 141 struct request_queue *q = hctx->queue; 142 LIST_HEAD(rq_list); 143 struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); 144 145 do { 146 struct request *rq; 147 148 if (!sbitmap_any_bit_set(&hctx->ctx_map)) 149 break; 150 151 if (!blk_mq_get_dispatch_budget(hctx)) 152 break; 153 154 rq = blk_mq_dequeue_from_ctx(hctx, ctx); 155 if (!rq) { 156 blk_mq_put_dispatch_budget(hctx); 157 break; 158 } 159 160 /* 161 * Now this rq owns the budget which has to be released 162 * if this rq won't be queued to driver via .queue_rq() 163 * in blk_mq_dispatch_rq_list(). 164 */ 165 list_add(&rq->queuelist, &rq_list); 166 167 /* round robin for fair dispatch */ 168 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); 169 170 } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); 171 172 WRITE_ONCE(hctx->dispatch_from, ctx); 173 } 174 175 /* return true if hw queue need to be run again */ 176 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) 177 { 178 struct request_queue *q = hctx->queue; 179 struct elevator_queue *e = q->elevator; 180 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; 181 LIST_HEAD(rq_list); 182 183 /* RCU or SRCU read lock is needed before checking quiesced flag */ 184 if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) 185 return; 186 187 hctx->run++; 188 189 /* 190 * If we have previous entries on our dispatch list, grab them first for 191 * more fair dispatch. 192 */ 193 if (!list_empty_careful(&hctx->dispatch)) { 194 spin_lock(&hctx->lock); 195 if (!list_empty(&hctx->dispatch)) 196 list_splice_init(&hctx->dispatch, &rq_list); 197 spin_unlock(&hctx->lock); 198 } 199 200 /* 201 * Only ask the scheduler for requests, if we didn't have residual 202 * requests from the dispatch list. This is to avoid the case where 203 * we only ever dispatch a fraction of the requests available because 204 * of low device queue depth. Once we pull requests out of the IO 205 * scheduler, we can no longer merge or sort them. So it's best to 206 * leave them there for as long as we can. Mark the hw queue as 207 * needing a restart in that case. 208 * 209 * We want to dispatch from the scheduler if there was nothing 210 * on the dispatch list or we were able to dispatch from the 211 * dispatch list. 212 */ 213 if (!list_empty(&rq_list)) { 214 blk_mq_sched_mark_restart_hctx(hctx); 215 if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { 216 if (has_sched_dispatch) 217 blk_mq_do_dispatch_sched(hctx); 218 else 219 blk_mq_do_dispatch_ctx(hctx); 220 } 221 } else if (has_sched_dispatch) { 222 blk_mq_do_dispatch_sched(hctx); 223 } else if (q->mq_ops->get_budget) { 224 /* 225 * If we need to get budget before queuing request, we 226 * dequeue request one by one from sw queue for avoiding 227 * to mess up I/O merge when dispatch runs out of resource. 228 * 229 * TODO: get more budgets, and dequeue more requests in 230 * one time. 231 */ 232 blk_mq_do_dispatch_ctx(hctx); 233 } else { 234 blk_mq_flush_busy_ctxs(hctx, &rq_list); 235 blk_mq_dispatch_rq_list(q, &rq_list, false); 236 } 237 } 238 239 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, 240 struct request **merged_request) 241 { 242 struct request *rq; 243 244 switch (elv_merge(q, &rq, bio)) { 245 case ELEVATOR_BACK_MERGE: 246 if (!blk_mq_sched_allow_merge(q, rq, bio)) 247 return false; 248 if (!bio_attempt_back_merge(q, rq, bio)) 249 return false; 250 *merged_request = attempt_back_merge(q, rq); 251 if (!*merged_request) 252 elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); 253 return true; 254 case ELEVATOR_FRONT_MERGE: 255 if (!blk_mq_sched_allow_merge(q, rq, bio)) 256 return false; 257 if (!bio_attempt_front_merge(q, rq, bio)) 258 return false; 259 *merged_request = attempt_front_merge(q, rq); 260 if (!*merged_request) 261 elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); 262 return true; 263 default: 264 return false; 265 } 266 } 267 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); 268 269 /* 270 * Reverse check our software queue for entries that we could potentially 271 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 272 * too much time checking for merges. 273 */ 274 static bool blk_mq_attempt_merge(struct request_queue *q, 275 struct blk_mq_ctx *ctx, struct bio *bio) 276 { 277 struct request *rq; 278 int checked = 8; 279 280 lockdep_assert_held(&ctx->lock); 281 282 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 283 bool merged = false; 284 285 if (!checked--) 286 break; 287 288 if (!blk_rq_merge_ok(rq, bio)) 289 continue; 290 291 switch (blk_try_merge(rq, bio)) { 292 case ELEVATOR_BACK_MERGE: 293 if (blk_mq_sched_allow_merge(q, rq, bio)) 294 merged = bio_attempt_back_merge(q, rq, bio); 295 break; 296 case ELEVATOR_FRONT_MERGE: 297 if (blk_mq_sched_allow_merge(q, rq, bio)) 298 merged = bio_attempt_front_merge(q, rq, bio); 299 break; 300 case ELEVATOR_DISCARD_MERGE: 301 merged = bio_attempt_discard_merge(q, rq, bio); 302 break; 303 default: 304 continue; 305 } 306 307 if (merged) 308 ctx->rq_merged++; 309 return merged; 310 } 311 312 return false; 313 } 314 315 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) 316 { 317 struct elevator_queue *e = q->elevator; 318 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 319 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 320 bool ret = false; 321 322 if (e && e->type->ops.mq.bio_merge) { 323 blk_mq_put_ctx(ctx); 324 return e->type->ops.mq.bio_merge(hctx, bio); 325 } 326 327 if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) { 328 /* default per sw-queue merge */ 329 spin_lock(&ctx->lock); 330 ret = blk_mq_attempt_merge(q, ctx, bio); 331 spin_unlock(&ctx->lock); 332 } 333 334 blk_mq_put_ctx(ctx); 335 return ret; 336 } 337 338 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) 339 { 340 return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); 341 } 342 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); 343 344 void blk_mq_sched_request_inserted(struct request *rq) 345 { 346 trace_block_rq_insert(rq->q, rq); 347 } 348 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); 349 350 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, 351 bool has_sched, 352 struct request *rq) 353 { 354 /* dispatch flush rq directly */ 355 if (rq->rq_flags & RQF_FLUSH_SEQ) { 356 spin_lock(&hctx->lock); 357 list_add(&rq->queuelist, &hctx->dispatch); 358 spin_unlock(&hctx->lock); 359 return true; 360 } 361 362 if (has_sched) 363 rq->rq_flags |= RQF_SORTED; 364 365 return false; 366 } 367 368 /** 369 * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list 370 * @pos: loop cursor. 371 * @skip: the list element that will not be examined. Iteration starts at 372 * @skip->next. 373 * @head: head of the list to examine. This list must have at least one 374 * element, namely @skip. 375 * @member: name of the list_head structure within typeof(*pos). 376 */ 377 #define list_for_each_entry_rcu_rr(pos, skip, head, member) \ 378 for ((pos) = (skip); \ 379 (pos = (pos)->member.next != (head) ? list_entry_rcu( \ 380 (pos)->member.next, typeof(*pos), member) : \ 381 list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ 382 (pos) != (skip); ) 383 384 /* 385 * Called after a driver tag has been freed to check whether a hctx needs to 386 * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware 387 * queues in a round-robin fashion if the tag set of @hctx is shared with other 388 * hardware queues. 389 */ 390 void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) 391 { 392 struct blk_mq_tags *const tags = hctx->tags; 393 struct blk_mq_tag_set *const set = hctx->queue->tag_set; 394 struct request_queue *const queue = hctx->queue, *q; 395 struct blk_mq_hw_ctx *hctx2; 396 unsigned int i, j; 397 398 if (set->flags & BLK_MQ_F_TAG_SHARED) { 399 /* 400 * If this is 0, then we know that no hardware queues 401 * have RESTART marked. We're done. 402 */ 403 if (!atomic_read(&queue->shared_hctx_restart)) 404 return; 405 406 rcu_read_lock(); 407 list_for_each_entry_rcu_rr(q, queue, &set->tag_list, 408 tag_set_list) { 409 queue_for_each_hw_ctx(q, hctx2, i) 410 if (hctx2->tags == tags && 411 blk_mq_sched_restart_hctx(hctx2)) 412 goto done; 413 } 414 j = hctx->queue_num + 1; 415 for (i = 0; i < queue->nr_hw_queues; i++, j++) { 416 if (j == queue->nr_hw_queues) 417 j = 0; 418 hctx2 = queue->queue_hw_ctx[j]; 419 if (hctx2->tags == tags && 420 blk_mq_sched_restart_hctx(hctx2)) 421 break; 422 } 423 done: 424 rcu_read_unlock(); 425 } else { 426 blk_mq_sched_restart_hctx(hctx); 427 } 428 } 429 430 void blk_mq_sched_insert_request(struct request *rq, bool at_head, 431 bool run_queue, bool async, bool can_block) 432 { 433 struct request_queue *q = rq->q; 434 struct elevator_queue *e = q->elevator; 435 struct blk_mq_ctx *ctx = rq->mq_ctx; 436 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 437 438 /* flush rq in flush machinery need to be dispatched directly */ 439 if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { 440 blk_insert_flush(rq); 441 goto run; 442 } 443 444 WARN_ON(e && (rq->tag != -1)); 445 446 if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) 447 goto run; 448 449 if (e && e->type->ops.mq.insert_requests) { 450 LIST_HEAD(list); 451 452 list_add(&rq->queuelist, &list); 453 e->type->ops.mq.insert_requests(hctx, &list, at_head); 454 } else { 455 spin_lock(&ctx->lock); 456 __blk_mq_insert_request(hctx, rq, at_head); 457 spin_unlock(&ctx->lock); 458 } 459 460 run: 461 if (run_queue) 462 blk_mq_run_hw_queue(hctx, async); 463 } 464 465 void blk_mq_sched_insert_requests(struct request_queue *q, 466 struct blk_mq_ctx *ctx, 467 struct list_head *list, bool run_queue_async) 468 { 469 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 470 struct elevator_queue *e = hctx->queue->elevator; 471 472 if (e && e->type->ops.mq.insert_requests) 473 e->type->ops.mq.insert_requests(hctx, list, false); 474 else 475 blk_mq_insert_requests(hctx, ctx, list); 476 477 blk_mq_run_hw_queue(hctx, run_queue_async); 478 } 479 480 static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, 481 struct blk_mq_hw_ctx *hctx, 482 unsigned int hctx_idx) 483 { 484 if (hctx->sched_tags) { 485 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); 486 blk_mq_free_rq_map(hctx->sched_tags); 487 hctx->sched_tags = NULL; 488 } 489 } 490 491 static int blk_mq_sched_alloc_tags(struct request_queue *q, 492 struct blk_mq_hw_ctx *hctx, 493 unsigned int hctx_idx) 494 { 495 struct blk_mq_tag_set *set = q->tag_set; 496 int ret; 497 498 hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, 499 set->reserved_tags); 500 if (!hctx->sched_tags) 501 return -ENOMEM; 502 503 ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); 504 if (ret) 505 blk_mq_sched_free_tags(set, hctx, hctx_idx); 506 507 return ret; 508 } 509 510 static void blk_mq_sched_tags_teardown(struct request_queue *q) 511 { 512 struct blk_mq_tag_set *set = q->tag_set; 513 struct blk_mq_hw_ctx *hctx; 514 int i; 515 516 queue_for_each_hw_ctx(q, hctx, i) 517 blk_mq_sched_free_tags(set, hctx, i); 518 } 519 520 int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 521 unsigned int hctx_idx) 522 { 523 struct elevator_queue *e = q->elevator; 524 int ret; 525 526 if (!e) 527 return 0; 528 529 ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx); 530 if (ret) 531 return ret; 532 533 if (e->type->ops.mq.init_hctx) { 534 ret = e->type->ops.mq.init_hctx(hctx, hctx_idx); 535 if (ret) { 536 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); 537 return ret; 538 } 539 } 540 541 blk_mq_debugfs_register_sched_hctx(q, hctx); 542 543 return 0; 544 } 545 546 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 547 unsigned int hctx_idx) 548 { 549 struct elevator_queue *e = q->elevator; 550 551 if (!e) 552 return; 553 554 blk_mq_debugfs_unregister_sched_hctx(hctx); 555 556 if (e->type->ops.mq.exit_hctx && hctx->sched_data) { 557 e->type->ops.mq.exit_hctx(hctx, hctx_idx); 558 hctx->sched_data = NULL; 559 } 560 561 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); 562 } 563 564 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) 565 { 566 struct blk_mq_hw_ctx *hctx; 567 struct elevator_queue *eq; 568 unsigned int i; 569 int ret; 570 571 if (!e) { 572 q->elevator = NULL; 573 return 0; 574 } 575 576 /* 577 * Default to double of smaller one between hw queue_depth and 128, 578 * since we don't split into sync/async like the old code did. 579 * Additionally, this is a per-hw queue depth. 580 */ 581 q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, 582 BLKDEV_MAX_RQ); 583 584 queue_for_each_hw_ctx(q, hctx, i) { 585 ret = blk_mq_sched_alloc_tags(q, hctx, i); 586 if (ret) 587 goto err; 588 } 589 590 ret = e->ops.mq.init_sched(q, e); 591 if (ret) 592 goto err; 593 594 blk_mq_debugfs_register_sched(q); 595 596 queue_for_each_hw_ctx(q, hctx, i) { 597 if (e->ops.mq.init_hctx) { 598 ret = e->ops.mq.init_hctx(hctx, i); 599 if (ret) { 600 eq = q->elevator; 601 blk_mq_exit_sched(q, eq); 602 kobject_put(&eq->kobj); 603 return ret; 604 } 605 } 606 blk_mq_debugfs_register_sched_hctx(q, hctx); 607 } 608 609 return 0; 610 611 err: 612 blk_mq_sched_tags_teardown(q); 613 q->elevator = NULL; 614 return ret; 615 } 616 617 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) 618 { 619 struct blk_mq_hw_ctx *hctx; 620 unsigned int i; 621 622 queue_for_each_hw_ctx(q, hctx, i) { 623 blk_mq_debugfs_unregister_sched_hctx(hctx); 624 if (e->type->ops.mq.exit_hctx && hctx->sched_data) { 625 e->type->ops.mq.exit_hctx(hctx, i); 626 hctx->sched_data = NULL; 627 } 628 } 629 blk_mq_debugfs_unregister_sched(q); 630 if (e->type->ops.mq.exit_sched) 631 e->type->ops.mq.exit_sched(e); 632 blk_mq_sched_tags_teardown(q); 633 q->elevator = NULL; 634 } 635 636 int blk_mq_sched_init(struct request_queue *q) 637 { 638 int ret; 639 640 mutex_lock(&q->sysfs_lock); 641 ret = elevator_init(q, NULL); 642 mutex_unlock(&q->sysfs_lock); 643 644 return ret; 645 } 646