1 /* 2 * Block multiqueue core code 3 * 4 * Copyright (C) 2013-2014 Jens Axboe 5 * Copyright (C) 2013-2014 Christoph Hellwig 6 */ 7 #include <linux/kernel.h> 8 #include <linux/module.h> 9 #include <linux/backing-dev.h> 10 #include <linux/bio.h> 11 #include <linux/blkdev.h> 12 #include <linux/mm.h> 13 #include <linux/init.h> 14 #include <linux/slab.h> 15 #include <linux/workqueue.h> 16 #include <linux/smp.h> 17 #include <linux/llist.h> 18 #include <linux/list_sort.h> 19 #include <linux/cpu.h> 20 #include <linux/cache.h> 21 #include <linux/sched/sysctl.h> 22 #include <linux/delay.h> 23 #include <linux/crash_dump.h> 24 25 #include <trace/events/block.h> 26 27 #include <linux/blk-mq.h> 28 #include "blk.h" 29 #include "blk-mq.h" 30 #include "blk-mq-tag.h" 31 32 static DEFINE_MUTEX(all_q_mutex); 33 static LIST_HEAD(all_q_list); 34 35 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 36 static void blk_mq_run_queues(struct request_queue *q); 37 38 /* 39 * Check if any of the ctx's have pending work in this hardware queue 40 */ 41 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 42 { 43 unsigned int i; 44 45 for (i = 0; i < hctx->ctx_map.map_size; i++) 46 if (hctx->ctx_map.map[i].word) 47 return true; 48 49 return false; 50 } 51 52 static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, 53 struct blk_mq_ctx *ctx) 54 { 55 return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; 56 } 57 58 #define CTX_TO_BIT(hctx, ctx) \ 59 ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) 60 61 /* 62 * Mark this ctx as having pending work in this hardware queue 63 */ 64 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 65 struct blk_mq_ctx *ctx) 66 { 67 struct blk_align_bitmap *bm = get_bm(hctx, ctx); 68 69 if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) 70 set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); 71 } 72 73 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 74 struct blk_mq_ctx *ctx) 75 { 76 struct blk_align_bitmap *bm = get_bm(hctx, ctx); 77 78 clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); 79 } 80 81 static int blk_mq_queue_enter(struct request_queue *q) 82 { 83 while (true) { 84 int ret; 85 86 if (percpu_ref_tryget_live(&q->mq_usage_counter)) 87 return 0; 88 89 ret = wait_event_interruptible(q->mq_freeze_wq, 90 !q->mq_freeze_depth || blk_queue_dying(q)); 91 if (blk_queue_dying(q)) 92 return -ENODEV; 93 if (ret) 94 return ret; 95 } 96 } 97 98 static void blk_mq_queue_exit(struct request_queue *q) 99 { 100 percpu_ref_put(&q->mq_usage_counter); 101 } 102 103 static void blk_mq_usage_counter_release(struct percpu_ref *ref) 104 { 105 struct request_queue *q = 106 container_of(ref, struct request_queue, mq_usage_counter); 107 108 wake_up_all(&q->mq_freeze_wq); 109 } 110 111 void blk_mq_freeze_queue_start(struct request_queue *q) 112 { 113 bool freeze; 114 115 spin_lock_irq(q->queue_lock); 116 freeze = !q->mq_freeze_depth++; 117 spin_unlock_irq(q->queue_lock); 118 119 if (freeze) { 120 percpu_ref_kill(&q->mq_usage_counter); 121 blk_mq_run_queues(q); 122 } 123 } 124 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); 125 126 static void blk_mq_freeze_queue_wait(struct request_queue *q) 127 { 128 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); 129 } 130 131 /* 132 * Guarantee no request is in use, so we can change any data structure of 133 * the queue afterward. 134 */ 135 void blk_mq_freeze_queue(struct request_queue *q) 136 { 137 blk_mq_freeze_queue_start(q); 138 blk_mq_freeze_queue_wait(q); 139 } 140 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 141 142 void blk_mq_unfreeze_queue(struct request_queue *q) 143 { 144 bool wake; 145 146 spin_lock_irq(q->queue_lock); 147 wake = !--q->mq_freeze_depth; 148 WARN_ON_ONCE(q->mq_freeze_depth < 0); 149 spin_unlock_irq(q->queue_lock); 150 if (wake) { 151 percpu_ref_reinit(&q->mq_usage_counter); 152 wake_up_all(&q->mq_freeze_wq); 153 } 154 } 155 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 156 157 void blk_mq_wake_waiters(struct request_queue *q) 158 { 159 struct blk_mq_hw_ctx *hctx; 160 unsigned int i; 161 162 queue_for_each_hw_ctx(q, hctx, i) 163 if (blk_mq_hw_queue_mapped(hctx)) 164 blk_mq_tag_wakeup_all(hctx->tags, true); 165 166 /* 167 * If we are called because the queue has now been marked as 168 * dying, we need to ensure that processes currently waiting on 169 * the queue are notified as well. 170 */ 171 wake_up_all(&q->mq_freeze_wq); 172 } 173 174 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 175 { 176 return blk_mq_has_free_tags(hctx->tags); 177 } 178 EXPORT_SYMBOL(blk_mq_can_queue); 179 180 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 181 struct request *rq, unsigned int rw_flags) 182 { 183 if (blk_queue_io_stat(q)) 184 rw_flags |= REQ_IO_STAT; 185 186 INIT_LIST_HEAD(&rq->queuelist); 187 /* csd/requeue_work/fifo_time is initialized before use */ 188 rq->q = q; 189 rq->mq_ctx = ctx; 190 rq->cmd_flags |= rw_flags; 191 /* do not touch atomic flags, it needs atomic ops against the timer */ 192 rq->cpu = -1; 193 INIT_HLIST_NODE(&rq->hash); 194 RB_CLEAR_NODE(&rq->rb_node); 195 rq->rq_disk = NULL; 196 rq->part = NULL; 197 rq->start_time = jiffies; 198 #ifdef CONFIG_BLK_CGROUP 199 rq->rl = NULL; 200 set_start_time_ns(rq); 201 rq->io_start_time_ns = 0; 202 #endif 203 rq->nr_phys_segments = 0; 204 #if defined(CONFIG_BLK_DEV_INTEGRITY) 205 rq->nr_integrity_segments = 0; 206 #endif 207 rq->special = NULL; 208 /* tag was already set */ 209 rq->errors = 0; 210 211 rq->cmd = rq->__cmd; 212 213 rq->extra_len = 0; 214 rq->sense_len = 0; 215 rq->resid_len = 0; 216 rq->sense = NULL; 217 218 INIT_LIST_HEAD(&rq->timeout_list); 219 rq->timeout = 0; 220 221 rq->end_io = NULL; 222 rq->end_io_data = NULL; 223 rq->next_rq = NULL; 224 225 ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 226 } 227 228 static struct request * 229 __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) 230 { 231 struct request *rq; 232 unsigned int tag; 233 234 tag = blk_mq_get_tag(data); 235 if (tag != BLK_MQ_TAG_FAIL) { 236 rq = data->hctx->tags->rqs[tag]; 237 238 if (blk_mq_tag_busy(data->hctx)) { 239 rq->cmd_flags = REQ_MQ_INFLIGHT; 240 atomic_inc(&data->hctx->nr_active); 241 } 242 243 rq->tag = tag; 244 blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw); 245 return rq; 246 } 247 248 return NULL; 249 } 250 251 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, 252 bool reserved) 253 { 254 struct blk_mq_ctx *ctx; 255 struct blk_mq_hw_ctx *hctx; 256 struct request *rq; 257 struct blk_mq_alloc_data alloc_data; 258 int ret; 259 260 ret = blk_mq_queue_enter(q); 261 if (ret) 262 return ERR_PTR(ret); 263 264 ctx = blk_mq_get_ctx(q); 265 hctx = q->mq_ops->map_queue(q, ctx->cpu); 266 blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, 267 reserved, ctx, hctx); 268 269 rq = __blk_mq_alloc_request(&alloc_data, rw); 270 if (!rq && (gfp & __GFP_WAIT)) { 271 __blk_mq_run_hw_queue(hctx); 272 blk_mq_put_ctx(ctx); 273 274 ctx = blk_mq_get_ctx(q); 275 hctx = q->mq_ops->map_queue(q, ctx->cpu); 276 blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx, 277 hctx); 278 rq = __blk_mq_alloc_request(&alloc_data, rw); 279 ctx = alloc_data.ctx; 280 } 281 blk_mq_put_ctx(ctx); 282 if (!rq) { 283 blk_mq_queue_exit(q); 284 return ERR_PTR(-EWOULDBLOCK); 285 } 286 return rq; 287 } 288 EXPORT_SYMBOL(blk_mq_alloc_request); 289 290 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 291 struct blk_mq_ctx *ctx, struct request *rq) 292 { 293 const int tag = rq->tag; 294 struct request_queue *q = rq->q; 295 296 if (rq->cmd_flags & REQ_MQ_INFLIGHT) 297 atomic_dec(&hctx->nr_active); 298 rq->cmd_flags = 0; 299 300 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 301 blk_mq_put_tag(hctx, tag, &ctx->last_tag); 302 blk_mq_queue_exit(q); 303 } 304 305 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) 306 { 307 struct blk_mq_ctx *ctx = rq->mq_ctx; 308 309 ctx->rq_completed[rq_is_sync(rq)]++; 310 __blk_mq_free_request(hctx, ctx, rq); 311 312 } 313 EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); 314 315 void blk_mq_free_request(struct request *rq) 316 { 317 struct blk_mq_hw_ctx *hctx; 318 struct request_queue *q = rq->q; 319 320 hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); 321 blk_mq_free_hctx_request(hctx, rq); 322 } 323 EXPORT_SYMBOL_GPL(blk_mq_free_request); 324 325 inline void __blk_mq_end_request(struct request *rq, int error) 326 { 327 blk_account_io_done(rq); 328 329 if (rq->end_io) { 330 rq->end_io(rq, error); 331 } else { 332 if (unlikely(blk_bidi_rq(rq))) 333 blk_mq_free_request(rq->next_rq); 334 blk_mq_free_request(rq); 335 } 336 } 337 EXPORT_SYMBOL(__blk_mq_end_request); 338 339 void blk_mq_end_request(struct request *rq, int error) 340 { 341 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 342 BUG(); 343 __blk_mq_end_request(rq, error); 344 } 345 EXPORT_SYMBOL(blk_mq_end_request); 346 347 static void __blk_mq_complete_request_remote(void *data) 348 { 349 struct request *rq = data; 350 351 rq->q->softirq_done_fn(rq); 352 } 353 354 static void blk_mq_ipi_complete_request(struct request *rq) 355 { 356 struct blk_mq_ctx *ctx = rq->mq_ctx; 357 bool shared = false; 358 int cpu; 359 360 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { 361 rq->q->softirq_done_fn(rq); 362 return; 363 } 364 365 cpu = get_cpu(); 366 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) 367 shared = cpus_share_cache(cpu, ctx->cpu); 368 369 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { 370 rq->csd.func = __blk_mq_complete_request_remote; 371 rq->csd.info = rq; 372 rq->csd.flags = 0; 373 smp_call_function_single_async(ctx->cpu, &rq->csd); 374 } else { 375 rq->q->softirq_done_fn(rq); 376 } 377 put_cpu(); 378 } 379 380 void __blk_mq_complete_request(struct request *rq) 381 { 382 struct request_queue *q = rq->q; 383 384 if (!q->softirq_done_fn) 385 blk_mq_end_request(rq, rq->errors); 386 else 387 blk_mq_ipi_complete_request(rq); 388 } 389 390 /** 391 * blk_mq_complete_request - end I/O on a request 392 * @rq: the request being processed 393 * 394 * Description: 395 * Ends all I/O on a request. It does not handle partial completions. 396 * The actual completion happens out-of-order, through a IPI handler. 397 **/ 398 void blk_mq_complete_request(struct request *rq) 399 { 400 struct request_queue *q = rq->q; 401 402 if (unlikely(blk_should_fake_timeout(q))) 403 return; 404 if (!blk_mark_rq_complete(rq)) 405 __blk_mq_complete_request(rq); 406 } 407 EXPORT_SYMBOL(blk_mq_complete_request); 408 409 int blk_mq_request_started(struct request *rq) 410 { 411 return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 412 } 413 EXPORT_SYMBOL_GPL(blk_mq_request_started); 414 415 void blk_mq_start_request(struct request *rq) 416 { 417 struct request_queue *q = rq->q; 418 419 trace_block_rq_issue(q, rq); 420 421 rq->resid_len = blk_rq_bytes(rq); 422 if (unlikely(blk_bidi_rq(rq))) 423 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); 424 425 blk_add_timer(rq); 426 427 /* 428 * Ensure that ->deadline is visible before set the started 429 * flag and clear the completed flag. 430 */ 431 smp_mb__before_atomic(); 432 433 /* 434 * Mark us as started and clear complete. Complete might have been 435 * set if requeue raced with timeout, which then marked it as 436 * complete. So be sure to clear complete again when we start 437 * the request, otherwise we'll ignore the completion event. 438 */ 439 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 440 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 441 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) 442 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 443 444 if (q->dma_drain_size && blk_rq_bytes(rq)) { 445 /* 446 * Make sure space for the drain appears. We know we can do 447 * this because max_hw_segments has been adjusted to be one 448 * fewer than the device can handle. 449 */ 450 rq->nr_phys_segments++; 451 } 452 } 453 EXPORT_SYMBOL(blk_mq_start_request); 454 455 static void __blk_mq_requeue_request(struct request *rq) 456 { 457 struct request_queue *q = rq->q; 458 459 trace_block_rq_requeue(q, rq); 460 461 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 462 if (q->dma_drain_size && blk_rq_bytes(rq)) 463 rq->nr_phys_segments--; 464 } 465 } 466 467 void blk_mq_requeue_request(struct request *rq) 468 { 469 __blk_mq_requeue_request(rq); 470 471 BUG_ON(blk_queued_rq(rq)); 472 blk_mq_add_to_requeue_list(rq, true); 473 } 474 EXPORT_SYMBOL(blk_mq_requeue_request); 475 476 static void blk_mq_requeue_work(struct work_struct *work) 477 { 478 struct request_queue *q = 479 container_of(work, struct request_queue, requeue_work); 480 LIST_HEAD(rq_list); 481 struct request *rq, *next; 482 unsigned long flags; 483 484 spin_lock_irqsave(&q->requeue_lock, flags); 485 list_splice_init(&q->requeue_list, &rq_list); 486 spin_unlock_irqrestore(&q->requeue_lock, flags); 487 488 list_for_each_entry_safe(rq, next, &rq_list, queuelist) { 489 if (!(rq->cmd_flags & REQ_SOFTBARRIER)) 490 continue; 491 492 rq->cmd_flags &= ~REQ_SOFTBARRIER; 493 list_del_init(&rq->queuelist); 494 blk_mq_insert_request(rq, true, false, false); 495 } 496 497 while (!list_empty(&rq_list)) { 498 rq = list_entry(rq_list.next, struct request, queuelist); 499 list_del_init(&rq->queuelist); 500 blk_mq_insert_request(rq, false, false, false); 501 } 502 503 /* 504 * Use the start variant of queue running here, so that running 505 * the requeue work will kick stopped queues. 506 */ 507 blk_mq_start_hw_queues(q); 508 } 509 510 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) 511 { 512 struct request_queue *q = rq->q; 513 unsigned long flags; 514 515 /* 516 * We abuse this flag that is otherwise used by the I/O scheduler to 517 * request head insertation from the workqueue. 518 */ 519 BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); 520 521 spin_lock_irqsave(&q->requeue_lock, flags); 522 if (at_head) { 523 rq->cmd_flags |= REQ_SOFTBARRIER; 524 list_add(&rq->queuelist, &q->requeue_list); 525 } else { 526 list_add_tail(&rq->queuelist, &q->requeue_list); 527 } 528 spin_unlock_irqrestore(&q->requeue_lock, flags); 529 } 530 EXPORT_SYMBOL(blk_mq_add_to_requeue_list); 531 532 void blk_mq_cancel_requeue_work(struct request_queue *q) 533 { 534 cancel_work_sync(&q->requeue_work); 535 } 536 EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); 537 538 void blk_mq_kick_requeue_list(struct request_queue *q) 539 { 540 kblockd_schedule_work(&q->requeue_work); 541 } 542 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 543 544 void blk_mq_abort_requeue_list(struct request_queue *q) 545 { 546 unsigned long flags; 547 LIST_HEAD(rq_list); 548 549 spin_lock_irqsave(&q->requeue_lock, flags); 550 list_splice_init(&q->requeue_list, &rq_list); 551 spin_unlock_irqrestore(&q->requeue_lock, flags); 552 553 while (!list_empty(&rq_list)) { 554 struct request *rq; 555 556 rq = list_first_entry(&rq_list, struct request, queuelist); 557 list_del_init(&rq->queuelist); 558 rq->errors = -EIO; 559 blk_mq_end_request(rq, rq->errors); 560 } 561 } 562 EXPORT_SYMBOL(blk_mq_abort_requeue_list); 563 564 static inline bool is_flush_request(struct request *rq, 565 struct blk_flush_queue *fq, unsigned int tag) 566 { 567 return ((rq->cmd_flags & REQ_FLUSH_SEQ) && 568 fq->flush_rq->tag == tag); 569 } 570 571 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 572 { 573 struct request *rq = tags->rqs[tag]; 574 /* mq_ctx of flush rq is always cloned from the corresponding req */ 575 struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); 576 577 if (!is_flush_request(rq, fq, tag)) 578 return rq; 579 580 return fq->flush_rq; 581 } 582 EXPORT_SYMBOL(blk_mq_tag_to_rq); 583 584 struct blk_mq_timeout_data { 585 unsigned long next; 586 unsigned int next_set; 587 }; 588 589 void blk_mq_rq_timed_out(struct request *req, bool reserved) 590 { 591 struct blk_mq_ops *ops = req->q->mq_ops; 592 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; 593 594 /* 595 * We know that complete is set at this point. If STARTED isn't set 596 * anymore, then the request isn't active and the "timeout" should 597 * just be ignored. This can happen due to the bitflag ordering. 598 * Timeout first checks if STARTED is set, and if it is, assumes 599 * the request is active. But if we race with completion, then 600 * we both flags will get cleared. So check here again, and ignore 601 * a timeout event with a request that isn't active. 602 */ 603 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) 604 return; 605 606 if (ops->timeout) 607 ret = ops->timeout(req, reserved); 608 609 switch (ret) { 610 case BLK_EH_HANDLED: 611 __blk_mq_complete_request(req); 612 break; 613 case BLK_EH_RESET_TIMER: 614 blk_add_timer(req); 615 blk_clear_rq_complete(req); 616 break; 617 case BLK_EH_NOT_HANDLED: 618 break; 619 default: 620 printk(KERN_ERR "block: bad eh return: %d\n", ret); 621 break; 622 } 623 } 624 625 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 626 struct request *rq, void *priv, bool reserved) 627 { 628 struct blk_mq_timeout_data *data = priv; 629 630 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 631 /* 632 * If a request wasn't started before the queue was 633 * marked dying, kill it here or it'll go unnoticed. 634 */ 635 if (unlikely(blk_queue_dying(rq->q))) { 636 rq->errors = -EIO; 637 blk_mq_complete_request(rq); 638 } 639 return; 640 } 641 if (rq->cmd_flags & REQ_NO_TIMEOUT) 642 return; 643 644 if (time_after_eq(jiffies, rq->deadline)) { 645 if (!blk_mark_rq_complete(rq)) 646 blk_mq_rq_timed_out(rq, reserved); 647 } else if (!data->next_set || time_after(data->next, rq->deadline)) { 648 data->next = rq->deadline; 649 data->next_set = 1; 650 } 651 } 652 653 static void blk_mq_rq_timer(unsigned long priv) 654 { 655 struct request_queue *q = (struct request_queue *)priv; 656 struct blk_mq_timeout_data data = { 657 .next = 0, 658 .next_set = 0, 659 }; 660 struct blk_mq_hw_ctx *hctx; 661 int i; 662 663 queue_for_each_hw_ctx(q, hctx, i) { 664 /* 665 * If not software queues are currently mapped to this 666 * hardware queue, there's nothing to check 667 */ 668 if (!blk_mq_hw_queue_mapped(hctx)) 669 continue; 670 671 blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); 672 } 673 674 if (data.next_set) { 675 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 676 mod_timer(&q->timeout, data.next); 677 } else { 678 queue_for_each_hw_ctx(q, hctx, i) 679 blk_mq_tag_idle(hctx); 680 } 681 } 682 683 /* 684 * Reverse check our software queue for entries that we could potentially 685 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 686 * too much time checking for merges. 687 */ 688 static bool blk_mq_attempt_merge(struct request_queue *q, 689 struct blk_mq_ctx *ctx, struct bio *bio) 690 { 691 struct request *rq; 692 int checked = 8; 693 694 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 695 int el_ret; 696 697 if (!checked--) 698 break; 699 700 if (!blk_rq_merge_ok(rq, bio)) 701 continue; 702 703 el_ret = blk_try_merge(rq, bio); 704 if (el_ret == ELEVATOR_BACK_MERGE) { 705 if (bio_attempt_back_merge(q, rq, bio)) { 706 ctx->rq_merged++; 707 return true; 708 } 709 break; 710 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 711 if (bio_attempt_front_merge(q, rq, bio)) { 712 ctx->rq_merged++; 713 return true; 714 } 715 break; 716 } 717 } 718 719 return false; 720 } 721 722 /* 723 * Process software queues that have been marked busy, splicing them 724 * to the for-dispatch 725 */ 726 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 727 { 728 struct blk_mq_ctx *ctx; 729 int i; 730 731 for (i = 0; i < hctx->ctx_map.map_size; i++) { 732 struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; 733 unsigned int off, bit; 734 735 if (!bm->word) 736 continue; 737 738 bit = 0; 739 off = i * hctx->ctx_map.bits_per_word; 740 do { 741 bit = find_next_bit(&bm->word, bm->depth, bit); 742 if (bit >= bm->depth) 743 break; 744 745 ctx = hctx->ctxs[bit + off]; 746 clear_bit(bit, &bm->word); 747 spin_lock(&ctx->lock); 748 list_splice_tail_init(&ctx->rq_list, list); 749 spin_unlock(&ctx->lock); 750 751 bit++; 752 } while (1); 753 } 754 } 755 756 /* 757 * Run this hardware queue, pulling any software queues mapped to it in. 758 * Note that this function currently has various problems around ordering 759 * of IO. In particular, we'd like FIFO behaviour on handling existing 760 * items on the hctx->dispatch list. Ignore that for now. 761 */ 762 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 763 { 764 struct request_queue *q = hctx->queue; 765 struct request *rq; 766 LIST_HEAD(rq_list); 767 LIST_HEAD(driver_list); 768 struct list_head *dptr; 769 int queued; 770 771 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); 772 773 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 774 return; 775 776 hctx->run++; 777 778 /* 779 * Touch any software queue that has pending entries. 780 */ 781 flush_busy_ctxs(hctx, &rq_list); 782 783 /* 784 * If we have previous entries on our dispatch list, grab them 785 * and stuff them at the front for more fair dispatch. 786 */ 787 if (!list_empty_careful(&hctx->dispatch)) { 788 spin_lock(&hctx->lock); 789 if (!list_empty(&hctx->dispatch)) 790 list_splice_init(&hctx->dispatch, &rq_list); 791 spin_unlock(&hctx->lock); 792 } 793 794 /* 795 * Start off with dptr being NULL, so we start the first request 796 * immediately, even if we have more pending. 797 */ 798 dptr = NULL; 799 800 /* 801 * Now process all the entries, sending them to the driver. 802 */ 803 queued = 0; 804 while (!list_empty(&rq_list)) { 805 struct blk_mq_queue_data bd; 806 int ret; 807 808 rq = list_first_entry(&rq_list, struct request, queuelist); 809 list_del_init(&rq->queuelist); 810 811 bd.rq = rq; 812 bd.list = dptr; 813 bd.last = list_empty(&rq_list); 814 815 ret = q->mq_ops->queue_rq(hctx, &bd); 816 switch (ret) { 817 case BLK_MQ_RQ_QUEUE_OK: 818 queued++; 819 continue; 820 case BLK_MQ_RQ_QUEUE_BUSY: 821 list_add(&rq->queuelist, &rq_list); 822 __blk_mq_requeue_request(rq); 823 break; 824 default: 825 pr_err("blk-mq: bad return on queue: %d\n", ret); 826 case BLK_MQ_RQ_QUEUE_ERROR: 827 rq->errors = -EIO; 828 blk_mq_end_request(rq, rq->errors); 829 break; 830 } 831 832 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 833 break; 834 835 /* 836 * We've done the first request. If we have more than 1 837 * left in the list, set dptr to defer issue. 838 */ 839 if (!dptr && rq_list.next != rq_list.prev) 840 dptr = &driver_list; 841 } 842 843 if (!queued) 844 hctx->dispatched[0]++; 845 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 846 hctx->dispatched[ilog2(queued) + 1]++; 847 848 /* 849 * Any items that need requeuing? Stuff them into hctx->dispatch, 850 * that is where we will continue on next queue run. 851 */ 852 if (!list_empty(&rq_list)) { 853 spin_lock(&hctx->lock); 854 list_splice(&rq_list, &hctx->dispatch); 855 spin_unlock(&hctx->lock); 856 } 857 } 858 859 /* 860 * It'd be great if the workqueue API had a way to pass 861 * in a mask and had some smarts for more clever placement. 862 * For now we just round-robin here, switching for every 863 * BLK_MQ_CPU_WORK_BATCH queued items. 864 */ 865 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 866 { 867 if (hctx->queue->nr_hw_queues == 1) 868 return WORK_CPU_UNBOUND; 869 870 if (--hctx->next_cpu_batch <= 0) { 871 int cpu = hctx->next_cpu, next_cpu; 872 873 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); 874 if (next_cpu >= nr_cpu_ids) 875 next_cpu = cpumask_first(hctx->cpumask); 876 877 hctx->next_cpu = next_cpu; 878 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 879 880 return cpu; 881 } 882 883 return hctx->next_cpu; 884 } 885 886 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 887 { 888 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || 889 !blk_mq_hw_queue_mapped(hctx))) 890 return; 891 892 if (!async) { 893 int cpu = get_cpu(); 894 if (cpumask_test_cpu(cpu, hctx->cpumask)) { 895 __blk_mq_run_hw_queue(hctx); 896 put_cpu(); 897 return; 898 } 899 900 put_cpu(); 901 } 902 903 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 904 &hctx->run_work, 0); 905 } 906 907 static void blk_mq_run_queues(struct request_queue *q) 908 { 909 struct blk_mq_hw_ctx *hctx; 910 int i; 911 912 queue_for_each_hw_ctx(q, hctx, i) { 913 if ((!blk_mq_hctx_has_pending(hctx) && 914 list_empty_careful(&hctx->dispatch)) || 915 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 916 continue; 917 918 blk_mq_run_hw_queue(hctx, false); 919 } 920 } 921 922 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 923 { 924 cancel_delayed_work(&hctx->run_work); 925 cancel_delayed_work(&hctx->delay_work); 926 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 927 } 928 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 929 930 void blk_mq_stop_hw_queues(struct request_queue *q) 931 { 932 struct blk_mq_hw_ctx *hctx; 933 int i; 934 935 queue_for_each_hw_ctx(q, hctx, i) 936 blk_mq_stop_hw_queue(hctx); 937 } 938 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 939 940 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 941 { 942 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 943 944 blk_mq_run_hw_queue(hctx, false); 945 } 946 EXPORT_SYMBOL(blk_mq_start_hw_queue); 947 948 void blk_mq_start_hw_queues(struct request_queue *q) 949 { 950 struct blk_mq_hw_ctx *hctx; 951 int i; 952 953 queue_for_each_hw_ctx(q, hctx, i) 954 blk_mq_start_hw_queue(hctx); 955 } 956 EXPORT_SYMBOL(blk_mq_start_hw_queues); 957 958 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 959 { 960 struct blk_mq_hw_ctx *hctx; 961 int i; 962 963 queue_for_each_hw_ctx(q, hctx, i) { 964 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 965 continue; 966 967 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 968 blk_mq_run_hw_queue(hctx, async); 969 } 970 } 971 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 972 973 static void blk_mq_run_work_fn(struct work_struct *work) 974 { 975 struct blk_mq_hw_ctx *hctx; 976 977 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); 978 979 __blk_mq_run_hw_queue(hctx); 980 } 981 982 static void blk_mq_delay_work_fn(struct work_struct *work) 983 { 984 struct blk_mq_hw_ctx *hctx; 985 986 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); 987 988 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) 989 __blk_mq_run_hw_queue(hctx); 990 } 991 992 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 993 { 994 if (unlikely(!blk_mq_hw_queue_mapped(hctx))) 995 return; 996 997 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 998 &hctx->delay_work, msecs_to_jiffies(msecs)); 999 } 1000 EXPORT_SYMBOL(blk_mq_delay_queue); 1001 1002 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 1003 struct request *rq, bool at_head) 1004 { 1005 struct blk_mq_ctx *ctx = rq->mq_ctx; 1006 1007 trace_block_rq_insert(hctx->queue, rq); 1008 1009 if (at_head) 1010 list_add(&rq->queuelist, &ctx->rq_list); 1011 else 1012 list_add_tail(&rq->queuelist, &ctx->rq_list); 1013 1014 blk_mq_hctx_mark_pending(hctx, ctx); 1015 } 1016 1017 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, 1018 bool async) 1019 { 1020 struct request_queue *q = rq->q; 1021 struct blk_mq_hw_ctx *hctx; 1022 struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx; 1023 1024 current_ctx = blk_mq_get_ctx(q); 1025 if (!cpu_online(ctx->cpu)) 1026 rq->mq_ctx = ctx = current_ctx; 1027 1028 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1029 1030 spin_lock(&ctx->lock); 1031 __blk_mq_insert_request(hctx, rq, at_head); 1032 spin_unlock(&ctx->lock); 1033 1034 if (run_queue) 1035 blk_mq_run_hw_queue(hctx, async); 1036 1037 blk_mq_put_ctx(current_ctx); 1038 } 1039 1040 static void blk_mq_insert_requests(struct request_queue *q, 1041 struct blk_mq_ctx *ctx, 1042 struct list_head *list, 1043 int depth, 1044 bool from_schedule) 1045 1046 { 1047 struct blk_mq_hw_ctx *hctx; 1048 struct blk_mq_ctx *current_ctx; 1049 1050 trace_block_unplug(q, depth, !from_schedule); 1051 1052 current_ctx = blk_mq_get_ctx(q); 1053 1054 if (!cpu_online(ctx->cpu)) 1055 ctx = current_ctx; 1056 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1057 1058 /* 1059 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1060 * offline now 1061 */ 1062 spin_lock(&ctx->lock); 1063 while (!list_empty(list)) { 1064 struct request *rq; 1065 1066 rq = list_first_entry(list, struct request, queuelist); 1067 list_del_init(&rq->queuelist); 1068 rq->mq_ctx = ctx; 1069 __blk_mq_insert_request(hctx, rq, false); 1070 } 1071 spin_unlock(&ctx->lock); 1072 1073 blk_mq_run_hw_queue(hctx, from_schedule); 1074 blk_mq_put_ctx(current_ctx); 1075 } 1076 1077 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1078 { 1079 struct request *rqa = container_of(a, struct request, queuelist); 1080 struct request *rqb = container_of(b, struct request, queuelist); 1081 1082 return !(rqa->mq_ctx < rqb->mq_ctx || 1083 (rqa->mq_ctx == rqb->mq_ctx && 1084 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 1085 } 1086 1087 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1088 { 1089 struct blk_mq_ctx *this_ctx; 1090 struct request_queue *this_q; 1091 struct request *rq; 1092 LIST_HEAD(list); 1093 LIST_HEAD(ctx_list); 1094 unsigned int depth; 1095 1096 list_splice_init(&plug->mq_list, &list); 1097 1098 list_sort(NULL, &list, plug_ctx_cmp); 1099 1100 this_q = NULL; 1101 this_ctx = NULL; 1102 depth = 0; 1103 1104 while (!list_empty(&list)) { 1105 rq = list_entry_rq(list.next); 1106 list_del_init(&rq->queuelist); 1107 BUG_ON(!rq->q); 1108 if (rq->mq_ctx != this_ctx) { 1109 if (this_ctx) { 1110 blk_mq_insert_requests(this_q, this_ctx, 1111 &ctx_list, depth, 1112 from_schedule); 1113 } 1114 1115 this_ctx = rq->mq_ctx; 1116 this_q = rq->q; 1117 depth = 0; 1118 } 1119 1120 depth++; 1121 list_add_tail(&rq->queuelist, &ctx_list); 1122 } 1123 1124 /* 1125 * If 'this_ctx' is set, we know we have entries to complete 1126 * on 'ctx_list'. Do those. 1127 */ 1128 if (this_ctx) { 1129 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 1130 from_schedule); 1131 } 1132 } 1133 1134 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1135 { 1136 init_request_from_bio(rq, bio); 1137 1138 if (blk_do_io_stat(rq)) 1139 blk_account_io_start(rq, 1); 1140 } 1141 1142 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) 1143 { 1144 return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 1145 !blk_queue_nomerges(hctx->queue); 1146 } 1147 1148 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, 1149 struct blk_mq_ctx *ctx, 1150 struct request *rq, struct bio *bio) 1151 { 1152 if (!hctx_allow_merges(hctx)) { 1153 blk_mq_bio_to_request(rq, bio); 1154 spin_lock(&ctx->lock); 1155 insert_rq: 1156 __blk_mq_insert_request(hctx, rq, false); 1157 spin_unlock(&ctx->lock); 1158 return false; 1159 } else { 1160 struct request_queue *q = hctx->queue; 1161 1162 spin_lock(&ctx->lock); 1163 if (!blk_mq_attempt_merge(q, ctx, bio)) { 1164 blk_mq_bio_to_request(rq, bio); 1165 goto insert_rq; 1166 } 1167 1168 spin_unlock(&ctx->lock); 1169 __blk_mq_free_request(hctx, ctx, rq); 1170 return true; 1171 } 1172 } 1173 1174 struct blk_map_ctx { 1175 struct blk_mq_hw_ctx *hctx; 1176 struct blk_mq_ctx *ctx; 1177 }; 1178 1179 static struct request *blk_mq_map_request(struct request_queue *q, 1180 struct bio *bio, 1181 struct blk_map_ctx *data) 1182 { 1183 struct blk_mq_hw_ctx *hctx; 1184 struct blk_mq_ctx *ctx; 1185 struct request *rq; 1186 int rw = bio_data_dir(bio); 1187 struct blk_mq_alloc_data alloc_data; 1188 1189 if (unlikely(blk_mq_queue_enter(q))) { 1190 bio_endio(bio, -EIO); 1191 return NULL; 1192 } 1193 1194 ctx = blk_mq_get_ctx(q); 1195 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1196 1197 if (rw_is_sync(bio->bi_rw)) 1198 rw |= REQ_SYNC; 1199 1200 trace_block_getrq(q, bio, rw); 1201 blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx, 1202 hctx); 1203 rq = __blk_mq_alloc_request(&alloc_data, rw); 1204 if (unlikely(!rq)) { 1205 __blk_mq_run_hw_queue(hctx); 1206 blk_mq_put_ctx(ctx); 1207 trace_block_sleeprq(q, bio, rw); 1208 1209 ctx = blk_mq_get_ctx(q); 1210 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1211 blk_mq_set_alloc_data(&alloc_data, q, 1212 __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); 1213 rq = __blk_mq_alloc_request(&alloc_data, rw); 1214 ctx = alloc_data.ctx; 1215 hctx = alloc_data.hctx; 1216 } 1217 1218 hctx->queued++; 1219 data->hctx = hctx; 1220 data->ctx = ctx; 1221 return rq; 1222 } 1223 1224 /* 1225 * Multiple hardware queue variant. This will not use per-process plugs, 1226 * but will attempt to bypass the hctx queueing if we can go straight to 1227 * hardware for SYNC IO. 1228 */ 1229 static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 1230 { 1231 const int is_sync = rw_is_sync(bio->bi_rw); 1232 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1233 struct blk_map_ctx data; 1234 struct request *rq; 1235 1236 blk_queue_bounce(q, &bio); 1237 1238 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1239 bio_endio(bio, -EIO); 1240 return; 1241 } 1242 1243 rq = blk_mq_map_request(q, bio, &data); 1244 if (unlikely(!rq)) 1245 return; 1246 1247 if (unlikely(is_flush_fua)) { 1248 blk_mq_bio_to_request(rq, bio); 1249 blk_insert_flush(rq); 1250 goto run_queue; 1251 } 1252 1253 /* 1254 * If the driver supports defer issued based on 'last', then 1255 * queue it up like normal since we can potentially save some 1256 * CPU this way. 1257 */ 1258 if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { 1259 struct blk_mq_queue_data bd = { 1260 .rq = rq, 1261 .list = NULL, 1262 .last = 1 1263 }; 1264 int ret; 1265 1266 blk_mq_bio_to_request(rq, bio); 1267 1268 /* 1269 * For OK queue, we are done. For error, kill it. Any other 1270 * error (busy), just add it to our list as we previously 1271 * would have done 1272 */ 1273 ret = q->mq_ops->queue_rq(data.hctx, &bd); 1274 if (ret == BLK_MQ_RQ_QUEUE_OK) 1275 goto done; 1276 else { 1277 __blk_mq_requeue_request(rq); 1278 1279 if (ret == BLK_MQ_RQ_QUEUE_ERROR) { 1280 rq->errors = -EIO; 1281 blk_mq_end_request(rq, rq->errors); 1282 goto done; 1283 } 1284 } 1285 } 1286 1287 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1288 /* 1289 * For a SYNC request, send it to the hardware immediately. For 1290 * an ASYNC request, just ensure that we run it later on. The 1291 * latter allows for merging opportunities and more efficient 1292 * dispatching. 1293 */ 1294 run_queue: 1295 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1296 } 1297 done: 1298 blk_mq_put_ctx(data.ctx); 1299 } 1300 1301 /* 1302 * Single hardware queue variant. This will attempt to use any per-process 1303 * plug for merging and IO deferral. 1304 */ 1305 static void blk_sq_make_request(struct request_queue *q, struct bio *bio) 1306 { 1307 const int is_sync = rw_is_sync(bio->bi_rw); 1308 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1309 unsigned int use_plug, request_count = 0; 1310 struct blk_map_ctx data; 1311 struct request *rq; 1312 1313 /* 1314 * If we have multiple hardware queues, just go directly to 1315 * one of those for sync IO. 1316 */ 1317 use_plug = !is_flush_fua && !is_sync; 1318 1319 blk_queue_bounce(q, &bio); 1320 1321 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1322 bio_endio(bio, -EIO); 1323 return; 1324 } 1325 1326 if (use_plug && !blk_queue_nomerges(q) && 1327 blk_attempt_plug_merge(q, bio, &request_count)) 1328 return; 1329 1330 rq = blk_mq_map_request(q, bio, &data); 1331 if (unlikely(!rq)) 1332 return; 1333 1334 if (unlikely(is_flush_fua)) { 1335 blk_mq_bio_to_request(rq, bio); 1336 blk_insert_flush(rq); 1337 goto run_queue; 1338 } 1339 1340 /* 1341 * A task plug currently exists. Since this is completely lockless, 1342 * utilize that to temporarily store requests until the task is 1343 * either done or scheduled away. 1344 */ 1345 if (use_plug) { 1346 struct blk_plug *plug = current->plug; 1347 1348 if (plug) { 1349 blk_mq_bio_to_request(rq, bio); 1350 if (list_empty(&plug->mq_list)) 1351 trace_block_plug(q); 1352 else if (request_count >= BLK_MAX_REQUEST_COUNT) { 1353 blk_flush_plug_list(plug, false); 1354 trace_block_plug(q); 1355 } 1356 list_add_tail(&rq->queuelist, &plug->mq_list); 1357 blk_mq_put_ctx(data.ctx); 1358 return; 1359 } 1360 } 1361 1362 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1363 /* 1364 * For a SYNC request, send it to the hardware immediately. For 1365 * an ASYNC request, just ensure that we run it later on. The 1366 * latter allows for merging opportunities and more efficient 1367 * dispatching. 1368 */ 1369 run_queue: 1370 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1371 } 1372 1373 blk_mq_put_ctx(data.ctx); 1374 } 1375 1376 /* 1377 * Default mapping to a software queue, since we use one per CPU. 1378 */ 1379 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 1380 { 1381 return q->queue_hw_ctx[q->mq_map[cpu]]; 1382 } 1383 EXPORT_SYMBOL(blk_mq_map_queue); 1384 1385 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, 1386 struct blk_mq_tags *tags, unsigned int hctx_idx) 1387 { 1388 struct page *page; 1389 1390 if (tags->rqs && set->ops->exit_request) { 1391 int i; 1392 1393 for (i = 0; i < tags->nr_tags; i++) { 1394 if (!tags->rqs[i]) 1395 continue; 1396 set->ops->exit_request(set->driver_data, tags->rqs[i], 1397 hctx_idx, i); 1398 tags->rqs[i] = NULL; 1399 } 1400 } 1401 1402 while (!list_empty(&tags->page_list)) { 1403 page = list_first_entry(&tags->page_list, struct page, lru); 1404 list_del_init(&page->lru); 1405 __free_pages(page, page->private); 1406 } 1407 1408 kfree(tags->rqs); 1409 1410 blk_mq_free_tags(tags); 1411 } 1412 1413 static size_t order_to_size(unsigned int order) 1414 { 1415 return (size_t)PAGE_SIZE << order; 1416 } 1417 1418 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, 1419 unsigned int hctx_idx) 1420 { 1421 struct blk_mq_tags *tags; 1422 unsigned int i, j, entries_per_page, max_order = 4; 1423 size_t rq_size, left; 1424 1425 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, 1426 set->numa_node, 1427 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 1428 if (!tags) 1429 return NULL; 1430 1431 INIT_LIST_HEAD(&tags->page_list); 1432 1433 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), 1434 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, 1435 set->numa_node); 1436 if (!tags->rqs) { 1437 blk_mq_free_tags(tags); 1438 return NULL; 1439 } 1440 1441 /* 1442 * rq_size is the size of the request plus driver payload, rounded 1443 * to the cacheline size 1444 */ 1445 rq_size = round_up(sizeof(struct request) + set->cmd_size, 1446 cache_line_size()); 1447 left = rq_size * set->queue_depth; 1448 1449 for (i = 0; i < set->queue_depth; ) { 1450 int this_order = max_order; 1451 struct page *page; 1452 int to_do; 1453 void *p; 1454 1455 while (left < order_to_size(this_order - 1) && this_order) 1456 this_order--; 1457 1458 do { 1459 page = alloc_pages_node(set->numa_node, 1460 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 1461 this_order); 1462 if (page) 1463 break; 1464 if (!this_order--) 1465 break; 1466 if (order_to_size(this_order) < rq_size) 1467 break; 1468 } while (1); 1469 1470 if (!page) 1471 goto fail; 1472 1473 page->private = this_order; 1474 list_add_tail(&page->lru, &tags->page_list); 1475 1476 p = page_address(page); 1477 entries_per_page = order_to_size(this_order) / rq_size; 1478 to_do = min(entries_per_page, set->queue_depth - i); 1479 left -= to_do * rq_size; 1480 for (j = 0; j < to_do; j++) { 1481 tags->rqs[i] = p; 1482 if (set->ops->init_request) { 1483 if (set->ops->init_request(set->driver_data, 1484 tags->rqs[i], hctx_idx, i, 1485 set->numa_node)) { 1486 tags->rqs[i] = NULL; 1487 goto fail; 1488 } 1489 } 1490 1491 p += rq_size; 1492 i++; 1493 } 1494 } 1495 1496 return tags; 1497 1498 fail: 1499 blk_mq_free_rq_map(set, tags, hctx_idx); 1500 return NULL; 1501 } 1502 1503 static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) 1504 { 1505 kfree(bitmap->map); 1506 } 1507 1508 static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) 1509 { 1510 unsigned int bpw = 8, total, num_maps, i; 1511 1512 bitmap->bits_per_word = bpw; 1513 1514 num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; 1515 bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), 1516 GFP_KERNEL, node); 1517 if (!bitmap->map) 1518 return -ENOMEM; 1519 1520 bitmap->map_size = num_maps; 1521 1522 total = nr_cpu_ids; 1523 for (i = 0; i < num_maps; i++) { 1524 bitmap->map[i].depth = min(total, bitmap->bits_per_word); 1525 total -= bitmap->map[i].depth; 1526 } 1527 1528 return 0; 1529 } 1530 1531 static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) 1532 { 1533 struct request_queue *q = hctx->queue; 1534 struct blk_mq_ctx *ctx; 1535 LIST_HEAD(tmp); 1536 1537 /* 1538 * Move ctx entries to new CPU, if this one is going away. 1539 */ 1540 ctx = __blk_mq_get_ctx(q, cpu); 1541 1542 spin_lock(&ctx->lock); 1543 if (!list_empty(&ctx->rq_list)) { 1544 list_splice_init(&ctx->rq_list, &tmp); 1545 blk_mq_hctx_clear_pending(hctx, ctx); 1546 } 1547 spin_unlock(&ctx->lock); 1548 1549 if (list_empty(&tmp)) 1550 return NOTIFY_OK; 1551 1552 ctx = blk_mq_get_ctx(q); 1553 spin_lock(&ctx->lock); 1554 1555 while (!list_empty(&tmp)) { 1556 struct request *rq; 1557 1558 rq = list_first_entry(&tmp, struct request, queuelist); 1559 rq->mq_ctx = ctx; 1560 list_move_tail(&rq->queuelist, &ctx->rq_list); 1561 } 1562 1563 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1564 blk_mq_hctx_mark_pending(hctx, ctx); 1565 1566 spin_unlock(&ctx->lock); 1567 1568 blk_mq_run_hw_queue(hctx, true); 1569 blk_mq_put_ctx(ctx); 1570 return NOTIFY_OK; 1571 } 1572 1573 static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) 1574 { 1575 struct request_queue *q = hctx->queue; 1576 struct blk_mq_tag_set *set = q->tag_set; 1577 1578 if (set->tags[hctx->queue_num]) 1579 return NOTIFY_OK; 1580 1581 set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); 1582 if (!set->tags[hctx->queue_num]) 1583 return NOTIFY_STOP; 1584 1585 hctx->tags = set->tags[hctx->queue_num]; 1586 return NOTIFY_OK; 1587 } 1588 1589 static int blk_mq_hctx_notify(void *data, unsigned long action, 1590 unsigned int cpu) 1591 { 1592 struct blk_mq_hw_ctx *hctx = data; 1593 1594 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 1595 return blk_mq_hctx_cpu_offline(hctx, cpu); 1596 else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 1597 return blk_mq_hctx_cpu_online(hctx, cpu); 1598 1599 return NOTIFY_OK; 1600 } 1601 1602 static void blk_mq_exit_hctx(struct request_queue *q, 1603 struct blk_mq_tag_set *set, 1604 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1605 { 1606 unsigned flush_start_tag = set->queue_depth; 1607 1608 blk_mq_tag_idle(hctx); 1609 1610 if (set->ops->exit_request) 1611 set->ops->exit_request(set->driver_data, 1612 hctx->fq->flush_rq, hctx_idx, 1613 flush_start_tag + hctx_idx); 1614 1615 if (set->ops->exit_hctx) 1616 set->ops->exit_hctx(hctx, hctx_idx); 1617 1618 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1619 blk_free_flush_queue(hctx->fq); 1620 kfree(hctx->ctxs); 1621 blk_mq_free_bitmap(&hctx->ctx_map); 1622 } 1623 1624 static void blk_mq_exit_hw_queues(struct request_queue *q, 1625 struct blk_mq_tag_set *set, int nr_queue) 1626 { 1627 struct blk_mq_hw_ctx *hctx; 1628 unsigned int i; 1629 1630 queue_for_each_hw_ctx(q, hctx, i) { 1631 if (i == nr_queue) 1632 break; 1633 blk_mq_exit_hctx(q, set, hctx, i); 1634 } 1635 } 1636 1637 static void blk_mq_free_hw_queues(struct request_queue *q, 1638 struct blk_mq_tag_set *set) 1639 { 1640 struct blk_mq_hw_ctx *hctx; 1641 unsigned int i; 1642 1643 queue_for_each_hw_ctx(q, hctx, i) 1644 free_cpumask_var(hctx->cpumask); 1645 } 1646 1647 static int blk_mq_init_hctx(struct request_queue *q, 1648 struct blk_mq_tag_set *set, 1649 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 1650 { 1651 int node; 1652 unsigned flush_start_tag = set->queue_depth; 1653 1654 node = hctx->numa_node; 1655 if (node == NUMA_NO_NODE) 1656 node = hctx->numa_node = set->numa_node; 1657 1658 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 1659 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); 1660 spin_lock_init(&hctx->lock); 1661 INIT_LIST_HEAD(&hctx->dispatch); 1662 hctx->queue = q; 1663 hctx->queue_num = hctx_idx; 1664 hctx->flags = set->flags; 1665 1666 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1667 blk_mq_hctx_notify, hctx); 1668 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1669 1670 hctx->tags = set->tags[hctx_idx]; 1671 1672 /* 1673 * Allocate space for all possible cpus to avoid allocation at 1674 * runtime 1675 */ 1676 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1677 GFP_KERNEL, node); 1678 if (!hctx->ctxs) 1679 goto unregister_cpu_notifier; 1680 1681 if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) 1682 goto free_ctxs; 1683 1684 hctx->nr_ctx = 0; 1685 1686 if (set->ops->init_hctx && 1687 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 1688 goto free_bitmap; 1689 1690 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); 1691 if (!hctx->fq) 1692 goto exit_hctx; 1693 1694 if (set->ops->init_request && 1695 set->ops->init_request(set->driver_data, 1696 hctx->fq->flush_rq, hctx_idx, 1697 flush_start_tag + hctx_idx, node)) 1698 goto free_fq; 1699 1700 return 0; 1701 1702 free_fq: 1703 kfree(hctx->fq); 1704 exit_hctx: 1705 if (set->ops->exit_hctx) 1706 set->ops->exit_hctx(hctx, hctx_idx); 1707 free_bitmap: 1708 blk_mq_free_bitmap(&hctx->ctx_map); 1709 free_ctxs: 1710 kfree(hctx->ctxs); 1711 unregister_cpu_notifier: 1712 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1713 1714 return -1; 1715 } 1716 1717 static int blk_mq_init_hw_queues(struct request_queue *q, 1718 struct blk_mq_tag_set *set) 1719 { 1720 struct blk_mq_hw_ctx *hctx; 1721 unsigned int i; 1722 1723 /* 1724 * Initialize hardware queues 1725 */ 1726 queue_for_each_hw_ctx(q, hctx, i) { 1727 if (blk_mq_init_hctx(q, set, hctx, i)) 1728 break; 1729 } 1730 1731 if (i == q->nr_hw_queues) 1732 return 0; 1733 1734 /* 1735 * Init failed 1736 */ 1737 blk_mq_exit_hw_queues(q, set, i); 1738 1739 return 1; 1740 } 1741 1742 static void blk_mq_init_cpu_queues(struct request_queue *q, 1743 unsigned int nr_hw_queues) 1744 { 1745 unsigned int i; 1746 1747 for_each_possible_cpu(i) { 1748 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1749 struct blk_mq_hw_ctx *hctx; 1750 1751 memset(__ctx, 0, sizeof(*__ctx)); 1752 __ctx->cpu = i; 1753 spin_lock_init(&__ctx->lock); 1754 INIT_LIST_HEAD(&__ctx->rq_list); 1755 __ctx->queue = q; 1756 1757 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1758 if (!cpu_online(i)) 1759 continue; 1760 1761 hctx = q->mq_ops->map_queue(q, i); 1762 cpumask_set_cpu(i, hctx->cpumask); 1763 hctx->nr_ctx++; 1764 1765 /* 1766 * Set local node, IFF we have more than one hw queue. If 1767 * not, we remain on the home node of the device 1768 */ 1769 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1770 hctx->numa_node = cpu_to_node(i); 1771 } 1772 } 1773 1774 static void blk_mq_map_swqueue(struct request_queue *q) 1775 { 1776 unsigned int i; 1777 struct blk_mq_hw_ctx *hctx; 1778 struct blk_mq_ctx *ctx; 1779 1780 queue_for_each_hw_ctx(q, hctx, i) { 1781 cpumask_clear(hctx->cpumask); 1782 hctx->nr_ctx = 0; 1783 } 1784 1785 /* 1786 * Map software to hardware queues 1787 */ 1788 queue_for_each_ctx(q, ctx, i) { 1789 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1790 if (!cpu_online(i)) 1791 continue; 1792 1793 hctx = q->mq_ops->map_queue(q, i); 1794 cpumask_set_cpu(i, hctx->cpumask); 1795 ctx->index_hw = hctx->nr_ctx; 1796 hctx->ctxs[hctx->nr_ctx++] = ctx; 1797 } 1798 1799 queue_for_each_hw_ctx(q, hctx, i) { 1800 /* 1801 * If no software queues are mapped to this hardware queue, 1802 * disable it and free the request entries. 1803 */ 1804 if (!hctx->nr_ctx) { 1805 struct blk_mq_tag_set *set = q->tag_set; 1806 1807 if (set->tags[i]) { 1808 blk_mq_free_rq_map(set, set->tags[i], i); 1809 set->tags[i] = NULL; 1810 hctx->tags = NULL; 1811 } 1812 continue; 1813 } 1814 1815 /* 1816 * Initialize batch roundrobin counts 1817 */ 1818 hctx->next_cpu = cpumask_first(hctx->cpumask); 1819 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1820 } 1821 } 1822 1823 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) 1824 { 1825 struct blk_mq_hw_ctx *hctx; 1826 struct request_queue *q; 1827 bool shared; 1828 int i; 1829 1830 if (set->tag_list.next == set->tag_list.prev) 1831 shared = false; 1832 else 1833 shared = true; 1834 1835 list_for_each_entry(q, &set->tag_list, tag_set_list) { 1836 blk_mq_freeze_queue(q); 1837 1838 queue_for_each_hw_ctx(q, hctx, i) { 1839 if (shared) 1840 hctx->flags |= BLK_MQ_F_TAG_SHARED; 1841 else 1842 hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 1843 } 1844 blk_mq_unfreeze_queue(q); 1845 } 1846 } 1847 1848 static void blk_mq_del_queue_tag_set(struct request_queue *q) 1849 { 1850 struct blk_mq_tag_set *set = q->tag_set; 1851 1852 mutex_lock(&set->tag_list_lock); 1853 list_del_init(&q->tag_set_list); 1854 blk_mq_update_tag_set_depth(set); 1855 mutex_unlock(&set->tag_list_lock); 1856 } 1857 1858 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 1859 struct request_queue *q) 1860 { 1861 q->tag_set = set; 1862 1863 mutex_lock(&set->tag_list_lock); 1864 list_add_tail(&q->tag_set_list, &set->tag_list); 1865 blk_mq_update_tag_set_depth(set); 1866 mutex_unlock(&set->tag_list_lock); 1867 } 1868 1869 /* 1870 * It is the actual release handler for mq, but we do it from 1871 * request queue's release handler for avoiding use-after-free 1872 * and headache because q->mq_kobj shouldn't have been introduced, 1873 * but we can't group ctx/kctx kobj without it. 1874 */ 1875 void blk_mq_release(struct request_queue *q) 1876 { 1877 struct blk_mq_hw_ctx *hctx; 1878 unsigned int i; 1879 1880 /* hctx kobj stays in hctx */ 1881 queue_for_each_hw_ctx(q, hctx, i) 1882 kfree(hctx); 1883 1884 kfree(q->queue_hw_ctx); 1885 1886 /* ctx kobj stays in queue_ctx */ 1887 free_percpu(q->queue_ctx); 1888 } 1889 1890 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 1891 { 1892 struct blk_mq_hw_ctx **hctxs; 1893 struct blk_mq_ctx __percpu *ctx; 1894 struct request_queue *q; 1895 unsigned int *map; 1896 int i; 1897 1898 ctx = alloc_percpu(struct blk_mq_ctx); 1899 if (!ctx) 1900 return ERR_PTR(-ENOMEM); 1901 1902 hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1903 set->numa_node); 1904 1905 if (!hctxs) 1906 goto err_percpu; 1907 1908 map = blk_mq_make_queue_map(set); 1909 if (!map) 1910 goto err_map; 1911 1912 for (i = 0; i < set->nr_hw_queues; i++) { 1913 int node = blk_mq_hw_queue_to_node(map, i); 1914 1915 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), 1916 GFP_KERNEL, node); 1917 if (!hctxs[i]) 1918 goto err_hctxs; 1919 1920 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, 1921 node)) 1922 goto err_hctxs; 1923 1924 atomic_set(&hctxs[i]->nr_active, 0); 1925 hctxs[i]->numa_node = node; 1926 hctxs[i]->queue_num = i; 1927 } 1928 1929 q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); 1930 if (!q) 1931 goto err_hctxs; 1932 1933 /* 1934 * Init percpu_ref in atomic mode so that it's faster to shutdown. 1935 * See blk_register_queue() for details. 1936 */ 1937 if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, 1938 PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) 1939 goto err_mq_usage; 1940 1941 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1942 blk_queue_rq_timeout(q, 30000); 1943 1944 q->nr_queues = nr_cpu_ids; 1945 q->nr_hw_queues = set->nr_hw_queues; 1946 q->mq_map = map; 1947 1948 q->queue_ctx = ctx; 1949 q->queue_hw_ctx = hctxs; 1950 1951 q->mq_ops = set->ops; 1952 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1953 1954 if (!(set->flags & BLK_MQ_F_SG_MERGE)) 1955 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; 1956 1957 q->sg_reserved_size = INT_MAX; 1958 1959 INIT_WORK(&q->requeue_work, blk_mq_requeue_work); 1960 INIT_LIST_HEAD(&q->requeue_list); 1961 spin_lock_init(&q->requeue_lock); 1962 1963 if (q->nr_hw_queues > 1) 1964 blk_queue_make_request(q, blk_mq_make_request); 1965 else 1966 blk_queue_make_request(q, blk_sq_make_request); 1967 1968 if (set->timeout) 1969 blk_queue_rq_timeout(q, set->timeout); 1970 1971 /* 1972 * Do this after blk_queue_make_request() overrides it... 1973 */ 1974 q->nr_requests = set->queue_depth; 1975 1976 if (set->ops->complete) 1977 blk_queue_softirq_done(q, set->ops->complete); 1978 1979 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 1980 1981 if (blk_mq_init_hw_queues(q, set)) 1982 goto err_mq_usage; 1983 1984 mutex_lock(&all_q_mutex); 1985 list_add_tail(&q->all_q_node, &all_q_list); 1986 mutex_unlock(&all_q_mutex); 1987 1988 blk_mq_add_queue_tag_set(set, q); 1989 1990 blk_mq_map_swqueue(q); 1991 1992 return q; 1993 1994 err_mq_usage: 1995 blk_cleanup_queue(q); 1996 err_hctxs: 1997 kfree(map); 1998 for (i = 0; i < set->nr_hw_queues; i++) { 1999 if (!hctxs[i]) 2000 break; 2001 free_cpumask_var(hctxs[i]->cpumask); 2002 kfree(hctxs[i]); 2003 } 2004 err_map: 2005 kfree(hctxs); 2006 err_percpu: 2007 free_percpu(ctx); 2008 return ERR_PTR(-ENOMEM); 2009 } 2010 EXPORT_SYMBOL(blk_mq_init_queue); 2011 2012 void blk_mq_free_queue(struct request_queue *q) 2013 { 2014 struct blk_mq_tag_set *set = q->tag_set; 2015 2016 blk_mq_del_queue_tag_set(q); 2017 2018 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 2019 blk_mq_free_hw_queues(q, set); 2020 2021 percpu_ref_exit(&q->mq_usage_counter); 2022 2023 kfree(q->mq_map); 2024 2025 q->mq_map = NULL; 2026 2027 mutex_lock(&all_q_mutex); 2028 list_del_init(&q->all_q_node); 2029 mutex_unlock(&all_q_mutex); 2030 } 2031 2032 /* Basically redo blk_mq_init_queue with queue frozen */ 2033 static void blk_mq_queue_reinit(struct request_queue *q) 2034 { 2035 WARN_ON_ONCE(!q->mq_freeze_depth); 2036 2037 blk_mq_sysfs_unregister(q); 2038 2039 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 2040 2041 /* 2042 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 2043 * we should change hctx numa_node according to new topology (this 2044 * involves free and re-allocate memory, worthy doing?) 2045 */ 2046 2047 blk_mq_map_swqueue(q); 2048 2049 blk_mq_sysfs_register(q); 2050 } 2051 2052 static int blk_mq_queue_reinit_notify(struct notifier_block *nb, 2053 unsigned long action, void *hcpu) 2054 { 2055 struct request_queue *q; 2056 2057 /* 2058 * Before new mappings are established, hotadded cpu might already 2059 * start handling requests. This doesn't break anything as we map 2060 * offline CPUs to first hardware queue. We will re-init the queue 2061 * below to get optimal settings. 2062 */ 2063 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 2064 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 2065 return NOTIFY_OK; 2066 2067 mutex_lock(&all_q_mutex); 2068 2069 /* 2070 * We need to freeze and reinit all existing queues. Freezing 2071 * involves synchronous wait for an RCU grace period and doing it 2072 * one by one may take a long time. Start freezing all queues in 2073 * one swoop and then wait for the completions so that freezing can 2074 * take place in parallel. 2075 */ 2076 list_for_each_entry(q, &all_q_list, all_q_node) 2077 blk_mq_freeze_queue_start(q); 2078 list_for_each_entry(q, &all_q_list, all_q_node) 2079 blk_mq_freeze_queue_wait(q); 2080 2081 list_for_each_entry(q, &all_q_list, all_q_node) 2082 blk_mq_queue_reinit(q); 2083 2084 list_for_each_entry(q, &all_q_list, all_q_node) 2085 blk_mq_unfreeze_queue(q); 2086 2087 mutex_unlock(&all_q_mutex); 2088 return NOTIFY_OK; 2089 } 2090 2091 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2092 { 2093 int i; 2094 2095 for (i = 0; i < set->nr_hw_queues; i++) { 2096 set->tags[i] = blk_mq_init_rq_map(set, i); 2097 if (!set->tags[i]) 2098 goto out_unwind; 2099 } 2100 2101 return 0; 2102 2103 out_unwind: 2104 while (--i >= 0) 2105 blk_mq_free_rq_map(set, set->tags[i], i); 2106 2107 return -ENOMEM; 2108 } 2109 2110 /* 2111 * Allocate the request maps associated with this tag_set. Note that this 2112 * may reduce the depth asked for, if memory is tight. set->queue_depth 2113 * will be updated to reflect the allocated depth. 2114 */ 2115 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2116 { 2117 unsigned int depth; 2118 int err; 2119 2120 depth = set->queue_depth; 2121 do { 2122 err = __blk_mq_alloc_rq_maps(set); 2123 if (!err) 2124 break; 2125 2126 set->queue_depth >>= 1; 2127 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 2128 err = -ENOMEM; 2129 break; 2130 } 2131 } while (set->queue_depth); 2132 2133 if (!set->queue_depth || err) { 2134 pr_err("blk-mq: failed to allocate request map\n"); 2135 return -ENOMEM; 2136 } 2137 2138 if (depth != set->queue_depth) 2139 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 2140 depth, set->queue_depth); 2141 2142 return 0; 2143 } 2144 2145 /* 2146 * Alloc a tag set to be associated with one or more request queues. 2147 * May fail with EINVAL for various error conditions. May adjust the 2148 * requested depth down, if if it too large. In that case, the set 2149 * value will be stored in set->queue_depth. 2150 */ 2151 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2152 { 2153 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 2154 2155 if (!set->nr_hw_queues) 2156 return -EINVAL; 2157 if (!set->queue_depth) 2158 return -EINVAL; 2159 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 2160 return -EINVAL; 2161 2162 if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) 2163 return -EINVAL; 2164 2165 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 2166 pr_info("blk-mq: reduced tag depth to %u\n", 2167 BLK_MQ_MAX_DEPTH); 2168 set->queue_depth = BLK_MQ_MAX_DEPTH; 2169 } 2170 2171 /* 2172 * If a crashdump is active, then we are potentially in a very 2173 * memory constrained environment. Limit us to 1 queue and 2174 * 64 tags to prevent using too much memory. 2175 */ 2176 if (is_kdump_kernel()) { 2177 set->nr_hw_queues = 1; 2178 set->queue_depth = min(64U, set->queue_depth); 2179 } 2180 2181 set->tags = kmalloc_node(set->nr_hw_queues * 2182 sizeof(struct blk_mq_tags *), 2183 GFP_KERNEL, set->numa_node); 2184 if (!set->tags) 2185 return -ENOMEM; 2186 2187 if (blk_mq_alloc_rq_maps(set)) 2188 goto enomem; 2189 2190 mutex_init(&set->tag_list_lock); 2191 INIT_LIST_HEAD(&set->tag_list); 2192 2193 return 0; 2194 enomem: 2195 kfree(set->tags); 2196 set->tags = NULL; 2197 return -ENOMEM; 2198 } 2199 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 2200 2201 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 2202 { 2203 int i; 2204 2205 for (i = 0; i < set->nr_hw_queues; i++) { 2206 if (set->tags[i]) 2207 blk_mq_free_rq_map(set, set->tags[i], i); 2208 } 2209 2210 kfree(set->tags); 2211 set->tags = NULL; 2212 } 2213 EXPORT_SYMBOL(blk_mq_free_tag_set); 2214 2215 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) 2216 { 2217 struct blk_mq_tag_set *set = q->tag_set; 2218 struct blk_mq_hw_ctx *hctx; 2219 int i, ret; 2220 2221 if (!set || nr > set->queue_depth) 2222 return -EINVAL; 2223 2224 ret = 0; 2225 queue_for_each_hw_ctx(q, hctx, i) { 2226 ret = blk_mq_tag_update_depth(hctx->tags, nr); 2227 if (ret) 2228 break; 2229 } 2230 2231 if (!ret) 2232 q->nr_requests = nr; 2233 2234 return ret; 2235 } 2236 2237 void blk_mq_disable_hotplug(void) 2238 { 2239 mutex_lock(&all_q_mutex); 2240 } 2241 2242 void blk_mq_enable_hotplug(void) 2243 { 2244 mutex_unlock(&all_q_mutex); 2245 } 2246 2247 static int __init blk_mq_init(void) 2248 { 2249 blk_mq_cpu_init(); 2250 2251 hotcpu_notifier(blk_mq_queue_reinit_notify, 0); 2252 2253 return 0; 2254 } 2255 subsys_initcall(blk_mq_init); 2256