1 /* 2 * Block multiqueue core code 3 * 4 * Copyright (C) 2013-2014 Jens Axboe 5 * Copyright (C) 2013-2014 Christoph Hellwig 6 */ 7 #include <linux/kernel.h> 8 #include <linux/module.h> 9 #include <linux/backing-dev.h> 10 #include <linux/bio.h> 11 #include <linux/blkdev.h> 12 #include <linux/mm.h> 13 #include <linux/init.h> 14 #include <linux/slab.h> 15 #include <linux/workqueue.h> 16 #include <linux/smp.h> 17 #include <linux/llist.h> 18 #include <linux/list_sort.h> 19 #include <linux/cpu.h> 20 #include <linux/cache.h> 21 #include <linux/sched/sysctl.h> 22 #include <linux/delay.h> 23 #include <linux/crash_dump.h> 24 25 #include <trace/events/block.h> 26 27 #include <linux/blk-mq.h> 28 #include "blk.h" 29 #include "blk-mq.h" 30 #include "blk-mq-tag.h" 31 32 static DEFINE_MUTEX(all_q_mutex); 33 static LIST_HEAD(all_q_list); 34 35 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 36 37 /* 38 * Check if any of the ctx's have pending work in this hardware queue 39 */ 40 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 41 { 42 unsigned int i; 43 44 for (i = 0; i < hctx->ctx_map.size; i++) 45 if (hctx->ctx_map.map[i].word) 46 return true; 47 48 return false; 49 } 50 51 static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, 52 struct blk_mq_ctx *ctx) 53 { 54 return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; 55 } 56 57 #define CTX_TO_BIT(hctx, ctx) \ 58 ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) 59 60 /* 61 * Mark this ctx as having pending work in this hardware queue 62 */ 63 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 64 struct blk_mq_ctx *ctx) 65 { 66 struct blk_align_bitmap *bm = get_bm(hctx, ctx); 67 68 if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) 69 set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); 70 } 71 72 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 73 struct blk_mq_ctx *ctx) 74 { 75 struct blk_align_bitmap *bm = get_bm(hctx, ctx); 76 77 clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); 78 } 79 80 static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp) 81 { 82 while (true) { 83 int ret; 84 85 if (percpu_ref_tryget_live(&q->mq_usage_counter)) 86 return 0; 87 88 if (!(gfp & __GFP_WAIT)) 89 return -EBUSY; 90 91 ret = wait_event_interruptible(q->mq_freeze_wq, 92 !q->mq_freeze_depth || blk_queue_dying(q)); 93 if (blk_queue_dying(q)) 94 return -ENODEV; 95 if (ret) 96 return ret; 97 } 98 } 99 100 static void blk_mq_queue_exit(struct request_queue *q) 101 { 102 percpu_ref_put(&q->mq_usage_counter); 103 } 104 105 static void blk_mq_usage_counter_release(struct percpu_ref *ref) 106 { 107 struct request_queue *q = 108 container_of(ref, struct request_queue, mq_usage_counter); 109 110 wake_up_all(&q->mq_freeze_wq); 111 } 112 113 void blk_mq_freeze_queue_start(struct request_queue *q) 114 { 115 bool freeze; 116 117 spin_lock_irq(q->queue_lock); 118 freeze = !q->mq_freeze_depth++; 119 spin_unlock_irq(q->queue_lock); 120 121 if (freeze) { 122 percpu_ref_kill(&q->mq_usage_counter); 123 blk_mq_run_hw_queues(q, false); 124 } 125 } 126 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); 127 128 static void blk_mq_freeze_queue_wait(struct request_queue *q) 129 { 130 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); 131 } 132 133 /* 134 * Guarantee no request is in use, so we can change any data structure of 135 * the queue afterward. 136 */ 137 void blk_mq_freeze_queue(struct request_queue *q) 138 { 139 blk_mq_freeze_queue_start(q); 140 blk_mq_freeze_queue_wait(q); 141 } 142 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 143 144 void blk_mq_unfreeze_queue(struct request_queue *q) 145 { 146 bool wake; 147 148 spin_lock_irq(q->queue_lock); 149 wake = !--q->mq_freeze_depth; 150 WARN_ON_ONCE(q->mq_freeze_depth < 0); 151 spin_unlock_irq(q->queue_lock); 152 if (wake) { 153 percpu_ref_reinit(&q->mq_usage_counter); 154 wake_up_all(&q->mq_freeze_wq); 155 } 156 } 157 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 158 159 void blk_mq_wake_waiters(struct request_queue *q) 160 { 161 struct blk_mq_hw_ctx *hctx; 162 unsigned int i; 163 164 queue_for_each_hw_ctx(q, hctx, i) 165 if (blk_mq_hw_queue_mapped(hctx)) 166 blk_mq_tag_wakeup_all(hctx->tags, true); 167 168 /* 169 * If we are called because the queue has now been marked as 170 * dying, we need to ensure that processes currently waiting on 171 * the queue are notified as well. 172 */ 173 wake_up_all(&q->mq_freeze_wq); 174 } 175 176 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 177 { 178 return blk_mq_has_free_tags(hctx->tags); 179 } 180 EXPORT_SYMBOL(blk_mq_can_queue); 181 182 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 183 struct request *rq, unsigned int rw_flags) 184 { 185 if (blk_queue_io_stat(q)) 186 rw_flags |= REQ_IO_STAT; 187 188 INIT_LIST_HEAD(&rq->queuelist); 189 /* csd/requeue_work/fifo_time is initialized before use */ 190 rq->q = q; 191 rq->mq_ctx = ctx; 192 rq->cmd_flags |= rw_flags; 193 /* do not touch atomic flags, it needs atomic ops against the timer */ 194 rq->cpu = -1; 195 INIT_HLIST_NODE(&rq->hash); 196 RB_CLEAR_NODE(&rq->rb_node); 197 rq->rq_disk = NULL; 198 rq->part = NULL; 199 rq->start_time = jiffies; 200 #ifdef CONFIG_BLK_CGROUP 201 rq->rl = NULL; 202 set_start_time_ns(rq); 203 rq->io_start_time_ns = 0; 204 #endif 205 rq->nr_phys_segments = 0; 206 #if defined(CONFIG_BLK_DEV_INTEGRITY) 207 rq->nr_integrity_segments = 0; 208 #endif 209 rq->special = NULL; 210 /* tag was already set */ 211 rq->errors = 0; 212 213 rq->cmd = rq->__cmd; 214 215 rq->extra_len = 0; 216 rq->sense_len = 0; 217 rq->resid_len = 0; 218 rq->sense = NULL; 219 220 INIT_LIST_HEAD(&rq->timeout_list); 221 rq->timeout = 0; 222 223 rq->end_io = NULL; 224 rq->end_io_data = NULL; 225 rq->next_rq = NULL; 226 227 ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 228 } 229 230 static struct request * 231 __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) 232 { 233 struct request *rq; 234 unsigned int tag; 235 236 tag = blk_mq_get_tag(data); 237 if (tag != BLK_MQ_TAG_FAIL) { 238 rq = data->hctx->tags->rqs[tag]; 239 240 if (blk_mq_tag_busy(data->hctx)) { 241 rq->cmd_flags = REQ_MQ_INFLIGHT; 242 atomic_inc(&data->hctx->nr_active); 243 } 244 245 rq->tag = tag; 246 blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw); 247 return rq; 248 } 249 250 return NULL; 251 } 252 253 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, 254 bool reserved) 255 { 256 struct blk_mq_ctx *ctx; 257 struct blk_mq_hw_ctx *hctx; 258 struct request *rq; 259 struct blk_mq_alloc_data alloc_data; 260 int ret; 261 262 ret = blk_mq_queue_enter(q, gfp); 263 if (ret) 264 return ERR_PTR(ret); 265 266 ctx = blk_mq_get_ctx(q); 267 hctx = q->mq_ops->map_queue(q, ctx->cpu); 268 blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, 269 reserved, ctx, hctx); 270 271 rq = __blk_mq_alloc_request(&alloc_data, rw); 272 if (!rq && (gfp & __GFP_WAIT)) { 273 __blk_mq_run_hw_queue(hctx); 274 blk_mq_put_ctx(ctx); 275 276 ctx = blk_mq_get_ctx(q); 277 hctx = q->mq_ops->map_queue(q, ctx->cpu); 278 blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx, 279 hctx); 280 rq = __blk_mq_alloc_request(&alloc_data, rw); 281 ctx = alloc_data.ctx; 282 } 283 blk_mq_put_ctx(ctx); 284 if (!rq) { 285 blk_mq_queue_exit(q); 286 return ERR_PTR(-EWOULDBLOCK); 287 } 288 return rq; 289 } 290 EXPORT_SYMBOL(blk_mq_alloc_request); 291 292 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 293 struct blk_mq_ctx *ctx, struct request *rq) 294 { 295 const int tag = rq->tag; 296 struct request_queue *q = rq->q; 297 298 if (rq->cmd_flags & REQ_MQ_INFLIGHT) 299 atomic_dec(&hctx->nr_active); 300 rq->cmd_flags = 0; 301 302 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 303 blk_mq_put_tag(hctx, tag, &ctx->last_tag); 304 blk_mq_queue_exit(q); 305 } 306 307 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) 308 { 309 struct blk_mq_ctx *ctx = rq->mq_ctx; 310 311 ctx->rq_completed[rq_is_sync(rq)]++; 312 __blk_mq_free_request(hctx, ctx, rq); 313 314 } 315 EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); 316 317 void blk_mq_free_request(struct request *rq) 318 { 319 struct blk_mq_hw_ctx *hctx; 320 struct request_queue *q = rq->q; 321 322 hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); 323 blk_mq_free_hctx_request(hctx, rq); 324 } 325 EXPORT_SYMBOL_GPL(blk_mq_free_request); 326 327 inline void __blk_mq_end_request(struct request *rq, int error) 328 { 329 blk_account_io_done(rq); 330 331 if (rq->end_io) { 332 rq->end_io(rq, error); 333 } else { 334 if (unlikely(blk_bidi_rq(rq))) 335 blk_mq_free_request(rq->next_rq); 336 blk_mq_free_request(rq); 337 } 338 } 339 EXPORT_SYMBOL(__blk_mq_end_request); 340 341 void blk_mq_end_request(struct request *rq, int error) 342 { 343 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 344 BUG(); 345 __blk_mq_end_request(rq, error); 346 } 347 EXPORT_SYMBOL(blk_mq_end_request); 348 349 static void __blk_mq_complete_request_remote(void *data) 350 { 351 struct request *rq = data; 352 353 rq->q->softirq_done_fn(rq); 354 } 355 356 static void blk_mq_ipi_complete_request(struct request *rq) 357 { 358 struct blk_mq_ctx *ctx = rq->mq_ctx; 359 bool shared = false; 360 int cpu; 361 362 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { 363 rq->q->softirq_done_fn(rq); 364 return; 365 } 366 367 cpu = get_cpu(); 368 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) 369 shared = cpus_share_cache(cpu, ctx->cpu); 370 371 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { 372 rq->csd.func = __blk_mq_complete_request_remote; 373 rq->csd.info = rq; 374 rq->csd.flags = 0; 375 smp_call_function_single_async(ctx->cpu, &rq->csd); 376 } else { 377 rq->q->softirq_done_fn(rq); 378 } 379 put_cpu(); 380 } 381 382 void __blk_mq_complete_request(struct request *rq) 383 { 384 struct request_queue *q = rq->q; 385 386 if (!q->softirq_done_fn) 387 blk_mq_end_request(rq, rq->errors); 388 else 389 blk_mq_ipi_complete_request(rq); 390 } 391 392 /** 393 * blk_mq_complete_request - end I/O on a request 394 * @rq: the request being processed 395 * 396 * Description: 397 * Ends all I/O on a request. It does not handle partial completions. 398 * The actual completion happens out-of-order, through a IPI handler. 399 **/ 400 void blk_mq_complete_request(struct request *rq) 401 { 402 struct request_queue *q = rq->q; 403 404 if (unlikely(blk_should_fake_timeout(q))) 405 return; 406 if (!blk_mark_rq_complete(rq)) 407 __blk_mq_complete_request(rq); 408 } 409 EXPORT_SYMBOL(blk_mq_complete_request); 410 411 int blk_mq_request_started(struct request *rq) 412 { 413 return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 414 } 415 EXPORT_SYMBOL_GPL(blk_mq_request_started); 416 417 void blk_mq_start_request(struct request *rq) 418 { 419 struct request_queue *q = rq->q; 420 421 trace_block_rq_issue(q, rq); 422 423 rq->resid_len = blk_rq_bytes(rq); 424 if (unlikely(blk_bidi_rq(rq))) 425 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); 426 427 blk_add_timer(rq); 428 429 /* 430 * Ensure that ->deadline is visible before set the started 431 * flag and clear the completed flag. 432 */ 433 smp_mb__before_atomic(); 434 435 /* 436 * Mark us as started and clear complete. Complete might have been 437 * set if requeue raced with timeout, which then marked it as 438 * complete. So be sure to clear complete again when we start 439 * the request, otherwise we'll ignore the completion event. 440 */ 441 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 442 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 443 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) 444 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 445 446 if (q->dma_drain_size && blk_rq_bytes(rq)) { 447 /* 448 * Make sure space for the drain appears. We know we can do 449 * this because max_hw_segments has been adjusted to be one 450 * fewer than the device can handle. 451 */ 452 rq->nr_phys_segments++; 453 } 454 } 455 EXPORT_SYMBOL(blk_mq_start_request); 456 457 static void __blk_mq_requeue_request(struct request *rq) 458 { 459 struct request_queue *q = rq->q; 460 461 trace_block_rq_requeue(q, rq); 462 463 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 464 if (q->dma_drain_size && blk_rq_bytes(rq)) 465 rq->nr_phys_segments--; 466 } 467 } 468 469 void blk_mq_requeue_request(struct request *rq) 470 { 471 __blk_mq_requeue_request(rq); 472 473 BUG_ON(blk_queued_rq(rq)); 474 blk_mq_add_to_requeue_list(rq, true); 475 } 476 EXPORT_SYMBOL(blk_mq_requeue_request); 477 478 static void blk_mq_requeue_work(struct work_struct *work) 479 { 480 struct request_queue *q = 481 container_of(work, struct request_queue, requeue_work); 482 LIST_HEAD(rq_list); 483 struct request *rq, *next; 484 unsigned long flags; 485 486 spin_lock_irqsave(&q->requeue_lock, flags); 487 list_splice_init(&q->requeue_list, &rq_list); 488 spin_unlock_irqrestore(&q->requeue_lock, flags); 489 490 list_for_each_entry_safe(rq, next, &rq_list, queuelist) { 491 if (!(rq->cmd_flags & REQ_SOFTBARRIER)) 492 continue; 493 494 rq->cmd_flags &= ~REQ_SOFTBARRIER; 495 list_del_init(&rq->queuelist); 496 blk_mq_insert_request(rq, true, false, false); 497 } 498 499 while (!list_empty(&rq_list)) { 500 rq = list_entry(rq_list.next, struct request, queuelist); 501 list_del_init(&rq->queuelist); 502 blk_mq_insert_request(rq, false, false, false); 503 } 504 505 /* 506 * Use the start variant of queue running here, so that running 507 * the requeue work will kick stopped queues. 508 */ 509 blk_mq_start_hw_queues(q); 510 } 511 512 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) 513 { 514 struct request_queue *q = rq->q; 515 unsigned long flags; 516 517 /* 518 * We abuse this flag that is otherwise used by the I/O scheduler to 519 * request head insertation from the workqueue. 520 */ 521 BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); 522 523 spin_lock_irqsave(&q->requeue_lock, flags); 524 if (at_head) { 525 rq->cmd_flags |= REQ_SOFTBARRIER; 526 list_add(&rq->queuelist, &q->requeue_list); 527 } else { 528 list_add_tail(&rq->queuelist, &q->requeue_list); 529 } 530 spin_unlock_irqrestore(&q->requeue_lock, flags); 531 } 532 EXPORT_SYMBOL(blk_mq_add_to_requeue_list); 533 534 void blk_mq_cancel_requeue_work(struct request_queue *q) 535 { 536 cancel_work_sync(&q->requeue_work); 537 } 538 EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); 539 540 void blk_mq_kick_requeue_list(struct request_queue *q) 541 { 542 kblockd_schedule_work(&q->requeue_work); 543 } 544 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 545 546 void blk_mq_abort_requeue_list(struct request_queue *q) 547 { 548 unsigned long flags; 549 LIST_HEAD(rq_list); 550 551 spin_lock_irqsave(&q->requeue_lock, flags); 552 list_splice_init(&q->requeue_list, &rq_list); 553 spin_unlock_irqrestore(&q->requeue_lock, flags); 554 555 while (!list_empty(&rq_list)) { 556 struct request *rq; 557 558 rq = list_first_entry(&rq_list, struct request, queuelist); 559 list_del_init(&rq->queuelist); 560 rq->errors = -EIO; 561 blk_mq_end_request(rq, rq->errors); 562 } 563 } 564 EXPORT_SYMBOL(blk_mq_abort_requeue_list); 565 566 static inline bool is_flush_request(struct request *rq, 567 struct blk_flush_queue *fq, unsigned int tag) 568 { 569 return ((rq->cmd_flags & REQ_FLUSH_SEQ) && 570 fq->flush_rq->tag == tag); 571 } 572 573 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 574 { 575 struct request *rq = tags->rqs[tag]; 576 /* mq_ctx of flush rq is always cloned from the corresponding req */ 577 struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); 578 579 if (!is_flush_request(rq, fq, tag)) 580 return rq; 581 582 return fq->flush_rq; 583 } 584 EXPORT_SYMBOL(blk_mq_tag_to_rq); 585 586 struct blk_mq_timeout_data { 587 unsigned long next; 588 unsigned int next_set; 589 }; 590 591 void blk_mq_rq_timed_out(struct request *req, bool reserved) 592 { 593 struct blk_mq_ops *ops = req->q->mq_ops; 594 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; 595 596 /* 597 * We know that complete is set at this point. If STARTED isn't set 598 * anymore, then the request isn't active and the "timeout" should 599 * just be ignored. This can happen due to the bitflag ordering. 600 * Timeout first checks if STARTED is set, and if it is, assumes 601 * the request is active. But if we race with completion, then 602 * we both flags will get cleared. So check here again, and ignore 603 * a timeout event with a request that isn't active. 604 */ 605 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) 606 return; 607 608 if (ops->timeout) 609 ret = ops->timeout(req, reserved); 610 611 switch (ret) { 612 case BLK_EH_HANDLED: 613 __blk_mq_complete_request(req); 614 break; 615 case BLK_EH_RESET_TIMER: 616 blk_add_timer(req); 617 blk_clear_rq_complete(req); 618 break; 619 case BLK_EH_NOT_HANDLED: 620 break; 621 default: 622 printk(KERN_ERR "block: bad eh return: %d\n", ret); 623 break; 624 } 625 } 626 627 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 628 struct request *rq, void *priv, bool reserved) 629 { 630 struct blk_mq_timeout_data *data = priv; 631 632 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 633 /* 634 * If a request wasn't started before the queue was 635 * marked dying, kill it here or it'll go unnoticed. 636 */ 637 if (unlikely(blk_queue_dying(rq->q))) { 638 rq->errors = -EIO; 639 blk_mq_complete_request(rq); 640 } 641 return; 642 } 643 if (rq->cmd_flags & REQ_NO_TIMEOUT) 644 return; 645 646 if (time_after_eq(jiffies, rq->deadline)) { 647 if (!blk_mark_rq_complete(rq)) 648 blk_mq_rq_timed_out(rq, reserved); 649 } else if (!data->next_set || time_after(data->next, rq->deadline)) { 650 data->next = rq->deadline; 651 data->next_set = 1; 652 } 653 } 654 655 static void blk_mq_rq_timer(unsigned long priv) 656 { 657 struct request_queue *q = (struct request_queue *)priv; 658 struct blk_mq_timeout_data data = { 659 .next = 0, 660 .next_set = 0, 661 }; 662 struct blk_mq_hw_ctx *hctx; 663 int i; 664 665 queue_for_each_hw_ctx(q, hctx, i) { 666 /* 667 * If not software queues are currently mapped to this 668 * hardware queue, there's nothing to check 669 */ 670 if (!blk_mq_hw_queue_mapped(hctx)) 671 continue; 672 673 blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); 674 } 675 676 if (data.next_set) { 677 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 678 mod_timer(&q->timeout, data.next); 679 } else { 680 queue_for_each_hw_ctx(q, hctx, i) 681 blk_mq_tag_idle(hctx); 682 } 683 } 684 685 /* 686 * Reverse check our software queue for entries that we could potentially 687 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 688 * too much time checking for merges. 689 */ 690 static bool blk_mq_attempt_merge(struct request_queue *q, 691 struct blk_mq_ctx *ctx, struct bio *bio) 692 { 693 struct request *rq; 694 int checked = 8; 695 696 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 697 int el_ret; 698 699 if (!checked--) 700 break; 701 702 if (!blk_rq_merge_ok(rq, bio)) 703 continue; 704 705 el_ret = blk_try_merge(rq, bio); 706 if (el_ret == ELEVATOR_BACK_MERGE) { 707 if (bio_attempt_back_merge(q, rq, bio)) { 708 ctx->rq_merged++; 709 return true; 710 } 711 break; 712 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 713 if (bio_attempt_front_merge(q, rq, bio)) { 714 ctx->rq_merged++; 715 return true; 716 } 717 break; 718 } 719 } 720 721 return false; 722 } 723 724 /* 725 * Process software queues that have been marked busy, splicing them 726 * to the for-dispatch 727 */ 728 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 729 { 730 struct blk_mq_ctx *ctx; 731 int i; 732 733 for (i = 0; i < hctx->ctx_map.size; i++) { 734 struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; 735 unsigned int off, bit; 736 737 if (!bm->word) 738 continue; 739 740 bit = 0; 741 off = i * hctx->ctx_map.bits_per_word; 742 do { 743 bit = find_next_bit(&bm->word, bm->depth, bit); 744 if (bit >= bm->depth) 745 break; 746 747 ctx = hctx->ctxs[bit + off]; 748 clear_bit(bit, &bm->word); 749 spin_lock(&ctx->lock); 750 list_splice_tail_init(&ctx->rq_list, list); 751 spin_unlock(&ctx->lock); 752 753 bit++; 754 } while (1); 755 } 756 } 757 758 /* 759 * Run this hardware queue, pulling any software queues mapped to it in. 760 * Note that this function currently has various problems around ordering 761 * of IO. In particular, we'd like FIFO behaviour on handling existing 762 * items on the hctx->dispatch list. Ignore that for now. 763 */ 764 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 765 { 766 struct request_queue *q = hctx->queue; 767 struct request *rq; 768 LIST_HEAD(rq_list); 769 LIST_HEAD(driver_list); 770 struct list_head *dptr; 771 int queued; 772 773 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); 774 775 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 776 return; 777 778 hctx->run++; 779 780 /* 781 * Touch any software queue that has pending entries. 782 */ 783 flush_busy_ctxs(hctx, &rq_list); 784 785 /* 786 * If we have previous entries on our dispatch list, grab them 787 * and stuff them at the front for more fair dispatch. 788 */ 789 if (!list_empty_careful(&hctx->dispatch)) { 790 spin_lock(&hctx->lock); 791 if (!list_empty(&hctx->dispatch)) 792 list_splice_init(&hctx->dispatch, &rq_list); 793 spin_unlock(&hctx->lock); 794 } 795 796 /* 797 * Start off with dptr being NULL, so we start the first request 798 * immediately, even if we have more pending. 799 */ 800 dptr = NULL; 801 802 /* 803 * Now process all the entries, sending them to the driver. 804 */ 805 queued = 0; 806 while (!list_empty(&rq_list)) { 807 struct blk_mq_queue_data bd; 808 int ret; 809 810 rq = list_first_entry(&rq_list, struct request, queuelist); 811 list_del_init(&rq->queuelist); 812 813 bd.rq = rq; 814 bd.list = dptr; 815 bd.last = list_empty(&rq_list); 816 817 ret = q->mq_ops->queue_rq(hctx, &bd); 818 switch (ret) { 819 case BLK_MQ_RQ_QUEUE_OK: 820 queued++; 821 continue; 822 case BLK_MQ_RQ_QUEUE_BUSY: 823 list_add(&rq->queuelist, &rq_list); 824 __blk_mq_requeue_request(rq); 825 break; 826 default: 827 pr_err("blk-mq: bad return on queue: %d\n", ret); 828 case BLK_MQ_RQ_QUEUE_ERROR: 829 rq->errors = -EIO; 830 blk_mq_end_request(rq, rq->errors); 831 break; 832 } 833 834 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 835 break; 836 837 /* 838 * We've done the first request. If we have more than 1 839 * left in the list, set dptr to defer issue. 840 */ 841 if (!dptr && rq_list.next != rq_list.prev) 842 dptr = &driver_list; 843 } 844 845 if (!queued) 846 hctx->dispatched[0]++; 847 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 848 hctx->dispatched[ilog2(queued) + 1]++; 849 850 /* 851 * Any items that need requeuing? Stuff them into hctx->dispatch, 852 * that is where we will continue on next queue run. 853 */ 854 if (!list_empty(&rq_list)) { 855 spin_lock(&hctx->lock); 856 list_splice(&rq_list, &hctx->dispatch); 857 spin_unlock(&hctx->lock); 858 } 859 } 860 861 /* 862 * It'd be great if the workqueue API had a way to pass 863 * in a mask and had some smarts for more clever placement. 864 * For now we just round-robin here, switching for every 865 * BLK_MQ_CPU_WORK_BATCH queued items. 866 */ 867 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 868 { 869 if (hctx->queue->nr_hw_queues == 1) 870 return WORK_CPU_UNBOUND; 871 872 if (--hctx->next_cpu_batch <= 0) { 873 int cpu = hctx->next_cpu, next_cpu; 874 875 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); 876 if (next_cpu >= nr_cpu_ids) 877 next_cpu = cpumask_first(hctx->cpumask); 878 879 hctx->next_cpu = next_cpu; 880 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 881 882 return cpu; 883 } 884 885 return hctx->next_cpu; 886 } 887 888 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 889 { 890 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || 891 !blk_mq_hw_queue_mapped(hctx))) 892 return; 893 894 if (!async) { 895 int cpu = get_cpu(); 896 if (cpumask_test_cpu(cpu, hctx->cpumask)) { 897 __blk_mq_run_hw_queue(hctx); 898 put_cpu(); 899 return; 900 } 901 902 put_cpu(); 903 } 904 905 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 906 &hctx->run_work, 0); 907 } 908 909 void blk_mq_run_hw_queues(struct request_queue *q, bool async) 910 { 911 struct blk_mq_hw_ctx *hctx; 912 int i; 913 914 queue_for_each_hw_ctx(q, hctx, i) { 915 if ((!blk_mq_hctx_has_pending(hctx) && 916 list_empty_careful(&hctx->dispatch)) || 917 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 918 continue; 919 920 blk_mq_run_hw_queue(hctx, async); 921 } 922 } 923 EXPORT_SYMBOL(blk_mq_run_hw_queues); 924 925 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 926 { 927 cancel_delayed_work(&hctx->run_work); 928 cancel_delayed_work(&hctx->delay_work); 929 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 930 } 931 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 932 933 void blk_mq_stop_hw_queues(struct request_queue *q) 934 { 935 struct blk_mq_hw_ctx *hctx; 936 int i; 937 938 queue_for_each_hw_ctx(q, hctx, i) 939 blk_mq_stop_hw_queue(hctx); 940 } 941 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 942 943 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 944 { 945 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 946 947 blk_mq_run_hw_queue(hctx, false); 948 } 949 EXPORT_SYMBOL(blk_mq_start_hw_queue); 950 951 void blk_mq_start_hw_queues(struct request_queue *q) 952 { 953 struct blk_mq_hw_ctx *hctx; 954 int i; 955 956 queue_for_each_hw_ctx(q, hctx, i) 957 blk_mq_start_hw_queue(hctx); 958 } 959 EXPORT_SYMBOL(blk_mq_start_hw_queues); 960 961 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 962 { 963 struct blk_mq_hw_ctx *hctx; 964 int i; 965 966 queue_for_each_hw_ctx(q, hctx, i) { 967 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 968 continue; 969 970 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 971 blk_mq_run_hw_queue(hctx, async); 972 } 973 } 974 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 975 976 static void blk_mq_run_work_fn(struct work_struct *work) 977 { 978 struct blk_mq_hw_ctx *hctx; 979 980 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); 981 982 __blk_mq_run_hw_queue(hctx); 983 } 984 985 static void blk_mq_delay_work_fn(struct work_struct *work) 986 { 987 struct blk_mq_hw_ctx *hctx; 988 989 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); 990 991 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) 992 __blk_mq_run_hw_queue(hctx); 993 } 994 995 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 996 { 997 if (unlikely(!blk_mq_hw_queue_mapped(hctx))) 998 return; 999 1000 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 1001 &hctx->delay_work, msecs_to_jiffies(msecs)); 1002 } 1003 EXPORT_SYMBOL(blk_mq_delay_queue); 1004 1005 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 1006 struct request *rq, bool at_head) 1007 { 1008 struct blk_mq_ctx *ctx = rq->mq_ctx; 1009 1010 trace_block_rq_insert(hctx->queue, rq); 1011 1012 if (at_head) 1013 list_add(&rq->queuelist, &ctx->rq_list); 1014 else 1015 list_add_tail(&rq->queuelist, &ctx->rq_list); 1016 1017 blk_mq_hctx_mark_pending(hctx, ctx); 1018 } 1019 1020 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, 1021 bool async) 1022 { 1023 struct request_queue *q = rq->q; 1024 struct blk_mq_hw_ctx *hctx; 1025 struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx; 1026 1027 current_ctx = blk_mq_get_ctx(q); 1028 if (!cpu_online(ctx->cpu)) 1029 rq->mq_ctx = ctx = current_ctx; 1030 1031 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1032 1033 spin_lock(&ctx->lock); 1034 __blk_mq_insert_request(hctx, rq, at_head); 1035 spin_unlock(&ctx->lock); 1036 1037 if (run_queue) 1038 blk_mq_run_hw_queue(hctx, async); 1039 1040 blk_mq_put_ctx(current_ctx); 1041 } 1042 1043 static void blk_mq_insert_requests(struct request_queue *q, 1044 struct blk_mq_ctx *ctx, 1045 struct list_head *list, 1046 int depth, 1047 bool from_schedule) 1048 1049 { 1050 struct blk_mq_hw_ctx *hctx; 1051 struct blk_mq_ctx *current_ctx; 1052 1053 trace_block_unplug(q, depth, !from_schedule); 1054 1055 current_ctx = blk_mq_get_ctx(q); 1056 1057 if (!cpu_online(ctx->cpu)) 1058 ctx = current_ctx; 1059 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1060 1061 /* 1062 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1063 * offline now 1064 */ 1065 spin_lock(&ctx->lock); 1066 while (!list_empty(list)) { 1067 struct request *rq; 1068 1069 rq = list_first_entry(list, struct request, queuelist); 1070 list_del_init(&rq->queuelist); 1071 rq->mq_ctx = ctx; 1072 __blk_mq_insert_request(hctx, rq, false); 1073 } 1074 spin_unlock(&ctx->lock); 1075 1076 blk_mq_run_hw_queue(hctx, from_schedule); 1077 blk_mq_put_ctx(current_ctx); 1078 } 1079 1080 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1081 { 1082 struct request *rqa = container_of(a, struct request, queuelist); 1083 struct request *rqb = container_of(b, struct request, queuelist); 1084 1085 return !(rqa->mq_ctx < rqb->mq_ctx || 1086 (rqa->mq_ctx == rqb->mq_ctx && 1087 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 1088 } 1089 1090 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1091 { 1092 struct blk_mq_ctx *this_ctx; 1093 struct request_queue *this_q; 1094 struct request *rq; 1095 LIST_HEAD(list); 1096 LIST_HEAD(ctx_list); 1097 unsigned int depth; 1098 1099 list_splice_init(&plug->mq_list, &list); 1100 1101 list_sort(NULL, &list, plug_ctx_cmp); 1102 1103 this_q = NULL; 1104 this_ctx = NULL; 1105 depth = 0; 1106 1107 while (!list_empty(&list)) { 1108 rq = list_entry_rq(list.next); 1109 list_del_init(&rq->queuelist); 1110 BUG_ON(!rq->q); 1111 if (rq->mq_ctx != this_ctx) { 1112 if (this_ctx) { 1113 blk_mq_insert_requests(this_q, this_ctx, 1114 &ctx_list, depth, 1115 from_schedule); 1116 } 1117 1118 this_ctx = rq->mq_ctx; 1119 this_q = rq->q; 1120 depth = 0; 1121 } 1122 1123 depth++; 1124 list_add_tail(&rq->queuelist, &ctx_list); 1125 } 1126 1127 /* 1128 * If 'this_ctx' is set, we know we have entries to complete 1129 * on 'ctx_list'. Do those. 1130 */ 1131 if (this_ctx) { 1132 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 1133 from_schedule); 1134 } 1135 } 1136 1137 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1138 { 1139 init_request_from_bio(rq, bio); 1140 1141 if (blk_do_io_stat(rq)) 1142 blk_account_io_start(rq, 1); 1143 } 1144 1145 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) 1146 { 1147 return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 1148 !blk_queue_nomerges(hctx->queue); 1149 } 1150 1151 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, 1152 struct blk_mq_ctx *ctx, 1153 struct request *rq, struct bio *bio) 1154 { 1155 if (!hctx_allow_merges(hctx)) { 1156 blk_mq_bio_to_request(rq, bio); 1157 spin_lock(&ctx->lock); 1158 insert_rq: 1159 __blk_mq_insert_request(hctx, rq, false); 1160 spin_unlock(&ctx->lock); 1161 return false; 1162 } else { 1163 struct request_queue *q = hctx->queue; 1164 1165 spin_lock(&ctx->lock); 1166 if (!blk_mq_attempt_merge(q, ctx, bio)) { 1167 blk_mq_bio_to_request(rq, bio); 1168 goto insert_rq; 1169 } 1170 1171 spin_unlock(&ctx->lock); 1172 __blk_mq_free_request(hctx, ctx, rq); 1173 return true; 1174 } 1175 } 1176 1177 struct blk_map_ctx { 1178 struct blk_mq_hw_ctx *hctx; 1179 struct blk_mq_ctx *ctx; 1180 }; 1181 1182 static struct request *blk_mq_map_request(struct request_queue *q, 1183 struct bio *bio, 1184 struct blk_map_ctx *data) 1185 { 1186 struct blk_mq_hw_ctx *hctx; 1187 struct blk_mq_ctx *ctx; 1188 struct request *rq; 1189 int rw = bio_data_dir(bio); 1190 struct blk_mq_alloc_data alloc_data; 1191 1192 if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) { 1193 bio_endio(bio, -EIO); 1194 return NULL; 1195 } 1196 1197 ctx = blk_mq_get_ctx(q); 1198 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1199 1200 if (rw_is_sync(bio->bi_rw)) 1201 rw |= REQ_SYNC; 1202 1203 trace_block_getrq(q, bio, rw); 1204 blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx, 1205 hctx); 1206 rq = __blk_mq_alloc_request(&alloc_data, rw); 1207 if (unlikely(!rq)) { 1208 __blk_mq_run_hw_queue(hctx); 1209 blk_mq_put_ctx(ctx); 1210 trace_block_sleeprq(q, bio, rw); 1211 1212 ctx = blk_mq_get_ctx(q); 1213 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1214 blk_mq_set_alloc_data(&alloc_data, q, 1215 __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); 1216 rq = __blk_mq_alloc_request(&alloc_data, rw); 1217 ctx = alloc_data.ctx; 1218 hctx = alloc_data.hctx; 1219 } 1220 1221 hctx->queued++; 1222 data->hctx = hctx; 1223 data->ctx = ctx; 1224 return rq; 1225 } 1226 1227 /* 1228 * Multiple hardware queue variant. This will not use per-process plugs, 1229 * but will attempt to bypass the hctx queueing if we can go straight to 1230 * hardware for SYNC IO. 1231 */ 1232 static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 1233 { 1234 const int is_sync = rw_is_sync(bio->bi_rw); 1235 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1236 struct blk_map_ctx data; 1237 struct request *rq; 1238 1239 blk_queue_bounce(q, &bio); 1240 1241 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1242 bio_endio(bio, -EIO); 1243 return; 1244 } 1245 1246 rq = blk_mq_map_request(q, bio, &data); 1247 if (unlikely(!rq)) 1248 return; 1249 1250 if (unlikely(is_flush_fua)) { 1251 blk_mq_bio_to_request(rq, bio); 1252 blk_insert_flush(rq); 1253 goto run_queue; 1254 } 1255 1256 /* 1257 * If the driver supports defer issued based on 'last', then 1258 * queue it up like normal since we can potentially save some 1259 * CPU this way. 1260 */ 1261 if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { 1262 struct blk_mq_queue_data bd = { 1263 .rq = rq, 1264 .list = NULL, 1265 .last = 1 1266 }; 1267 int ret; 1268 1269 blk_mq_bio_to_request(rq, bio); 1270 1271 /* 1272 * For OK queue, we are done. For error, kill it. Any other 1273 * error (busy), just add it to our list as we previously 1274 * would have done 1275 */ 1276 ret = q->mq_ops->queue_rq(data.hctx, &bd); 1277 if (ret == BLK_MQ_RQ_QUEUE_OK) 1278 goto done; 1279 else { 1280 __blk_mq_requeue_request(rq); 1281 1282 if (ret == BLK_MQ_RQ_QUEUE_ERROR) { 1283 rq->errors = -EIO; 1284 blk_mq_end_request(rq, rq->errors); 1285 goto done; 1286 } 1287 } 1288 } 1289 1290 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1291 /* 1292 * For a SYNC request, send it to the hardware immediately. For 1293 * an ASYNC request, just ensure that we run it later on. The 1294 * latter allows for merging opportunities and more efficient 1295 * dispatching. 1296 */ 1297 run_queue: 1298 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1299 } 1300 done: 1301 blk_mq_put_ctx(data.ctx); 1302 } 1303 1304 /* 1305 * Single hardware queue variant. This will attempt to use any per-process 1306 * plug for merging and IO deferral. 1307 */ 1308 static void blk_sq_make_request(struct request_queue *q, struct bio *bio) 1309 { 1310 const int is_sync = rw_is_sync(bio->bi_rw); 1311 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1312 unsigned int use_plug, request_count = 0; 1313 struct blk_map_ctx data; 1314 struct request *rq; 1315 1316 /* 1317 * If we have multiple hardware queues, just go directly to 1318 * one of those for sync IO. 1319 */ 1320 use_plug = !is_flush_fua && !is_sync; 1321 1322 blk_queue_bounce(q, &bio); 1323 1324 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1325 bio_endio(bio, -EIO); 1326 return; 1327 } 1328 1329 if (use_plug && !blk_queue_nomerges(q) && 1330 blk_attempt_plug_merge(q, bio, &request_count)) 1331 return; 1332 1333 rq = blk_mq_map_request(q, bio, &data); 1334 if (unlikely(!rq)) 1335 return; 1336 1337 if (unlikely(is_flush_fua)) { 1338 blk_mq_bio_to_request(rq, bio); 1339 blk_insert_flush(rq); 1340 goto run_queue; 1341 } 1342 1343 /* 1344 * A task plug currently exists. Since this is completely lockless, 1345 * utilize that to temporarily store requests until the task is 1346 * either done or scheduled away. 1347 */ 1348 if (use_plug) { 1349 struct blk_plug *plug = current->plug; 1350 1351 if (plug) { 1352 blk_mq_bio_to_request(rq, bio); 1353 if (list_empty(&plug->mq_list)) 1354 trace_block_plug(q); 1355 else if (request_count >= BLK_MAX_REQUEST_COUNT) { 1356 blk_flush_plug_list(plug, false); 1357 trace_block_plug(q); 1358 } 1359 list_add_tail(&rq->queuelist, &plug->mq_list); 1360 blk_mq_put_ctx(data.ctx); 1361 return; 1362 } 1363 } 1364 1365 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1366 /* 1367 * For a SYNC request, send it to the hardware immediately. For 1368 * an ASYNC request, just ensure that we run it later on. The 1369 * latter allows for merging opportunities and more efficient 1370 * dispatching. 1371 */ 1372 run_queue: 1373 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1374 } 1375 1376 blk_mq_put_ctx(data.ctx); 1377 } 1378 1379 /* 1380 * Default mapping to a software queue, since we use one per CPU. 1381 */ 1382 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 1383 { 1384 return q->queue_hw_ctx[q->mq_map[cpu]]; 1385 } 1386 EXPORT_SYMBOL(blk_mq_map_queue); 1387 1388 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, 1389 struct blk_mq_tags *tags, unsigned int hctx_idx) 1390 { 1391 struct page *page; 1392 1393 if (tags->rqs && set->ops->exit_request) { 1394 int i; 1395 1396 for (i = 0; i < tags->nr_tags; i++) { 1397 if (!tags->rqs[i]) 1398 continue; 1399 set->ops->exit_request(set->driver_data, tags->rqs[i], 1400 hctx_idx, i); 1401 tags->rqs[i] = NULL; 1402 } 1403 } 1404 1405 while (!list_empty(&tags->page_list)) { 1406 page = list_first_entry(&tags->page_list, struct page, lru); 1407 list_del_init(&page->lru); 1408 __free_pages(page, page->private); 1409 } 1410 1411 kfree(tags->rqs); 1412 1413 blk_mq_free_tags(tags); 1414 } 1415 1416 static size_t order_to_size(unsigned int order) 1417 { 1418 return (size_t)PAGE_SIZE << order; 1419 } 1420 1421 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, 1422 unsigned int hctx_idx) 1423 { 1424 struct blk_mq_tags *tags; 1425 unsigned int i, j, entries_per_page, max_order = 4; 1426 size_t rq_size, left; 1427 1428 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, 1429 set->numa_node, 1430 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 1431 if (!tags) 1432 return NULL; 1433 1434 INIT_LIST_HEAD(&tags->page_list); 1435 1436 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), 1437 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, 1438 set->numa_node); 1439 if (!tags->rqs) { 1440 blk_mq_free_tags(tags); 1441 return NULL; 1442 } 1443 1444 /* 1445 * rq_size is the size of the request plus driver payload, rounded 1446 * to the cacheline size 1447 */ 1448 rq_size = round_up(sizeof(struct request) + set->cmd_size, 1449 cache_line_size()); 1450 left = rq_size * set->queue_depth; 1451 1452 for (i = 0; i < set->queue_depth; ) { 1453 int this_order = max_order; 1454 struct page *page; 1455 int to_do; 1456 void *p; 1457 1458 while (left < order_to_size(this_order - 1) && this_order) 1459 this_order--; 1460 1461 do { 1462 page = alloc_pages_node(set->numa_node, 1463 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 1464 this_order); 1465 if (page) 1466 break; 1467 if (!this_order--) 1468 break; 1469 if (order_to_size(this_order) < rq_size) 1470 break; 1471 } while (1); 1472 1473 if (!page) 1474 goto fail; 1475 1476 page->private = this_order; 1477 list_add_tail(&page->lru, &tags->page_list); 1478 1479 p = page_address(page); 1480 entries_per_page = order_to_size(this_order) / rq_size; 1481 to_do = min(entries_per_page, set->queue_depth - i); 1482 left -= to_do * rq_size; 1483 for (j = 0; j < to_do; j++) { 1484 tags->rqs[i] = p; 1485 if (set->ops->init_request) { 1486 if (set->ops->init_request(set->driver_data, 1487 tags->rqs[i], hctx_idx, i, 1488 set->numa_node)) { 1489 tags->rqs[i] = NULL; 1490 goto fail; 1491 } 1492 } 1493 1494 p += rq_size; 1495 i++; 1496 } 1497 } 1498 1499 return tags; 1500 1501 fail: 1502 blk_mq_free_rq_map(set, tags, hctx_idx); 1503 return NULL; 1504 } 1505 1506 static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) 1507 { 1508 kfree(bitmap->map); 1509 } 1510 1511 static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) 1512 { 1513 unsigned int bpw = 8, total, num_maps, i; 1514 1515 bitmap->bits_per_word = bpw; 1516 1517 num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; 1518 bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), 1519 GFP_KERNEL, node); 1520 if (!bitmap->map) 1521 return -ENOMEM; 1522 1523 total = nr_cpu_ids; 1524 for (i = 0; i < num_maps; i++) { 1525 bitmap->map[i].depth = min(total, bitmap->bits_per_word); 1526 total -= bitmap->map[i].depth; 1527 } 1528 1529 return 0; 1530 } 1531 1532 static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) 1533 { 1534 struct request_queue *q = hctx->queue; 1535 struct blk_mq_ctx *ctx; 1536 LIST_HEAD(tmp); 1537 1538 /* 1539 * Move ctx entries to new CPU, if this one is going away. 1540 */ 1541 ctx = __blk_mq_get_ctx(q, cpu); 1542 1543 spin_lock(&ctx->lock); 1544 if (!list_empty(&ctx->rq_list)) { 1545 list_splice_init(&ctx->rq_list, &tmp); 1546 blk_mq_hctx_clear_pending(hctx, ctx); 1547 } 1548 spin_unlock(&ctx->lock); 1549 1550 if (list_empty(&tmp)) 1551 return NOTIFY_OK; 1552 1553 ctx = blk_mq_get_ctx(q); 1554 spin_lock(&ctx->lock); 1555 1556 while (!list_empty(&tmp)) { 1557 struct request *rq; 1558 1559 rq = list_first_entry(&tmp, struct request, queuelist); 1560 rq->mq_ctx = ctx; 1561 list_move_tail(&rq->queuelist, &ctx->rq_list); 1562 } 1563 1564 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1565 blk_mq_hctx_mark_pending(hctx, ctx); 1566 1567 spin_unlock(&ctx->lock); 1568 1569 blk_mq_run_hw_queue(hctx, true); 1570 blk_mq_put_ctx(ctx); 1571 return NOTIFY_OK; 1572 } 1573 1574 static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) 1575 { 1576 struct request_queue *q = hctx->queue; 1577 struct blk_mq_tag_set *set = q->tag_set; 1578 1579 if (set->tags[hctx->queue_num]) 1580 return NOTIFY_OK; 1581 1582 set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); 1583 if (!set->tags[hctx->queue_num]) 1584 return NOTIFY_STOP; 1585 1586 hctx->tags = set->tags[hctx->queue_num]; 1587 return NOTIFY_OK; 1588 } 1589 1590 static int blk_mq_hctx_notify(void *data, unsigned long action, 1591 unsigned int cpu) 1592 { 1593 struct blk_mq_hw_ctx *hctx = data; 1594 1595 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 1596 return blk_mq_hctx_cpu_offline(hctx, cpu); 1597 else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 1598 return blk_mq_hctx_cpu_online(hctx, cpu); 1599 1600 return NOTIFY_OK; 1601 } 1602 1603 static void blk_mq_exit_hctx(struct request_queue *q, 1604 struct blk_mq_tag_set *set, 1605 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1606 { 1607 unsigned flush_start_tag = set->queue_depth; 1608 1609 blk_mq_tag_idle(hctx); 1610 1611 if (set->ops->exit_request) 1612 set->ops->exit_request(set->driver_data, 1613 hctx->fq->flush_rq, hctx_idx, 1614 flush_start_tag + hctx_idx); 1615 1616 if (set->ops->exit_hctx) 1617 set->ops->exit_hctx(hctx, hctx_idx); 1618 1619 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1620 blk_free_flush_queue(hctx->fq); 1621 kfree(hctx->ctxs); 1622 blk_mq_free_bitmap(&hctx->ctx_map); 1623 } 1624 1625 static void blk_mq_exit_hw_queues(struct request_queue *q, 1626 struct blk_mq_tag_set *set, int nr_queue) 1627 { 1628 struct blk_mq_hw_ctx *hctx; 1629 unsigned int i; 1630 1631 queue_for_each_hw_ctx(q, hctx, i) { 1632 if (i == nr_queue) 1633 break; 1634 blk_mq_exit_hctx(q, set, hctx, i); 1635 } 1636 } 1637 1638 static void blk_mq_free_hw_queues(struct request_queue *q, 1639 struct blk_mq_tag_set *set) 1640 { 1641 struct blk_mq_hw_ctx *hctx; 1642 unsigned int i; 1643 1644 queue_for_each_hw_ctx(q, hctx, i) 1645 free_cpumask_var(hctx->cpumask); 1646 } 1647 1648 static int blk_mq_init_hctx(struct request_queue *q, 1649 struct blk_mq_tag_set *set, 1650 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 1651 { 1652 int node; 1653 unsigned flush_start_tag = set->queue_depth; 1654 1655 node = hctx->numa_node; 1656 if (node == NUMA_NO_NODE) 1657 node = hctx->numa_node = set->numa_node; 1658 1659 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 1660 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); 1661 spin_lock_init(&hctx->lock); 1662 INIT_LIST_HEAD(&hctx->dispatch); 1663 hctx->queue = q; 1664 hctx->queue_num = hctx_idx; 1665 hctx->flags = set->flags; 1666 1667 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1668 blk_mq_hctx_notify, hctx); 1669 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1670 1671 hctx->tags = set->tags[hctx_idx]; 1672 1673 /* 1674 * Allocate space for all possible cpus to avoid allocation at 1675 * runtime 1676 */ 1677 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1678 GFP_KERNEL, node); 1679 if (!hctx->ctxs) 1680 goto unregister_cpu_notifier; 1681 1682 if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) 1683 goto free_ctxs; 1684 1685 hctx->nr_ctx = 0; 1686 1687 if (set->ops->init_hctx && 1688 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 1689 goto free_bitmap; 1690 1691 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); 1692 if (!hctx->fq) 1693 goto exit_hctx; 1694 1695 if (set->ops->init_request && 1696 set->ops->init_request(set->driver_data, 1697 hctx->fq->flush_rq, hctx_idx, 1698 flush_start_tag + hctx_idx, node)) 1699 goto free_fq; 1700 1701 return 0; 1702 1703 free_fq: 1704 kfree(hctx->fq); 1705 exit_hctx: 1706 if (set->ops->exit_hctx) 1707 set->ops->exit_hctx(hctx, hctx_idx); 1708 free_bitmap: 1709 blk_mq_free_bitmap(&hctx->ctx_map); 1710 free_ctxs: 1711 kfree(hctx->ctxs); 1712 unregister_cpu_notifier: 1713 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1714 1715 return -1; 1716 } 1717 1718 static int blk_mq_init_hw_queues(struct request_queue *q, 1719 struct blk_mq_tag_set *set) 1720 { 1721 struct blk_mq_hw_ctx *hctx; 1722 unsigned int i; 1723 1724 /* 1725 * Initialize hardware queues 1726 */ 1727 queue_for_each_hw_ctx(q, hctx, i) { 1728 if (blk_mq_init_hctx(q, set, hctx, i)) 1729 break; 1730 } 1731 1732 if (i == q->nr_hw_queues) 1733 return 0; 1734 1735 /* 1736 * Init failed 1737 */ 1738 blk_mq_exit_hw_queues(q, set, i); 1739 1740 return 1; 1741 } 1742 1743 static void blk_mq_init_cpu_queues(struct request_queue *q, 1744 unsigned int nr_hw_queues) 1745 { 1746 unsigned int i; 1747 1748 for_each_possible_cpu(i) { 1749 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1750 struct blk_mq_hw_ctx *hctx; 1751 1752 memset(__ctx, 0, sizeof(*__ctx)); 1753 __ctx->cpu = i; 1754 spin_lock_init(&__ctx->lock); 1755 INIT_LIST_HEAD(&__ctx->rq_list); 1756 __ctx->queue = q; 1757 1758 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1759 if (!cpu_online(i)) 1760 continue; 1761 1762 hctx = q->mq_ops->map_queue(q, i); 1763 1764 /* 1765 * Set local node, IFF we have more than one hw queue. If 1766 * not, we remain on the home node of the device 1767 */ 1768 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1769 hctx->numa_node = cpu_to_node(i); 1770 } 1771 } 1772 1773 static void blk_mq_map_swqueue(struct request_queue *q) 1774 { 1775 unsigned int i; 1776 struct blk_mq_hw_ctx *hctx; 1777 struct blk_mq_ctx *ctx; 1778 1779 queue_for_each_hw_ctx(q, hctx, i) { 1780 cpumask_clear(hctx->cpumask); 1781 hctx->nr_ctx = 0; 1782 } 1783 1784 /* 1785 * Map software to hardware queues 1786 */ 1787 queue_for_each_ctx(q, ctx, i) { 1788 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1789 if (!cpu_online(i)) 1790 continue; 1791 1792 hctx = q->mq_ops->map_queue(q, i); 1793 cpumask_set_cpu(i, hctx->cpumask); 1794 ctx->index_hw = hctx->nr_ctx; 1795 hctx->ctxs[hctx->nr_ctx++] = ctx; 1796 } 1797 1798 queue_for_each_hw_ctx(q, hctx, i) { 1799 struct blk_mq_ctxmap *map = &hctx->ctx_map; 1800 1801 /* 1802 * If no software queues are mapped to this hardware queue, 1803 * disable it and free the request entries. 1804 */ 1805 if (!hctx->nr_ctx) { 1806 struct blk_mq_tag_set *set = q->tag_set; 1807 1808 if (set->tags[i]) { 1809 blk_mq_free_rq_map(set, set->tags[i], i); 1810 set->tags[i] = NULL; 1811 hctx->tags = NULL; 1812 } 1813 continue; 1814 } 1815 1816 /* 1817 * Set the map size to the number of mapped software queues. 1818 * This is more accurate and more efficient than looping 1819 * over all possibly mapped software queues. 1820 */ 1821 map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word); 1822 1823 /* 1824 * Initialize batch roundrobin counts 1825 */ 1826 hctx->next_cpu = cpumask_first(hctx->cpumask); 1827 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1828 } 1829 } 1830 1831 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) 1832 { 1833 struct blk_mq_hw_ctx *hctx; 1834 struct request_queue *q; 1835 bool shared; 1836 int i; 1837 1838 if (set->tag_list.next == set->tag_list.prev) 1839 shared = false; 1840 else 1841 shared = true; 1842 1843 list_for_each_entry(q, &set->tag_list, tag_set_list) { 1844 blk_mq_freeze_queue(q); 1845 1846 queue_for_each_hw_ctx(q, hctx, i) { 1847 if (shared) 1848 hctx->flags |= BLK_MQ_F_TAG_SHARED; 1849 else 1850 hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 1851 } 1852 blk_mq_unfreeze_queue(q); 1853 } 1854 } 1855 1856 static void blk_mq_del_queue_tag_set(struct request_queue *q) 1857 { 1858 struct blk_mq_tag_set *set = q->tag_set; 1859 1860 mutex_lock(&set->tag_list_lock); 1861 list_del_init(&q->tag_set_list); 1862 blk_mq_update_tag_set_depth(set); 1863 mutex_unlock(&set->tag_list_lock); 1864 } 1865 1866 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 1867 struct request_queue *q) 1868 { 1869 q->tag_set = set; 1870 1871 mutex_lock(&set->tag_list_lock); 1872 list_add_tail(&q->tag_set_list, &set->tag_list); 1873 blk_mq_update_tag_set_depth(set); 1874 mutex_unlock(&set->tag_list_lock); 1875 } 1876 1877 /* 1878 * It is the actual release handler for mq, but we do it from 1879 * request queue's release handler for avoiding use-after-free 1880 * and headache because q->mq_kobj shouldn't have been introduced, 1881 * but we can't group ctx/kctx kobj without it. 1882 */ 1883 void blk_mq_release(struct request_queue *q) 1884 { 1885 struct blk_mq_hw_ctx *hctx; 1886 unsigned int i; 1887 1888 /* hctx kobj stays in hctx */ 1889 queue_for_each_hw_ctx(q, hctx, i) 1890 kfree(hctx); 1891 1892 kfree(q->queue_hw_ctx); 1893 1894 /* ctx kobj stays in queue_ctx */ 1895 free_percpu(q->queue_ctx); 1896 } 1897 1898 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 1899 { 1900 struct request_queue *uninit_q, *q; 1901 1902 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); 1903 if (!uninit_q) 1904 return ERR_PTR(-ENOMEM); 1905 1906 q = blk_mq_init_allocated_queue(set, uninit_q); 1907 if (IS_ERR(q)) 1908 blk_cleanup_queue(uninit_q); 1909 1910 return q; 1911 } 1912 EXPORT_SYMBOL(blk_mq_init_queue); 1913 1914 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 1915 struct request_queue *q) 1916 { 1917 struct blk_mq_hw_ctx **hctxs; 1918 struct blk_mq_ctx __percpu *ctx; 1919 unsigned int *map; 1920 int i; 1921 1922 ctx = alloc_percpu(struct blk_mq_ctx); 1923 if (!ctx) 1924 return ERR_PTR(-ENOMEM); 1925 1926 hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1927 set->numa_node); 1928 1929 if (!hctxs) 1930 goto err_percpu; 1931 1932 map = blk_mq_make_queue_map(set); 1933 if (!map) 1934 goto err_map; 1935 1936 for (i = 0; i < set->nr_hw_queues; i++) { 1937 int node = blk_mq_hw_queue_to_node(map, i); 1938 1939 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), 1940 GFP_KERNEL, node); 1941 if (!hctxs[i]) 1942 goto err_hctxs; 1943 1944 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, 1945 node)) 1946 goto err_hctxs; 1947 1948 atomic_set(&hctxs[i]->nr_active, 0); 1949 hctxs[i]->numa_node = node; 1950 hctxs[i]->queue_num = i; 1951 } 1952 1953 /* 1954 * Init percpu_ref in atomic mode so that it's faster to shutdown. 1955 * See blk_register_queue() for details. 1956 */ 1957 if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, 1958 PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) 1959 goto err_hctxs; 1960 1961 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1962 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30000); 1963 1964 q->nr_queues = nr_cpu_ids; 1965 q->nr_hw_queues = set->nr_hw_queues; 1966 q->mq_map = map; 1967 1968 q->queue_ctx = ctx; 1969 q->queue_hw_ctx = hctxs; 1970 1971 q->mq_ops = set->ops; 1972 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1973 1974 if (!(set->flags & BLK_MQ_F_SG_MERGE)) 1975 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; 1976 1977 q->sg_reserved_size = INT_MAX; 1978 1979 INIT_WORK(&q->requeue_work, blk_mq_requeue_work); 1980 INIT_LIST_HEAD(&q->requeue_list); 1981 spin_lock_init(&q->requeue_lock); 1982 1983 if (q->nr_hw_queues > 1) 1984 blk_queue_make_request(q, blk_mq_make_request); 1985 else 1986 blk_queue_make_request(q, blk_sq_make_request); 1987 1988 /* 1989 * Do this after blk_queue_make_request() overrides it... 1990 */ 1991 q->nr_requests = set->queue_depth; 1992 1993 if (set->ops->complete) 1994 blk_queue_softirq_done(q, set->ops->complete); 1995 1996 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 1997 1998 if (blk_mq_init_hw_queues(q, set)) 1999 goto err_hctxs; 2000 2001 mutex_lock(&all_q_mutex); 2002 list_add_tail(&q->all_q_node, &all_q_list); 2003 mutex_unlock(&all_q_mutex); 2004 2005 blk_mq_add_queue_tag_set(set, q); 2006 2007 blk_mq_map_swqueue(q); 2008 2009 return q; 2010 2011 err_hctxs: 2012 kfree(map); 2013 for (i = 0; i < set->nr_hw_queues; i++) { 2014 if (!hctxs[i]) 2015 break; 2016 free_cpumask_var(hctxs[i]->cpumask); 2017 kfree(hctxs[i]); 2018 } 2019 err_map: 2020 kfree(hctxs); 2021 err_percpu: 2022 free_percpu(ctx); 2023 return ERR_PTR(-ENOMEM); 2024 } 2025 EXPORT_SYMBOL(blk_mq_init_allocated_queue); 2026 2027 void blk_mq_free_queue(struct request_queue *q) 2028 { 2029 struct blk_mq_tag_set *set = q->tag_set; 2030 2031 blk_mq_del_queue_tag_set(q); 2032 2033 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 2034 blk_mq_free_hw_queues(q, set); 2035 2036 percpu_ref_exit(&q->mq_usage_counter); 2037 2038 kfree(q->mq_map); 2039 2040 q->mq_map = NULL; 2041 2042 mutex_lock(&all_q_mutex); 2043 list_del_init(&q->all_q_node); 2044 mutex_unlock(&all_q_mutex); 2045 } 2046 2047 /* Basically redo blk_mq_init_queue with queue frozen */ 2048 static void blk_mq_queue_reinit(struct request_queue *q) 2049 { 2050 WARN_ON_ONCE(!q->mq_freeze_depth); 2051 2052 blk_mq_sysfs_unregister(q); 2053 2054 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 2055 2056 /* 2057 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 2058 * we should change hctx numa_node according to new topology (this 2059 * involves free and re-allocate memory, worthy doing?) 2060 */ 2061 2062 blk_mq_map_swqueue(q); 2063 2064 blk_mq_sysfs_register(q); 2065 } 2066 2067 static int blk_mq_queue_reinit_notify(struct notifier_block *nb, 2068 unsigned long action, void *hcpu) 2069 { 2070 struct request_queue *q; 2071 2072 /* 2073 * Before new mappings are established, hotadded cpu might already 2074 * start handling requests. This doesn't break anything as we map 2075 * offline CPUs to first hardware queue. We will re-init the queue 2076 * below to get optimal settings. 2077 */ 2078 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 2079 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 2080 return NOTIFY_OK; 2081 2082 mutex_lock(&all_q_mutex); 2083 2084 /* 2085 * We need to freeze and reinit all existing queues. Freezing 2086 * involves synchronous wait for an RCU grace period and doing it 2087 * one by one may take a long time. Start freezing all queues in 2088 * one swoop and then wait for the completions so that freezing can 2089 * take place in parallel. 2090 */ 2091 list_for_each_entry(q, &all_q_list, all_q_node) 2092 blk_mq_freeze_queue_start(q); 2093 list_for_each_entry(q, &all_q_list, all_q_node) 2094 blk_mq_freeze_queue_wait(q); 2095 2096 list_for_each_entry(q, &all_q_list, all_q_node) 2097 blk_mq_queue_reinit(q); 2098 2099 list_for_each_entry(q, &all_q_list, all_q_node) 2100 blk_mq_unfreeze_queue(q); 2101 2102 mutex_unlock(&all_q_mutex); 2103 return NOTIFY_OK; 2104 } 2105 2106 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2107 { 2108 int i; 2109 2110 for (i = 0; i < set->nr_hw_queues; i++) { 2111 set->tags[i] = blk_mq_init_rq_map(set, i); 2112 if (!set->tags[i]) 2113 goto out_unwind; 2114 } 2115 2116 return 0; 2117 2118 out_unwind: 2119 while (--i >= 0) 2120 blk_mq_free_rq_map(set, set->tags[i], i); 2121 2122 return -ENOMEM; 2123 } 2124 2125 /* 2126 * Allocate the request maps associated with this tag_set. Note that this 2127 * may reduce the depth asked for, if memory is tight. set->queue_depth 2128 * will be updated to reflect the allocated depth. 2129 */ 2130 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2131 { 2132 unsigned int depth; 2133 int err; 2134 2135 depth = set->queue_depth; 2136 do { 2137 err = __blk_mq_alloc_rq_maps(set); 2138 if (!err) 2139 break; 2140 2141 set->queue_depth >>= 1; 2142 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 2143 err = -ENOMEM; 2144 break; 2145 } 2146 } while (set->queue_depth); 2147 2148 if (!set->queue_depth || err) { 2149 pr_err("blk-mq: failed to allocate request map\n"); 2150 return -ENOMEM; 2151 } 2152 2153 if (depth != set->queue_depth) 2154 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 2155 depth, set->queue_depth); 2156 2157 return 0; 2158 } 2159 2160 /* 2161 * Alloc a tag set to be associated with one or more request queues. 2162 * May fail with EINVAL for various error conditions. May adjust the 2163 * requested depth down, if if it too large. In that case, the set 2164 * value will be stored in set->queue_depth. 2165 */ 2166 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2167 { 2168 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 2169 2170 if (!set->nr_hw_queues) 2171 return -EINVAL; 2172 if (!set->queue_depth) 2173 return -EINVAL; 2174 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 2175 return -EINVAL; 2176 2177 if (!set->ops->queue_rq || !set->ops->map_queue) 2178 return -EINVAL; 2179 2180 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 2181 pr_info("blk-mq: reduced tag depth to %u\n", 2182 BLK_MQ_MAX_DEPTH); 2183 set->queue_depth = BLK_MQ_MAX_DEPTH; 2184 } 2185 2186 /* 2187 * If a crashdump is active, then we are potentially in a very 2188 * memory constrained environment. Limit us to 1 queue and 2189 * 64 tags to prevent using too much memory. 2190 */ 2191 if (is_kdump_kernel()) { 2192 set->nr_hw_queues = 1; 2193 set->queue_depth = min(64U, set->queue_depth); 2194 } 2195 2196 set->tags = kmalloc_node(set->nr_hw_queues * 2197 sizeof(struct blk_mq_tags *), 2198 GFP_KERNEL, set->numa_node); 2199 if (!set->tags) 2200 return -ENOMEM; 2201 2202 if (blk_mq_alloc_rq_maps(set)) 2203 goto enomem; 2204 2205 mutex_init(&set->tag_list_lock); 2206 INIT_LIST_HEAD(&set->tag_list); 2207 2208 return 0; 2209 enomem: 2210 kfree(set->tags); 2211 set->tags = NULL; 2212 return -ENOMEM; 2213 } 2214 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 2215 2216 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 2217 { 2218 int i; 2219 2220 for (i = 0; i < set->nr_hw_queues; i++) { 2221 if (set->tags[i]) 2222 blk_mq_free_rq_map(set, set->tags[i], i); 2223 } 2224 2225 kfree(set->tags); 2226 set->tags = NULL; 2227 } 2228 EXPORT_SYMBOL(blk_mq_free_tag_set); 2229 2230 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) 2231 { 2232 struct blk_mq_tag_set *set = q->tag_set; 2233 struct blk_mq_hw_ctx *hctx; 2234 int i, ret; 2235 2236 if (!set || nr > set->queue_depth) 2237 return -EINVAL; 2238 2239 ret = 0; 2240 queue_for_each_hw_ctx(q, hctx, i) { 2241 ret = blk_mq_tag_update_depth(hctx->tags, nr); 2242 if (ret) 2243 break; 2244 } 2245 2246 if (!ret) 2247 q->nr_requests = nr; 2248 2249 return ret; 2250 } 2251 2252 void blk_mq_disable_hotplug(void) 2253 { 2254 mutex_lock(&all_q_mutex); 2255 } 2256 2257 void blk_mq_enable_hotplug(void) 2258 { 2259 mutex_unlock(&all_q_mutex); 2260 } 2261 2262 static int __init blk_mq_init(void) 2263 { 2264 blk_mq_cpu_init(); 2265 2266 hotcpu_notifier(blk_mq_queue_reinit_notify, 0); 2267 2268 return 0; 2269 } 2270 subsys_initcall(blk_mq_init); 2271