1 #include <linux/kernel.h> 2 #include <linux/module.h> 3 #include <linux/backing-dev.h> 4 #include <linux/bio.h> 5 #include <linux/blkdev.h> 6 #include <linux/mm.h> 7 #include <linux/init.h> 8 #include <linux/slab.h> 9 #include <linux/workqueue.h> 10 #include <linux/smp.h> 11 #include <linux/llist.h> 12 #include <linux/list_sort.h> 13 #include <linux/cpu.h> 14 #include <linux/cache.h> 15 #include <linux/sched/sysctl.h> 16 #include <linux/delay.h> 17 18 #include <trace/events/block.h> 19 20 #include <linux/blk-mq.h> 21 #include "blk.h" 22 #include "blk-mq.h" 23 #include "blk-mq-tag.h" 24 25 static DEFINE_MUTEX(all_q_mutex); 26 static LIST_HEAD(all_q_list); 27 28 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 29 30 static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, 31 unsigned int cpu) 32 { 33 return per_cpu_ptr(q->queue_ctx, cpu); 34 } 35 36 /* 37 * This assumes per-cpu software queueing queues. They could be per-node 38 * as well, for instance. For now this is hardcoded as-is. Note that we don't 39 * care about preemption, since we know the ctx's are persistent. This does 40 * mean that we can't rely on ctx always matching the currently running CPU. 41 */ 42 static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) 43 { 44 return __blk_mq_get_ctx(q, get_cpu()); 45 } 46 47 static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) 48 { 49 put_cpu(); 50 } 51 52 /* 53 * Check if any of the ctx's have pending work in this hardware queue 54 */ 55 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 56 { 57 unsigned int i; 58 59 for (i = 0; i < hctx->nr_ctx_map; i++) 60 if (hctx->ctx_map[i]) 61 return true; 62 63 return false; 64 } 65 66 /* 67 * Mark this ctx as having pending work in this hardware queue 68 */ 69 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 70 struct blk_mq_ctx *ctx) 71 { 72 if (!test_bit(ctx->index_hw, hctx->ctx_map)) 73 set_bit(ctx->index_hw, hctx->ctx_map); 74 } 75 76 static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp, 77 bool reserved) 78 { 79 struct request *rq; 80 unsigned int tag; 81 82 tag = blk_mq_get_tag(hctx->tags, gfp, reserved); 83 if (tag != BLK_MQ_TAG_FAIL) { 84 rq = hctx->rqs[tag]; 85 rq->tag = tag; 86 87 return rq; 88 } 89 90 return NULL; 91 } 92 93 static int blk_mq_queue_enter(struct request_queue *q) 94 { 95 int ret; 96 97 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 98 smp_wmb(); 99 /* we have problems to freeze the queue if it's initializing */ 100 if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) 101 return 0; 102 103 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 104 105 spin_lock_irq(q->queue_lock); 106 ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, 107 !blk_queue_bypass(q) || blk_queue_dying(q), 108 *q->queue_lock); 109 /* inc usage with lock hold to avoid freeze_queue runs here */ 110 if (!ret && !blk_queue_dying(q)) 111 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 112 else if (blk_queue_dying(q)) 113 ret = -ENODEV; 114 spin_unlock_irq(q->queue_lock); 115 116 return ret; 117 } 118 119 static void blk_mq_queue_exit(struct request_queue *q) 120 { 121 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 122 } 123 124 static void __blk_mq_drain_queue(struct request_queue *q) 125 { 126 while (true) { 127 s64 count; 128 129 spin_lock_irq(q->queue_lock); 130 count = percpu_counter_sum(&q->mq_usage_counter); 131 spin_unlock_irq(q->queue_lock); 132 133 if (count == 0) 134 break; 135 blk_mq_run_queues(q, false); 136 msleep(10); 137 } 138 } 139 140 /* 141 * Guarantee no request is in use, so we can change any data structure of 142 * the queue afterward. 143 */ 144 static void blk_mq_freeze_queue(struct request_queue *q) 145 { 146 bool drain; 147 148 spin_lock_irq(q->queue_lock); 149 drain = !q->bypass_depth++; 150 queue_flag_set(QUEUE_FLAG_BYPASS, q); 151 spin_unlock_irq(q->queue_lock); 152 153 if (drain) 154 __blk_mq_drain_queue(q); 155 } 156 157 void blk_mq_drain_queue(struct request_queue *q) 158 { 159 __blk_mq_drain_queue(q); 160 } 161 162 static void blk_mq_unfreeze_queue(struct request_queue *q) 163 { 164 bool wake = false; 165 166 spin_lock_irq(q->queue_lock); 167 if (!--q->bypass_depth) { 168 queue_flag_clear(QUEUE_FLAG_BYPASS, q); 169 wake = true; 170 } 171 WARN_ON_ONCE(q->bypass_depth < 0); 172 spin_unlock_irq(q->queue_lock); 173 if (wake) 174 wake_up_all(&q->mq_freeze_wq); 175 } 176 177 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 178 { 179 return blk_mq_has_free_tags(hctx->tags); 180 } 181 EXPORT_SYMBOL(blk_mq_can_queue); 182 183 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 184 struct request *rq, unsigned int rw_flags) 185 { 186 if (blk_queue_io_stat(q)) 187 rw_flags |= REQ_IO_STAT; 188 189 rq->mq_ctx = ctx; 190 rq->cmd_flags = rw_flags; 191 rq->start_time = jiffies; 192 set_start_time_ns(rq); 193 ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 194 } 195 196 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, 197 gfp_t gfp, bool reserved) 198 { 199 return blk_mq_alloc_rq(hctx, gfp, reserved); 200 } 201 202 static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, 203 int rw, gfp_t gfp, 204 bool reserved) 205 { 206 struct request *rq; 207 208 do { 209 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 210 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 211 212 rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); 213 if (rq) { 214 blk_mq_rq_ctx_init(q, ctx, rq, rw); 215 break; 216 } 217 218 blk_mq_put_ctx(ctx); 219 if (!(gfp & __GFP_WAIT)) 220 break; 221 222 __blk_mq_run_hw_queue(hctx); 223 blk_mq_wait_for_tags(hctx->tags); 224 } while (1); 225 226 return rq; 227 } 228 229 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, 230 gfp_t gfp, bool reserved) 231 { 232 struct request *rq; 233 234 if (blk_mq_queue_enter(q)) 235 return NULL; 236 237 rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); 238 if (rq) 239 blk_mq_put_ctx(rq->mq_ctx); 240 return rq; 241 } 242 243 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, 244 gfp_t gfp) 245 { 246 struct request *rq; 247 248 if (blk_mq_queue_enter(q)) 249 return NULL; 250 251 rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); 252 if (rq) 253 blk_mq_put_ctx(rq->mq_ctx); 254 return rq; 255 } 256 EXPORT_SYMBOL(blk_mq_alloc_reserved_request); 257 258 /* 259 * Re-init and set pdu, if we have it 260 */ 261 static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) 262 { 263 blk_rq_init(hctx->queue, rq); 264 265 if (hctx->cmd_size) 266 rq->special = blk_mq_rq_to_pdu(rq); 267 } 268 269 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 270 struct blk_mq_ctx *ctx, struct request *rq) 271 { 272 const int tag = rq->tag; 273 struct request_queue *q = rq->q; 274 275 blk_mq_rq_init(hctx, rq); 276 blk_mq_put_tag(hctx->tags, tag); 277 278 blk_mq_queue_exit(q); 279 } 280 281 void blk_mq_free_request(struct request *rq) 282 { 283 struct blk_mq_ctx *ctx = rq->mq_ctx; 284 struct blk_mq_hw_ctx *hctx; 285 struct request_queue *q = rq->q; 286 287 ctx->rq_completed[rq_is_sync(rq)]++; 288 289 hctx = q->mq_ops->map_queue(q, ctx->cpu); 290 __blk_mq_free_request(hctx, ctx, rq); 291 } 292 293 static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) 294 { 295 if (error) 296 clear_bit(BIO_UPTODATE, &bio->bi_flags); 297 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 298 error = -EIO; 299 300 if (unlikely(rq->cmd_flags & REQ_QUIET)) 301 set_bit(BIO_QUIET, &bio->bi_flags); 302 303 /* don't actually finish bio if it's part of flush sequence */ 304 if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) 305 bio_endio(bio, error); 306 } 307 308 void blk_mq_complete_request(struct request *rq, int error) 309 { 310 struct bio *bio = rq->bio; 311 unsigned int bytes = 0; 312 313 trace_block_rq_complete(rq->q, rq); 314 315 while (bio) { 316 struct bio *next = bio->bi_next; 317 318 bio->bi_next = NULL; 319 bytes += bio->bi_iter.bi_size; 320 blk_mq_bio_endio(rq, bio, error); 321 bio = next; 322 } 323 324 blk_account_io_completion(rq, bytes); 325 326 blk_account_io_done(rq); 327 328 if (rq->end_io) 329 rq->end_io(rq, error); 330 else 331 blk_mq_free_request(rq); 332 } 333 334 void __blk_mq_end_io(struct request *rq, int error) 335 { 336 if (!blk_mark_rq_complete(rq)) 337 blk_mq_complete_request(rq, error); 338 } 339 340 static void blk_mq_end_io_remote(void *data) 341 { 342 struct request *rq = data; 343 344 __blk_mq_end_io(rq, rq->errors); 345 } 346 347 /* 348 * End IO on this request on a multiqueue enabled driver. We'll either do 349 * it directly inline, or punt to a local IPI handler on the matching 350 * remote CPU. 351 */ 352 void blk_mq_end_io(struct request *rq, int error) 353 { 354 struct blk_mq_ctx *ctx = rq->mq_ctx; 355 int cpu; 356 357 if (!ctx->ipi_redirect) 358 return __blk_mq_end_io(rq, error); 359 360 cpu = get_cpu(); 361 if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { 362 rq->errors = error; 363 rq->csd.func = blk_mq_end_io_remote; 364 rq->csd.info = rq; 365 rq->csd.flags = 0; 366 __smp_call_function_single(ctx->cpu, &rq->csd, 0); 367 } else { 368 __blk_mq_end_io(rq, error); 369 } 370 put_cpu(); 371 } 372 EXPORT_SYMBOL(blk_mq_end_io); 373 374 static void blk_mq_start_request(struct request *rq) 375 { 376 struct request_queue *q = rq->q; 377 378 trace_block_rq_issue(q, rq); 379 380 /* 381 * Just mark start time and set the started bit. Due to memory 382 * ordering, we know we'll see the correct deadline as long as 383 * REQ_ATOMIC_STARTED is seen. 384 */ 385 rq->deadline = jiffies + q->rq_timeout; 386 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 387 } 388 389 static void blk_mq_requeue_request(struct request *rq) 390 { 391 struct request_queue *q = rq->q; 392 393 trace_block_rq_requeue(q, rq); 394 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 395 } 396 397 struct blk_mq_timeout_data { 398 struct blk_mq_hw_ctx *hctx; 399 unsigned long *next; 400 unsigned int *next_set; 401 }; 402 403 static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) 404 { 405 struct blk_mq_timeout_data *data = __data; 406 struct blk_mq_hw_ctx *hctx = data->hctx; 407 unsigned int tag; 408 409 /* It may not be in flight yet (this is where 410 * the REQ_ATOMIC_STARTED flag comes in). The requests are 411 * statically allocated, so we know it's always safe to access the 412 * memory associated with a bit offset into ->rqs[]. 413 */ 414 tag = 0; 415 do { 416 struct request *rq; 417 418 tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); 419 if (tag >= hctx->queue_depth) 420 break; 421 422 rq = hctx->rqs[tag++]; 423 424 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 425 continue; 426 427 blk_rq_check_expired(rq, data->next, data->next_set); 428 } while (1); 429 } 430 431 static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, 432 unsigned long *next, 433 unsigned int *next_set) 434 { 435 struct blk_mq_timeout_data data = { 436 .hctx = hctx, 437 .next = next, 438 .next_set = next_set, 439 }; 440 441 /* 442 * Ask the tagging code to iterate busy requests, so we can 443 * check them for timeout. 444 */ 445 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); 446 } 447 448 static void blk_mq_rq_timer(unsigned long data) 449 { 450 struct request_queue *q = (struct request_queue *) data; 451 struct blk_mq_hw_ctx *hctx; 452 unsigned long next = 0; 453 int i, next_set = 0; 454 455 queue_for_each_hw_ctx(q, hctx, i) 456 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 457 458 if (next_set) 459 mod_timer(&q->timeout, round_jiffies_up(next)); 460 } 461 462 /* 463 * Reverse check our software queue for entries that we could potentially 464 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 465 * too much time checking for merges. 466 */ 467 static bool blk_mq_attempt_merge(struct request_queue *q, 468 struct blk_mq_ctx *ctx, struct bio *bio) 469 { 470 struct request *rq; 471 int checked = 8; 472 473 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 474 int el_ret; 475 476 if (!checked--) 477 break; 478 479 if (!blk_rq_merge_ok(rq, bio)) 480 continue; 481 482 el_ret = blk_try_merge(rq, bio); 483 if (el_ret == ELEVATOR_BACK_MERGE) { 484 if (bio_attempt_back_merge(q, rq, bio)) { 485 ctx->rq_merged++; 486 return true; 487 } 488 break; 489 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 490 if (bio_attempt_front_merge(q, rq, bio)) { 491 ctx->rq_merged++; 492 return true; 493 } 494 break; 495 } 496 } 497 498 return false; 499 } 500 501 void blk_mq_add_timer(struct request *rq) 502 { 503 __blk_add_timer(rq, NULL); 504 } 505 506 /* 507 * Run this hardware queue, pulling any software queues mapped to it in. 508 * Note that this function currently has various problems around ordering 509 * of IO. In particular, we'd like FIFO behaviour on handling existing 510 * items on the hctx->dispatch list. Ignore that for now. 511 */ 512 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 513 { 514 struct request_queue *q = hctx->queue; 515 struct blk_mq_ctx *ctx; 516 struct request *rq; 517 LIST_HEAD(rq_list); 518 int bit, queued; 519 520 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 521 return; 522 523 hctx->run++; 524 525 /* 526 * Touch any software queue that has pending entries. 527 */ 528 for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { 529 clear_bit(bit, hctx->ctx_map); 530 ctx = hctx->ctxs[bit]; 531 BUG_ON(bit != ctx->index_hw); 532 533 spin_lock(&ctx->lock); 534 list_splice_tail_init(&ctx->rq_list, &rq_list); 535 spin_unlock(&ctx->lock); 536 } 537 538 /* 539 * If we have previous entries on our dispatch list, grab them 540 * and stuff them at the front for more fair dispatch. 541 */ 542 if (!list_empty_careful(&hctx->dispatch)) { 543 spin_lock(&hctx->lock); 544 if (!list_empty(&hctx->dispatch)) 545 list_splice_init(&hctx->dispatch, &rq_list); 546 spin_unlock(&hctx->lock); 547 } 548 549 /* 550 * Delete and return all entries from our dispatch list 551 */ 552 queued = 0; 553 554 /* 555 * Now process all the entries, sending them to the driver. 556 */ 557 while (!list_empty(&rq_list)) { 558 int ret; 559 560 rq = list_first_entry(&rq_list, struct request, queuelist); 561 list_del_init(&rq->queuelist); 562 blk_mq_start_request(rq); 563 564 /* 565 * Last request in the series. Flag it as such, this 566 * enables drivers to know when IO should be kicked off, 567 * if they don't do it on a per-request basis. 568 * 569 * Note: the flag isn't the only condition drivers 570 * should do kick off. If drive is busy, the last 571 * request might not have the bit set. 572 */ 573 if (list_empty(&rq_list)) 574 rq->cmd_flags |= REQ_END; 575 576 ret = q->mq_ops->queue_rq(hctx, rq); 577 switch (ret) { 578 case BLK_MQ_RQ_QUEUE_OK: 579 queued++; 580 continue; 581 case BLK_MQ_RQ_QUEUE_BUSY: 582 /* 583 * FIXME: we should have a mechanism to stop the queue 584 * like blk_stop_queue, otherwise we will waste cpu 585 * time 586 */ 587 list_add(&rq->queuelist, &rq_list); 588 blk_mq_requeue_request(rq); 589 break; 590 default: 591 pr_err("blk-mq: bad return on queue: %d\n", ret); 592 rq->errors = -EIO; 593 case BLK_MQ_RQ_QUEUE_ERROR: 594 blk_mq_end_io(rq, rq->errors); 595 break; 596 } 597 598 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 599 break; 600 } 601 602 if (!queued) 603 hctx->dispatched[0]++; 604 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 605 hctx->dispatched[ilog2(queued) + 1]++; 606 607 /* 608 * Any items that need requeuing? Stuff them into hctx->dispatch, 609 * that is where we will continue on next queue run. 610 */ 611 if (!list_empty(&rq_list)) { 612 spin_lock(&hctx->lock); 613 list_splice(&rq_list, &hctx->dispatch); 614 spin_unlock(&hctx->lock); 615 } 616 } 617 618 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 619 { 620 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 621 return; 622 623 if (!async) 624 __blk_mq_run_hw_queue(hctx); 625 else { 626 struct request_queue *q = hctx->queue; 627 628 kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); 629 } 630 } 631 632 void blk_mq_run_queues(struct request_queue *q, bool async) 633 { 634 struct blk_mq_hw_ctx *hctx; 635 int i; 636 637 queue_for_each_hw_ctx(q, hctx, i) { 638 if ((!blk_mq_hctx_has_pending(hctx) && 639 list_empty_careful(&hctx->dispatch)) || 640 test_bit(BLK_MQ_S_STOPPED, &hctx->flags)) 641 continue; 642 643 blk_mq_run_hw_queue(hctx, async); 644 } 645 } 646 EXPORT_SYMBOL(blk_mq_run_queues); 647 648 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 649 { 650 cancel_delayed_work(&hctx->delayed_work); 651 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 652 } 653 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 654 655 void blk_mq_stop_hw_queues(struct request_queue *q) 656 { 657 struct blk_mq_hw_ctx *hctx; 658 int i; 659 660 queue_for_each_hw_ctx(q, hctx, i) 661 blk_mq_stop_hw_queue(hctx); 662 } 663 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 664 665 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 666 { 667 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 668 __blk_mq_run_hw_queue(hctx); 669 } 670 EXPORT_SYMBOL(blk_mq_start_hw_queue); 671 672 void blk_mq_start_stopped_hw_queues(struct request_queue *q) 673 { 674 struct blk_mq_hw_ctx *hctx; 675 int i; 676 677 queue_for_each_hw_ctx(q, hctx, i) { 678 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 679 continue; 680 681 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 682 blk_mq_run_hw_queue(hctx, true); 683 } 684 } 685 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 686 687 static void blk_mq_work_fn(struct work_struct *work) 688 { 689 struct blk_mq_hw_ctx *hctx; 690 691 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); 692 __blk_mq_run_hw_queue(hctx); 693 } 694 695 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 696 struct request *rq) 697 { 698 struct blk_mq_ctx *ctx = rq->mq_ctx; 699 700 trace_block_rq_insert(hctx->queue, rq); 701 702 list_add_tail(&rq->queuelist, &ctx->rq_list); 703 blk_mq_hctx_mark_pending(hctx, ctx); 704 705 /* 706 * We do this early, to ensure we are on the right CPU. 707 */ 708 blk_mq_add_timer(rq); 709 } 710 711 void blk_mq_insert_request(struct request_queue *q, struct request *rq, 712 bool run_queue) 713 { 714 struct blk_mq_hw_ctx *hctx; 715 struct blk_mq_ctx *ctx, *current_ctx; 716 717 ctx = rq->mq_ctx; 718 hctx = q->mq_ops->map_queue(q, ctx->cpu); 719 720 if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) { 721 blk_insert_flush(rq); 722 } else { 723 current_ctx = blk_mq_get_ctx(q); 724 725 if (!cpu_online(ctx->cpu)) { 726 ctx = current_ctx; 727 hctx = q->mq_ops->map_queue(q, ctx->cpu); 728 rq->mq_ctx = ctx; 729 } 730 spin_lock(&ctx->lock); 731 __blk_mq_insert_request(hctx, rq); 732 spin_unlock(&ctx->lock); 733 734 blk_mq_put_ctx(current_ctx); 735 } 736 737 if (run_queue) 738 __blk_mq_run_hw_queue(hctx); 739 } 740 EXPORT_SYMBOL(blk_mq_insert_request); 741 742 /* 743 * This is a special version of blk_mq_insert_request to bypass FLUSH request 744 * check. Should only be used internally. 745 */ 746 void blk_mq_run_request(struct request *rq, bool run_queue, bool async) 747 { 748 struct request_queue *q = rq->q; 749 struct blk_mq_hw_ctx *hctx; 750 struct blk_mq_ctx *ctx, *current_ctx; 751 752 current_ctx = blk_mq_get_ctx(q); 753 754 ctx = rq->mq_ctx; 755 if (!cpu_online(ctx->cpu)) { 756 ctx = current_ctx; 757 rq->mq_ctx = ctx; 758 } 759 hctx = q->mq_ops->map_queue(q, ctx->cpu); 760 761 /* ctx->cpu might be offline */ 762 spin_lock(&ctx->lock); 763 __blk_mq_insert_request(hctx, rq); 764 spin_unlock(&ctx->lock); 765 766 blk_mq_put_ctx(current_ctx); 767 768 if (run_queue) 769 blk_mq_run_hw_queue(hctx, async); 770 } 771 772 static void blk_mq_insert_requests(struct request_queue *q, 773 struct blk_mq_ctx *ctx, 774 struct list_head *list, 775 int depth, 776 bool from_schedule) 777 778 { 779 struct blk_mq_hw_ctx *hctx; 780 struct blk_mq_ctx *current_ctx; 781 782 trace_block_unplug(q, depth, !from_schedule); 783 784 current_ctx = blk_mq_get_ctx(q); 785 786 if (!cpu_online(ctx->cpu)) 787 ctx = current_ctx; 788 hctx = q->mq_ops->map_queue(q, ctx->cpu); 789 790 /* 791 * preemption doesn't flush plug list, so it's possible ctx->cpu is 792 * offline now 793 */ 794 spin_lock(&ctx->lock); 795 while (!list_empty(list)) { 796 struct request *rq; 797 798 rq = list_first_entry(list, struct request, queuelist); 799 list_del_init(&rq->queuelist); 800 rq->mq_ctx = ctx; 801 __blk_mq_insert_request(hctx, rq); 802 } 803 spin_unlock(&ctx->lock); 804 805 blk_mq_put_ctx(current_ctx); 806 807 blk_mq_run_hw_queue(hctx, from_schedule); 808 } 809 810 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 811 { 812 struct request *rqa = container_of(a, struct request, queuelist); 813 struct request *rqb = container_of(b, struct request, queuelist); 814 815 return !(rqa->mq_ctx < rqb->mq_ctx || 816 (rqa->mq_ctx == rqb->mq_ctx && 817 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 818 } 819 820 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 821 { 822 struct blk_mq_ctx *this_ctx; 823 struct request_queue *this_q; 824 struct request *rq; 825 LIST_HEAD(list); 826 LIST_HEAD(ctx_list); 827 unsigned int depth; 828 829 list_splice_init(&plug->mq_list, &list); 830 831 list_sort(NULL, &list, plug_ctx_cmp); 832 833 this_q = NULL; 834 this_ctx = NULL; 835 depth = 0; 836 837 while (!list_empty(&list)) { 838 rq = list_entry_rq(list.next); 839 list_del_init(&rq->queuelist); 840 BUG_ON(!rq->q); 841 if (rq->mq_ctx != this_ctx) { 842 if (this_ctx) { 843 blk_mq_insert_requests(this_q, this_ctx, 844 &ctx_list, depth, 845 from_schedule); 846 } 847 848 this_ctx = rq->mq_ctx; 849 this_q = rq->q; 850 depth = 0; 851 } 852 853 depth++; 854 list_add_tail(&rq->queuelist, &ctx_list); 855 } 856 857 /* 858 * If 'this_ctx' is set, we know we have entries to complete 859 * on 'ctx_list'. Do those. 860 */ 861 if (this_ctx) { 862 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 863 from_schedule); 864 } 865 } 866 867 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 868 { 869 init_request_from_bio(rq, bio); 870 blk_account_io_start(rq, 1); 871 } 872 873 static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 874 { 875 struct blk_mq_hw_ctx *hctx; 876 struct blk_mq_ctx *ctx; 877 const int is_sync = rw_is_sync(bio->bi_rw); 878 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 879 int rw = bio_data_dir(bio); 880 struct request *rq; 881 unsigned int use_plug, request_count = 0; 882 883 /* 884 * If we have multiple hardware queues, just go directly to 885 * one of those for sync IO. 886 */ 887 use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); 888 889 blk_queue_bounce(q, &bio); 890 891 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) 892 return; 893 894 if (blk_mq_queue_enter(q)) { 895 bio_endio(bio, -EIO); 896 return; 897 } 898 899 ctx = blk_mq_get_ctx(q); 900 hctx = q->mq_ops->map_queue(q, ctx->cpu); 901 902 trace_block_getrq(q, bio, rw); 903 rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); 904 if (likely(rq)) 905 blk_mq_rq_ctx_init(q, ctx, rq, rw); 906 else { 907 blk_mq_put_ctx(ctx); 908 trace_block_sleeprq(q, bio, rw); 909 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, 910 false); 911 ctx = rq->mq_ctx; 912 hctx = q->mq_ops->map_queue(q, ctx->cpu); 913 } 914 915 hctx->queued++; 916 917 if (unlikely(is_flush_fua)) { 918 blk_mq_bio_to_request(rq, bio); 919 blk_mq_put_ctx(ctx); 920 blk_insert_flush(rq); 921 goto run_queue; 922 } 923 924 /* 925 * A task plug currently exists. Since this is completely lockless, 926 * utilize that to temporarily store requests until the task is 927 * either done or scheduled away. 928 */ 929 if (use_plug) { 930 struct blk_plug *plug = current->plug; 931 932 if (plug) { 933 blk_mq_bio_to_request(rq, bio); 934 if (list_empty(&plug->mq_list)) 935 trace_block_plug(q); 936 else if (request_count >= BLK_MAX_REQUEST_COUNT) { 937 blk_flush_plug_list(plug, false); 938 trace_block_plug(q); 939 } 940 list_add_tail(&rq->queuelist, &plug->mq_list); 941 blk_mq_put_ctx(ctx); 942 return; 943 } 944 } 945 946 spin_lock(&ctx->lock); 947 948 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 949 blk_mq_attempt_merge(q, ctx, bio)) 950 __blk_mq_free_request(hctx, ctx, rq); 951 else { 952 blk_mq_bio_to_request(rq, bio); 953 __blk_mq_insert_request(hctx, rq); 954 } 955 956 spin_unlock(&ctx->lock); 957 blk_mq_put_ctx(ctx); 958 959 /* 960 * For a SYNC request, send it to the hardware immediately. For an 961 * ASYNC request, just ensure that we run it later on. The latter 962 * allows for merging opportunities and more efficient dispatching. 963 */ 964 run_queue: 965 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); 966 } 967 968 /* 969 * Default mapping to a software queue, since we use one per CPU. 970 */ 971 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 972 { 973 return q->queue_hw_ctx[q->mq_map[cpu]]; 974 } 975 EXPORT_SYMBOL(blk_mq_map_queue); 976 977 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, 978 unsigned int hctx_index) 979 { 980 return kmalloc_node(sizeof(struct blk_mq_hw_ctx), 981 GFP_KERNEL | __GFP_ZERO, reg->numa_node); 982 } 983 EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); 984 985 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, 986 unsigned int hctx_index) 987 { 988 kfree(hctx); 989 } 990 EXPORT_SYMBOL(blk_mq_free_single_hw_queue); 991 992 static void blk_mq_hctx_notify(void *data, unsigned long action, 993 unsigned int cpu) 994 { 995 struct blk_mq_hw_ctx *hctx = data; 996 struct blk_mq_ctx *ctx; 997 LIST_HEAD(tmp); 998 999 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 1000 return; 1001 1002 /* 1003 * Move ctx entries to new CPU, if this one is going away. 1004 */ 1005 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 1006 1007 spin_lock(&ctx->lock); 1008 if (!list_empty(&ctx->rq_list)) { 1009 list_splice_init(&ctx->rq_list, &tmp); 1010 clear_bit(ctx->index_hw, hctx->ctx_map); 1011 } 1012 spin_unlock(&ctx->lock); 1013 1014 if (list_empty(&tmp)) 1015 return; 1016 1017 ctx = blk_mq_get_ctx(hctx->queue); 1018 spin_lock(&ctx->lock); 1019 1020 while (!list_empty(&tmp)) { 1021 struct request *rq; 1022 1023 rq = list_first_entry(&tmp, struct request, queuelist); 1024 rq->mq_ctx = ctx; 1025 list_move_tail(&rq->queuelist, &ctx->rq_list); 1026 } 1027 1028 blk_mq_hctx_mark_pending(hctx, ctx); 1029 1030 spin_unlock(&ctx->lock); 1031 blk_mq_put_ctx(ctx); 1032 } 1033 1034 static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, 1035 void (*init)(void *, struct blk_mq_hw_ctx *, 1036 struct request *, unsigned int), 1037 void *data) 1038 { 1039 unsigned int i; 1040 1041 for (i = 0; i < hctx->queue_depth; i++) { 1042 struct request *rq = hctx->rqs[i]; 1043 1044 init(data, hctx, rq, i); 1045 } 1046 } 1047 1048 void blk_mq_init_commands(struct request_queue *q, 1049 void (*init)(void *, struct blk_mq_hw_ctx *, 1050 struct request *, unsigned int), 1051 void *data) 1052 { 1053 struct blk_mq_hw_ctx *hctx; 1054 unsigned int i; 1055 1056 queue_for_each_hw_ctx(q, hctx, i) 1057 blk_mq_init_hw_commands(hctx, init, data); 1058 } 1059 EXPORT_SYMBOL(blk_mq_init_commands); 1060 1061 static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) 1062 { 1063 struct page *page; 1064 1065 while (!list_empty(&hctx->page_list)) { 1066 page = list_first_entry(&hctx->page_list, struct page, lru); 1067 list_del_init(&page->lru); 1068 __free_pages(page, page->private); 1069 } 1070 1071 kfree(hctx->rqs); 1072 1073 if (hctx->tags) 1074 blk_mq_free_tags(hctx->tags); 1075 } 1076 1077 static size_t order_to_size(unsigned int order) 1078 { 1079 size_t ret = PAGE_SIZE; 1080 1081 while (order--) 1082 ret *= 2; 1083 1084 return ret; 1085 } 1086 1087 static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, 1088 unsigned int reserved_tags, int node) 1089 { 1090 unsigned int i, j, entries_per_page, max_order = 4; 1091 size_t rq_size, left; 1092 1093 INIT_LIST_HEAD(&hctx->page_list); 1094 1095 hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), 1096 GFP_KERNEL, node); 1097 if (!hctx->rqs) 1098 return -ENOMEM; 1099 1100 /* 1101 * rq_size is the size of the request plus driver payload, rounded 1102 * to the cacheline size 1103 */ 1104 rq_size = round_up(sizeof(struct request) + hctx->cmd_size, 1105 cache_line_size()); 1106 left = rq_size * hctx->queue_depth; 1107 1108 for (i = 0; i < hctx->queue_depth;) { 1109 int this_order = max_order; 1110 struct page *page; 1111 int to_do; 1112 void *p; 1113 1114 while (left < order_to_size(this_order - 1) && this_order) 1115 this_order--; 1116 1117 do { 1118 page = alloc_pages_node(node, GFP_KERNEL, this_order); 1119 if (page) 1120 break; 1121 if (!this_order--) 1122 break; 1123 if (order_to_size(this_order) < rq_size) 1124 break; 1125 } while (1); 1126 1127 if (!page) 1128 break; 1129 1130 page->private = this_order; 1131 list_add_tail(&page->lru, &hctx->page_list); 1132 1133 p = page_address(page); 1134 entries_per_page = order_to_size(this_order) / rq_size; 1135 to_do = min(entries_per_page, hctx->queue_depth - i); 1136 left -= to_do * rq_size; 1137 for (j = 0; j < to_do; j++) { 1138 hctx->rqs[i] = p; 1139 blk_mq_rq_init(hctx, hctx->rqs[i]); 1140 p += rq_size; 1141 i++; 1142 } 1143 } 1144 1145 if (i < (reserved_tags + BLK_MQ_TAG_MIN)) 1146 goto err_rq_map; 1147 else if (i != hctx->queue_depth) { 1148 hctx->queue_depth = i; 1149 pr_warn("%s: queue depth set to %u because of low memory\n", 1150 __func__, i); 1151 } 1152 1153 hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); 1154 if (!hctx->tags) { 1155 err_rq_map: 1156 blk_mq_free_rq_map(hctx); 1157 return -ENOMEM; 1158 } 1159 1160 return 0; 1161 } 1162 1163 static int blk_mq_init_hw_queues(struct request_queue *q, 1164 struct blk_mq_reg *reg, void *driver_data) 1165 { 1166 struct blk_mq_hw_ctx *hctx; 1167 unsigned int i, j; 1168 1169 /* 1170 * Initialize hardware queues 1171 */ 1172 queue_for_each_hw_ctx(q, hctx, i) { 1173 unsigned int num_maps; 1174 int node; 1175 1176 node = hctx->numa_node; 1177 if (node == NUMA_NO_NODE) 1178 node = hctx->numa_node = reg->numa_node; 1179 1180 INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); 1181 spin_lock_init(&hctx->lock); 1182 INIT_LIST_HEAD(&hctx->dispatch); 1183 hctx->queue = q; 1184 hctx->queue_num = i; 1185 hctx->flags = reg->flags; 1186 hctx->queue_depth = reg->queue_depth; 1187 hctx->cmd_size = reg->cmd_size; 1188 1189 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1190 blk_mq_hctx_notify, hctx); 1191 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1192 1193 if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) 1194 break; 1195 1196 /* 1197 * Allocate space for all possible cpus to avoid allocation in 1198 * runtime 1199 */ 1200 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1201 GFP_KERNEL, node); 1202 if (!hctx->ctxs) 1203 break; 1204 1205 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; 1206 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), 1207 GFP_KERNEL, node); 1208 if (!hctx->ctx_map) 1209 break; 1210 1211 hctx->nr_ctx_map = num_maps; 1212 hctx->nr_ctx = 0; 1213 1214 if (reg->ops->init_hctx && 1215 reg->ops->init_hctx(hctx, driver_data, i)) 1216 break; 1217 } 1218 1219 if (i == q->nr_hw_queues) 1220 return 0; 1221 1222 /* 1223 * Init failed 1224 */ 1225 queue_for_each_hw_ctx(q, hctx, j) { 1226 if (i == j) 1227 break; 1228 1229 if (reg->ops->exit_hctx) 1230 reg->ops->exit_hctx(hctx, j); 1231 1232 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1233 blk_mq_free_rq_map(hctx); 1234 kfree(hctx->ctxs); 1235 } 1236 1237 return 1; 1238 } 1239 1240 static void blk_mq_init_cpu_queues(struct request_queue *q, 1241 unsigned int nr_hw_queues) 1242 { 1243 unsigned int i; 1244 1245 for_each_possible_cpu(i) { 1246 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1247 struct blk_mq_hw_ctx *hctx; 1248 1249 memset(__ctx, 0, sizeof(*__ctx)); 1250 __ctx->cpu = i; 1251 spin_lock_init(&__ctx->lock); 1252 INIT_LIST_HEAD(&__ctx->rq_list); 1253 __ctx->queue = q; 1254 1255 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1256 hctx = q->mq_ops->map_queue(q, i); 1257 hctx->nr_ctx++; 1258 1259 if (!cpu_online(i)) 1260 continue; 1261 1262 /* 1263 * Set local node, IFF we have more than one hw queue. If 1264 * not, we remain on the home node of the device 1265 */ 1266 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1267 hctx->numa_node = cpu_to_node(i); 1268 } 1269 } 1270 1271 static void blk_mq_map_swqueue(struct request_queue *q) 1272 { 1273 unsigned int i; 1274 struct blk_mq_hw_ctx *hctx; 1275 struct blk_mq_ctx *ctx; 1276 1277 queue_for_each_hw_ctx(q, hctx, i) { 1278 hctx->nr_ctx = 0; 1279 } 1280 1281 /* 1282 * Map software to hardware queues 1283 */ 1284 queue_for_each_ctx(q, ctx, i) { 1285 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1286 hctx = q->mq_ops->map_queue(q, i); 1287 ctx->index_hw = hctx->nr_ctx; 1288 hctx->ctxs[hctx->nr_ctx++] = ctx; 1289 } 1290 } 1291 1292 struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, 1293 void *driver_data) 1294 { 1295 struct blk_mq_hw_ctx **hctxs; 1296 struct blk_mq_ctx *ctx; 1297 struct request_queue *q; 1298 int i; 1299 1300 if (!reg->nr_hw_queues || 1301 !reg->ops->queue_rq || !reg->ops->map_queue || 1302 !reg->ops->alloc_hctx || !reg->ops->free_hctx) 1303 return ERR_PTR(-EINVAL); 1304 1305 if (!reg->queue_depth) 1306 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1307 else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { 1308 pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); 1309 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1310 } 1311 1312 /* 1313 * Set aside a tag for flush requests. It will only be used while 1314 * another flush request is in progress but outside the driver. 1315 * 1316 * TODO: only allocate if flushes are supported 1317 */ 1318 reg->queue_depth++; 1319 reg->reserved_tags++; 1320 1321 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) 1322 return ERR_PTR(-EINVAL); 1323 1324 ctx = alloc_percpu(struct blk_mq_ctx); 1325 if (!ctx) 1326 return ERR_PTR(-ENOMEM); 1327 1328 hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1329 reg->numa_node); 1330 1331 if (!hctxs) 1332 goto err_percpu; 1333 1334 for (i = 0; i < reg->nr_hw_queues; i++) { 1335 hctxs[i] = reg->ops->alloc_hctx(reg, i); 1336 if (!hctxs[i]) 1337 goto err_hctxs; 1338 1339 hctxs[i]->numa_node = NUMA_NO_NODE; 1340 hctxs[i]->queue_num = i; 1341 } 1342 1343 q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); 1344 if (!q) 1345 goto err_hctxs; 1346 1347 q->mq_map = blk_mq_make_queue_map(reg); 1348 if (!q->mq_map) 1349 goto err_map; 1350 1351 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1352 blk_queue_rq_timeout(q, 30000); 1353 1354 q->nr_queues = nr_cpu_ids; 1355 q->nr_hw_queues = reg->nr_hw_queues; 1356 1357 q->queue_ctx = ctx; 1358 q->queue_hw_ctx = hctxs; 1359 1360 q->mq_ops = reg->ops; 1361 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1362 1363 blk_queue_make_request(q, blk_mq_make_request); 1364 blk_queue_rq_timed_out(q, reg->ops->timeout); 1365 if (reg->timeout) 1366 blk_queue_rq_timeout(q, reg->timeout); 1367 1368 blk_mq_init_flush(q); 1369 blk_mq_init_cpu_queues(q, reg->nr_hw_queues); 1370 1371 if (blk_mq_init_hw_queues(q, reg, driver_data)) 1372 goto err_hw; 1373 1374 blk_mq_map_swqueue(q); 1375 1376 mutex_lock(&all_q_mutex); 1377 list_add_tail(&q->all_q_node, &all_q_list); 1378 mutex_unlock(&all_q_mutex); 1379 1380 return q; 1381 err_hw: 1382 kfree(q->mq_map); 1383 err_map: 1384 blk_cleanup_queue(q); 1385 err_hctxs: 1386 for (i = 0; i < reg->nr_hw_queues; i++) { 1387 if (!hctxs[i]) 1388 break; 1389 reg->ops->free_hctx(hctxs[i], i); 1390 } 1391 kfree(hctxs); 1392 err_percpu: 1393 free_percpu(ctx); 1394 return ERR_PTR(-ENOMEM); 1395 } 1396 EXPORT_SYMBOL(blk_mq_init_queue); 1397 1398 void blk_mq_free_queue(struct request_queue *q) 1399 { 1400 struct blk_mq_hw_ctx *hctx; 1401 int i; 1402 1403 queue_for_each_hw_ctx(q, hctx, i) { 1404 kfree(hctx->ctx_map); 1405 kfree(hctx->ctxs); 1406 blk_mq_free_rq_map(hctx); 1407 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1408 if (q->mq_ops->exit_hctx) 1409 q->mq_ops->exit_hctx(hctx, i); 1410 q->mq_ops->free_hctx(hctx, i); 1411 } 1412 1413 free_percpu(q->queue_ctx); 1414 kfree(q->queue_hw_ctx); 1415 kfree(q->mq_map); 1416 1417 q->queue_ctx = NULL; 1418 q->queue_hw_ctx = NULL; 1419 q->mq_map = NULL; 1420 1421 mutex_lock(&all_q_mutex); 1422 list_del_init(&q->all_q_node); 1423 mutex_unlock(&all_q_mutex); 1424 } 1425 1426 /* Basically redo blk_mq_init_queue with queue frozen */ 1427 static void blk_mq_queue_reinit(struct request_queue *q) 1428 { 1429 blk_mq_freeze_queue(q); 1430 1431 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 1432 1433 /* 1434 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 1435 * we should change hctx numa_node according to new topology (this 1436 * involves free and re-allocate memory, worthy doing?) 1437 */ 1438 1439 blk_mq_map_swqueue(q); 1440 1441 blk_mq_unfreeze_queue(q); 1442 } 1443 1444 static int blk_mq_queue_reinit_notify(struct notifier_block *nb, 1445 unsigned long action, void *hcpu) 1446 { 1447 struct request_queue *q; 1448 1449 /* 1450 * Before new mapping is established, hotadded cpu might already start 1451 * handling requests. This doesn't break anything as we map offline 1452 * CPUs to first hardware queue. We will re-init queue below to get 1453 * optimal settings. 1454 */ 1455 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 1456 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 1457 return NOTIFY_OK; 1458 1459 mutex_lock(&all_q_mutex); 1460 list_for_each_entry(q, &all_q_list, all_q_node) 1461 blk_mq_queue_reinit(q); 1462 mutex_unlock(&all_q_mutex); 1463 return NOTIFY_OK; 1464 } 1465 1466 static int __init blk_mq_init(void) 1467 { 1468 blk_mq_cpu_init(); 1469 1470 /* Must be called after percpu_counter_hotcpu_callback() */ 1471 hotcpu_notifier(blk_mq_queue_reinit_notify, -10); 1472 1473 return 0; 1474 } 1475 subsys_initcall(blk_mq_init); 1476