1 #include <linux/kernel.h> 2 #include <linux/module.h> 3 #include <linux/backing-dev.h> 4 #include <linux/bio.h> 5 #include <linux/blkdev.h> 6 #include <linux/mm.h> 7 #include <linux/init.h> 8 #include <linux/slab.h> 9 #include <linux/workqueue.h> 10 #include <linux/smp.h> 11 #include <linux/llist.h> 12 #include <linux/list_sort.h> 13 #include <linux/cpu.h> 14 #include <linux/cache.h> 15 #include <linux/sched/sysctl.h> 16 #include <linux/delay.h> 17 18 #include <trace/events/block.h> 19 20 #include <linux/blk-mq.h> 21 #include "blk.h" 22 #include "blk-mq.h" 23 #include "blk-mq-tag.h" 24 25 static DEFINE_MUTEX(all_q_mutex); 26 static LIST_HEAD(all_q_list); 27 28 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 29 30 DEFINE_PER_CPU(struct llist_head, ipi_lists); 31 32 static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, 33 unsigned int cpu) 34 { 35 return per_cpu_ptr(q->queue_ctx, cpu); 36 } 37 38 /* 39 * This assumes per-cpu software queueing queues. They could be per-node 40 * as well, for instance. For now this is hardcoded as-is. Note that we don't 41 * care about preemption, since we know the ctx's are persistent. This does 42 * mean that we can't rely on ctx always matching the currently running CPU. 43 */ 44 static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) 45 { 46 return __blk_mq_get_ctx(q, get_cpu()); 47 } 48 49 static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) 50 { 51 put_cpu(); 52 } 53 54 /* 55 * Check if any of the ctx's have pending work in this hardware queue 56 */ 57 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 58 { 59 unsigned int i; 60 61 for (i = 0; i < hctx->nr_ctx_map; i++) 62 if (hctx->ctx_map[i]) 63 return true; 64 65 return false; 66 } 67 68 /* 69 * Mark this ctx as having pending work in this hardware queue 70 */ 71 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 72 struct blk_mq_ctx *ctx) 73 { 74 if (!test_bit(ctx->index_hw, hctx->ctx_map)) 75 set_bit(ctx->index_hw, hctx->ctx_map); 76 } 77 78 static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp, 79 bool reserved) 80 { 81 struct request *rq; 82 unsigned int tag; 83 84 tag = blk_mq_get_tag(hctx->tags, gfp, reserved); 85 if (tag != BLK_MQ_TAG_FAIL) { 86 rq = hctx->rqs[tag]; 87 rq->tag = tag; 88 89 return rq; 90 } 91 92 return NULL; 93 } 94 95 static int blk_mq_queue_enter(struct request_queue *q) 96 { 97 int ret; 98 99 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 100 smp_wmb(); 101 /* we have problems to freeze the queue if it's initializing */ 102 if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) 103 return 0; 104 105 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 106 107 spin_lock_irq(q->queue_lock); 108 ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, 109 !blk_queue_bypass(q), *q->queue_lock); 110 /* inc usage with lock hold to avoid freeze_queue runs here */ 111 if (!ret) 112 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 113 spin_unlock_irq(q->queue_lock); 114 115 return ret; 116 } 117 118 static void blk_mq_queue_exit(struct request_queue *q) 119 { 120 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 121 } 122 123 /* 124 * Guarantee no request is in use, so we can change any data structure of 125 * the queue afterward. 126 */ 127 static void blk_mq_freeze_queue(struct request_queue *q) 128 { 129 bool drain; 130 131 spin_lock_irq(q->queue_lock); 132 drain = !q->bypass_depth++; 133 queue_flag_set(QUEUE_FLAG_BYPASS, q); 134 spin_unlock_irq(q->queue_lock); 135 136 if (!drain) 137 return; 138 139 while (true) { 140 s64 count; 141 142 spin_lock_irq(q->queue_lock); 143 count = percpu_counter_sum(&q->mq_usage_counter); 144 spin_unlock_irq(q->queue_lock); 145 146 if (count == 0) 147 break; 148 blk_mq_run_queues(q, false); 149 msleep(10); 150 } 151 } 152 153 static void blk_mq_unfreeze_queue(struct request_queue *q) 154 { 155 bool wake = false; 156 157 spin_lock_irq(q->queue_lock); 158 if (!--q->bypass_depth) { 159 queue_flag_clear(QUEUE_FLAG_BYPASS, q); 160 wake = true; 161 } 162 WARN_ON_ONCE(q->bypass_depth < 0); 163 spin_unlock_irq(q->queue_lock); 164 if (wake) 165 wake_up_all(&q->mq_freeze_wq); 166 } 167 168 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 169 { 170 return blk_mq_has_free_tags(hctx->tags); 171 } 172 EXPORT_SYMBOL(blk_mq_can_queue); 173 174 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 175 struct request *rq, unsigned int rw_flags) 176 { 177 if (blk_queue_io_stat(q)) 178 rw_flags |= REQ_IO_STAT; 179 180 rq->mq_ctx = ctx; 181 rq->cmd_flags = rw_flags; 182 ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 183 } 184 185 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, 186 gfp_t gfp, bool reserved) 187 { 188 return blk_mq_alloc_rq(hctx, gfp, reserved); 189 } 190 191 static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, 192 int rw, gfp_t gfp, 193 bool reserved) 194 { 195 struct request *rq; 196 197 do { 198 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 199 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 200 201 rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); 202 if (rq) { 203 blk_mq_rq_ctx_init(q, ctx, rq, rw); 204 break; 205 } else if (!(gfp & __GFP_WAIT)) 206 break; 207 208 blk_mq_put_ctx(ctx); 209 __blk_mq_run_hw_queue(hctx); 210 blk_mq_wait_for_tags(hctx->tags); 211 } while (1); 212 213 return rq; 214 } 215 216 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, 217 gfp_t gfp, bool reserved) 218 { 219 struct request *rq; 220 221 if (blk_mq_queue_enter(q)) 222 return NULL; 223 224 rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); 225 blk_mq_put_ctx(rq->mq_ctx); 226 return rq; 227 } 228 229 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, 230 gfp_t gfp) 231 { 232 struct request *rq; 233 234 if (blk_mq_queue_enter(q)) 235 return NULL; 236 237 rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); 238 blk_mq_put_ctx(rq->mq_ctx); 239 return rq; 240 } 241 EXPORT_SYMBOL(blk_mq_alloc_reserved_request); 242 243 /* 244 * Re-init and set pdu, if we have it 245 */ 246 static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) 247 { 248 blk_rq_init(hctx->queue, rq); 249 250 if (hctx->cmd_size) 251 rq->special = blk_mq_rq_to_pdu(rq); 252 } 253 254 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 255 struct blk_mq_ctx *ctx, struct request *rq) 256 { 257 const int tag = rq->tag; 258 struct request_queue *q = rq->q; 259 260 blk_mq_rq_init(hctx, rq); 261 blk_mq_put_tag(hctx->tags, tag); 262 263 blk_mq_queue_exit(q); 264 } 265 266 void blk_mq_free_request(struct request *rq) 267 { 268 struct blk_mq_ctx *ctx = rq->mq_ctx; 269 struct blk_mq_hw_ctx *hctx; 270 struct request_queue *q = rq->q; 271 272 ctx->rq_completed[rq_is_sync(rq)]++; 273 274 hctx = q->mq_ops->map_queue(q, ctx->cpu); 275 __blk_mq_free_request(hctx, ctx, rq); 276 } 277 278 static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) 279 { 280 if (error) 281 clear_bit(BIO_UPTODATE, &bio->bi_flags); 282 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 283 error = -EIO; 284 285 if (unlikely(rq->cmd_flags & REQ_QUIET)) 286 set_bit(BIO_QUIET, &bio->bi_flags); 287 288 /* don't actually finish bio if it's part of flush sequence */ 289 if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) 290 bio_endio(bio, error); 291 } 292 293 void blk_mq_complete_request(struct request *rq, int error) 294 { 295 struct bio *bio = rq->bio; 296 unsigned int bytes = 0; 297 298 trace_block_rq_complete(rq->q, rq); 299 300 while (bio) { 301 struct bio *next = bio->bi_next; 302 303 bio->bi_next = NULL; 304 bytes += bio->bi_size; 305 blk_mq_bio_endio(rq, bio, error); 306 bio = next; 307 } 308 309 blk_account_io_completion(rq, bytes); 310 311 if (rq->end_io) 312 rq->end_io(rq, error); 313 else 314 blk_mq_free_request(rq); 315 316 blk_account_io_done(rq); 317 } 318 319 void __blk_mq_end_io(struct request *rq, int error) 320 { 321 if (!blk_mark_rq_complete(rq)) 322 blk_mq_complete_request(rq, error); 323 } 324 325 #if defined(CONFIG_SMP) 326 327 /* 328 * Called with interrupts disabled. 329 */ 330 static void ipi_end_io(void *data) 331 { 332 struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id()); 333 struct llist_node *entry, *next; 334 struct request *rq; 335 336 entry = llist_del_all(list); 337 338 while (entry) { 339 next = entry->next; 340 rq = llist_entry(entry, struct request, ll_list); 341 __blk_mq_end_io(rq, rq->errors); 342 entry = next; 343 } 344 } 345 346 static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, 347 struct request *rq, const int error) 348 { 349 struct call_single_data *data = &rq->csd; 350 351 rq->errors = error; 352 rq->ll_list.next = NULL; 353 354 /* 355 * If the list is non-empty, an existing IPI must already 356 * be "in flight". If that is the case, we need not schedule 357 * a new one. 358 */ 359 if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) { 360 data->func = ipi_end_io; 361 data->flags = 0; 362 __smp_call_function_single(ctx->cpu, data, 0); 363 } 364 365 return true; 366 } 367 #else /* CONFIG_SMP */ 368 static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, 369 struct request *rq, const int error) 370 { 371 return false; 372 } 373 #endif 374 375 /* 376 * End IO on this request on a multiqueue enabled driver. We'll either do 377 * it directly inline, or punt to a local IPI handler on the matching 378 * remote CPU. 379 */ 380 void blk_mq_end_io(struct request *rq, int error) 381 { 382 struct blk_mq_ctx *ctx = rq->mq_ctx; 383 int cpu; 384 385 if (!ctx->ipi_redirect) 386 return __blk_mq_end_io(rq, error); 387 388 cpu = get_cpu(); 389 390 if (cpu == ctx->cpu || !cpu_online(ctx->cpu) || 391 !ipi_remote_cpu(ctx, cpu, rq, error)) 392 __blk_mq_end_io(rq, error); 393 394 put_cpu(); 395 } 396 EXPORT_SYMBOL(blk_mq_end_io); 397 398 static void blk_mq_start_request(struct request *rq) 399 { 400 struct request_queue *q = rq->q; 401 402 trace_block_rq_issue(q, rq); 403 404 /* 405 * Just mark start time and set the started bit. Due to memory 406 * ordering, we know we'll see the correct deadline as long as 407 * REQ_ATOMIC_STARTED is seen. 408 */ 409 rq->deadline = jiffies + q->rq_timeout; 410 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 411 } 412 413 static void blk_mq_requeue_request(struct request *rq) 414 { 415 struct request_queue *q = rq->q; 416 417 trace_block_rq_requeue(q, rq); 418 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 419 } 420 421 struct blk_mq_timeout_data { 422 struct blk_mq_hw_ctx *hctx; 423 unsigned long *next; 424 unsigned int *next_set; 425 }; 426 427 static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) 428 { 429 struct blk_mq_timeout_data *data = __data; 430 struct blk_mq_hw_ctx *hctx = data->hctx; 431 unsigned int tag; 432 433 /* It may not be in flight yet (this is where 434 * the REQ_ATOMIC_STARTED flag comes in). The requests are 435 * statically allocated, so we know it's always safe to access the 436 * memory associated with a bit offset into ->rqs[]. 437 */ 438 tag = 0; 439 do { 440 struct request *rq; 441 442 tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); 443 if (tag >= hctx->queue_depth) 444 break; 445 446 rq = hctx->rqs[tag++]; 447 448 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 449 continue; 450 451 blk_rq_check_expired(rq, data->next, data->next_set); 452 } while (1); 453 } 454 455 static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, 456 unsigned long *next, 457 unsigned int *next_set) 458 { 459 struct blk_mq_timeout_data data = { 460 .hctx = hctx, 461 .next = next, 462 .next_set = next_set, 463 }; 464 465 /* 466 * Ask the tagging code to iterate busy requests, so we can 467 * check them for timeout. 468 */ 469 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); 470 } 471 472 static void blk_mq_rq_timer(unsigned long data) 473 { 474 struct request_queue *q = (struct request_queue *) data; 475 struct blk_mq_hw_ctx *hctx; 476 unsigned long next = 0; 477 int i, next_set = 0; 478 479 queue_for_each_hw_ctx(q, hctx, i) 480 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 481 482 if (next_set) 483 mod_timer(&q->timeout, round_jiffies_up(next)); 484 } 485 486 /* 487 * Reverse check our software queue for entries that we could potentially 488 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 489 * too much time checking for merges. 490 */ 491 static bool blk_mq_attempt_merge(struct request_queue *q, 492 struct blk_mq_ctx *ctx, struct bio *bio) 493 { 494 struct request *rq; 495 int checked = 8; 496 497 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 498 int el_ret; 499 500 if (!checked--) 501 break; 502 503 if (!blk_rq_merge_ok(rq, bio)) 504 continue; 505 506 el_ret = blk_try_merge(rq, bio); 507 if (el_ret == ELEVATOR_BACK_MERGE) { 508 if (bio_attempt_back_merge(q, rq, bio)) { 509 ctx->rq_merged++; 510 return true; 511 } 512 break; 513 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 514 if (bio_attempt_front_merge(q, rq, bio)) { 515 ctx->rq_merged++; 516 return true; 517 } 518 break; 519 } 520 } 521 522 return false; 523 } 524 525 void blk_mq_add_timer(struct request *rq) 526 { 527 __blk_add_timer(rq, NULL); 528 } 529 530 /* 531 * Run this hardware queue, pulling any software queues mapped to it in. 532 * Note that this function currently has various problems around ordering 533 * of IO. In particular, we'd like FIFO behaviour on handling existing 534 * items on the hctx->dispatch list. Ignore that for now. 535 */ 536 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 537 { 538 struct request_queue *q = hctx->queue; 539 struct blk_mq_ctx *ctx; 540 struct request *rq; 541 LIST_HEAD(rq_list); 542 int bit, queued; 543 544 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 545 return; 546 547 hctx->run++; 548 549 /* 550 * Touch any software queue that has pending entries. 551 */ 552 for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { 553 clear_bit(bit, hctx->ctx_map); 554 ctx = hctx->ctxs[bit]; 555 BUG_ON(bit != ctx->index_hw); 556 557 spin_lock(&ctx->lock); 558 list_splice_tail_init(&ctx->rq_list, &rq_list); 559 spin_unlock(&ctx->lock); 560 } 561 562 /* 563 * If we have previous entries on our dispatch list, grab them 564 * and stuff them at the front for more fair dispatch. 565 */ 566 if (!list_empty_careful(&hctx->dispatch)) { 567 spin_lock(&hctx->lock); 568 if (!list_empty(&hctx->dispatch)) 569 list_splice_init(&hctx->dispatch, &rq_list); 570 spin_unlock(&hctx->lock); 571 } 572 573 /* 574 * Delete and return all entries from our dispatch list 575 */ 576 queued = 0; 577 578 /* 579 * Now process all the entries, sending them to the driver. 580 */ 581 while (!list_empty(&rq_list)) { 582 int ret; 583 584 rq = list_first_entry(&rq_list, struct request, queuelist); 585 list_del_init(&rq->queuelist); 586 blk_mq_start_request(rq); 587 588 /* 589 * Last request in the series. Flag it as such, this 590 * enables drivers to know when IO should be kicked off, 591 * if they don't do it on a per-request basis. 592 * 593 * Note: the flag isn't the only condition drivers 594 * should do kick off. If drive is busy, the last 595 * request might not have the bit set. 596 */ 597 if (list_empty(&rq_list)) 598 rq->cmd_flags |= REQ_END; 599 600 ret = q->mq_ops->queue_rq(hctx, rq); 601 switch (ret) { 602 case BLK_MQ_RQ_QUEUE_OK: 603 queued++; 604 continue; 605 case BLK_MQ_RQ_QUEUE_BUSY: 606 /* 607 * FIXME: we should have a mechanism to stop the queue 608 * like blk_stop_queue, otherwise we will waste cpu 609 * time 610 */ 611 list_add(&rq->queuelist, &rq_list); 612 blk_mq_requeue_request(rq); 613 break; 614 default: 615 pr_err("blk-mq: bad return on queue: %d\n", ret); 616 rq->errors = -EIO; 617 case BLK_MQ_RQ_QUEUE_ERROR: 618 blk_mq_end_io(rq, rq->errors); 619 break; 620 } 621 622 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 623 break; 624 } 625 626 if (!queued) 627 hctx->dispatched[0]++; 628 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 629 hctx->dispatched[ilog2(queued) + 1]++; 630 631 /* 632 * Any items that need requeuing? Stuff them into hctx->dispatch, 633 * that is where we will continue on next queue run. 634 */ 635 if (!list_empty(&rq_list)) { 636 spin_lock(&hctx->lock); 637 list_splice(&rq_list, &hctx->dispatch); 638 spin_unlock(&hctx->lock); 639 } 640 } 641 642 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 643 { 644 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 645 return; 646 647 if (!async) 648 __blk_mq_run_hw_queue(hctx); 649 else { 650 struct request_queue *q = hctx->queue; 651 652 kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); 653 } 654 } 655 656 void blk_mq_run_queues(struct request_queue *q, bool async) 657 { 658 struct blk_mq_hw_ctx *hctx; 659 int i; 660 661 queue_for_each_hw_ctx(q, hctx, i) { 662 if ((!blk_mq_hctx_has_pending(hctx) && 663 list_empty_careful(&hctx->dispatch)) || 664 test_bit(BLK_MQ_S_STOPPED, &hctx->flags)) 665 continue; 666 667 blk_mq_run_hw_queue(hctx, async); 668 } 669 } 670 EXPORT_SYMBOL(blk_mq_run_queues); 671 672 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 673 { 674 cancel_delayed_work(&hctx->delayed_work); 675 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 676 } 677 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 678 679 void blk_mq_stop_hw_queues(struct request_queue *q) 680 { 681 struct blk_mq_hw_ctx *hctx; 682 int i; 683 684 queue_for_each_hw_ctx(q, hctx, i) 685 blk_mq_stop_hw_queue(hctx); 686 } 687 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 688 689 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 690 { 691 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 692 __blk_mq_run_hw_queue(hctx); 693 } 694 EXPORT_SYMBOL(blk_mq_start_hw_queue); 695 696 void blk_mq_start_stopped_hw_queues(struct request_queue *q) 697 { 698 struct blk_mq_hw_ctx *hctx; 699 int i; 700 701 queue_for_each_hw_ctx(q, hctx, i) { 702 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 703 continue; 704 705 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 706 blk_mq_run_hw_queue(hctx, true); 707 } 708 } 709 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 710 711 static void blk_mq_work_fn(struct work_struct *work) 712 { 713 struct blk_mq_hw_ctx *hctx; 714 715 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); 716 __blk_mq_run_hw_queue(hctx); 717 } 718 719 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 720 struct request *rq) 721 { 722 struct blk_mq_ctx *ctx = rq->mq_ctx; 723 724 trace_block_rq_insert(hctx->queue, rq); 725 726 list_add_tail(&rq->queuelist, &ctx->rq_list); 727 blk_mq_hctx_mark_pending(hctx, ctx); 728 729 /* 730 * We do this early, to ensure we are on the right CPU. 731 */ 732 blk_mq_add_timer(rq); 733 } 734 735 void blk_mq_insert_request(struct request_queue *q, struct request *rq, 736 bool run_queue) 737 { 738 struct blk_mq_hw_ctx *hctx; 739 struct blk_mq_ctx *ctx, *current_ctx; 740 741 ctx = rq->mq_ctx; 742 hctx = q->mq_ops->map_queue(q, ctx->cpu); 743 744 if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) { 745 blk_insert_flush(rq); 746 } else { 747 current_ctx = blk_mq_get_ctx(q); 748 749 if (!cpu_online(ctx->cpu)) { 750 ctx = current_ctx; 751 hctx = q->mq_ops->map_queue(q, ctx->cpu); 752 rq->mq_ctx = ctx; 753 } 754 spin_lock(&ctx->lock); 755 __blk_mq_insert_request(hctx, rq); 756 spin_unlock(&ctx->lock); 757 758 blk_mq_put_ctx(current_ctx); 759 } 760 761 if (run_queue) 762 __blk_mq_run_hw_queue(hctx); 763 } 764 EXPORT_SYMBOL(blk_mq_insert_request); 765 766 /* 767 * This is a special version of blk_mq_insert_request to bypass FLUSH request 768 * check. Should only be used internally. 769 */ 770 void blk_mq_run_request(struct request *rq, bool run_queue, bool async) 771 { 772 struct request_queue *q = rq->q; 773 struct blk_mq_hw_ctx *hctx; 774 struct blk_mq_ctx *ctx, *current_ctx; 775 776 current_ctx = blk_mq_get_ctx(q); 777 778 ctx = rq->mq_ctx; 779 if (!cpu_online(ctx->cpu)) { 780 ctx = current_ctx; 781 rq->mq_ctx = ctx; 782 } 783 hctx = q->mq_ops->map_queue(q, ctx->cpu); 784 785 /* ctx->cpu might be offline */ 786 spin_lock(&ctx->lock); 787 __blk_mq_insert_request(hctx, rq); 788 spin_unlock(&ctx->lock); 789 790 blk_mq_put_ctx(current_ctx); 791 792 if (run_queue) 793 blk_mq_run_hw_queue(hctx, async); 794 } 795 796 static void blk_mq_insert_requests(struct request_queue *q, 797 struct blk_mq_ctx *ctx, 798 struct list_head *list, 799 int depth, 800 bool from_schedule) 801 802 { 803 struct blk_mq_hw_ctx *hctx; 804 struct blk_mq_ctx *current_ctx; 805 806 trace_block_unplug(q, depth, !from_schedule); 807 808 current_ctx = blk_mq_get_ctx(q); 809 810 if (!cpu_online(ctx->cpu)) 811 ctx = current_ctx; 812 hctx = q->mq_ops->map_queue(q, ctx->cpu); 813 814 /* 815 * preemption doesn't flush plug list, so it's possible ctx->cpu is 816 * offline now 817 */ 818 spin_lock(&ctx->lock); 819 while (!list_empty(list)) { 820 struct request *rq; 821 822 rq = list_first_entry(list, struct request, queuelist); 823 list_del_init(&rq->queuelist); 824 rq->mq_ctx = ctx; 825 __blk_mq_insert_request(hctx, rq); 826 } 827 spin_unlock(&ctx->lock); 828 829 blk_mq_put_ctx(current_ctx); 830 831 blk_mq_run_hw_queue(hctx, from_schedule); 832 } 833 834 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 835 { 836 struct request *rqa = container_of(a, struct request, queuelist); 837 struct request *rqb = container_of(b, struct request, queuelist); 838 839 return !(rqa->mq_ctx < rqb->mq_ctx || 840 (rqa->mq_ctx == rqb->mq_ctx && 841 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 842 } 843 844 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 845 { 846 struct blk_mq_ctx *this_ctx; 847 struct request_queue *this_q; 848 struct request *rq; 849 LIST_HEAD(list); 850 LIST_HEAD(ctx_list); 851 unsigned int depth; 852 853 list_splice_init(&plug->mq_list, &list); 854 855 list_sort(NULL, &list, plug_ctx_cmp); 856 857 this_q = NULL; 858 this_ctx = NULL; 859 depth = 0; 860 861 while (!list_empty(&list)) { 862 rq = list_entry_rq(list.next); 863 list_del_init(&rq->queuelist); 864 BUG_ON(!rq->q); 865 if (rq->mq_ctx != this_ctx) { 866 if (this_ctx) { 867 blk_mq_insert_requests(this_q, this_ctx, 868 &ctx_list, depth, 869 from_schedule); 870 } 871 872 this_ctx = rq->mq_ctx; 873 this_q = rq->q; 874 depth = 0; 875 } 876 877 depth++; 878 list_add_tail(&rq->queuelist, &ctx_list); 879 } 880 881 /* 882 * If 'this_ctx' is set, we know we have entries to complete 883 * on 'ctx_list'. Do those. 884 */ 885 if (this_ctx) { 886 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 887 from_schedule); 888 } 889 } 890 891 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 892 { 893 init_request_from_bio(rq, bio); 894 blk_account_io_start(rq, 1); 895 } 896 897 static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 898 { 899 struct blk_mq_hw_ctx *hctx; 900 struct blk_mq_ctx *ctx; 901 const int is_sync = rw_is_sync(bio->bi_rw); 902 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 903 int rw = bio_data_dir(bio); 904 struct request *rq; 905 unsigned int use_plug, request_count = 0; 906 907 /* 908 * If we have multiple hardware queues, just go directly to 909 * one of those for sync IO. 910 */ 911 use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); 912 913 blk_queue_bounce(q, &bio); 914 915 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) 916 return; 917 918 if (blk_mq_queue_enter(q)) { 919 bio_endio(bio, -EIO); 920 return; 921 } 922 923 ctx = blk_mq_get_ctx(q); 924 hctx = q->mq_ops->map_queue(q, ctx->cpu); 925 926 trace_block_getrq(q, bio, rw); 927 rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); 928 if (likely(rq)) 929 blk_mq_rq_ctx_init(q, ctx, rq, rw); 930 else { 931 blk_mq_put_ctx(ctx); 932 trace_block_sleeprq(q, bio, rw); 933 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, 934 false); 935 ctx = rq->mq_ctx; 936 hctx = q->mq_ops->map_queue(q, ctx->cpu); 937 } 938 939 hctx->queued++; 940 941 if (unlikely(is_flush_fua)) { 942 blk_mq_bio_to_request(rq, bio); 943 blk_mq_put_ctx(ctx); 944 blk_insert_flush(rq); 945 goto run_queue; 946 } 947 948 /* 949 * A task plug currently exists. Since this is completely lockless, 950 * utilize that to temporarily store requests until the task is 951 * either done or scheduled away. 952 */ 953 if (use_plug) { 954 struct blk_plug *plug = current->plug; 955 956 if (plug) { 957 blk_mq_bio_to_request(rq, bio); 958 if (list_empty(&plug->mq_list)) 959 trace_block_plug(q); 960 else if (request_count >= BLK_MAX_REQUEST_COUNT) { 961 blk_flush_plug_list(plug, false); 962 trace_block_plug(q); 963 } 964 list_add_tail(&rq->queuelist, &plug->mq_list); 965 blk_mq_put_ctx(ctx); 966 return; 967 } 968 } 969 970 spin_lock(&ctx->lock); 971 972 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 973 blk_mq_attempt_merge(q, ctx, bio)) 974 __blk_mq_free_request(hctx, ctx, rq); 975 else { 976 blk_mq_bio_to_request(rq, bio); 977 __blk_mq_insert_request(hctx, rq); 978 } 979 980 spin_unlock(&ctx->lock); 981 blk_mq_put_ctx(ctx); 982 983 /* 984 * For a SYNC request, send it to the hardware immediately. For an 985 * ASYNC request, just ensure that we run it later on. The latter 986 * allows for merging opportunities and more efficient dispatching. 987 */ 988 run_queue: 989 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); 990 } 991 992 /* 993 * Default mapping to a software queue, since we use one per CPU. 994 */ 995 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 996 { 997 return q->queue_hw_ctx[q->mq_map[cpu]]; 998 } 999 EXPORT_SYMBOL(blk_mq_map_queue); 1000 1001 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, 1002 unsigned int hctx_index) 1003 { 1004 return kmalloc_node(sizeof(struct blk_mq_hw_ctx), 1005 GFP_KERNEL | __GFP_ZERO, reg->numa_node); 1006 } 1007 EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); 1008 1009 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, 1010 unsigned int hctx_index) 1011 { 1012 kfree(hctx); 1013 } 1014 EXPORT_SYMBOL(blk_mq_free_single_hw_queue); 1015 1016 static void blk_mq_hctx_notify(void *data, unsigned long action, 1017 unsigned int cpu) 1018 { 1019 struct blk_mq_hw_ctx *hctx = data; 1020 struct blk_mq_ctx *ctx; 1021 LIST_HEAD(tmp); 1022 1023 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 1024 return; 1025 1026 /* 1027 * Move ctx entries to new CPU, if this one is going away. 1028 */ 1029 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 1030 1031 spin_lock(&ctx->lock); 1032 if (!list_empty(&ctx->rq_list)) { 1033 list_splice_init(&ctx->rq_list, &tmp); 1034 clear_bit(ctx->index_hw, hctx->ctx_map); 1035 } 1036 spin_unlock(&ctx->lock); 1037 1038 if (list_empty(&tmp)) 1039 return; 1040 1041 ctx = blk_mq_get_ctx(hctx->queue); 1042 spin_lock(&ctx->lock); 1043 1044 while (!list_empty(&tmp)) { 1045 struct request *rq; 1046 1047 rq = list_first_entry(&tmp, struct request, queuelist); 1048 rq->mq_ctx = ctx; 1049 list_move_tail(&rq->queuelist, &ctx->rq_list); 1050 } 1051 1052 blk_mq_hctx_mark_pending(hctx, ctx); 1053 1054 spin_unlock(&ctx->lock); 1055 blk_mq_put_ctx(ctx); 1056 } 1057 1058 static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, 1059 void (*init)(void *, struct blk_mq_hw_ctx *, 1060 struct request *, unsigned int), 1061 void *data) 1062 { 1063 unsigned int i; 1064 1065 for (i = 0; i < hctx->queue_depth; i++) { 1066 struct request *rq = hctx->rqs[i]; 1067 1068 init(data, hctx, rq, i); 1069 } 1070 } 1071 1072 void blk_mq_init_commands(struct request_queue *q, 1073 void (*init)(void *, struct blk_mq_hw_ctx *, 1074 struct request *, unsigned int), 1075 void *data) 1076 { 1077 struct blk_mq_hw_ctx *hctx; 1078 unsigned int i; 1079 1080 queue_for_each_hw_ctx(q, hctx, i) 1081 blk_mq_init_hw_commands(hctx, init, data); 1082 } 1083 EXPORT_SYMBOL(blk_mq_init_commands); 1084 1085 static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) 1086 { 1087 struct page *page; 1088 1089 while (!list_empty(&hctx->page_list)) { 1090 page = list_first_entry(&hctx->page_list, struct page, list); 1091 list_del_init(&page->list); 1092 __free_pages(page, page->private); 1093 } 1094 1095 kfree(hctx->rqs); 1096 1097 if (hctx->tags) 1098 blk_mq_free_tags(hctx->tags); 1099 } 1100 1101 static size_t order_to_size(unsigned int order) 1102 { 1103 size_t ret = PAGE_SIZE; 1104 1105 while (order--) 1106 ret *= 2; 1107 1108 return ret; 1109 } 1110 1111 static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, 1112 unsigned int reserved_tags, int node) 1113 { 1114 unsigned int i, j, entries_per_page, max_order = 4; 1115 size_t rq_size, left; 1116 1117 INIT_LIST_HEAD(&hctx->page_list); 1118 1119 hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), 1120 GFP_KERNEL, node); 1121 if (!hctx->rqs) 1122 return -ENOMEM; 1123 1124 /* 1125 * rq_size is the size of the request plus driver payload, rounded 1126 * to the cacheline size 1127 */ 1128 rq_size = round_up(sizeof(struct request) + hctx->cmd_size, 1129 cache_line_size()); 1130 left = rq_size * hctx->queue_depth; 1131 1132 for (i = 0; i < hctx->queue_depth;) { 1133 int this_order = max_order; 1134 struct page *page; 1135 int to_do; 1136 void *p; 1137 1138 while (left < order_to_size(this_order - 1) && this_order) 1139 this_order--; 1140 1141 do { 1142 page = alloc_pages_node(node, GFP_KERNEL, this_order); 1143 if (page) 1144 break; 1145 if (!this_order--) 1146 break; 1147 if (order_to_size(this_order) < rq_size) 1148 break; 1149 } while (1); 1150 1151 if (!page) 1152 break; 1153 1154 page->private = this_order; 1155 list_add_tail(&page->list, &hctx->page_list); 1156 1157 p = page_address(page); 1158 entries_per_page = order_to_size(this_order) / rq_size; 1159 to_do = min(entries_per_page, hctx->queue_depth - i); 1160 left -= to_do * rq_size; 1161 for (j = 0; j < to_do; j++) { 1162 hctx->rqs[i] = p; 1163 blk_mq_rq_init(hctx, hctx->rqs[i]); 1164 p += rq_size; 1165 i++; 1166 } 1167 } 1168 1169 if (i < (reserved_tags + BLK_MQ_TAG_MIN)) 1170 goto err_rq_map; 1171 else if (i != hctx->queue_depth) { 1172 hctx->queue_depth = i; 1173 pr_warn("%s: queue depth set to %u because of low memory\n", 1174 __func__, i); 1175 } 1176 1177 hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); 1178 if (!hctx->tags) { 1179 err_rq_map: 1180 blk_mq_free_rq_map(hctx); 1181 return -ENOMEM; 1182 } 1183 1184 return 0; 1185 } 1186 1187 static int blk_mq_init_hw_queues(struct request_queue *q, 1188 struct blk_mq_reg *reg, void *driver_data) 1189 { 1190 struct blk_mq_hw_ctx *hctx; 1191 unsigned int i, j; 1192 1193 /* 1194 * Initialize hardware queues 1195 */ 1196 queue_for_each_hw_ctx(q, hctx, i) { 1197 unsigned int num_maps; 1198 int node; 1199 1200 node = hctx->numa_node; 1201 if (node == NUMA_NO_NODE) 1202 node = hctx->numa_node = reg->numa_node; 1203 1204 INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); 1205 spin_lock_init(&hctx->lock); 1206 INIT_LIST_HEAD(&hctx->dispatch); 1207 hctx->queue = q; 1208 hctx->queue_num = i; 1209 hctx->flags = reg->flags; 1210 hctx->queue_depth = reg->queue_depth; 1211 hctx->cmd_size = reg->cmd_size; 1212 1213 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1214 blk_mq_hctx_notify, hctx); 1215 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1216 1217 if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) 1218 break; 1219 1220 /* 1221 * Allocate space for all possible cpus to avoid allocation in 1222 * runtime 1223 */ 1224 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1225 GFP_KERNEL, node); 1226 if (!hctx->ctxs) 1227 break; 1228 1229 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; 1230 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), 1231 GFP_KERNEL, node); 1232 if (!hctx->ctx_map) 1233 break; 1234 1235 hctx->nr_ctx_map = num_maps; 1236 hctx->nr_ctx = 0; 1237 1238 if (reg->ops->init_hctx && 1239 reg->ops->init_hctx(hctx, driver_data, i)) 1240 break; 1241 } 1242 1243 if (i == q->nr_hw_queues) 1244 return 0; 1245 1246 /* 1247 * Init failed 1248 */ 1249 queue_for_each_hw_ctx(q, hctx, j) { 1250 if (i == j) 1251 break; 1252 1253 if (reg->ops->exit_hctx) 1254 reg->ops->exit_hctx(hctx, j); 1255 1256 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1257 blk_mq_free_rq_map(hctx); 1258 kfree(hctx->ctxs); 1259 } 1260 1261 return 1; 1262 } 1263 1264 static void blk_mq_init_cpu_queues(struct request_queue *q, 1265 unsigned int nr_hw_queues) 1266 { 1267 unsigned int i; 1268 1269 for_each_possible_cpu(i) { 1270 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1271 struct blk_mq_hw_ctx *hctx; 1272 1273 memset(__ctx, 0, sizeof(*__ctx)); 1274 __ctx->cpu = i; 1275 spin_lock_init(&__ctx->lock); 1276 INIT_LIST_HEAD(&__ctx->rq_list); 1277 __ctx->queue = q; 1278 1279 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1280 hctx = q->mq_ops->map_queue(q, i); 1281 hctx->nr_ctx++; 1282 1283 if (!cpu_online(i)) 1284 continue; 1285 1286 /* 1287 * Set local node, IFF we have more than one hw queue. If 1288 * not, we remain on the home node of the device 1289 */ 1290 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1291 hctx->numa_node = cpu_to_node(i); 1292 } 1293 } 1294 1295 static void blk_mq_map_swqueue(struct request_queue *q) 1296 { 1297 unsigned int i; 1298 struct blk_mq_hw_ctx *hctx; 1299 struct blk_mq_ctx *ctx; 1300 1301 queue_for_each_hw_ctx(q, hctx, i) { 1302 hctx->nr_ctx = 0; 1303 } 1304 1305 /* 1306 * Map software to hardware queues 1307 */ 1308 queue_for_each_ctx(q, ctx, i) { 1309 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1310 hctx = q->mq_ops->map_queue(q, i); 1311 ctx->index_hw = hctx->nr_ctx; 1312 hctx->ctxs[hctx->nr_ctx++] = ctx; 1313 } 1314 } 1315 1316 struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, 1317 void *driver_data) 1318 { 1319 struct blk_mq_hw_ctx **hctxs; 1320 struct blk_mq_ctx *ctx; 1321 struct request_queue *q; 1322 int i; 1323 1324 if (!reg->nr_hw_queues || 1325 !reg->ops->queue_rq || !reg->ops->map_queue || 1326 !reg->ops->alloc_hctx || !reg->ops->free_hctx) 1327 return ERR_PTR(-EINVAL); 1328 1329 if (!reg->queue_depth) 1330 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1331 else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { 1332 pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); 1333 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1334 } 1335 1336 /* 1337 * Set aside a tag for flush requests. It will only be used while 1338 * another flush request is in progress but outside the driver. 1339 * 1340 * TODO: only allocate if flushes are supported 1341 */ 1342 reg->queue_depth++; 1343 reg->reserved_tags++; 1344 1345 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) 1346 return ERR_PTR(-EINVAL); 1347 1348 ctx = alloc_percpu(struct blk_mq_ctx); 1349 if (!ctx) 1350 return ERR_PTR(-ENOMEM); 1351 1352 hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1353 reg->numa_node); 1354 1355 if (!hctxs) 1356 goto err_percpu; 1357 1358 for (i = 0; i < reg->nr_hw_queues; i++) { 1359 hctxs[i] = reg->ops->alloc_hctx(reg, i); 1360 if (!hctxs[i]) 1361 goto err_hctxs; 1362 1363 hctxs[i]->numa_node = NUMA_NO_NODE; 1364 hctxs[i]->queue_num = i; 1365 } 1366 1367 q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); 1368 if (!q) 1369 goto err_hctxs; 1370 1371 q->mq_map = blk_mq_make_queue_map(reg); 1372 if (!q->mq_map) 1373 goto err_map; 1374 1375 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1376 blk_queue_rq_timeout(q, 30000); 1377 1378 q->nr_queues = nr_cpu_ids; 1379 q->nr_hw_queues = reg->nr_hw_queues; 1380 1381 q->queue_ctx = ctx; 1382 q->queue_hw_ctx = hctxs; 1383 1384 q->mq_ops = reg->ops; 1385 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1386 1387 blk_queue_make_request(q, blk_mq_make_request); 1388 blk_queue_rq_timed_out(q, reg->ops->timeout); 1389 if (reg->timeout) 1390 blk_queue_rq_timeout(q, reg->timeout); 1391 1392 blk_mq_init_flush(q); 1393 blk_mq_init_cpu_queues(q, reg->nr_hw_queues); 1394 1395 if (blk_mq_init_hw_queues(q, reg, driver_data)) 1396 goto err_hw; 1397 1398 blk_mq_map_swqueue(q); 1399 1400 mutex_lock(&all_q_mutex); 1401 list_add_tail(&q->all_q_node, &all_q_list); 1402 mutex_unlock(&all_q_mutex); 1403 1404 return q; 1405 err_hw: 1406 kfree(q->mq_map); 1407 err_map: 1408 blk_cleanup_queue(q); 1409 err_hctxs: 1410 for (i = 0; i < reg->nr_hw_queues; i++) { 1411 if (!hctxs[i]) 1412 break; 1413 reg->ops->free_hctx(hctxs[i], i); 1414 } 1415 kfree(hctxs); 1416 err_percpu: 1417 free_percpu(ctx); 1418 return ERR_PTR(-ENOMEM); 1419 } 1420 EXPORT_SYMBOL(blk_mq_init_queue); 1421 1422 void blk_mq_free_queue(struct request_queue *q) 1423 { 1424 struct blk_mq_hw_ctx *hctx; 1425 int i; 1426 1427 queue_for_each_hw_ctx(q, hctx, i) { 1428 cancel_delayed_work_sync(&hctx->delayed_work); 1429 kfree(hctx->ctx_map); 1430 kfree(hctx->ctxs); 1431 blk_mq_free_rq_map(hctx); 1432 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1433 if (q->mq_ops->exit_hctx) 1434 q->mq_ops->exit_hctx(hctx, i); 1435 q->mq_ops->free_hctx(hctx, i); 1436 } 1437 1438 free_percpu(q->queue_ctx); 1439 kfree(q->queue_hw_ctx); 1440 kfree(q->mq_map); 1441 1442 q->queue_ctx = NULL; 1443 q->queue_hw_ctx = NULL; 1444 q->mq_map = NULL; 1445 1446 mutex_lock(&all_q_mutex); 1447 list_del_init(&q->all_q_node); 1448 mutex_unlock(&all_q_mutex); 1449 } 1450 EXPORT_SYMBOL(blk_mq_free_queue); 1451 1452 /* Basically redo blk_mq_init_queue with queue frozen */ 1453 static void blk_mq_queue_reinit(struct request_queue *q) 1454 { 1455 blk_mq_freeze_queue(q); 1456 1457 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 1458 1459 /* 1460 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 1461 * we should change hctx numa_node according to new topology (this 1462 * involves free and re-allocate memory, worthy doing?) 1463 */ 1464 1465 blk_mq_map_swqueue(q); 1466 1467 blk_mq_unfreeze_queue(q); 1468 } 1469 1470 static int blk_mq_queue_reinit_notify(struct notifier_block *nb, 1471 unsigned long action, void *hcpu) 1472 { 1473 struct request_queue *q; 1474 1475 /* 1476 * Before new mapping is established, hotadded cpu might already start 1477 * handling requests. This doesn't break anything as we map offline 1478 * CPUs to first hardware queue. We will re-init queue below to get 1479 * optimal settings. 1480 */ 1481 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 1482 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 1483 return NOTIFY_OK; 1484 1485 mutex_lock(&all_q_mutex); 1486 list_for_each_entry(q, &all_q_list, all_q_node) 1487 blk_mq_queue_reinit(q); 1488 mutex_unlock(&all_q_mutex); 1489 return NOTIFY_OK; 1490 } 1491 1492 static int __init blk_mq_init(void) 1493 { 1494 unsigned int i; 1495 1496 for_each_possible_cpu(i) 1497 init_llist_head(&per_cpu(ipi_lists, i)); 1498 1499 blk_mq_cpu_init(); 1500 1501 /* Must be called after percpu_counter_hotcpu_callback() */ 1502 hotcpu_notifier(blk_mq_queue_reinit_notify, -10); 1503 1504 return 0; 1505 } 1506 subsys_initcall(blk_mq_init); 1507