1 /* 2 * Block multiqueue core code 3 * 4 * Copyright (C) 2013-2014 Jens Axboe 5 * Copyright (C) 2013-2014 Christoph Hellwig 6 */ 7 #include <linux/kernel.h> 8 #include <linux/module.h> 9 #include <linux/backing-dev.h> 10 #include <linux/bio.h> 11 #include <linux/blkdev.h> 12 #include <linux/kmemleak.h> 13 #include <linux/mm.h> 14 #include <linux/init.h> 15 #include <linux/slab.h> 16 #include <linux/workqueue.h> 17 #include <linux/smp.h> 18 #include <linux/llist.h> 19 #include <linux/list_sort.h> 20 #include <linux/cpu.h> 21 #include <linux/cache.h> 22 #include <linux/sched/sysctl.h> 23 #include <linux/delay.h> 24 #include <linux/crash_dump.h> 25 #include <linux/prefetch.h> 26 27 #include <trace/events/block.h> 28 29 #include <linux/blk-mq.h> 30 #include "blk.h" 31 #include "blk-mq.h" 32 #include "blk-mq-tag.h" 33 34 static DEFINE_MUTEX(all_q_mutex); 35 static LIST_HEAD(all_q_list); 36 37 /* 38 * Check if any of the ctx's have pending work in this hardware queue 39 */ 40 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 41 { 42 return sbitmap_any_bit_set(&hctx->ctx_map); 43 } 44 45 /* 46 * Mark this ctx as having pending work in this hardware queue 47 */ 48 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 49 struct blk_mq_ctx *ctx) 50 { 51 if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) 52 sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); 53 } 54 55 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 56 struct blk_mq_ctx *ctx) 57 { 58 sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); 59 } 60 61 void blk_mq_freeze_queue_start(struct request_queue *q) 62 { 63 int freeze_depth; 64 65 freeze_depth = atomic_inc_return(&q->mq_freeze_depth); 66 if (freeze_depth == 1) { 67 percpu_ref_kill(&q->q_usage_counter); 68 blk_mq_run_hw_queues(q, false); 69 } 70 } 71 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); 72 73 static void blk_mq_freeze_queue_wait(struct request_queue *q) 74 { 75 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); 76 } 77 78 /* 79 * Guarantee no request is in use, so we can change any data structure of 80 * the queue afterward. 81 */ 82 void blk_freeze_queue(struct request_queue *q) 83 { 84 /* 85 * In the !blk_mq case we are only calling this to kill the 86 * q_usage_counter, otherwise this increases the freeze depth 87 * and waits for it to return to zero. For this reason there is 88 * no blk_unfreeze_queue(), and blk_freeze_queue() is not 89 * exported to drivers as the only user for unfreeze is blk_mq. 90 */ 91 blk_mq_freeze_queue_start(q); 92 blk_mq_freeze_queue_wait(q); 93 } 94 95 void blk_mq_freeze_queue(struct request_queue *q) 96 { 97 /* 98 * ...just an alias to keep freeze and unfreeze actions balanced 99 * in the blk_mq_* namespace 100 */ 101 blk_freeze_queue(q); 102 } 103 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 104 105 void blk_mq_unfreeze_queue(struct request_queue *q) 106 { 107 int freeze_depth; 108 109 freeze_depth = atomic_dec_return(&q->mq_freeze_depth); 110 WARN_ON_ONCE(freeze_depth < 0); 111 if (!freeze_depth) { 112 percpu_ref_reinit(&q->q_usage_counter); 113 wake_up_all(&q->mq_freeze_wq); 114 } 115 } 116 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 117 118 void blk_mq_wake_waiters(struct request_queue *q) 119 { 120 struct blk_mq_hw_ctx *hctx; 121 unsigned int i; 122 123 queue_for_each_hw_ctx(q, hctx, i) 124 if (blk_mq_hw_queue_mapped(hctx)) 125 blk_mq_tag_wakeup_all(hctx->tags, true); 126 127 /* 128 * If we are called because the queue has now been marked as 129 * dying, we need to ensure that processes currently waiting on 130 * the queue are notified as well. 131 */ 132 wake_up_all(&q->mq_freeze_wq); 133 } 134 135 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 136 { 137 return blk_mq_has_free_tags(hctx->tags); 138 } 139 EXPORT_SYMBOL(blk_mq_can_queue); 140 141 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 142 struct request *rq, int op, 143 unsigned int op_flags) 144 { 145 if (blk_queue_io_stat(q)) 146 op_flags |= REQ_IO_STAT; 147 148 INIT_LIST_HEAD(&rq->queuelist); 149 /* csd/requeue_work/fifo_time is initialized before use */ 150 rq->q = q; 151 rq->mq_ctx = ctx; 152 req_set_op_attrs(rq, op, op_flags); 153 /* do not touch atomic flags, it needs atomic ops against the timer */ 154 rq->cpu = -1; 155 INIT_HLIST_NODE(&rq->hash); 156 RB_CLEAR_NODE(&rq->rb_node); 157 rq->rq_disk = NULL; 158 rq->part = NULL; 159 rq->start_time = jiffies; 160 #ifdef CONFIG_BLK_CGROUP 161 rq->rl = NULL; 162 set_start_time_ns(rq); 163 rq->io_start_time_ns = 0; 164 #endif 165 rq->nr_phys_segments = 0; 166 #if defined(CONFIG_BLK_DEV_INTEGRITY) 167 rq->nr_integrity_segments = 0; 168 #endif 169 rq->special = NULL; 170 /* tag was already set */ 171 rq->errors = 0; 172 173 rq->cmd = rq->__cmd; 174 175 rq->extra_len = 0; 176 rq->sense_len = 0; 177 rq->resid_len = 0; 178 rq->sense = NULL; 179 180 INIT_LIST_HEAD(&rq->timeout_list); 181 rq->timeout = 0; 182 183 rq->end_io = NULL; 184 rq->end_io_data = NULL; 185 rq->next_rq = NULL; 186 187 ctx->rq_dispatched[rw_is_sync(op, op_flags)]++; 188 } 189 190 static struct request * 191 __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags) 192 { 193 struct request *rq; 194 unsigned int tag; 195 196 tag = blk_mq_get_tag(data); 197 if (tag != BLK_MQ_TAG_FAIL) { 198 rq = data->hctx->tags->rqs[tag]; 199 200 if (blk_mq_tag_busy(data->hctx)) { 201 rq->cmd_flags = REQ_MQ_INFLIGHT; 202 atomic_inc(&data->hctx->nr_active); 203 } 204 205 rq->tag = tag; 206 blk_mq_rq_ctx_init(data->q, data->ctx, rq, op, op_flags); 207 return rq; 208 } 209 210 return NULL; 211 } 212 213 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, 214 unsigned int flags) 215 { 216 struct blk_mq_ctx *ctx; 217 struct blk_mq_hw_ctx *hctx; 218 struct request *rq; 219 struct blk_mq_alloc_data alloc_data; 220 int ret; 221 222 ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); 223 if (ret) 224 return ERR_PTR(ret); 225 226 ctx = blk_mq_get_ctx(q); 227 hctx = blk_mq_map_queue(q, ctx->cpu); 228 blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); 229 rq = __blk_mq_alloc_request(&alloc_data, rw, 0); 230 blk_mq_put_ctx(ctx); 231 232 if (!rq) { 233 blk_queue_exit(q); 234 return ERR_PTR(-EWOULDBLOCK); 235 } 236 237 rq->__data_len = 0; 238 rq->__sector = (sector_t) -1; 239 rq->bio = rq->biotail = NULL; 240 return rq; 241 } 242 EXPORT_SYMBOL(blk_mq_alloc_request); 243 244 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, 245 unsigned int flags, unsigned int hctx_idx) 246 { 247 struct blk_mq_hw_ctx *hctx; 248 struct blk_mq_ctx *ctx; 249 struct request *rq; 250 struct blk_mq_alloc_data alloc_data; 251 int ret; 252 253 /* 254 * If the tag allocator sleeps we could get an allocation for a 255 * different hardware context. No need to complicate the low level 256 * allocator for this for the rare use case of a command tied to 257 * a specific queue. 258 */ 259 if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT))) 260 return ERR_PTR(-EINVAL); 261 262 if (hctx_idx >= q->nr_hw_queues) 263 return ERR_PTR(-EIO); 264 265 ret = blk_queue_enter(q, true); 266 if (ret) 267 return ERR_PTR(ret); 268 269 /* 270 * Check if the hardware context is actually mapped to anything. 271 * If not tell the caller that it should skip this queue. 272 */ 273 hctx = q->queue_hw_ctx[hctx_idx]; 274 if (!blk_mq_hw_queue_mapped(hctx)) { 275 ret = -EXDEV; 276 goto out_queue_exit; 277 } 278 ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); 279 280 blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); 281 rq = __blk_mq_alloc_request(&alloc_data, rw, 0); 282 if (!rq) { 283 ret = -EWOULDBLOCK; 284 goto out_queue_exit; 285 } 286 287 return rq; 288 289 out_queue_exit: 290 blk_queue_exit(q); 291 return ERR_PTR(ret); 292 } 293 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); 294 295 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 296 struct blk_mq_ctx *ctx, struct request *rq) 297 { 298 const int tag = rq->tag; 299 struct request_queue *q = rq->q; 300 301 if (rq->cmd_flags & REQ_MQ_INFLIGHT) 302 atomic_dec(&hctx->nr_active); 303 rq->cmd_flags = 0; 304 305 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 306 blk_mq_put_tag(hctx, ctx, tag); 307 blk_queue_exit(q); 308 } 309 310 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) 311 { 312 struct blk_mq_ctx *ctx = rq->mq_ctx; 313 314 ctx->rq_completed[rq_is_sync(rq)]++; 315 __blk_mq_free_request(hctx, ctx, rq); 316 317 } 318 EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); 319 320 void blk_mq_free_request(struct request *rq) 321 { 322 blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); 323 } 324 EXPORT_SYMBOL_GPL(blk_mq_free_request); 325 326 inline void __blk_mq_end_request(struct request *rq, int error) 327 { 328 blk_account_io_done(rq); 329 330 if (rq->end_io) { 331 rq->end_io(rq, error); 332 } else { 333 if (unlikely(blk_bidi_rq(rq))) 334 blk_mq_free_request(rq->next_rq); 335 blk_mq_free_request(rq); 336 } 337 } 338 EXPORT_SYMBOL(__blk_mq_end_request); 339 340 void blk_mq_end_request(struct request *rq, int error) 341 { 342 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 343 BUG(); 344 __blk_mq_end_request(rq, error); 345 } 346 EXPORT_SYMBOL(blk_mq_end_request); 347 348 static void __blk_mq_complete_request_remote(void *data) 349 { 350 struct request *rq = data; 351 352 rq->q->softirq_done_fn(rq); 353 } 354 355 static void blk_mq_ipi_complete_request(struct request *rq) 356 { 357 struct blk_mq_ctx *ctx = rq->mq_ctx; 358 bool shared = false; 359 int cpu; 360 361 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { 362 rq->q->softirq_done_fn(rq); 363 return; 364 } 365 366 cpu = get_cpu(); 367 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) 368 shared = cpus_share_cache(cpu, ctx->cpu); 369 370 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { 371 rq->csd.func = __blk_mq_complete_request_remote; 372 rq->csd.info = rq; 373 rq->csd.flags = 0; 374 smp_call_function_single_async(ctx->cpu, &rq->csd); 375 } else { 376 rq->q->softirq_done_fn(rq); 377 } 378 put_cpu(); 379 } 380 381 static void __blk_mq_complete_request(struct request *rq) 382 { 383 struct request_queue *q = rq->q; 384 385 if (!q->softirq_done_fn) 386 blk_mq_end_request(rq, rq->errors); 387 else 388 blk_mq_ipi_complete_request(rq); 389 } 390 391 /** 392 * blk_mq_complete_request - end I/O on a request 393 * @rq: the request being processed 394 * 395 * Description: 396 * Ends all I/O on a request. It does not handle partial completions. 397 * The actual completion happens out-of-order, through a IPI handler. 398 **/ 399 void blk_mq_complete_request(struct request *rq, int error) 400 { 401 struct request_queue *q = rq->q; 402 403 if (unlikely(blk_should_fake_timeout(q))) 404 return; 405 if (!blk_mark_rq_complete(rq)) { 406 rq->errors = error; 407 __blk_mq_complete_request(rq); 408 } 409 } 410 EXPORT_SYMBOL(blk_mq_complete_request); 411 412 int blk_mq_request_started(struct request *rq) 413 { 414 return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 415 } 416 EXPORT_SYMBOL_GPL(blk_mq_request_started); 417 418 void blk_mq_start_request(struct request *rq) 419 { 420 struct request_queue *q = rq->q; 421 422 trace_block_rq_issue(q, rq); 423 424 rq->resid_len = blk_rq_bytes(rq); 425 if (unlikely(blk_bidi_rq(rq))) 426 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); 427 428 blk_add_timer(rq); 429 430 /* 431 * Ensure that ->deadline is visible before set the started 432 * flag and clear the completed flag. 433 */ 434 smp_mb__before_atomic(); 435 436 /* 437 * Mark us as started and clear complete. Complete might have been 438 * set if requeue raced with timeout, which then marked it as 439 * complete. So be sure to clear complete again when we start 440 * the request, otherwise we'll ignore the completion event. 441 */ 442 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 443 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 444 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) 445 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 446 447 if (q->dma_drain_size && blk_rq_bytes(rq)) { 448 /* 449 * Make sure space for the drain appears. We know we can do 450 * this because max_hw_segments has been adjusted to be one 451 * fewer than the device can handle. 452 */ 453 rq->nr_phys_segments++; 454 } 455 } 456 EXPORT_SYMBOL(blk_mq_start_request); 457 458 static void __blk_mq_requeue_request(struct request *rq) 459 { 460 struct request_queue *q = rq->q; 461 462 trace_block_rq_requeue(q, rq); 463 464 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 465 if (q->dma_drain_size && blk_rq_bytes(rq)) 466 rq->nr_phys_segments--; 467 } 468 } 469 470 void blk_mq_requeue_request(struct request *rq) 471 { 472 __blk_mq_requeue_request(rq); 473 474 BUG_ON(blk_queued_rq(rq)); 475 blk_mq_add_to_requeue_list(rq, true); 476 } 477 EXPORT_SYMBOL(blk_mq_requeue_request); 478 479 static void blk_mq_requeue_work(struct work_struct *work) 480 { 481 struct request_queue *q = 482 container_of(work, struct request_queue, requeue_work.work); 483 LIST_HEAD(rq_list); 484 struct request *rq, *next; 485 unsigned long flags; 486 487 spin_lock_irqsave(&q->requeue_lock, flags); 488 list_splice_init(&q->requeue_list, &rq_list); 489 spin_unlock_irqrestore(&q->requeue_lock, flags); 490 491 list_for_each_entry_safe(rq, next, &rq_list, queuelist) { 492 if (!(rq->cmd_flags & REQ_SOFTBARRIER)) 493 continue; 494 495 rq->cmd_flags &= ~REQ_SOFTBARRIER; 496 list_del_init(&rq->queuelist); 497 blk_mq_insert_request(rq, true, false, false); 498 } 499 500 while (!list_empty(&rq_list)) { 501 rq = list_entry(rq_list.next, struct request, queuelist); 502 list_del_init(&rq->queuelist); 503 blk_mq_insert_request(rq, false, false, false); 504 } 505 506 /* 507 * Use the start variant of queue running here, so that running 508 * the requeue work will kick stopped queues. 509 */ 510 blk_mq_start_hw_queues(q); 511 } 512 513 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) 514 { 515 struct request_queue *q = rq->q; 516 unsigned long flags; 517 518 /* 519 * We abuse this flag that is otherwise used by the I/O scheduler to 520 * request head insertation from the workqueue. 521 */ 522 BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); 523 524 spin_lock_irqsave(&q->requeue_lock, flags); 525 if (at_head) { 526 rq->cmd_flags |= REQ_SOFTBARRIER; 527 list_add(&rq->queuelist, &q->requeue_list); 528 } else { 529 list_add_tail(&rq->queuelist, &q->requeue_list); 530 } 531 spin_unlock_irqrestore(&q->requeue_lock, flags); 532 } 533 EXPORT_SYMBOL(blk_mq_add_to_requeue_list); 534 535 void blk_mq_cancel_requeue_work(struct request_queue *q) 536 { 537 cancel_delayed_work_sync(&q->requeue_work); 538 } 539 EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); 540 541 void blk_mq_kick_requeue_list(struct request_queue *q) 542 { 543 kblockd_schedule_delayed_work(&q->requeue_work, 0); 544 } 545 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 546 547 void blk_mq_delay_kick_requeue_list(struct request_queue *q, 548 unsigned long msecs) 549 { 550 kblockd_schedule_delayed_work(&q->requeue_work, 551 msecs_to_jiffies(msecs)); 552 } 553 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); 554 555 void blk_mq_abort_requeue_list(struct request_queue *q) 556 { 557 unsigned long flags; 558 LIST_HEAD(rq_list); 559 560 spin_lock_irqsave(&q->requeue_lock, flags); 561 list_splice_init(&q->requeue_list, &rq_list); 562 spin_unlock_irqrestore(&q->requeue_lock, flags); 563 564 while (!list_empty(&rq_list)) { 565 struct request *rq; 566 567 rq = list_first_entry(&rq_list, struct request, queuelist); 568 list_del_init(&rq->queuelist); 569 rq->errors = -EIO; 570 blk_mq_end_request(rq, rq->errors); 571 } 572 } 573 EXPORT_SYMBOL(blk_mq_abort_requeue_list); 574 575 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 576 { 577 if (tag < tags->nr_tags) { 578 prefetch(tags->rqs[tag]); 579 return tags->rqs[tag]; 580 } 581 582 return NULL; 583 } 584 EXPORT_SYMBOL(blk_mq_tag_to_rq); 585 586 struct blk_mq_timeout_data { 587 unsigned long next; 588 unsigned int next_set; 589 }; 590 591 void blk_mq_rq_timed_out(struct request *req, bool reserved) 592 { 593 struct blk_mq_ops *ops = req->q->mq_ops; 594 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; 595 596 /* 597 * We know that complete is set at this point. If STARTED isn't set 598 * anymore, then the request isn't active and the "timeout" should 599 * just be ignored. This can happen due to the bitflag ordering. 600 * Timeout first checks if STARTED is set, and if it is, assumes 601 * the request is active. But if we race with completion, then 602 * we both flags will get cleared. So check here again, and ignore 603 * a timeout event with a request that isn't active. 604 */ 605 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) 606 return; 607 608 if (ops->timeout) 609 ret = ops->timeout(req, reserved); 610 611 switch (ret) { 612 case BLK_EH_HANDLED: 613 __blk_mq_complete_request(req); 614 break; 615 case BLK_EH_RESET_TIMER: 616 blk_add_timer(req); 617 blk_clear_rq_complete(req); 618 break; 619 case BLK_EH_NOT_HANDLED: 620 break; 621 default: 622 printk(KERN_ERR "block: bad eh return: %d\n", ret); 623 break; 624 } 625 } 626 627 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 628 struct request *rq, void *priv, bool reserved) 629 { 630 struct blk_mq_timeout_data *data = priv; 631 632 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 633 /* 634 * If a request wasn't started before the queue was 635 * marked dying, kill it here or it'll go unnoticed. 636 */ 637 if (unlikely(blk_queue_dying(rq->q))) { 638 rq->errors = -EIO; 639 blk_mq_end_request(rq, rq->errors); 640 } 641 return; 642 } 643 644 if (time_after_eq(jiffies, rq->deadline)) { 645 if (!blk_mark_rq_complete(rq)) 646 blk_mq_rq_timed_out(rq, reserved); 647 } else if (!data->next_set || time_after(data->next, rq->deadline)) { 648 data->next = rq->deadline; 649 data->next_set = 1; 650 } 651 } 652 653 static void blk_mq_timeout_work(struct work_struct *work) 654 { 655 struct request_queue *q = 656 container_of(work, struct request_queue, timeout_work); 657 struct blk_mq_timeout_data data = { 658 .next = 0, 659 .next_set = 0, 660 }; 661 int i; 662 663 /* A deadlock might occur if a request is stuck requiring a 664 * timeout at the same time a queue freeze is waiting 665 * completion, since the timeout code would not be able to 666 * acquire the queue reference here. 667 * 668 * That's why we don't use blk_queue_enter here; instead, we use 669 * percpu_ref_tryget directly, because we need to be able to 670 * obtain a reference even in the short window between the queue 671 * starting to freeze, by dropping the first reference in 672 * blk_mq_freeze_queue_start, and the moment the last request is 673 * consumed, marked by the instant q_usage_counter reaches 674 * zero. 675 */ 676 if (!percpu_ref_tryget(&q->q_usage_counter)) 677 return; 678 679 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); 680 681 if (data.next_set) { 682 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 683 mod_timer(&q->timeout, data.next); 684 } else { 685 struct blk_mq_hw_ctx *hctx; 686 687 queue_for_each_hw_ctx(q, hctx, i) { 688 /* the hctx may be unmapped, so check it here */ 689 if (blk_mq_hw_queue_mapped(hctx)) 690 blk_mq_tag_idle(hctx); 691 } 692 } 693 blk_queue_exit(q); 694 } 695 696 /* 697 * Reverse check our software queue for entries that we could potentially 698 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 699 * too much time checking for merges. 700 */ 701 static bool blk_mq_attempt_merge(struct request_queue *q, 702 struct blk_mq_ctx *ctx, struct bio *bio) 703 { 704 struct request *rq; 705 int checked = 8; 706 707 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 708 int el_ret; 709 710 if (!checked--) 711 break; 712 713 if (!blk_rq_merge_ok(rq, bio)) 714 continue; 715 716 el_ret = blk_try_merge(rq, bio); 717 if (el_ret == ELEVATOR_BACK_MERGE) { 718 if (bio_attempt_back_merge(q, rq, bio)) { 719 ctx->rq_merged++; 720 return true; 721 } 722 break; 723 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 724 if (bio_attempt_front_merge(q, rq, bio)) { 725 ctx->rq_merged++; 726 return true; 727 } 728 break; 729 } 730 } 731 732 return false; 733 } 734 735 struct flush_busy_ctx_data { 736 struct blk_mq_hw_ctx *hctx; 737 struct list_head *list; 738 }; 739 740 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) 741 { 742 struct flush_busy_ctx_data *flush_data = data; 743 struct blk_mq_hw_ctx *hctx = flush_data->hctx; 744 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 745 746 sbitmap_clear_bit(sb, bitnr); 747 spin_lock(&ctx->lock); 748 list_splice_tail_init(&ctx->rq_list, flush_data->list); 749 spin_unlock(&ctx->lock); 750 return true; 751 } 752 753 /* 754 * Process software queues that have been marked busy, splicing them 755 * to the for-dispatch 756 */ 757 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 758 { 759 struct flush_busy_ctx_data data = { 760 .hctx = hctx, 761 .list = list, 762 }; 763 764 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); 765 } 766 767 static inline unsigned int queued_to_index(unsigned int queued) 768 { 769 if (!queued) 770 return 0; 771 772 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); 773 } 774 775 /* 776 * Run this hardware queue, pulling any software queues mapped to it in. 777 * Note that this function currently has various problems around ordering 778 * of IO. In particular, we'd like FIFO behaviour on handling existing 779 * items on the hctx->dispatch list. Ignore that for now. 780 */ 781 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 782 { 783 struct request_queue *q = hctx->queue; 784 struct request *rq; 785 LIST_HEAD(rq_list); 786 LIST_HEAD(driver_list); 787 struct list_head *dptr; 788 int queued; 789 790 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 791 return; 792 793 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && 794 cpu_online(hctx->next_cpu)); 795 796 hctx->run++; 797 798 /* 799 * Touch any software queue that has pending entries. 800 */ 801 flush_busy_ctxs(hctx, &rq_list); 802 803 /* 804 * If we have previous entries on our dispatch list, grab them 805 * and stuff them at the front for more fair dispatch. 806 */ 807 if (!list_empty_careful(&hctx->dispatch)) { 808 spin_lock(&hctx->lock); 809 if (!list_empty(&hctx->dispatch)) 810 list_splice_init(&hctx->dispatch, &rq_list); 811 spin_unlock(&hctx->lock); 812 } 813 814 /* 815 * Start off with dptr being NULL, so we start the first request 816 * immediately, even if we have more pending. 817 */ 818 dptr = NULL; 819 820 /* 821 * Now process all the entries, sending them to the driver. 822 */ 823 queued = 0; 824 while (!list_empty(&rq_list)) { 825 struct blk_mq_queue_data bd; 826 int ret; 827 828 rq = list_first_entry(&rq_list, struct request, queuelist); 829 list_del_init(&rq->queuelist); 830 831 bd.rq = rq; 832 bd.list = dptr; 833 bd.last = list_empty(&rq_list); 834 835 ret = q->mq_ops->queue_rq(hctx, &bd); 836 switch (ret) { 837 case BLK_MQ_RQ_QUEUE_OK: 838 queued++; 839 break; 840 case BLK_MQ_RQ_QUEUE_BUSY: 841 list_add(&rq->queuelist, &rq_list); 842 __blk_mq_requeue_request(rq); 843 break; 844 default: 845 pr_err("blk-mq: bad return on queue: %d\n", ret); 846 case BLK_MQ_RQ_QUEUE_ERROR: 847 rq->errors = -EIO; 848 blk_mq_end_request(rq, rq->errors); 849 break; 850 } 851 852 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 853 break; 854 855 /* 856 * We've done the first request. If we have more than 1 857 * left in the list, set dptr to defer issue. 858 */ 859 if (!dptr && rq_list.next != rq_list.prev) 860 dptr = &driver_list; 861 } 862 863 hctx->dispatched[queued_to_index(queued)]++; 864 865 /* 866 * Any items that need requeuing? Stuff them into hctx->dispatch, 867 * that is where we will continue on next queue run. 868 */ 869 if (!list_empty(&rq_list)) { 870 spin_lock(&hctx->lock); 871 list_splice(&rq_list, &hctx->dispatch); 872 spin_unlock(&hctx->lock); 873 /* 874 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but 875 * it's possible the queue is stopped and restarted again 876 * before this. Queue restart will dispatch requests. And since 877 * requests in rq_list aren't added into hctx->dispatch yet, 878 * the requests in rq_list might get lost. 879 * 880 * blk_mq_run_hw_queue() already checks the STOPPED bit 881 **/ 882 blk_mq_run_hw_queue(hctx, true); 883 } 884 } 885 886 /* 887 * It'd be great if the workqueue API had a way to pass 888 * in a mask and had some smarts for more clever placement. 889 * For now we just round-robin here, switching for every 890 * BLK_MQ_CPU_WORK_BATCH queued items. 891 */ 892 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 893 { 894 if (hctx->queue->nr_hw_queues == 1) 895 return WORK_CPU_UNBOUND; 896 897 if (--hctx->next_cpu_batch <= 0) { 898 int cpu = hctx->next_cpu, next_cpu; 899 900 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); 901 if (next_cpu >= nr_cpu_ids) 902 next_cpu = cpumask_first(hctx->cpumask); 903 904 hctx->next_cpu = next_cpu; 905 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 906 907 return cpu; 908 } 909 910 return hctx->next_cpu; 911 } 912 913 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 914 { 915 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || 916 !blk_mq_hw_queue_mapped(hctx))) 917 return; 918 919 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { 920 int cpu = get_cpu(); 921 if (cpumask_test_cpu(cpu, hctx->cpumask)) { 922 __blk_mq_run_hw_queue(hctx); 923 put_cpu(); 924 return; 925 } 926 927 put_cpu(); 928 } 929 930 kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work); 931 } 932 933 void blk_mq_run_hw_queues(struct request_queue *q, bool async) 934 { 935 struct blk_mq_hw_ctx *hctx; 936 int i; 937 938 queue_for_each_hw_ctx(q, hctx, i) { 939 if ((!blk_mq_hctx_has_pending(hctx) && 940 list_empty_careful(&hctx->dispatch)) || 941 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 942 continue; 943 944 blk_mq_run_hw_queue(hctx, async); 945 } 946 } 947 EXPORT_SYMBOL(blk_mq_run_hw_queues); 948 949 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 950 { 951 cancel_work(&hctx->run_work); 952 cancel_delayed_work(&hctx->delay_work); 953 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 954 } 955 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 956 957 void blk_mq_stop_hw_queues(struct request_queue *q) 958 { 959 struct blk_mq_hw_ctx *hctx; 960 int i; 961 962 queue_for_each_hw_ctx(q, hctx, i) 963 blk_mq_stop_hw_queue(hctx); 964 } 965 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 966 967 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 968 { 969 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 970 971 blk_mq_run_hw_queue(hctx, false); 972 } 973 EXPORT_SYMBOL(blk_mq_start_hw_queue); 974 975 void blk_mq_start_hw_queues(struct request_queue *q) 976 { 977 struct blk_mq_hw_ctx *hctx; 978 int i; 979 980 queue_for_each_hw_ctx(q, hctx, i) 981 blk_mq_start_hw_queue(hctx); 982 } 983 EXPORT_SYMBOL(blk_mq_start_hw_queues); 984 985 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 986 { 987 struct blk_mq_hw_ctx *hctx; 988 int i; 989 990 queue_for_each_hw_ctx(q, hctx, i) { 991 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 992 continue; 993 994 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 995 blk_mq_run_hw_queue(hctx, async); 996 } 997 } 998 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 999 1000 static void blk_mq_run_work_fn(struct work_struct *work) 1001 { 1002 struct blk_mq_hw_ctx *hctx; 1003 1004 hctx = container_of(work, struct blk_mq_hw_ctx, run_work); 1005 1006 __blk_mq_run_hw_queue(hctx); 1007 } 1008 1009 static void blk_mq_delay_work_fn(struct work_struct *work) 1010 { 1011 struct blk_mq_hw_ctx *hctx; 1012 1013 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); 1014 1015 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) 1016 __blk_mq_run_hw_queue(hctx); 1017 } 1018 1019 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 1020 { 1021 if (unlikely(!blk_mq_hw_queue_mapped(hctx))) 1022 return; 1023 1024 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 1025 &hctx->delay_work, msecs_to_jiffies(msecs)); 1026 } 1027 EXPORT_SYMBOL(blk_mq_delay_queue); 1028 1029 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, 1030 struct request *rq, 1031 bool at_head) 1032 { 1033 struct blk_mq_ctx *ctx = rq->mq_ctx; 1034 1035 trace_block_rq_insert(hctx->queue, rq); 1036 1037 if (at_head) 1038 list_add(&rq->queuelist, &ctx->rq_list); 1039 else 1040 list_add_tail(&rq->queuelist, &ctx->rq_list); 1041 } 1042 1043 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 1044 struct request *rq, bool at_head) 1045 { 1046 struct blk_mq_ctx *ctx = rq->mq_ctx; 1047 1048 __blk_mq_insert_req_list(hctx, rq, at_head); 1049 blk_mq_hctx_mark_pending(hctx, ctx); 1050 } 1051 1052 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, 1053 bool async) 1054 { 1055 struct blk_mq_ctx *ctx = rq->mq_ctx; 1056 struct request_queue *q = rq->q; 1057 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 1058 1059 spin_lock(&ctx->lock); 1060 __blk_mq_insert_request(hctx, rq, at_head); 1061 spin_unlock(&ctx->lock); 1062 1063 if (run_queue) 1064 blk_mq_run_hw_queue(hctx, async); 1065 } 1066 1067 static void blk_mq_insert_requests(struct request_queue *q, 1068 struct blk_mq_ctx *ctx, 1069 struct list_head *list, 1070 int depth, 1071 bool from_schedule) 1072 1073 { 1074 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 1075 1076 trace_block_unplug(q, depth, !from_schedule); 1077 1078 /* 1079 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1080 * offline now 1081 */ 1082 spin_lock(&ctx->lock); 1083 while (!list_empty(list)) { 1084 struct request *rq; 1085 1086 rq = list_first_entry(list, struct request, queuelist); 1087 BUG_ON(rq->mq_ctx != ctx); 1088 list_del_init(&rq->queuelist); 1089 __blk_mq_insert_req_list(hctx, rq, false); 1090 } 1091 blk_mq_hctx_mark_pending(hctx, ctx); 1092 spin_unlock(&ctx->lock); 1093 1094 blk_mq_run_hw_queue(hctx, from_schedule); 1095 } 1096 1097 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1098 { 1099 struct request *rqa = container_of(a, struct request, queuelist); 1100 struct request *rqb = container_of(b, struct request, queuelist); 1101 1102 return !(rqa->mq_ctx < rqb->mq_ctx || 1103 (rqa->mq_ctx == rqb->mq_ctx && 1104 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 1105 } 1106 1107 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1108 { 1109 struct blk_mq_ctx *this_ctx; 1110 struct request_queue *this_q; 1111 struct request *rq; 1112 LIST_HEAD(list); 1113 LIST_HEAD(ctx_list); 1114 unsigned int depth; 1115 1116 list_splice_init(&plug->mq_list, &list); 1117 1118 list_sort(NULL, &list, plug_ctx_cmp); 1119 1120 this_q = NULL; 1121 this_ctx = NULL; 1122 depth = 0; 1123 1124 while (!list_empty(&list)) { 1125 rq = list_entry_rq(list.next); 1126 list_del_init(&rq->queuelist); 1127 BUG_ON(!rq->q); 1128 if (rq->mq_ctx != this_ctx) { 1129 if (this_ctx) { 1130 blk_mq_insert_requests(this_q, this_ctx, 1131 &ctx_list, depth, 1132 from_schedule); 1133 } 1134 1135 this_ctx = rq->mq_ctx; 1136 this_q = rq->q; 1137 depth = 0; 1138 } 1139 1140 depth++; 1141 list_add_tail(&rq->queuelist, &ctx_list); 1142 } 1143 1144 /* 1145 * If 'this_ctx' is set, we know we have entries to complete 1146 * on 'ctx_list'. Do those. 1147 */ 1148 if (this_ctx) { 1149 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 1150 from_schedule); 1151 } 1152 } 1153 1154 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1155 { 1156 init_request_from_bio(rq, bio); 1157 1158 blk_account_io_start(rq, 1); 1159 } 1160 1161 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) 1162 { 1163 return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 1164 !blk_queue_nomerges(hctx->queue); 1165 } 1166 1167 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, 1168 struct blk_mq_ctx *ctx, 1169 struct request *rq, struct bio *bio) 1170 { 1171 if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) { 1172 blk_mq_bio_to_request(rq, bio); 1173 spin_lock(&ctx->lock); 1174 insert_rq: 1175 __blk_mq_insert_request(hctx, rq, false); 1176 spin_unlock(&ctx->lock); 1177 return false; 1178 } else { 1179 struct request_queue *q = hctx->queue; 1180 1181 spin_lock(&ctx->lock); 1182 if (!blk_mq_attempt_merge(q, ctx, bio)) { 1183 blk_mq_bio_to_request(rq, bio); 1184 goto insert_rq; 1185 } 1186 1187 spin_unlock(&ctx->lock); 1188 __blk_mq_free_request(hctx, ctx, rq); 1189 return true; 1190 } 1191 } 1192 1193 struct blk_map_ctx { 1194 struct blk_mq_hw_ctx *hctx; 1195 struct blk_mq_ctx *ctx; 1196 }; 1197 1198 static struct request *blk_mq_map_request(struct request_queue *q, 1199 struct bio *bio, 1200 struct blk_map_ctx *data) 1201 { 1202 struct blk_mq_hw_ctx *hctx; 1203 struct blk_mq_ctx *ctx; 1204 struct request *rq; 1205 int op = bio_data_dir(bio); 1206 int op_flags = 0; 1207 struct blk_mq_alloc_data alloc_data; 1208 1209 blk_queue_enter_live(q); 1210 ctx = blk_mq_get_ctx(q); 1211 hctx = blk_mq_map_queue(q, ctx->cpu); 1212 1213 if (rw_is_sync(bio_op(bio), bio->bi_opf)) 1214 op_flags |= REQ_SYNC; 1215 1216 trace_block_getrq(q, bio, op); 1217 blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); 1218 rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); 1219 1220 data->hctx = alloc_data.hctx; 1221 data->ctx = alloc_data.ctx; 1222 data->hctx->queued++; 1223 return rq; 1224 } 1225 1226 static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) 1227 { 1228 int ret; 1229 struct request_queue *q = rq->q; 1230 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); 1231 struct blk_mq_queue_data bd = { 1232 .rq = rq, 1233 .list = NULL, 1234 .last = 1 1235 }; 1236 blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); 1237 1238 /* 1239 * For OK queue, we are done. For error, kill it. Any other 1240 * error (busy), just add it to our list as we previously 1241 * would have done 1242 */ 1243 ret = q->mq_ops->queue_rq(hctx, &bd); 1244 if (ret == BLK_MQ_RQ_QUEUE_OK) { 1245 *cookie = new_cookie; 1246 return 0; 1247 } 1248 1249 __blk_mq_requeue_request(rq); 1250 1251 if (ret == BLK_MQ_RQ_QUEUE_ERROR) { 1252 *cookie = BLK_QC_T_NONE; 1253 rq->errors = -EIO; 1254 blk_mq_end_request(rq, rq->errors); 1255 return 0; 1256 } 1257 1258 return -1; 1259 } 1260 1261 /* 1262 * Multiple hardware queue variant. This will not use per-process plugs, 1263 * but will attempt to bypass the hctx queueing if we can go straight to 1264 * hardware for SYNC IO. 1265 */ 1266 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) 1267 { 1268 const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); 1269 const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); 1270 struct blk_map_ctx data; 1271 struct request *rq; 1272 unsigned int request_count = 0; 1273 struct blk_plug *plug; 1274 struct request *same_queue_rq = NULL; 1275 blk_qc_t cookie; 1276 1277 blk_queue_bounce(q, &bio); 1278 1279 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1280 bio_io_error(bio); 1281 return BLK_QC_T_NONE; 1282 } 1283 1284 blk_queue_split(q, &bio, q->bio_split); 1285 1286 if (!is_flush_fua && !blk_queue_nomerges(q) && 1287 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) 1288 return BLK_QC_T_NONE; 1289 1290 rq = blk_mq_map_request(q, bio, &data); 1291 if (unlikely(!rq)) 1292 return BLK_QC_T_NONE; 1293 1294 cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); 1295 1296 if (unlikely(is_flush_fua)) { 1297 blk_mq_bio_to_request(rq, bio); 1298 blk_insert_flush(rq); 1299 goto run_queue; 1300 } 1301 1302 plug = current->plug; 1303 /* 1304 * If the driver supports defer issued based on 'last', then 1305 * queue it up like normal since we can potentially save some 1306 * CPU this way. 1307 */ 1308 if (((plug && !blk_queue_nomerges(q)) || is_sync) && 1309 !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { 1310 struct request *old_rq = NULL; 1311 1312 blk_mq_bio_to_request(rq, bio); 1313 1314 /* 1315 * We do limited pluging. If the bio can be merged, do that. 1316 * Otherwise the existing request in the plug list will be 1317 * issued. So the plug list will have one request at most 1318 */ 1319 if (plug) { 1320 /* 1321 * The plug list might get flushed before this. If that 1322 * happens, same_queue_rq is invalid and plug list is 1323 * empty 1324 */ 1325 if (same_queue_rq && !list_empty(&plug->mq_list)) { 1326 old_rq = same_queue_rq; 1327 list_del_init(&old_rq->queuelist); 1328 } 1329 list_add_tail(&rq->queuelist, &plug->mq_list); 1330 } else /* is_sync */ 1331 old_rq = rq; 1332 blk_mq_put_ctx(data.ctx); 1333 if (!old_rq) 1334 goto done; 1335 if (!blk_mq_direct_issue_request(old_rq, &cookie)) 1336 goto done; 1337 blk_mq_insert_request(old_rq, false, true, true); 1338 goto done; 1339 } 1340 1341 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1342 /* 1343 * For a SYNC request, send it to the hardware immediately. For 1344 * an ASYNC request, just ensure that we run it later on. The 1345 * latter allows for merging opportunities and more efficient 1346 * dispatching. 1347 */ 1348 run_queue: 1349 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1350 } 1351 blk_mq_put_ctx(data.ctx); 1352 done: 1353 return cookie; 1354 } 1355 1356 /* 1357 * Single hardware queue variant. This will attempt to use any per-process 1358 * plug for merging and IO deferral. 1359 */ 1360 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) 1361 { 1362 const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); 1363 const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); 1364 struct blk_plug *plug; 1365 unsigned int request_count = 0; 1366 struct blk_map_ctx data; 1367 struct request *rq; 1368 blk_qc_t cookie; 1369 1370 blk_queue_bounce(q, &bio); 1371 1372 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1373 bio_io_error(bio); 1374 return BLK_QC_T_NONE; 1375 } 1376 1377 blk_queue_split(q, &bio, q->bio_split); 1378 1379 if (!is_flush_fua && !blk_queue_nomerges(q)) { 1380 if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) 1381 return BLK_QC_T_NONE; 1382 } else 1383 request_count = blk_plug_queued_count(q); 1384 1385 rq = blk_mq_map_request(q, bio, &data); 1386 if (unlikely(!rq)) 1387 return BLK_QC_T_NONE; 1388 1389 cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); 1390 1391 if (unlikely(is_flush_fua)) { 1392 blk_mq_bio_to_request(rq, bio); 1393 blk_insert_flush(rq); 1394 goto run_queue; 1395 } 1396 1397 /* 1398 * A task plug currently exists. Since this is completely lockless, 1399 * utilize that to temporarily store requests until the task is 1400 * either done or scheduled away. 1401 */ 1402 plug = current->plug; 1403 if (plug) { 1404 blk_mq_bio_to_request(rq, bio); 1405 if (!request_count) 1406 trace_block_plug(q); 1407 1408 blk_mq_put_ctx(data.ctx); 1409 1410 if (request_count >= BLK_MAX_REQUEST_COUNT) { 1411 blk_flush_plug_list(plug, false); 1412 trace_block_plug(q); 1413 } 1414 1415 list_add_tail(&rq->queuelist, &plug->mq_list); 1416 return cookie; 1417 } 1418 1419 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1420 /* 1421 * For a SYNC request, send it to the hardware immediately. For 1422 * an ASYNC request, just ensure that we run it later on. The 1423 * latter allows for merging opportunities and more efficient 1424 * dispatching. 1425 */ 1426 run_queue: 1427 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1428 } 1429 1430 blk_mq_put_ctx(data.ctx); 1431 return cookie; 1432 } 1433 1434 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, 1435 struct blk_mq_tags *tags, unsigned int hctx_idx) 1436 { 1437 struct page *page; 1438 1439 if (tags->rqs && set->ops->exit_request) { 1440 int i; 1441 1442 for (i = 0; i < tags->nr_tags; i++) { 1443 if (!tags->rqs[i]) 1444 continue; 1445 set->ops->exit_request(set->driver_data, tags->rqs[i], 1446 hctx_idx, i); 1447 tags->rqs[i] = NULL; 1448 } 1449 } 1450 1451 while (!list_empty(&tags->page_list)) { 1452 page = list_first_entry(&tags->page_list, struct page, lru); 1453 list_del_init(&page->lru); 1454 /* 1455 * Remove kmemleak object previously allocated in 1456 * blk_mq_init_rq_map(). 1457 */ 1458 kmemleak_free(page_address(page)); 1459 __free_pages(page, page->private); 1460 } 1461 1462 kfree(tags->rqs); 1463 1464 blk_mq_free_tags(tags); 1465 } 1466 1467 static size_t order_to_size(unsigned int order) 1468 { 1469 return (size_t)PAGE_SIZE << order; 1470 } 1471 1472 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, 1473 unsigned int hctx_idx) 1474 { 1475 struct blk_mq_tags *tags; 1476 unsigned int i, j, entries_per_page, max_order = 4; 1477 size_t rq_size, left; 1478 1479 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, 1480 set->numa_node, 1481 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 1482 if (!tags) 1483 return NULL; 1484 1485 INIT_LIST_HEAD(&tags->page_list); 1486 1487 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), 1488 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, 1489 set->numa_node); 1490 if (!tags->rqs) { 1491 blk_mq_free_tags(tags); 1492 return NULL; 1493 } 1494 1495 /* 1496 * rq_size is the size of the request plus driver payload, rounded 1497 * to the cacheline size 1498 */ 1499 rq_size = round_up(sizeof(struct request) + set->cmd_size, 1500 cache_line_size()); 1501 left = rq_size * set->queue_depth; 1502 1503 for (i = 0; i < set->queue_depth; ) { 1504 int this_order = max_order; 1505 struct page *page; 1506 int to_do; 1507 void *p; 1508 1509 while (this_order && left < order_to_size(this_order - 1)) 1510 this_order--; 1511 1512 do { 1513 page = alloc_pages_node(set->numa_node, 1514 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 1515 this_order); 1516 if (page) 1517 break; 1518 if (!this_order--) 1519 break; 1520 if (order_to_size(this_order) < rq_size) 1521 break; 1522 } while (1); 1523 1524 if (!page) 1525 goto fail; 1526 1527 page->private = this_order; 1528 list_add_tail(&page->lru, &tags->page_list); 1529 1530 p = page_address(page); 1531 /* 1532 * Allow kmemleak to scan these pages as they contain pointers 1533 * to additional allocations like via ops->init_request(). 1534 */ 1535 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL); 1536 entries_per_page = order_to_size(this_order) / rq_size; 1537 to_do = min(entries_per_page, set->queue_depth - i); 1538 left -= to_do * rq_size; 1539 for (j = 0; j < to_do; j++) { 1540 tags->rqs[i] = p; 1541 if (set->ops->init_request) { 1542 if (set->ops->init_request(set->driver_data, 1543 tags->rqs[i], hctx_idx, i, 1544 set->numa_node)) { 1545 tags->rqs[i] = NULL; 1546 goto fail; 1547 } 1548 } 1549 1550 p += rq_size; 1551 i++; 1552 } 1553 } 1554 return tags; 1555 1556 fail: 1557 blk_mq_free_rq_map(set, tags, hctx_idx); 1558 return NULL; 1559 } 1560 1561 /* 1562 * 'cpu' is going away. splice any existing rq_list entries from this 1563 * software queue to the hw queue dispatch list, and ensure that it 1564 * gets run. 1565 */ 1566 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) 1567 { 1568 struct blk_mq_hw_ctx *hctx; 1569 struct blk_mq_ctx *ctx; 1570 LIST_HEAD(tmp); 1571 1572 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); 1573 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 1574 1575 spin_lock(&ctx->lock); 1576 if (!list_empty(&ctx->rq_list)) { 1577 list_splice_init(&ctx->rq_list, &tmp); 1578 blk_mq_hctx_clear_pending(hctx, ctx); 1579 } 1580 spin_unlock(&ctx->lock); 1581 1582 if (list_empty(&tmp)) 1583 return 0; 1584 1585 spin_lock(&hctx->lock); 1586 list_splice_tail_init(&tmp, &hctx->dispatch); 1587 spin_unlock(&hctx->lock); 1588 1589 blk_mq_run_hw_queue(hctx, true); 1590 return 0; 1591 } 1592 1593 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 1594 { 1595 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, 1596 &hctx->cpuhp_dead); 1597 } 1598 1599 /* hctx->ctxs will be freed in queue's release handler */ 1600 static void blk_mq_exit_hctx(struct request_queue *q, 1601 struct blk_mq_tag_set *set, 1602 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1603 { 1604 unsigned flush_start_tag = set->queue_depth; 1605 1606 blk_mq_tag_idle(hctx); 1607 1608 if (set->ops->exit_request) 1609 set->ops->exit_request(set->driver_data, 1610 hctx->fq->flush_rq, hctx_idx, 1611 flush_start_tag + hctx_idx); 1612 1613 if (set->ops->exit_hctx) 1614 set->ops->exit_hctx(hctx, hctx_idx); 1615 1616 blk_mq_remove_cpuhp(hctx); 1617 blk_free_flush_queue(hctx->fq); 1618 sbitmap_free(&hctx->ctx_map); 1619 } 1620 1621 static void blk_mq_exit_hw_queues(struct request_queue *q, 1622 struct blk_mq_tag_set *set, int nr_queue) 1623 { 1624 struct blk_mq_hw_ctx *hctx; 1625 unsigned int i; 1626 1627 queue_for_each_hw_ctx(q, hctx, i) { 1628 if (i == nr_queue) 1629 break; 1630 blk_mq_exit_hctx(q, set, hctx, i); 1631 } 1632 } 1633 1634 static void blk_mq_free_hw_queues(struct request_queue *q, 1635 struct blk_mq_tag_set *set) 1636 { 1637 struct blk_mq_hw_ctx *hctx; 1638 unsigned int i; 1639 1640 queue_for_each_hw_ctx(q, hctx, i) 1641 free_cpumask_var(hctx->cpumask); 1642 } 1643 1644 static int blk_mq_init_hctx(struct request_queue *q, 1645 struct blk_mq_tag_set *set, 1646 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 1647 { 1648 int node; 1649 unsigned flush_start_tag = set->queue_depth; 1650 1651 node = hctx->numa_node; 1652 if (node == NUMA_NO_NODE) 1653 node = hctx->numa_node = set->numa_node; 1654 1655 INIT_WORK(&hctx->run_work, blk_mq_run_work_fn); 1656 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); 1657 spin_lock_init(&hctx->lock); 1658 INIT_LIST_HEAD(&hctx->dispatch); 1659 hctx->queue = q; 1660 hctx->queue_num = hctx_idx; 1661 hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; 1662 1663 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); 1664 1665 hctx->tags = set->tags[hctx_idx]; 1666 1667 /* 1668 * Allocate space for all possible cpus to avoid allocation at 1669 * runtime 1670 */ 1671 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1672 GFP_KERNEL, node); 1673 if (!hctx->ctxs) 1674 goto unregister_cpu_notifier; 1675 1676 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL, 1677 node)) 1678 goto free_ctxs; 1679 1680 hctx->nr_ctx = 0; 1681 1682 if (set->ops->init_hctx && 1683 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 1684 goto free_bitmap; 1685 1686 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); 1687 if (!hctx->fq) 1688 goto exit_hctx; 1689 1690 if (set->ops->init_request && 1691 set->ops->init_request(set->driver_data, 1692 hctx->fq->flush_rq, hctx_idx, 1693 flush_start_tag + hctx_idx, node)) 1694 goto free_fq; 1695 1696 return 0; 1697 1698 free_fq: 1699 kfree(hctx->fq); 1700 exit_hctx: 1701 if (set->ops->exit_hctx) 1702 set->ops->exit_hctx(hctx, hctx_idx); 1703 free_bitmap: 1704 sbitmap_free(&hctx->ctx_map); 1705 free_ctxs: 1706 kfree(hctx->ctxs); 1707 unregister_cpu_notifier: 1708 blk_mq_remove_cpuhp(hctx); 1709 return -1; 1710 } 1711 1712 static void blk_mq_init_cpu_queues(struct request_queue *q, 1713 unsigned int nr_hw_queues) 1714 { 1715 unsigned int i; 1716 1717 for_each_possible_cpu(i) { 1718 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1719 struct blk_mq_hw_ctx *hctx; 1720 1721 memset(__ctx, 0, sizeof(*__ctx)); 1722 __ctx->cpu = i; 1723 spin_lock_init(&__ctx->lock); 1724 INIT_LIST_HEAD(&__ctx->rq_list); 1725 __ctx->queue = q; 1726 1727 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1728 if (!cpu_online(i)) 1729 continue; 1730 1731 hctx = blk_mq_map_queue(q, i); 1732 1733 /* 1734 * Set local node, IFF we have more than one hw queue. If 1735 * not, we remain on the home node of the device 1736 */ 1737 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1738 hctx->numa_node = local_memory_node(cpu_to_node(i)); 1739 } 1740 } 1741 1742 static void blk_mq_map_swqueue(struct request_queue *q, 1743 const struct cpumask *online_mask) 1744 { 1745 unsigned int i; 1746 struct blk_mq_hw_ctx *hctx; 1747 struct blk_mq_ctx *ctx; 1748 struct blk_mq_tag_set *set = q->tag_set; 1749 1750 /* 1751 * Avoid others reading imcomplete hctx->cpumask through sysfs 1752 */ 1753 mutex_lock(&q->sysfs_lock); 1754 1755 queue_for_each_hw_ctx(q, hctx, i) { 1756 cpumask_clear(hctx->cpumask); 1757 hctx->nr_ctx = 0; 1758 } 1759 1760 /* 1761 * Map software to hardware queues 1762 */ 1763 for_each_possible_cpu(i) { 1764 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1765 if (!cpumask_test_cpu(i, online_mask)) 1766 continue; 1767 1768 ctx = per_cpu_ptr(q->queue_ctx, i); 1769 hctx = blk_mq_map_queue(q, i); 1770 1771 cpumask_set_cpu(i, hctx->cpumask); 1772 ctx->index_hw = hctx->nr_ctx; 1773 hctx->ctxs[hctx->nr_ctx++] = ctx; 1774 } 1775 1776 mutex_unlock(&q->sysfs_lock); 1777 1778 queue_for_each_hw_ctx(q, hctx, i) { 1779 /* 1780 * If no software queues are mapped to this hardware queue, 1781 * disable it and free the request entries. 1782 */ 1783 if (!hctx->nr_ctx) { 1784 if (set->tags[i]) { 1785 blk_mq_free_rq_map(set, set->tags[i], i); 1786 set->tags[i] = NULL; 1787 } 1788 hctx->tags = NULL; 1789 continue; 1790 } 1791 1792 /* unmapped hw queue can be remapped after CPU topo changed */ 1793 if (!set->tags[i]) 1794 set->tags[i] = blk_mq_init_rq_map(set, i); 1795 hctx->tags = set->tags[i]; 1796 WARN_ON(!hctx->tags); 1797 1798 /* 1799 * Set the map size to the number of mapped software queues. 1800 * This is more accurate and more efficient than looping 1801 * over all possibly mapped software queues. 1802 */ 1803 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); 1804 1805 /* 1806 * Initialize batch roundrobin counts 1807 */ 1808 hctx->next_cpu = cpumask_first(hctx->cpumask); 1809 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1810 } 1811 } 1812 1813 static void queue_set_hctx_shared(struct request_queue *q, bool shared) 1814 { 1815 struct blk_mq_hw_ctx *hctx; 1816 int i; 1817 1818 queue_for_each_hw_ctx(q, hctx, i) { 1819 if (shared) 1820 hctx->flags |= BLK_MQ_F_TAG_SHARED; 1821 else 1822 hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 1823 } 1824 } 1825 1826 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared) 1827 { 1828 struct request_queue *q; 1829 1830 list_for_each_entry(q, &set->tag_list, tag_set_list) { 1831 blk_mq_freeze_queue(q); 1832 queue_set_hctx_shared(q, shared); 1833 blk_mq_unfreeze_queue(q); 1834 } 1835 } 1836 1837 static void blk_mq_del_queue_tag_set(struct request_queue *q) 1838 { 1839 struct blk_mq_tag_set *set = q->tag_set; 1840 1841 mutex_lock(&set->tag_list_lock); 1842 list_del_init(&q->tag_set_list); 1843 if (list_is_singular(&set->tag_list)) { 1844 /* just transitioned to unshared */ 1845 set->flags &= ~BLK_MQ_F_TAG_SHARED; 1846 /* update existing queue */ 1847 blk_mq_update_tag_set_depth(set, false); 1848 } 1849 mutex_unlock(&set->tag_list_lock); 1850 } 1851 1852 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 1853 struct request_queue *q) 1854 { 1855 q->tag_set = set; 1856 1857 mutex_lock(&set->tag_list_lock); 1858 1859 /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ 1860 if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { 1861 set->flags |= BLK_MQ_F_TAG_SHARED; 1862 /* update existing queue */ 1863 blk_mq_update_tag_set_depth(set, true); 1864 } 1865 if (set->flags & BLK_MQ_F_TAG_SHARED) 1866 queue_set_hctx_shared(q, true); 1867 list_add_tail(&q->tag_set_list, &set->tag_list); 1868 1869 mutex_unlock(&set->tag_list_lock); 1870 } 1871 1872 /* 1873 * It is the actual release handler for mq, but we do it from 1874 * request queue's release handler for avoiding use-after-free 1875 * and headache because q->mq_kobj shouldn't have been introduced, 1876 * but we can't group ctx/kctx kobj without it. 1877 */ 1878 void blk_mq_release(struct request_queue *q) 1879 { 1880 struct blk_mq_hw_ctx *hctx; 1881 unsigned int i; 1882 1883 /* hctx kobj stays in hctx */ 1884 queue_for_each_hw_ctx(q, hctx, i) { 1885 if (!hctx) 1886 continue; 1887 kfree(hctx->ctxs); 1888 kfree(hctx); 1889 } 1890 1891 q->mq_map = NULL; 1892 1893 kfree(q->queue_hw_ctx); 1894 1895 /* ctx kobj stays in queue_ctx */ 1896 free_percpu(q->queue_ctx); 1897 } 1898 1899 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 1900 { 1901 struct request_queue *uninit_q, *q; 1902 1903 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); 1904 if (!uninit_q) 1905 return ERR_PTR(-ENOMEM); 1906 1907 q = blk_mq_init_allocated_queue(set, uninit_q); 1908 if (IS_ERR(q)) 1909 blk_cleanup_queue(uninit_q); 1910 1911 return q; 1912 } 1913 EXPORT_SYMBOL(blk_mq_init_queue); 1914 1915 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 1916 struct request_queue *q) 1917 { 1918 int i, j; 1919 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; 1920 1921 blk_mq_sysfs_unregister(q); 1922 for (i = 0; i < set->nr_hw_queues; i++) { 1923 int node; 1924 1925 if (hctxs[i]) 1926 continue; 1927 1928 node = blk_mq_hw_queue_to_node(q->mq_map, i); 1929 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), 1930 GFP_KERNEL, node); 1931 if (!hctxs[i]) 1932 break; 1933 1934 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, 1935 node)) { 1936 kfree(hctxs[i]); 1937 hctxs[i] = NULL; 1938 break; 1939 } 1940 1941 atomic_set(&hctxs[i]->nr_active, 0); 1942 hctxs[i]->numa_node = node; 1943 hctxs[i]->queue_num = i; 1944 1945 if (blk_mq_init_hctx(q, set, hctxs[i], i)) { 1946 free_cpumask_var(hctxs[i]->cpumask); 1947 kfree(hctxs[i]); 1948 hctxs[i] = NULL; 1949 break; 1950 } 1951 blk_mq_hctx_kobj_init(hctxs[i]); 1952 } 1953 for (j = i; j < q->nr_hw_queues; j++) { 1954 struct blk_mq_hw_ctx *hctx = hctxs[j]; 1955 1956 if (hctx) { 1957 if (hctx->tags) { 1958 blk_mq_free_rq_map(set, hctx->tags, j); 1959 set->tags[j] = NULL; 1960 } 1961 blk_mq_exit_hctx(q, set, hctx, j); 1962 free_cpumask_var(hctx->cpumask); 1963 kobject_put(&hctx->kobj); 1964 kfree(hctx->ctxs); 1965 kfree(hctx); 1966 hctxs[j] = NULL; 1967 1968 } 1969 } 1970 q->nr_hw_queues = i; 1971 blk_mq_sysfs_register(q); 1972 } 1973 1974 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 1975 struct request_queue *q) 1976 { 1977 /* mark the queue as mq asap */ 1978 q->mq_ops = set->ops; 1979 1980 q->queue_ctx = alloc_percpu(struct blk_mq_ctx); 1981 if (!q->queue_ctx) 1982 goto err_exit; 1983 1984 q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), 1985 GFP_KERNEL, set->numa_node); 1986 if (!q->queue_hw_ctx) 1987 goto err_percpu; 1988 1989 q->mq_map = set->mq_map; 1990 1991 blk_mq_realloc_hw_ctxs(set, q); 1992 if (!q->nr_hw_queues) 1993 goto err_hctxs; 1994 1995 INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 1996 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 1997 1998 q->nr_queues = nr_cpu_ids; 1999 2000 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 2001 2002 if (!(set->flags & BLK_MQ_F_SG_MERGE)) 2003 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; 2004 2005 q->sg_reserved_size = INT_MAX; 2006 2007 INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); 2008 INIT_LIST_HEAD(&q->requeue_list); 2009 spin_lock_init(&q->requeue_lock); 2010 2011 if (q->nr_hw_queues > 1) 2012 blk_queue_make_request(q, blk_mq_make_request); 2013 else 2014 blk_queue_make_request(q, blk_sq_make_request); 2015 2016 /* 2017 * Do this after blk_queue_make_request() overrides it... 2018 */ 2019 q->nr_requests = set->queue_depth; 2020 2021 if (set->ops->complete) 2022 blk_queue_softirq_done(q, set->ops->complete); 2023 2024 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 2025 2026 get_online_cpus(); 2027 mutex_lock(&all_q_mutex); 2028 2029 list_add_tail(&q->all_q_node, &all_q_list); 2030 blk_mq_add_queue_tag_set(set, q); 2031 blk_mq_map_swqueue(q, cpu_online_mask); 2032 2033 mutex_unlock(&all_q_mutex); 2034 put_online_cpus(); 2035 2036 return q; 2037 2038 err_hctxs: 2039 kfree(q->queue_hw_ctx); 2040 err_percpu: 2041 free_percpu(q->queue_ctx); 2042 err_exit: 2043 q->mq_ops = NULL; 2044 return ERR_PTR(-ENOMEM); 2045 } 2046 EXPORT_SYMBOL(blk_mq_init_allocated_queue); 2047 2048 void blk_mq_free_queue(struct request_queue *q) 2049 { 2050 struct blk_mq_tag_set *set = q->tag_set; 2051 2052 mutex_lock(&all_q_mutex); 2053 list_del_init(&q->all_q_node); 2054 mutex_unlock(&all_q_mutex); 2055 2056 blk_mq_del_queue_tag_set(q); 2057 2058 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 2059 blk_mq_free_hw_queues(q, set); 2060 } 2061 2062 /* Basically redo blk_mq_init_queue with queue frozen */ 2063 static void blk_mq_queue_reinit(struct request_queue *q, 2064 const struct cpumask *online_mask) 2065 { 2066 WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); 2067 2068 blk_mq_sysfs_unregister(q); 2069 2070 /* 2071 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 2072 * we should change hctx numa_node according to new topology (this 2073 * involves free and re-allocate memory, worthy doing?) 2074 */ 2075 2076 blk_mq_map_swqueue(q, online_mask); 2077 2078 blk_mq_sysfs_register(q); 2079 } 2080 2081 /* 2082 * New online cpumask which is going to be set in this hotplug event. 2083 * Declare this cpumasks as global as cpu-hotplug operation is invoked 2084 * one-by-one and dynamically allocating this could result in a failure. 2085 */ 2086 static struct cpumask cpuhp_online_new; 2087 2088 static void blk_mq_queue_reinit_work(void) 2089 { 2090 struct request_queue *q; 2091 2092 mutex_lock(&all_q_mutex); 2093 /* 2094 * We need to freeze and reinit all existing queues. Freezing 2095 * involves synchronous wait for an RCU grace period and doing it 2096 * one by one may take a long time. Start freezing all queues in 2097 * one swoop and then wait for the completions so that freezing can 2098 * take place in parallel. 2099 */ 2100 list_for_each_entry(q, &all_q_list, all_q_node) 2101 blk_mq_freeze_queue_start(q); 2102 list_for_each_entry(q, &all_q_list, all_q_node) { 2103 blk_mq_freeze_queue_wait(q); 2104 2105 /* 2106 * timeout handler can't touch hw queue during the 2107 * reinitialization 2108 */ 2109 del_timer_sync(&q->timeout); 2110 } 2111 2112 list_for_each_entry(q, &all_q_list, all_q_node) 2113 blk_mq_queue_reinit(q, &cpuhp_online_new); 2114 2115 list_for_each_entry(q, &all_q_list, all_q_node) 2116 blk_mq_unfreeze_queue(q); 2117 2118 mutex_unlock(&all_q_mutex); 2119 } 2120 2121 static int blk_mq_queue_reinit_dead(unsigned int cpu) 2122 { 2123 cpumask_copy(&cpuhp_online_new, cpu_online_mask); 2124 blk_mq_queue_reinit_work(); 2125 return 0; 2126 } 2127 2128 /* 2129 * Before hotadded cpu starts handling requests, new mappings must be 2130 * established. Otherwise, these requests in hw queue might never be 2131 * dispatched. 2132 * 2133 * For example, there is a single hw queue (hctx) and two CPU queues (ctx0 2134 * for CPU0, and ctx1 for CPU1). 2135 * 2136 * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list 2137 * and set bit0 in pending bitmap as ctx1->index_hw is still zero. 2138 * 2139 * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in 2140 * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. 2141 * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list 2142 * is ignored. 2143 */ 2144 static int blk_mq_queue_reinit_prepare(unsigned int cpu) 2145 { 2146 cpumask_copy(&cpuhp_online_new, cpu_online_mask); 2147 cpumask_set_cpu(cpu, &cpuhp_online_new); 2148 blk_mq_queue_reinit_work(); 2149 return 0; 2150 } 2151 2152 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2153 { 2154 int i; 2155 2156 for (i = 0; i < set->nr_hw_queues; i++) { 2157 set->tags[i] = blk_mq_init_rq_map(set, i); 2158 if (!set->tags[i]) 2159 goto out_unwind; 2160 } 2161 2162 return 0; 2163 2164 out_unwind: 2165 while (--i >= 0) 2166 blk_mq_free_rq_map(set, set->tags[i], i); 2167 2168 return -ENOMEM; 2169 } 2170 2171 /* 2172 * Allocate the request maps associated with this tag_set. Note that this 2173 * may reduce the depth asked for, if memory is tight. set->queue_depth 2174 * will be updated to reflect the allocated depth. 2175 */ 2176 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2177 { 2178 unsigned int depth; 2179 int err; 2180 2181 depth = set->queue_depth; 2182 do { 2183 err = __blk_mq_alloc_rq_maps(set); 2184 if (!err) 2185 break; 2186 2187 set->queue_depth >>= 1; 2188 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 2189 err = -ENOMEM; 2190 break; 2191 } 2192 } while (set->queue_depth); 2193 2194 if (!set->queue_depth || err) { 2195 pr_err("blk-mq: failed to allocate request map\n"); 2196 return -ENOMEM; 2197 } 2198 2199 if (depth != set->queue_depth) 2200 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 2201 depth, set->queue_depth); 2202 2203 return 0; 2204 } 2205 2206 /* 2207 * Alloc a tag set to be associated with one or more request queues. 2208 * May fail with EINVAL for various error conditions. May adjust the 2209 * requested depth down, if if it too large. In that case, the set 2210 * value will be stored in set->queue_depth. 2211 */ 2212 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2213 { 2214 int ret; 2215 2216 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 2217 2218 if (!set->nr_hw_queues) 2219 return -EINVAL; 2220 if (!set->queue_depth) 2221 return -EINVAL; 2222 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 2223 return -EINVAL; 2224 2225 if (!set->ops->queue_rq) 2226 return -EINVAL; 2227 2228 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 2229 pr_info("blk-mq: reduced tag depth to %u\n", 2230 BLK_MQ_MAX_DEPTH); 2231 set->queue_depth = BLK_MQ_MAX_DEPTH; 2232 } 2233 2234 /* 2235 * If a crashdump is active, then we are potentially in a very 2236 * memory constrained environment. Limit us to 1 queue and 2237 * 64 tags to prevent using too much memory. 2238 */ 2239 if (is_kdump_kernel()) { 2240 set->nr_hw_queues = 1; 2241 set->queue_depth = min(64U, set->queue_depth); 2242 } 2243 /* 2244 * There is no use for more h/w queues than cpus. 2245 */ 2246 if (set->nr_hw_queues > nr_cpu_ids) 2247 set->nr_hw_queues = nr_cpu_ids; 2248 2249 set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *), 2250 GFP_KERNEL, set->numa_node); 2251 if (!set->tags) 2252 return -ENOMEM; 2253 2254 ret = -ENOMEM; 2255 set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids, 2256 GFP_KERNEL, set->numa_node); 2257 if (!set->mq_map) 2258 goto out_free_tags; 2259 2260 if (set->ops->map_queues) 2261 ret = set->ops->map_queues(set); 2262 else 2263 ret = blk_mq_map_queues(set); 2264 if (ret) 2265 goto out_free_mq_map; 2266 2267 ret = blk_mq_alloc_rq_maps(set); 2268 if (ret) 2269 goto out_free_mq_map; 2270 2271 mutex_init(&set->tag_list_lock); 2272 INIT_LIST_HEAD(&set->tag_list); 2273 2274 return 0; 2275 2276 out_free_mq_map: 2277 kfree(set->mq_map); 2278 set->mq_map = NULL; 2279 out_free_tags: 2280 kfree(set->tags); 2281 set->tags = NULL; 2282 return ret; 2283 } 2284 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 2285 2286 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 2287 { 2288 int i; 2289 2290 for (i = 0; i < nr_cpu_ids; i++) { 2291 if (set->tags[i]) 2292 blk_mq_free_rq_map(set, set->tags[i], i); 2293 } 2294 2295 kfree(set->mq_map); 2296 set->mq_map = NULL; 2297 2298 kfree(set->tags); 2299 set->tags = NULL; 2300 } 2301 EXPORT_SYMBOL(blk_mq_free_tag_set); 2302 2303 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) 2304 { 2305 struct blk_mq_tag_set *set = q->tag_set; 2306 struct blk_mq_hw_ctx *hctx; 2307 int i, ret; 2308 2309 if (!set || nr > set->queue_depth) 2310 return -EINVAL; 2311 2312 ret = 0; 2313 queue_for_each_hw_ctx(q, hctx, i) { 2314 if (!hctx->tags) 2315 continue; 2316 ret = blk_mq_tag_update_depth(hctx->tags, nr); 2317 if (ret) 2318 break; 2319 } 2320 2321 if (!ret) 2322 q->nr_requests = nr; 2323 2324 return ret; 2325 } 2326 2327 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) 2328 { 2329 struct request_queue *q; 2330 2331 if (nr_hw_queues > nr_cpu_ids) 2332 nr_hw_queues = nr_cpu_ids; 2333 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) 2334 return; 2335 2336 list_for_each_entry(q, &set->tag_list, tag_set_list) 2337 blk_mq_freeze_queue(q); 2338 2339 set->nr_hw_queues = nr_hw_queues; 2340 list_for_each_entry(q, &set->tag_list, tag_set_list) { 2341 blk_mq_realloc_hw_ctxs(set, q); 2342 2343 if (q->nr_hw_queues > 1) 2344 blk_queue_make_request(q, blk_mq_make_request); 2345 else 2346 blk_queue_make_request(q, blk_sq_make_request); 2347 2348 blk_mq_queue_reinit(q, cpu_online_mask); 2349 } 2350 2351 list_for_each_entry(q, &set->tag_list, tag_set_list) 2352 blk_mq_unfreeze_queue(q); 2353 } 2354 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); 2355 2356 void blk_mq_disable_hotplug(void) 2357 { 2358 mutex_lock(&all_q_mutex); 2359 } 2360 2361 void blk_mq_enable_hotplug(void) 2362 { 2363 mutex_unlock(&all_q_mutex); 2364 } 2365 2366 static int __init blk_mq_init(void) 2367 { 2368 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 2369 blk_mq_hctx_notify_dead); 2370 2371 cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare", 2372 blk_mq_queue_reinit_prepare, 2373 blk_mq_queue_reinit_dead); 2374 return 0; 2375 } 2376 subsys_initcall(blk_mq_init); 2377