1 /* 2 * Block multiqueue core code 3 * 4 * Copyright (C) 2013-2014 Jens Axboe 5 * Copyright (C) 2013-2014 Christoph Hellwig 6 */ 7 #include <linux/kernel.h> 8 #include <linux/module.h> 9 #include <linux/backing-dev.h> 10 #include <linux/bio.h> 11 #include <linux/blkdev.h> 12 #include <linux/kmemleak.h> 13 #include <linux/mm.h> 14 #include <linux/init.h> 15 #include <linux/slab.h> 16 #include <linux/workqueue.h> 17 #include <linux/smp.h> 18 #include <linux/llist.h> 19 #include <linux/list_sort.h> 20 #include <linux/cpu.h> 21 #include <linux/cache.h> 22 #include <linux/sched/sysctl.h> 23 #include <linux/sched/topology.h> 24 #include <linux/sched/signal.h> 25 #include <linux/delay.h> 26 #include <linux/crash_dump.h> 27 #include <linux/prefetch.h> 28 29 #include <trace/events/block.h> 30 31 #include <linux/blk-mq.h> 32 #include "blk.h" 33 #include "blk-mq.h" 34 #include "blk-mq-debugfs.h" 35 #include "blk-mq-tag.h" 36 #include "blk-stat.h" 37 #include "blk-wbt.h" 38 #include "blk-mq-sched.h" 39 40 static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); 41 static void blk_mq_poll_stats_start(struct request_queue *q); 42 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); 43 44 static int blk_mq_poll_stats_bkt(const struct request *rq) 45 { 46 int ddir, bytes, bucket; 47 48 ddir = rq_data_dir(rq); 49 bytes = blk_rq_bytes(rq); 50 51 bucket = ddir + 2*(ilog2(bytes) - 9); 52 53 if (bucket < 0) 54 return -1; 55 else if (bucket >= BLK_MQ_POLL_STATS_BKTS) 56 return ddir + BLK_MQ_POLL_STATS_BKTS - 2; 57 58 return bucket; 59 } 60 61 /* 62 * Check if any of the ctx's have pending work in this hardware queue 63 */ 64 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 65 { 66 return !list_empty_careful(&hctx->dispatch) || 67 sbitmap_any_bit_set(&hctx->ctx_map) || 68 blk_mq_sched_has_work(hctx); 69 } 70 71 /* 72 * Mark this ctx as having pending work in this hardware queue 73 */ 74 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 75 struct blk_mq_ctx *ctx) 76 { 77 if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) 78 sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); 79 } 80 81 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 82 struct blk_mq_ctx *ctx) 83 { 84 sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); 85 } 86 87 struct mq_inflight { 88 struct hd_struct *part; 89 unsigned int *inflight; 90 }; 91 92 static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, 93 struct request *rq, void *priv, 94 bool reserved) 95 { 96 struct mq_inflight *mi = priv; 97 98 if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && 99 !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { 100 /* 101 * index[0] counts the specific partition that was asked 102 * for. index[1] counts the ones that are active on the 103 * whole device, so increment that if mi->part is indeed 104 * a partition, and not a whole device. 105 */ 106 if (rq->part == mi->part) 107 mi->inflight[0]++; 108 if (mi->part->partno) 109 mi->inflight[1]++; 110 } 111 } 112 113 void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, 114 unsigned int inflight[2]) 115 { 116 struct mq_inflight mi = { .part = part, .inflight = inflight, }; 117 118 inflight[0] = inflight[1] = 0; 119 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); 120 } 121 122 void blk_freeze_queue_start(struct request_queue *q) 123 { 124 int freeze_depth; 125 126 freeze_depth = atomic_inc_return(&q->mq_freeze_depth); 127 if (freeze_depth == 1) { 128 percpu_ref_kill(&q->q_usage_counter); 129 if (q->mq_ops) 130 blk_mq_run_hw_queues(q, false); 131 } 132 } 133 EXPORT_SYMBOL_GPL(blk_freeze_queue_start); 134 135 void blk_mq_freeze_queue_wait(struct request_queue *q) 136 { 137 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); 138 } 139 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); 140 141 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 142 unsigned long timeout) 143 { 144 return wait_event_timeout(q->mq_freeze_wq, 145 percpu_ref_is_zero(&q->q_usage_counter), 146 timeout); 147 } 148 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); 149 150 /* 151 * Guarantee no request is in use, so we can change any data structure of 152 * the queue afterward. 153 */ 154 void blk_freeze_queue(struct request_queue *q) 155 { 156 /* 157 * In the !blk_mq case we are only calling this to kill the 158 * q_usage_counter, otherwise this increases the freeze depth 159 * and waits for it to return to zero. For this reason there is 160 * no blk_unfreeze_queue(), and blk_freeze_queue() is not 161 * exported to drivers as the only user for unfreeze is blk_mq. 162 */ 163 blk_freeze_queue_start(q); 164 if (!q->mq_ops) 165 blk_drain_queue(q); 166 blk_mq_freeze_queue_wait(q); 167 } 168 169 void blk_mq_freeze_queue(struct request_queue *q) 170 { 171 /* 172 * ...just an alias to keep freeze and unfreeze actions balanced 173 * in the blk_mq_* namespace 174 */ 175 blk_freeze_queue(q); 176 } 177 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 178 179 void blk_mq_unfreeze_queue(struct request_queue *q) 180 { 181 int freeze_depth; 182 183 freeze_depth = atomic_dec_return(&q->mq_freeze_depth); 184 WARN_ON_ONCE(freeze_depth < 0); 185 if (!freeze_depth) { 186 percpu_ref_reinit(&q->q_usage_counter); 187 wake_up_all(&q->mq_freeze_wq); 188 } 189 } 190 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 191 192 /* 193 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the 194 * mpt3sas driver such that this function can be removed. 195 */ 196 void blk_mq_quiesce_queue_nowait(struct request_queue *q) 197 { 198 unsigned long flags; 199 200 spin_lock_irqsave(q->queue_lock, flags); 201 queue_flag_set(QUEUE_FLAG_QUIESCED, q); 202 spin_unlock_irqrestore(q->queue_lock, flags); 203 } 204 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); 205 206 /** 207 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished 208 * @q: request queue. 209 * 210 * Note: this function does not prevent that the struct request end_io() 211 * callback function is invoked. Once this function is returned, we make 212 * sure no dispatch can happen until the queue is unquiesced via 213 * blk_mq_unquiesce_queue(). 214 */ 215 void blk_mq_quiesce_queue(struct request_queue *q) 216 { 217 struct blk_mq_hw_ctx *hctx; 218 unsigned int i; 219 bool rcu = false; 220 221 blk_mq_quiesce_queue_nowait(q); 222 223 queue_for_each_hw_ctx(q, hctx, i) { 224 if (hctx->flags & BLK_MQ_F_BLOCKING) 225 synchronize_srcu(hctx->queue_rq_srcu); 226 else 227 rcu = true; 228 } 229 if (rcu) 230 synchronize_rcu(); 231 } 232 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); 233 234 /* 235 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() 236 * @q: request queue. 237 * 238 * This function recovers queue into the state before quiescing 239 * which is done by blk_mq_quiesce_queue. 240 */ 241 void blk_mq_unquiesce_queue(struct request_queue *q) 242 { 243 unsigned long flags; 244 245 spin_lock_irqsave(q->queue_lock, flags); 246 queue_flag_clear(QUEUE_FLAG_QUIESCED, q); 247 spin_unlock_irqrestore(q->queue_lock, flags); 248 249 /* dispatch requests which are inserted during quiescing */ 250 blk_mq_run_hw_queues(q, true); 251 } 252 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); 253 254 void blk_mq_wake_waiters(struct request_queue *q) 255 { 256 struct blk_mq_hw_ctx *hctx; 257 unsigned int i; 258 259 queue_for_each_hw_ctx(q, hctx, i) 260 if (blk_mq_hw_queue_mapped(hctx)) 261 blk_mq_tag_wakeup_all(hctx->tags, true); 262 } 263 264 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 265 { 266 return blk_mq_has_free_tags(hctx->tags); 267 } 268 EXPORT_SYMBOL(blk_mq_can_queue); 269 270 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, 271 unsigned int tag, unsigned int op) 272 { 273 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 274 struct request *rq = tags->static_rqs[tag]; 275 276 rq->rq_flags = 0; 277 278 if (data->flags & BLK_MQ_REQ_INTERNAL) { 279 rq->tag = -1; 280 rq->internal_tag = tag; 281 } else { 282 if (blk_mq_tag_busy(data->hctx)) { 283 rq->rq_flags = RQF_MQ_INFLIGHT; 284 atomic_inc(&data->hctx->nr_active); 285 } 286 rq->tag = tag; 287 rq->internal_tag = -1; 288 data->hctx->tags->rqs[rq->tag] = rq; 289 } 290 291 INIT_LIST_HEAD(&rq->queuelist); 292 /* csd/requeue_work/fifo_time is initialized before use */ 293 rq->q = data->q; 294 rq->mq_ctx = data->ctx; 295 rq->cmd_flags = op; 296 if (data->flags & BLK_MQ_REQ_PREEMPT) 297 rq->rq_flags |= RQF_PREEMPT; 298 if (blk_queue_io_stat(data->q)) 299 rq->rq_flags |= RQF_IO_STAT; 300 /* do not touch atomic flags, it needs atomic ops against the timer */ 301 rq->cpu = -1; 302 INIT_HLIST_NODE(&rq->hash); 303 RB_CLEAR_NODE(&rq->rb_node); 304 rq->rq_disk = NULL; 305 rq->part = NULL; 306 rq->start_time = jiffies; 307 #ifdef CONFIG_BLK_CGROUP 308 rq->rl = NULL; 309 set_start_time_ns(rq); 310 rq->io_start_time_ns = 0; 311 #endif 312 rq->nr_phys_segments = 0; 313 #if defined(CONFIG_BLK_DEV_INTEGRITY) 314 rq->nr_integrity_segments = 0; 315 #endif 316 rq->special = NULL; 317 /* tag was already set */ 318 rq->extra_len = 0; 319 320 INIT_LIST_HEAD(&rq->timeout_list); 321 rq->timeout = 0; 322 323 rq->end_io = NULL; 324 rq->end_io_data = NULL; 325 rq->next_rq = NULL; 326 327 data->ctx->rq_dispatched[op_is_sync(op)]++; 328 return rq; 329 } 330 331 static struct request *blk_mq_get_request(struct request_queue *q, 332 struct bio *bio, unsigned int op, 333 struct blk_mq_alloc_data *data) 334 { 335 struct elevator_queue *e = q->elevator; 336 struct request *rq; 337 unsigned int tag; 338 bool put_ctx_on_error = false; 339 340 blk_queue_enter_live(q); 341 data->q = q; 342 if (likely(!data->ctx)) { 343 data->ctx = blk_mq_get_ctx(q); 344 put_ctx_on_error = true; 345 } 346 if (likely(!data->hctx)) 347 data->hctx = blk_mq_map_queue(q, data->ctx->cpu); 348 if (op & REQ_NOWAIT) 349 data->flags |= BLK_MQ_REQ_NOWAIT; 350 351 if (e) { 352 data->flags |= BLK_MQ_REQ_INTERNAL; 353 354 /* 355 * Flush requests are special and go directly to the 356 * dispatch list. 357 */ 358 if (!op_is_flush(op) && e->type->ops.mq.limit_depth) 359 e->type->ops.mq.limit_depth(op, data); 360 } 361 362 tag = blk_mq_get_tag(data); 363 if (tag == BLK_MQ_TAG_FAIL) { 364 if (put_ctx_on_error) { 365 blk_mq_put_ctx(data->ctx); 366 data->ctx = NULL; 367 } 368 blk_queue_exit(q); 369 return NULL; 370 } 371 372 rq = blk_mq_rq_ctx_init(data, tag, op); 373 if (!op_is_flush(op)) { 374 rq->elv.icq = NULL; 375 if (e && e->type->ops.mq.prepare_request) { 376 if (e->type->icq_cache && rq_ioc(bio)) 377 blk_mq_sched_assign_ioc(rq, bio); 378 379 e->type->ops.mq.prepare_request(rq, bio); 380 rq->rq_flags |= RQF_ELVPRIV; 381 } 382 } 383 data->hctx->queued++; 384 return rq; 385 } 386 387 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 388 blk_mq_req_flags_t flags) 389 { 390 struct blk_mq_alloc_data alloc_data = { .flags = flags }; 391 struct request *rq; 392 int ret; 393 394 ret = blk_queue_enter(q, flags); 395 if (ret) 396 return ERR_PTR(ret); 397 398 rq = blk_mq_get_request(q, NULL, op, &alloc_data); 399 blk_queue_exit(q); 400 401 if (!rq) 402 return ERR_PTR(-EWOULDBLOCK); 403 404 blk_mq_put_ctx(alloc_data.ctx); 405 406 rq->__data_len = 0; 407 rq->__sector = (sector_t) -1; 408 rq->bio = rq->biotail = NULL; 409 return rq; 410 } 411 EXPORT_SYMBOL(blk_mq_alloc_request); 412 413 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 414 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) 415 { 416 struct blk_mq_alloc_data alloc_data = { .flags = flags }; 417 struct request *rq; 418 unsigned int cpu; 419 int ret; 420 421 /* 422 * If the tag allocator sleeps we could get an allocation for a 423 * different hardware context. No need to complicate the low level 424 * allocator for this for the rare use case of a command tied to 425 * a specific queue. 426 */ 427 if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT))) 428 return ERR_PTR(-EINVAL); 429 430 if (hctx_idx >= q->nr_hw_queues) 431 return ERR_PTR(-EIO); 432 433 ret = blk_queue_enter(q, flags); 434 if (ret) 435 return ERR_PTR(ret); 436 437 /* 438 * Check if the hardware context is actually mapped to anything. 439 * If not tell the caller that it should skip this queue. 440 */ 441 alloc_data.hctx = q->queue_hw_ctx[hctx_idx]; 442 if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) { 443 blk_queue_exit(q); 444 return ERR_PTR(-EXDEV); 445 } 446 cpu = cpumask_first(alloc_data.hctx->cpumask); 447 alloc_data.ctx = __blk_mq_get_ctx(q, cpu); 448 449 rq = blk_mq_get_request(q, NULL, op, &alloc_data); 450 blk_queue_exit(q); 451 452 if (!rq) 453 return ERR_PTR(-EWOULDBLOCK); 454 455 return rq; 456 } 457 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); 458 459 void blk_mq_free_request(struct request *rq) 460 { 461 struct request_queue *q = rq->q; 462 struct elevator_queue *e = q->elevator; 463 struct blk_mq_ctx *ctx = rq->mq_ctx; 464 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 465 const int sched_tag = rq->internal_tag; 466 467 if (rq->rq_flags & RQF_ELVPRIV) { 468 if (e && e->type->ops.mq.finish_request) 469 e->type->ops.mq.finish_request(rq); 470 if (rq->elv.icq) { 471 put_io_context(rq->elv.icq->ioc); 472 rq->elv.icq = NULL; 473 } 474 } 475 476 ctx->rq_completed[rq_is_sync(rq)]++; 477 if (rq->rq_flags & RQF_MQ_INFLIGHT) 478 atomic_dec(&hctx->nr_active); 479 480 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) 481 laptop_io_completion(q->backing_dev_info); 482 483 wbt_done(q->rq_wb, &rq->issue_stat); 484 485 if (blk_rq_rl(rq)) 486 blk_put_rl(blk_rq_rl(rq)); 487 488 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 489 clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); 490 if (rq->tag != -1) 491 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); 492 if (sched_tag != -1) 493 blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag); 494 blk_mq_sched_restart(hctx); 495 blk_queue_exit(q); 496 } 497 EXPORT_SYMBOL_GPL(blk_mq_free_request); 498 499 inline void __blk_mq_end_request(struct request *rq, blk_status_t error) 500 { 501 blk_account_io_done(rq); 502 503 if (rq->end_io) { 504 wbt_done(rq->q->rq_wb, &rq->issue_stat); 505 rq->end_io(rq, error); 506 } else { 507 if (unlikely(blk_bidi_rq(rq))) 508 blk_mq_free_request(rq->next_rq); 509 blk_mq_free_request(rq); 510 } 511 } 512 EXPORT_SYMBOL(__blk_mq_end_request); 513 514 void blk_mq_end_request(struct request *rq, blk_status_t error) 515 { 516 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 517 BUG(); 518 __blk_mq_end_request(rq, error); 519 } 520 EXPORT_SYMBOL(blk_mq_end_request); 521 522 static void __blk_mq_complete_request_remote(void *data) 523 { 524 struct request *rq = data; 525 526 rq->q->softirq_done_fn(rq); 527 } 528 529 static void __blk_mq_complete_request(struct request *rq) 530 { 531 struct blk_mq_ctx *ctx = rq->mq_ctx; 532 bool shared = false; 533 int cpu; 534 535 if (rq->internal_tag != -1) 536 blk_mq_sched_completed_request(rq); 537 if (rq->rq_flags & RQF_STATS) { 538 blk_mq_poll_stats_start(rq->q); 539 blk_stat_add(rq); 540 } 541 542 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { 543 rq->q->softirq_done_fn(rq); 544 return; 545 } 546 547 cpu = get_cpu(); 548 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) 549 shared = cpus_share_cache(cpu, ctx->cpu); 550 551 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { 552 rq->csd.func = __blk_mq_complete_request_remote; 553 rq->csd.info = rq; 554 rq->csd.flags = 0; 555 smp_call_function_single_async(ctx->cpu, &rq->csd); 556 } else { 557 rq->q->softirq_done_fn(rq); 558 } 559 put_cpu(); 560 } 561 562 /** 563 * blk_mq_complete_request - end I/O on a request 564 * @rq: the request being processed 565 * 566 * Description: 567 * Ends all I/O on a request. It does not handle partial completions. 568 * The actual completion happens out-of-order, through a IPI handler. 569 **/ 570 void blk_mq_complete_request(struct request *rq) 571 { 572 struct request_queue *q = rq->q; 573 574 if (unlikely(blk_should_fake_timeout(q))) 575 return; 576 if (!blk_mark_rq_complete(rq)) 577 __blk_mq_complete_request(rq); 578 } 579 EXPORT_SYMBOL(blk_mq_complete_request); 580 581 int blk_mq_request_started(struct request *rq) 582 { 583 return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 584 } 585 EXPORT_SYMBOL_GPL(blk_mq_request_started); 586 587 void blk_mq_start_request(struct request *rq) 588 { 589 struct request_queue *q = rq->q; 590 591 blk_mq_sched_started_request(rq); 592 593 trace_block_rq_issue(q, rq); 594 595 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { 596 blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq)); 597 rq->rq_flags |= RQF_STATS; 598 wbt_issue(q->rq_wb, &rq->issue_stat); 599 } 600 601 blk_add_timer(rq); 602 603 WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); 604 605 /* 606 * Mark us as started and clear complete. Complete might have been 607 * set if requeue raced with timeout, which then marked it as 608 * complete. So be sure to clear complete again when we start 609 * the request, otherwise we'll ignore the completion event. 610 * 611 * Ensure that ->deadline is visible before we set STARTED, such that 612 * blk_mq_check_expired() is guaranteed to observe our ->deadline when 613 * it observes STARTED. 614 */ 615 smp_wmb(); 616 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 617 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { 618 /* 619 * Coherence order guarantees these consecutive stores to a 620 * single variable propagate in the specified order. Thus the 621 * clear_bit() is ordered _after_ the set bit. See 622 * blk_mq_check_expired(). 623 * 624 * (the bits must be part of the same byte for this to be 625 * true). 626 */ 627 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 628 } 629 630 if (q->dma_drain_size && blk_rq_bytes(rq)) { 631 /* 632 * Make sure space for the drain appears. We know we can do 633 * this because max_hw_segments has been adjusted to be one 634 * fewer than the device can handle. 635 */ 636 rq->nr_phys_segments++; 637 } 638 } 639 EXPORT_SYMBOL(blk_mq_start_request); 640 641 /* 642 * When we reach here because queue is busy, REQ_ATOM_COMPLETE 643 * flag isn't set yet, so there may be race with timeout handler, 644 * but given rq->deadline is just set in .queue_rq() under 645 * this situation, the race won't be possible in reality because 646 * rq->timeout should be set as big enough to cover the window 647 * between blk_mq_start_request() called from .queue_rq() and 648 * clearing REQ_ATOM_STARTED here. 649 */ 650 static void __blk_mq_requeue_request(struct request *rq) 651 { 652 struct request_queue *q = rq->q; 653 654 blk_mq_put_driver_tag(rq); 655 656 trace_block_rq_requeue(q, rq); 657 wbt_requeue(q->rq_wb, &rq->issue_stat); 658 blk_mq_sched_requeue_request(rq); 659 660 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 661 if (q->dma_drain_size && blk_rq_bytes(rq)) 662 rq->nr_phys_segments--; 663 } 664 } 665 666 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) 667 { 668 __blk_mq_requeue_request(rq); 669 670 BUG_ON(blk_queued_rq(rq)); 671 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); 672 } 673 EXPORT_SYMBOL(blk_mq_requeue_request); 674 675 static void blk_mq_requeue_work(struct work_struct *work) 676 { 677 struct request_queue *q = 678 container_of(work, struct request_queue, requeue_work.work); 679 LIST_HEAD(rq_list); 680 struct request *rq, *next; 681 682 spin_lock_irq(&q->requeue_lock); 683 list_splice_init(&q->requeue_list, &rq_list); 684 spin_unlock_irq(&q->requeue_lock); 685 686 list_for_each_entry_safe(rq, next, &rq_list, queuelist) { 687 if (!(rq->rq_flags & RQF_SOFTBARRIER)) 688 continue; 689 690 rq->rq_flags &= ~RQF_SOFTBARRIER; 691 list_del_init(&rq->queuelist); 692 blk_mq_sched_insert_request(rq, true, false, false, true); 693 } 694 695 while (!list_empty(&rq_list)) { 696 rq = list_entry(rq_list.next, struct request, queuelist); 697 list_del_init(&rq->queuelist); 698 blk_mq_sched_insert_request(rq, false, false, false, true); 699 } 700 701 blk_mq_run_hw_queues(q, false); 702 } 703 704 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, 705 bool kick_requeue_list) 706 { 707 struct request_queue *q = rq->q; 708 unsigned long flags; 709 710 /* 711 * We abuse this flag that is otherwise used by the I/O scheduler to 712 * request head insertion from the workqueue. 713 */ 714 BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); 715 716 spin_lock_irqsave(&q->requeue_lock, flags); 717 if (at_head) { 718 rq->rq_flags |= RQF_SOFTBARRIER; 719 list_add(&rq->queuelist, &q->requeue_list); 720 } else { 721 list_add_tail(&rq->queuelist, &q->requeue_list); 722 } 723 spin_unlock_irqrestore(&q->requeue_lock, flags); 724 725 if (kick_requeue_list) 726 blk_mq_kick_requeue_list(q); 727 } 728 EXPORT_SYMBOL(blk_mq_add_to_requeue_list); 729 730 void blk_mq_kick_requeue_list(struct request_queue *q) 731 { 732 kblockd_schedule_delayed_work(&q->requeue_work, 0); 733 } 734 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 735 736 void blk_mq_delay_kick_requeue_list(struct request_queue *q, 737 unsigned long msecs) 738 { 739 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 740 msecs_to_jiffies(msecs)); 741 } 742 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); 743 744 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 745 { 746 if (tag < tags->nr_tags) { 747 prefetch(tags->rqs[tag]); 748 return tags->rqs[tag]; 749 } 750 751 return NULL; 752 } 753 EXPORT_SYMBOL(blk_mq_tag_to_rq); 754 755 struct blk_mq_timeout_data { 756 unsigned long next; 757 unsigned int next_set; 758 }; 759 760 void blk_mq_rq_timed_out(struct request *req, bool reserved) 761 { 762 const struct blk_mq_ops *ops = req->q->mq_ops; 763 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; 764 765 /* 766 * We know that complete is set at this point. If STARTED isn't set 767 * anymore, then the request isn't active and the "timeout" should 768 * just be ignored. This can happen due to the bitflag ordering. 769 * Timeout first checks if STARTED is set, and if it is, assumes 770 * the request is active. But if we race with completion, then 771 * both flags will get cleared. So check here again, and ignore 772 * a timeout event with a request that isn't active. 773 */ 774 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) 775 return; 776 777 if (ops->timeout) 778 ret = ops->timeout(req, reserved); 779 780 switch (ret) { 781 case BLK_EH_HANDLED: 782 __blk_mq_complete_request(req); 783 break; 784 case BLK_EH_RESET_TIMER: 785 blk_add_timer(req); 786 blk_clear_rq_complete(req); 787 break; 788 case BLK_EH_NOT_HANDLED: 789 break; 790 default: 791 printk(KERN_ERR "block: bad eh return: %d\n", ret); 792 break; 793 } 794 } 795 796 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 797 struct request *rq, void *priv, bool reserved) 798 { 799 struct blk_mq_timeout_data *data = priv; 800 unsigned long deadline; 801 802 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 803 return; 804 805 /* 806 * Ensures that if we see STARTED we must also see our 807 * up-to-date deadline, see blk_mq_start_request(). 808 */ 809 smp_rmb(); 810 811 deadline = READ_ONCE(rq->deadline); 812 813 /* 814 * The rq being checked may have been freed and reallocated 815 * out already here, we avoid this race by checking rq->deadline 816 * and REQ_ATOM_COMPLETE flag together: 817 * 818 * - if rq->deadline is observed as new value because of 819 * reusing, the rq won't be timed out because of timing. 820 * - if rq->deadline is observed as previous value, 821 * REQ_ATOM_COMPLETE flag won't be cleared in reuse path 822 * because we put a barrier between setting rq->deadline 823 * and clearing the flag in blk_mq_start_request(), so 824 * this rq won't be timed out too. 825 */ 826 if (time_after_eq(jiffies, deadline)) { 827 if (!blk_mark_rq_complete(rq)) { 828 /* 829 * Again coherence order ensures that consecutive reads 830 * from the same variable must be in that order. This 831 * ensures that if we see COMPLETE clear, we must then 832 * see STARTED set and we'll ignore this timeout. 833 * 834 * (There's also the MB implied by the test_and_clear()) 835 */ 836 blk_mq_rq_timed_out(rq, reserved); 837 } 838 } else if (!data->next_set || time_after(data->next, deadline)) { 839 data->next = deadline; 840 data->next_set = 1; 841 } 842 } 843 844 static void blk_mq_timeout_work(struct work_struct *work) 845 { 846 struct request_queue *q = 847 container_of(work, struct request_queue, timeout_work); 848 struct blk_mq_timeout_data data = { 849 .next = 0, 850 .next_set = 0, 851 }; 852 int i; 853 854 /* A deadlock might occur if a request is stuck requiring a 855 * timeout at the same time a queue freeze is waiting 856 * completion, since the timeout code would not be able to 857 * acquire the queue reference here. 858 * 859 * That's why we don't use blk_queue_enter here; instead, we use 860 * percpu_ref_tryget directly, because we need to be able to 861 * obtain a reference even in the short window between the queue 862 * starting to freeze, by dropping the first reference in 863 * blk_freeze_queue_start, and the moment the last request is 864 * consumed, marked by the instant q_usage_counter reaches 865 * zero. 866 */ 867 if (!percpu_ref_tryget(&q->q_usage_counter)) 868 return; 869 870 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); 871 872 if (data.next_set) { 873 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 874 mod_timer(&q->timeout, data.next); 875 } else { 876 struct blk_mq_hw_ctx *hctx; 877 878 queue_for_each_hw_ctx(q, hctx, i) { 879 /* the hctx may be unmapped, so check it here */ 880 if (blk_mq_hw_queue_mapped(hctx)) 881 blk_mq_tag_idle(hctx); 882 } 883 } 884 blk_queue_exit(q); 885 } 886 887 struct flush_busy_ctx_data { 888 struct blk_mq_hw_ctx *hctx; 889 struct list_head *list; 890 }; 891 892 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) 893 { 894 struct flush_busy_ctx_data *flush_data = data; 895 struct blk_mq_hw_ctx *hctx = flush_data->hctx; 896 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 897 898 sbitmap_clear_bit(sb, bitnr); 899 spin_lock(&ctx->lock); 900 list_splice_tail_init(&ctx->rq_list, flush_data->list); 901 spin_unlock(&ctx->lock); 902 return true; 903 } 904 905 /* 906 * Process software queues that have been marked busy, splicing them 907 * to the for-dispatch 908 */ 909 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 910 { 911 struct flush_busy_ctx_data data = { 912 .hctx = hctx, 913 .list = list, 914 }; 915 916 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); 917 } 918 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); 919 920 struct dispatch_rq_data { 921 struct blk_mq_hw_ctx *hctx; 922 struct request *rq; 923 }; 924 925 static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, 926 void *data) 927 { 928 struct dispatch_rq_data *dispatch_data = data; 929 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; 930 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 931 932 spin_lock(&ctx->lock); 933 if (unlikely(!list_empty(&ctx->rq_list))) { 934 dispatch_data->rq = list_entry_rq(ctx->rq_list.next); 935 list_del_init(&dispatch_data->rq->queuelist); 936 if (list_empty(&ctx->rq_list)) 937 sbitmap_clear_bit(sb, bitnr); 938 } 939 spin_unlock(&ctx->lock); 940 941 return !dispatch_data->rq; 942 } 943 944 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, 945 struct blk_mq_ctx *start) 946 { 947 unsigned off = start ? start->index_hw : 0; 948 struct dispatch_rq_data data = { 949 .hctx = hctx, 950 .rq = NULL, 951 }; 952 953 __sbitmap_for_each_set(&hctx->ctx_map, off, 954 dispatch_rq_from_ctx, &data); 955 956 return data.rq; 957 } 958 959 static inline unsigned int queued_to_index(unsigned int queued) 960 { 961 if (!queued) 962 return 0; 963 964 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); 965 } 966 967 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, 968 bool wait) 969 { 970 struct blk_mq_alloc_data data = { 971 .q = rq->q, 972 .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), 973 .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, 974 }; 975 976 might_sleep_if(wait); 977 978 if (rq->tag != -1) 979 goto done; 980 981 if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) 982 data.flags |= BLK_MQ_REQ_RESERVED; 983 984 rq->tag = blk_mq_get_tag(&data); 985 if (rq->tag >= 0) { 986 if (blk_mq_tag_busy(data.hctx)) { 987 rq->rq_flags |= RQF_MQ_INFLIGHT; 988 atomic_inc(&data.hctx->nr_active); 989 } 990 data.hctx->tags->rqs[rq->tag] = rq; 991 } 992 993 done: 994 if (hctx) 995 *hctx = data.hctx; 996 return rq->tag != -1; 997 } 998 999 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, 1000 int flags, void *key) 1001 { 1002 struct blk_mq_hw_ctx *hctx; 1003 1004 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); 1005 1006 list_del_init(&wait->entry); 1007 blk_mq_run_hw_queue(hctx, true); 1008 return 1; 1009 } 1010 1011 /* 1012 * Mark us waiting for a tag. For shared tags, this involves hooking us into 1013 * the tag wakeups. For non-shared tags, we can simply mark us nedeing a 1014 * restart. For both caes, take care to check the condition again after 1015 * marking us as waiting. 1016 */ 1017 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, 1018 struct request *rq) 1019 { 1020 struct blk_mq_hw_ctx *this_hctx = *hctx; 1021 bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0; 1022 struct sbq_wait_state *ws; 1023 wait_queue_entry_t *wait; 1024 bool ret; 1025 1026 if (!shared_tags) { 1027 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) 1028 set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); 1029 } else { 1030 wait = &this_hctx->dispatch_wait; 1031 if (!list_empty_careful(&wait->entry)) 1032 return false; 1033 1034 spin_lock(&this_hctx->lock); 1035 if (!list_empty(&wait->entry)) { 1036 spin_unlock(&this_hctx->lock); 1037 return false; 1038 } 1039 1040 ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); 1041 add_wait_queue(&ws->wait, wait); 1042 } 1043 1044 /* 1045 * It's possible that a tag was freed in the window between the 1046 * allocation failure and adding the hardware queue to the wait 1047 * queue. 1048 */ 1049 ret = blk_mq_get_driver_tag(rq, hctx, false); 1050 1051 if (!shared_tags) { 1052 /* 1053 * Don't clear RESTART here, someone else could have set it. 1054 * At most this will cost an extra queue run. 1055 */ 1056 return ret; 1057 } else { 1058 if (!ret) { 1059 spin_unlock(&this_hctx->lock); 1060 return false; 1061 } 1062 1063 /* 1064 * We got a tag, remove ourselves from the wait queue to ensure 1065 * someone else gets the wakeup. 1066 */ 1067 spin_lock_irq(&ws->wait.lock); 1068 list_del_init(&wait->entry); 1069 spin_unlock_irq(&ws->wait.lock); 1070 spin_unlock(&this_hctx->lock); 1071 return true; 1072 } 1073 } 1074 1075 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, 1076 bool got_budget) 1077 { 1078 struct blk_mq_hw_ctx *hctx; 1079 struct request *rq, *nxt; 1080 bool no_tag = false; 1081 int errors, queued; 1082 1083 if (list_empty(list)) 1084 return false; 1085 1086 WARN_ON(!list_is_singular(list) && got_budget); 1087 1088 /* 1089 * Now process all the entries, sending them to the driver. 1090 */ 1091 errors = queued = 0; 1092 do { 1093 struct blk_mq_queue_data bd; 1094 blk_status_t ret; 1095 1096 rq = list_first_entry(list, struct request, queuelist); 1097 if (!blk_mq_get_driver_tag(rq, &hctx, false)) { 1098 /* 1099 * The initial allocation attempt failed, so we need to 1100 * rerun the hardware queue when a tag is freed. The 1101 * waitqueue takes care of that. If the queue is run 1102 * before we add this entry back on the dispatch list, 1103 * we'll re-run it below. 1104 */ 1105 if (!blk_mq_mark_tag_wait(&hctx, rq)) { 1106 if (got_budget) 1107 blk_mq_put_dispatch_budget(hctx); 1108 /* 1109 * For non-shared tags, the RESTART check 1110 * will suffice. 1111 */ 1112 if (hctx->flags & BLK_MQ_F_TAG_SHARED) 1113 no_tag = true; 1114 break; 1115 } 1116 } 1117 1118 if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) { 1119 blk_mq_put_driver_tag(rq); 1120 break; 1121 } 1122 1123 list_del_init(&rq->queuelist); 1124 1125 bd.rq = rq; 1126 1127 /* 1128 * Flag last if we have no more requests, or if we have more 1129 * but can't assign a driver tag to it. 1130 */ 1131 if (list_empty(list)) 1132 bd.last = true; 1133 else { 1134 nxt = list_first_entry(list, struct request, queuelist); 1135 bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); 1136 } 1137 1138 ret = q->mq_ops->queue_rq(hctx, &bd); 1139 if (ret == BLK_STS_RESOURCE) { 1140 /* 1141 * If an I/O scheduler has been configured and we got a 1142 * driver tag for the next request already, free it 1143 * again. 1144 */ 1145 if (!list_empty(list)) { 1146 nxt = list_first_entry(list, struct request, queuelist); 1147 blk_mq_put_driver_tag(nxt); 1148 } 1149 list_add(&rq->queuelist, list); 1150 __blk_mq_requeue_request(rq); 1151 break; 1152 } 1153 1154 if (unlikely(ret != BLK_STS_OK)) { 1155 errors++; 1156 blk_mq_end_request(rq, BLK_STS_IOERR); 1157 continue; 1158 } 1159 1160 queued++; 1161 } while (!list_empty(list)); 1162 1163 hctx->dispatched[queued_to_index(queued)]++; 1164 1165 /* 1166 * Any items that need requeuing? Stuff them into hctx->dispatch, 1167 * that is where we will continue on next queue run. 1168 */ 1169 if (!list_empty(list)) { 1170 spin_lock(&hctx->lock); 1171 list_splice_init(list, &hctx->dispatch); 1172 spin_unlock(&hctx->lock); 1173 1174 /* 1175 * If SCHED_RESTART was set by the caller of this function and 1176 * it is no longer set that means that it was cleared by another 1177 * thread and hence that a queue rerun is needed. 1178 * 1179 * If 'no_tag' is set, that means that we failed getting 1180 * a driver tag with an I/O scheduler attached. If our dispatch 1181 * waitqueue is no longer active, ensure that we run the queue 1182 * AFTER adding our entries back to the list. 1183 * 1184 * If no I/O scheduler has been configured it is possible that 1185 * the hardware queue got stopped and restarted before requests 1186 * were pushed back onto the dispatch list. Rerun the queue to 1187 * avoid starvation. Notes: 1188 * - blk_mq_run_hw_queue() checks whether or not a queue has 1189 * been stopped before rerunning a queue. 1190 * - Some but not all block drivers stop a queue before 1191 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq 1192 * and dm-rq. 1193 */ 1194 if (!blk_mq_sched_needs_restart(hctx) || 1195 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) 1196 blk_mq_run_hw_queue(hctx, true); 1197 } 1198 1199 return (queued + errors) != 0; 1200 } 1201 1202 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 1203 { 1204 int srcu_idx; 1205 1206 /* 1207 * We should be running this queue from one of the CPUs that 1208 * are mapped to it. 1209 */ 1210 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && 1211 cpu_online(hctx->next_cpu)); 1212 1213 /* 1214 * We can't run the queue inline with ints disabled. Ensure that 1215 * we catch bad users of this early. 1216 */ 1217 WARN_ON_ONCE(in_interrupt()); 1218 1219 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { 1220 rcu_read_lock(); 1221 blk_mq_sched_dispatch_requests(hctx); 1222 rcu_read_unlock(); 1223 } else { 1224 might_sleep(); 1225 1226 srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); 1227 blk_mq_sched_dispatch_requests(hctx); 1228 srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); 1229 } 1230 } 1231 1232 /* 1233 * It'd be great if the workqueue API had a way to pass 1234 * in a mask and had some smarts for more clever placement. 1235 * For now we just round-robin here, switching for every 1236 * BLK_MQ_CPU_WORK_BATCH queued items. 1237 */ 1238 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 1239 { 1240 if (hctx->queue->nr_hw_queues == 1) 1241 return WORK_CPU_UNBOUND; 1242 1243 if (--hctx->next_cpu_batch <= 0) { 1244 int next_cpu; 1245 1246 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); 1247 if (next_cpu >= nr_cpu_ids) 1248 next_cpu = cpumask_first(hctx->cpumask); 1249 1250 hctx->next_cpu = next_cpu; 1251 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1252 } 1253 1254 return hctx->next_cpu; 1255 } 1256 1257 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, 1258 unsigned long msecs) 1259 { 1260 if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx))) 1261 return; 1262 1263 if (unlikely(blk_mq_hctx_stopped(hctx))) 1264 return; 1265 1266 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { 1267 int cpu = get_cpu(); 1268 if (cpumask_test_cpu(cpu, hctx->cpumask)) { 1269 __blk_mq_run_hw_queue(hctx); 1270 put_cpu(); 1271 return; 1272 } 1273 1274 put_cpu(); 1275 } 1276 1277 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 1278 &hctx->run_work, 1279 msecs_to_jiffies(msecs)); 1280 } 1281 1282 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 1283 { 1284 __blk_mq_delay_run_hw_queue(hctx, true, msecs); 1285 } 1286 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); 1287 1288 bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1289 { 1290 if (blk_mq_hctx_has_pending(hctx)) { 1291 __blk_mq_delay_run_hw_queue(hctx, async, 0); 1292 return true; 1293 } 1294 1295 return false; 1296 } 1297 EXPORT_SYMBOL(blk_mq_run_hw_queue); 1298 1299 void blk_mq_run_hw_queues(struct request_queue *q, bool async) 1300 { 1301 struct blk_mq_hw_ctx *hctx; 1302 int i; 1303 1304 queue_for_each_hw_ctx(q, hctx, i) { 1305 if (blk_mq_hctx_stopped(hctx)) 1306 continue; 1307 1308 blk_mq_run_hw_queue(hctx, async); 1309 } 1310 } 1311 EXPORT_SYMBOL(blk_mq_run_hw_queues); 1312 1313 /** 1314 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped 1315 * @q: request queue. 1316 * 1317 * The caller is responsible for serializing this function against 1318 * blk_mq_{start,stop}_hw_queue(). 1319 */ 1320 bool blk_mq_queue_stopped(struct request_queue *q) 1321 { 1322 struct blk_mq_hw_ctx *hctx; 1323 int i; 1324 1325 queue_for_each_hw_ctx(q, hctx, i) 1326 if (blk_mq_hctx_stopped(hctx)) 1327 return true; 1328 1329 return false; 1330 } 1331 EXPORT_SYMBOL(blk_mq_queue_stopped); 1332 1333 /* 1334 * This function is often used for pausing .queue_rq() by driver when 1335 * there isn't enough resource or some conditions aren't satisfied, and 1336 * BLK_STS_RESOURCE is usually returned. 1337 * 1338 * We do not guarantee that dispatch can be drained or blocked 1339 * after blk_mq_stop_hw_queue() returns. Please use 1340 * blk_mq_quiesce_queue() for that requirement. 1341 */ 1342 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 1343 { 1344 cancel_delayed_work(&hctx->run_work); 1345 1346 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 1347 } 1348 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 1349 1350 /* 1351 * This function is often used for pausing .queue_rq() by driver when 1352 * there isn't enough resource or some conditions aren't satisfied, and 1353 * BLK_STS_RESOURCE is usually returned. 1354 * 1355 * We do not guarantee that dispatch can be drained or blocked 1356 * after blk_mq_stop_hw_queues() returns. Please use 1357 * blk_mq_quiesce_queue() for that requirement. 1358 */ 1359 void blk_mq_stop_hw_queues(struct request_queue *q) 1360 { 1361 struct blk_mq_hw_ctx *hctx; 1362 int i; 1363 1364 queue_for_each_hw_ctx(q, hctx, i) 1365 blk_mq_stop_hw_queue(hctx); 1366 } 1367 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 1368 1369 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 1370 { 1371 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1372 1373 blk_mq_run_hw_queue(hctx, false); 1374 } 1375 EXPORT_SYMBOL(blk_mq_start_hw_queue); 1376 1377 void blk_mq_start_hw_queues(struct request_queue *q) 1378 { 1379 struct blk_mq_hw_ctx *hctx; 1380 int i; 1381 1382 queue_for_each_hw_ctx(q, hctx, i) 1383 blk_mq_start_hw_queue(hctx); 1384 } 1385 EXPORT_SYMBOL(blk_mq_start_hw_queues); 1386 1387 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1388 { 1389 if (!blk_mq_hctx_stopped(hctx)) 1390 return; 1391 1392 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1393 blk_mq_run_hw_queue(hctx, async); 1394 } 1395 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); 1396 1397 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 1398 { 1399 struct blk_mq_hw_ctx *hctx; 1400 int i; 1401 1402 queue_for_each_hw_ctx(q, hctx, i) 1403 blk_mq_start_stopped_hw_queue(hctx, async); 1404 } 1405 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 1406 1407 static void blk_mq_run_work_fn(struct work_struct *work) 1408 { 1409 struct blk_mq_hw_ctx *hctx; 1410 1411 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); 1412 1413 /* 1414 * If we are stopped, don't run the queue. The exception is if 1415 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear 1416 * the STOPPED bit and run it. 1417 */ 1418 if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) { 1419 if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state)) 1420 return; 1421 1422 clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state); 1423 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1424 } 1425 1426 __blk_mq_run_hw_queue(hctx); 1427 } 1428 1429 1430 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 1431 { 1432 if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx))) 1433 return; 1434 1435 /* 1436 * Stop the hw queue, then modify currently delayed work. 1437 * This should prevent us from running the queue prematurely. 1438 * Mark the queue as auto-clearing STOPPED when it runs. 1439 */ 1440 blk_mq_stop_hw_queue(hctx); 1441 set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state); 1442 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 1443 &hctx->run_work, 1444 msecs_to_jiffies(msecs)); 1445 } 1446 EXPORT_SYMBOL(blk_mq_delay_queue); 1447 1448 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, 1449 struct request *rq, 1450 bool at_head) 1451 { 1452 struct blk_mq_ctx *ctx = rq->mq_ctx; 1453 1454 lockdep_assert_held(&ctx->lock); 1455 1456 trace_block_rq_insert(hctx->queue, rq); 1457 1458 if (at_head) 1459 list_add(&rq->queuelist, &ctx->rq_list); 1460 else 1461 list_add_tail(&rq->queuelist, &ctx->rq_list); 1462 } 1463 1464 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 1465 bool at_head) 1466 { 1467 struct blk_mq_ctx *ctx = rq->mq_ctx; 1468 1469 lockdep_assert_held(&ctx->lock); 1470 1471 __blk_mq_insert_req_list(hctx, rq, at_head); 1472 blk_mq_hctx_mark_pending(hctx, ctx); 1473 } 1474 1475 /* 1476 * Should only be used carefully, when the caller knows we want to 1477 * bypass a potential IO scheduler on the target device. 1478 */ 1479 void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) 1480 { 1481 struct blk_mq_ctx *ctx = rq->mq_ctx; 1482 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); 1483 1484 spin_lock(&hctx->lock); 1485 list_add_tail(&rq->queuelist, &hctx->dispatch); 1486 spin_unlock(&hctx->lock); 1487 1488 if (run_queue) 1489 blk_mq_run_hw_queue(hctx, false); 1490 } 1491 1492 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 1493 struct list_head *list) 1494 1495 { 1496 /* 1497 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1498 * offline now 1499 */ 1500 spin_lock(&ctx->lock); 1501 while (!list_empty(list)) { 1502 struct request *rq; 1503 1504 rq = list_first_entry(list, struct request, queuelist); 1505 BUG_ON(rq->mq_ctx != ctx); 1506 list_del_init(&rq->queuelist); 1507 __blk_mq_insert_req_list(hctx, rq, false); 1508 } 1509 blk_mq_hctx_mark_pending(hctx, ctx); 1510 spin_unlock(&ctx->lock); 1511 } 1512 1513 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1514 { 1515 struct request *rqa = container_of(a, struct request, queuelist); 1516 struct request *rqb = container_of(b, struct request, queuelist); 1517 1518 return !(rqa->mq_ctx < rqb->mq_ctx || 1519 (rqa->mq_ctx == rqb->mq_ctx && 1520 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 1521 } 1522 1523 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1524 { 1525 struct blk_mq_ctx *this_ctx; 1526 struct request_queue *this_q; 1527 struct request *rq; 1528 LIST_HEAD(list); 1529 LIST_HEAD(ctx_list); 1530 unsigned int depth; 1531 1532 list_splice_init(&plug->mq_list, &list); 1533 1534 list_sort(NULL, &list, plug_ctx_cmp); 1535 1536 this_q = NULL; 1537 this_ctx = NULL; 1538 depth = 0; 1539 1540 while (!list_empty(&list)) { 1541 rq = list_entry_rq(list.next); 1542 list_del_init(&rq->queuelist); 1543 BUG_ON(!rq->q); 1544 if (rq->mq_ctx != this_ctx) { 1545 if (this_ctx) { 1546 trace_block_unplug(this_q, depth, from_schedule); 1547 blk_mq_sched_insert_requests(this_q, this_ctx, 1548 &ctx_list, 1549 from_schedule); 1550 } 1551 1552 this_ctx = rq->mq_ctx; 1553 this_q = rq->q; 1554 depth = 0; 1555 } 1556 1557 depth++; 1558 list_add_tail(&rq->queuelist, &ctx_list); 1559 } 1560 1561 /* 1562 * If 'this_ctx' is set, we know we have entries to complete 1563 * on 'ctx_list'. Do those. 1564 */ 1565 if (this_ctx) { 1566 trace_block_unplug(this_q, depth, from_schedule); 1567 blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, 1568 from_schedule); 1569 } 1570 } 1571 1572 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1573 { 1574 blk_init_request_from_bio(rq, bio); 1575 1576 blk_rq_set_rl(rq, blk_get_rl(rq->q, bio)); 1577 1578 blk_account_io_start(rq, true); 1579 } 1580 1581 static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx, 1582 struct blk_mq_ctx *ctx, 1583 struct request *rq) 1584 { 1585 spin_lock(&ctx->lock); 1586 __blk_mq_insert_request(hctx, rq, false); 1587 spin_unlock(&ctx->lock); 1588 } 1589 1590 static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) 1591 { 1592 if (rq->tag != -1) 1593 return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); 1594 1595 return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); 1596 } 1597 1598 static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 1599 struct request *rq, 1600 blk_qc_t *cookie, bool may_sleep) 1601 { 1602 struct request_queue *q = rq->q; 1603 struct blk_mq_queue_data bd = { 1604 .rq = rq, 1605 .last = true, 1606 }; 1607 blk_qc_t new_cookie; 1608 blk_status_t ret; 1609 bool run_queue = true; 1610 1611 /* RCU or SRCU read lock is needed before checking quiesced flag */ 1612 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { 1613 run_queue = false; 1614 goto insert; 1615 } 1616 1617 if (q->elevator) 1618 goto insert; 1619 1620 if (!blk_mq_get_driver_tag(rq, NULL, false)) 1621 goto insert; 1622 1623 if (!blk_mq_get_dispatch_budget(hctx)) { 1624 blk_mq_put_driver_tag(rq); 1625 goto insert; 1626 } 1627 1628 new_cookie = request_to_qc_t(hctx, rq); 1629 1630 /* 1631 * For OK queue, we are done. For error, kill it. Any other 1632 * error (busy), just add it to our list as we previously 1633 * would have done 1634 */ 1635 ret = q->mq_ops->queue_rq(hctx, &bd); 1636 switch (ret) { 1637 case BLK_STS_OK: 1638 *cookie = new_cookie; 1639 return; 1640 case BLK_STS_RESOURCE: 1641 __blk_mq_requeue_request(rq); 1642 goto insert; 1643 default: 1644 *cookie = BLK_QC_T_NONE; 1645 blk_mq_end_request(rq, ret); 1646 return; 1647 } 1648 1649 insert: 1650 blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); 1651 } 1652 1653 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 1654 struct request *rq, blk_qc_t *cookie) 1655 { 1656 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { 1657 rcu_read_lock(); 1658 __blk_mq_try_issue_directly(hctx, rq, cookie, false); 1659 rcu_read_unlock(); 1660 } else { 1661 unsigned int srcu_idx; 1662 1663 might_sleep(); 1664 1665 srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); 1666 __blk_mq_try_issue_directly(hctx, rq, cookie, true); 1667 srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); 1668 } 1669 } 1670 1671 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) 1672 { 1673 const int is_sync = op_is_sync(bio->bi_opf); 1674 const int is_flush_fua = op_is_flush(bio->bi_opf); 1675 struct blk_mq_alloc_data data = { .flags = 0 }; 1676 struct request *rq; 1677 unsigned int request_count = 0; 1678 struct blk_plug *plug; 1679 struct request *same_queue_rq = NULL; 1680 blk_qc_t cookie; 1681 unsigned int wb_acct; 1682 1683 blk_queue_bounce(q, &bio); 1684 1685 blk_queue_split(q, &bio); 1686 1687 if (!bio_integrity_prep(bio)) 1688 return BLK_QC_T_NONE; 1689 1690 if (!is_flush_fua && !blk_queue_nomerges(q) && 1691 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) 1692 return BLK_QC_T_NONE; 1693 1694 if (blk_mq_sched_bio_merge(q, bio)) 1695 return BLK_QC_T_NONE; 1696 1697 wb_acct = wbt_wait(q->rq_wb, bio, NULL); 1698 1699 trace_block_getrq(q, bio, bio->bi_opf); 1700 1701 rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); 1702 if (unlikely(!rq)) { 1703 __wbt_done(q->rq_wb, wb_acct); 1704 if (bio->bi_opf & REQ_NOWAIT) 1705 bio_wouldblock_error(bio); 1706 return BLK_QC_T_NONE; 1707 } 1708 1709 wbt_track(&rq->issue_stat, wb_acct); 1710 1711 cookie = request_to_qc_t(data.hctx, rq); 1712 1713 plug = current->plug; 1714 if (unlikely(is_flush_fua)) { 1715 blk_mq_put_ctx(data.ctx); 1716 blk_mq_bio_to_request(rq, bio); 1717 1718 /* bypass scheduler for flush rq */ 1719 blk_insert_flush(rq); 1720 blk_mq_run_hw_queue(data.hctx, true); 1721 } else if (plug && q->nr_hw_queues == 1) { 1722 struct request *last = NULL; 1723 1724 blk_mq_put_ctx(data.ctx); 1725 blk_mq_bio_to_request(rq, bio); 1726 1727 /* 1728 * @request_count may become stale because of schedule 1729 * out, so check the list again. 1730 */ 1731 if (list_empty(&plug->mq_list)) 1732 request_count = 0; 1733 else if (blk_queue_nomerges(q)) 1734 request_count = blk_plug_queued_count(q); 1735 1736 if (!request_count) 1737 trace_block_plug(q); 1738 else 1739 last = list_entry_rq(plug->mq_list.prev); 1740 1741 if (request_count >= BLK_MAX_REQUEST_COUNT || (last && 1742 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { 1743 blk_flush_plug_list(plug, false); 1744 trace_block_plug(q); 1745 } 1746 1747 list_add_tail(&rq->queuelist, &plug->mq_list); 1748 } else if (plug && !blk_queue_nomerges(q)) { 1749 blk_mq_bio_to_request(rq, bio); 1750 1751 /* 1752 * We do limited plugging. If the bio can be merged, do that. 1753 * Otherwise the existing request in the plug list will be 1754 * issued. So the plug list will have one request at most 1755 * The plug list might get flushed before this. If that happens, 1756 * the plug list is empty, and same_queue_rq is invalid. 1757 */ 1758 if (list_empty(&plug->mq_list)) 1759 same_queue_rq = NULL; 1760 if (same_queue_rq) 1761 list_del_init(&same_queue_rq->queuelist); 1762 list_add_tail(&rq->queuelist, &plug->mq_list); 1763 1764 blk_mq_put_ctx(data.ctx); 1765 1766 if (same_queue_rq) { 1767 data.hctx = blk_mq_map_queue(q, 1768 same_queue_rq->mq_ctx->cpu); 1769 blk_mq_try_issue_directly(data.hctx, same_queue_rq, 1770 &cookie); 1771 } 1772 } else if (q->nr_hw_queues > 1 && is_sync) { 1773 blk_mq_put_ctx(data.ctx); 1774 blk_mq_bio_to_request(rq, bio); 1775 blk_mq_try_issue_directly(data.hctx, rq, &cookie); 1776 } else if (q->elevator) { 1777 blk_mq_put_ctx(data.ctx); 1778 blk_mq_bio_to_request(rq, bio); 1779 blk_mq_sched_insert_request(rq, false, true, true, true); 1780 } else { 1781 blk_mq_put_ctx(data.ctx); 1782 blk_mq_bio_to_request(rq, bio); 1783 blk_mq_queue_io(data.hctx, data.ctx, rq); 1784 blk_mq_run_hw_queue(data.hctx, true); 1785 } 1786 1787 return cookie; 1788 } 1789 1790 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 1791 unsigned int hctx_idx) 1792 { 1793 struct page *page; 1794 1795 if (tags->rqs && set->ops->exit_request) { 1796 int i; 1797 1798 for (i = 0; i < tags->nr_tags; i++) { 1799 struct request *rq = tags->static_rqs[i]; 1800 1801 if (!rq) 1802 continue; 1803 set->ops->exit_request(set, rq, hctx_idx); 1804 tags->static_rqs[i] = NULL; 1805 } 1806 } 1807 1808 while (!list_empty(&tags->page_list)) { 1809 page = list_first_entry(&tags->page_list, struct page, lru); 1810 list_del_init(&page->lru); 1811 /* 1812 * Remove kmemleak object previously allocated in 1813 * blk_mq_init_rq_map(). 1814 */ 1815 kmemleak_free(page_address(page)); 1816 __free_pages(page, page->private); 1817 } 1818 } 1819 1820 void blk_mq_free_rq_map(struct blk_mq_tags *tags) 1821 { 1822 kfree(tags->rqs); 1823 tags->rqs = NULL; 1824 kfree(tags->static_rqs); 1825 tags->static_rqs = NULL; 1826 1827 blk_mq_free_tags(tags); 1828 } 1829 1830 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, 1831 unsigned int hctx_idx, 1832 unsigned int nr_tags, 1833 unsigned int reserved_tags) 1834 { 1835 struct blk_mq_tags *tags; 1836 int node; 1837 1838 node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); 1839 if (node == NUMA_NO_NODE) 1840 node = set->numa_node; 1841 1842 tags = blk_mq_init_tags(nr_tags, reserved_tags, node, 1843 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 1844 if (!tags) 1845 return NULL; 1846 1847 tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *), 1848 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 1849 node); 1850 if (!tags->rqs) { 1851 blk_mq_free_tags(tags); 1852 return NULL; 1853 } 1854 1855 tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *), 1856 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 1857 node); 1858 if (!tags->static_rqs) { 1859 kfree(tags->rqs); 1860 blk_mq_free_tags(tags); 1861 return NULL; 1862 } 1863 1864 return tags; 1865 } 1866 1867 static size_t order_to_size(unsigned int order) 1868 { 1869 return (size_t)PAGE_SIZE << order; 1870 } 1871 1872 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 1873 unsigned int hctx_idx, unsigned int depth) 1874 { 1875 unsigned int i, j, entries_per_page, max_order = 4; 1876 size_t rq_size, left; 1877 int node; 1878 1879 node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); 1880 if (node == NUMA_NO_NODE) 1881 node = set->numa_node; 1882 1883 INIT_LIST_HEAD(&tags->page_list); 1884 1885 /* 1886 * rq_size is the size of the request plus driver payload, rounded 1887 * to the cacheline size 1888 */ 1889 rq_size = round_up(sizeof(struct request) + set->cmd_size, 1890 cache_line_size()); 1891 left = rq_size * depth; 1892 1893 for (i = 0; i < depth; ) { 1894 int this_order = max_order; 1895 struct page *page; 1896 int to_do; 1897 void *p; 1898 1899 while (this_order && left < order_to_size(this_order - 1)) 1900 this_order--; 1901 1902 do { 1903 page = alloc_pages_node(node, 1904 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 1905 this_order); 1906 if (page) 1907 break; 1908 if (!this_order--) 1909 break; 1910 if (order_to_size(this_order) < rq_size) 1911 break; 1912 } while (1); 1913 1914 if (!page) 1915 goto fail; 1916 1917 page->private = this_order; 1918 list_add_tail(&page->lru, &tags->page_list); 1919 1920 p = page_address(page); 1921 /* 1922 * Allow kmemleak to scan these pages as they contain pointers 1923 * to additional allocations like via ops->init_request(). 1924 */ 1925 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); 1926 entries_per_page = order_to_size(this_order) / rq_size; 1927 to_do = min(entries_per_page, depth - i); 1928 left -= to_do * rq_size; 1929 for (j = 0; j < to_do; j++) { 1930 struct request *rq = p; 1931 1932 tags->static_rqs[i] = rq; 1933 if (set->ops->init_request) { 1934 if (set->ops->init_request(set, rq, hctx_idx, 1935 node)) { 1936 tags->static_rqs[i] = NULL; 1937 goto fail; 1938 } 1939 } 1940 1941 p += rq_size; 1942 i++; 1943 } 1944 } 1945 return 0; 1946 1947 fail: 1948 blk_mq_free_rqs(set, tags, hctx_idx); 1949 return -ENOMEM; 1950 } 1951 1952 /* 1953 * 'cpu' is going away. splice any existing rq_list entries from this 1954 * software queue to the hw queue dispatch list, and ensure that it 1955 * gets run. 1956 */ 1957 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) 1958 { 1959 struct blk_mq_hw_ctx *hctx; 1960 struct blk_mq_ctx *ctx; 1961 LIST_HEAD(tmp); 1962 1963 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); 1964 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 1965 1966 spin_lock(&ctx->lock); 1967 if (!list_empty(&ctx->rq_list)) { 1968 list_splice_init(&ctx->rq_list, &tmp); 1969 blk_mq_hctx_clear_pending(hctx, ctx); 1970 } 1971 spin_unlock(&ctx->lock); 1972 1973 if (list_empty(&tmp)) 1974 return 0; 1975 1976 spin_lock(&hctx->lock); 1977 list_splice_tail_init(&tmp, &hctx->dispatch); 1978 spin_unlock(&hctx->lock); 1979 1980 blk_mq_run_hw_queue(hctx, true); 1981 return 0; 1982 } 1983 1984 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 1985 { 1986 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, 1987 &hctx->cpuhp_dead); 1988 } 1989 1990 /* hctx->ctxs will be freed in queue's release handler */ 1991 static void blk_mq_exit_hctx(struct request_queue *q, 1992 struct blk_mq_tag_set *set, 1993 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1994 { 1995 blk_mq_debugfs_unregister_hctx(hctx); 1996 1997 blk_mq_tag_idle(hctx); 1998 1999 if (set->ops->exit_request) 2000 set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); 2001 2002 blk_mq_sched_exit_hctx(q, hctx, hctx_idx); 2003 2004 if (set->ops->exit_hctx) 2005 set->ops->exit_hctx(hctx, hctx_idx); 2006 2007 if (hctx->flags & BLK_MQ_F_BLOCKING) 2008 cleanup_srcu_struct(hctx->queue_rq_srcu); 2009 2010 blk_mq_remove_cpuhp(hctx); 2011 blk_free_flush_queue(hctx->fq); 2012 sbitmap_free(&hctx->ctx_map); 2013 } 2014 2015 static void blk_mq_exit_hw_queues(struct request_queue *q, 2016 struct blk_mq_tag_set *set, int nr_queue) 2017 { 2018 struct blk_mq_hw_ctx *hctx; 2019 unsigned int i; 2020 2021 queue_for_each_hw_ctx(q, hctx, i) { 2022 if (i == nr_queue) 2023 break; 2024 blk_mq_exit_hctx(q, set, hctx, i); 2025 } 2026 } 2027 2028 static int blk_mq_init_hctx(struct request_queue *q, 2029 struct blk_mq_tag_set *set, 2030 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 2031 { 2032 int node; 2033 2034 node = hctx->numa_node; 2035 if (node == NUMA_NO_NODE) 2036 node = hctx->numa_node = set->numa_node; 2037 2038 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 2039 spin_lock_init(&hctx->lock); 2040 INIT_LIST_HEAD(&hctx->dispatch); 2041 hctx->queue = q; 2042 hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; 2043 2044 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); 2045 2046 hctx->tags = set->tags[hctx_idx]; 2047 2048 /* 2049 * Allocate space for all possible cpus to avoid allocation at 2050 * runtime 2051 */ 2052 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), 2053 GFP_KERNEL, node); 2054 if (!hctx->ctxs) 2055 goto unregister_cpu_notifier; 2056 2057 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL, 2058 node)) 2059 goto free_ctxs; 2060 2061 hctx->nr_ctx = 0; 2062 2063 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); 2064 INIT_LIST_HEAD(&hctx->dispatch_wait.entry); 2065 2066 if (set->ops->init_hctx && 2067 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 2068 goto free_bitmap; 2069 2070 if (blk_mq_sched_init_hctx(q, hctx, hctx_idx)) 2071 goto exit_hctx; 2072 2073 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); 2074 if (!hctx->fq) 2075 goto sched_exit_hctx; 2076 2077 if (set->ops->init_request && 2078 set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx, 2079 node)) 2080 goto free_fq; 2081 2082 if (hctx->flags & BLK_MQ_F_BLOCKING) 2083 init_srcu_struct(hctx->queue_rq_srcu); 2084 2085 blk_mq_debugfs_register_hctx(q, hctx); 2086 2087 return 0; 2088 2089 free_fq: 2090 kfree(hctx->fq); 2091 sched_exit_hctx: 2092 blk_mq_sched_exit_hctx(q, hctx, hctx_idx); 2093 exit_hctx: 2094 if (set->ops->exit_hctx) 2095 set->ops->exit_hctx(hctx, hctx_idx); 2096 free_bitmap: 2097 sbitmap_free(&hctx->ctx_map); 2098 free_ctxs: 2099 kfree(hctx->ctxs); 2100 unregister_cpu_notifier: 2101 blk_mq_remove_cpuhp(hctx); 2102 return -1; 2103 } 2104 2105 static void blk_mq_init_cpu_queues(struct request_queue *q, 2106 unsigned int nr_hw_queues) 2107 { 2108 unsigned int i; 2109 2110 for_each_possible_cpu(i) { 2111 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 2112 struct blk_mq_hw_ctx *hctx; 2113 2114 __ctx->cpu = i; 2115 spin_lock_init(&__ctx->lock); 2116 INIT_LIST_HEAD(&__ctx->rq_list); 2117 __ctx->queue = q; 2118 2119 /* If the cpu isn't present, the cpu is mapped to first hctx */ 2120 if (!cpu_present(i)) 2121 continue; 2122 2123 hctx = blk_mq_map_queue(q, i); 2124 2125 /* 2126 * Set local node, IFF we have more than one hw queue. If 2127 * not, we remain on the home node of the device 2128 */ 2129 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 2130 hctx->numa_node = local_memory_node(cpu_to_node(i)); 2131 } 2132 } 2133 2134 static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) 2135 { 2136 int ret = 0; 2137 2138 set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, 2139 set->queue_depth, set->reserved_tags); 2140 if (!set->tags[hctx_idx]) 2141 return false; 2142 2143 ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, 2144 set->queue_depth); 2145 if (!ret) 2146 return true; 2147 2148 blk_mq_free_rq_map(set->tags[hctx_idx]); 2149 set->tags[hctx_idx] = NULL; 2150 return false; 2151 } 2152 2153 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, 2154 unsigned int hctx_idx) 2155 { 2156 if (set->tags[hctx_idx]) { 2157 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); 2158 blk_mq_free_rq_map(set->tags[hctx_idx]); 2159 set->tags[hctx_idx] = NULL; 2160 } 2161 } 2162 2163 static void blk_mq_map_swqueue(struct request_queue *q) 2164 { 2165 unsigned int i, hctx_idx; 2166 struct blk_mq_hw_ctx *hctx; 2167 struct blk_mq_ctx *ctx; 2168 struct blk_mq_tag_set *set = q->tag_set; 2169 2170 /* 2171 * Avoid others reading imcomplete hctx->cpumask through sysfs 2172 */ 2173 mutex_lock(&q->sysfs_lock); 2174 2175 queue_for_each_hw_ctx(q, hctx, i) { 2176 cpumask_clear(hctx->cpumask); 2177 hctx->nr_ctx = 0; 2178 } 2179 2180 /* 2181 * Map software to hardware queues. 2182 * 2183 * If the cpu isn't present, the cpu is mapped to first hctx. 2184 */ 2185 for_each_present_cpu(i) { 2186 hctx_idx = q->mq_map[i]; 2187 /* unmapped hw queue can be remapped after CPU topo changed */ 2188 if (!set->tags[hctx_idx] && 2189 !__blk_mq_alloc_rq_map(set, hctx_idx)) { 2190 /* 2191 * If tags initialization fail for some hctx, 2192 * that hctx won't be brought online. In this 2193 * case, remap the current ctx to hctx[0] which 2194 * is guaranteed to always have tags allocated 2195 */ 2196 q->mq_map[i] = 0; 2197 } 2198 2199 ctx = per_cpu_ptr(q->queue_ctx, i); 2200 hctx = blk_mq_map_queue(q, i); 2201 2202 cpumask_set_cpu(i, hctx->cpumask); 2203 ctx->index_hw = hctx->nr_ctx; 2204 hctx->ctxs[hctx->nr_ctx++] = ctx; 2205 } 2206 2207 mutex_unlock(&q->sysfs_lock); 2208 2209 queue_for_each_hw_ctx(q, hctx, i) { 2210 /* 2211 * If no software queues are mapped to this hardware queue, 2212 * disable it and free the request entries. 2213 */ 2214 if (!hctx->nr_ctx) { 2215 /* Never unmap queue 0. We need it as a 2216 * fallback in case of a new remap fails 2217 * allocation 2218 */ 2219 if (i && set->tags[i]) 2220 blk_mq_free_map_and_requests(set, i); 2221 2222 hctx->tags = NULL; 2223 continue; 2224 } 2225 2226 hctx->tags = set->tags[i]; 2227 WARN_ON(!hctx->tags); 2228 2229 /* 2230 * Set the map size to the number of mapped software queues. 2231 * This is more accurate and more efficient than looping 2232 * over all possibly mapped software queues. 2233 */ 2234 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); 2235 2236 /* 2237 * Initialize batch roundrobin counts 2238 */ 2239 hctx->next_cpu = cpumask_first(hctx->cpumask); 2240 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 2241 } 2242 } 2243 2244 /* 2245 * Caller needs to ensure that we're either frozen/quiesced, or that 2246 * the queue isn't live yet. 2247 */ 2248 static void queue_set_hctx_shared(struct request_queue *q, bool shared) 2249 { 2250 struct blk_mq_hw_ctx *hctx; 2251 int i; 2252 2253 queue_for_each_hw_ctx(q, hctx, i) { 2254 if (shared) { 2255 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 2256 atomic_inc(&q->shared_hctx_restart); 2257 hctx->flags |= BLK_MQ_F_TAG_SHARED; 2258 } else { 2259 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 2260 atomic_dec(&q->shared_hctx_restart); 2261 hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 2262 } 2263 } 2264 } 2265 2266 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, 2267 bool shared) 2268 { 2269 struct request_queue *q; 2270 2271 lockdep_assert_held(&set->tag_list_lock); 2272 2273 list_for_each_entry(q, &set->tag_list, tag_set_list) { 2274 blk_mq_freeze_queue(q); 2275 queue_set_hctx_shared(q, shared); 2276 blk_mq_unfreeze_queue(q); 2277 } 2278 } 2279 2280 static void blk_mq_del_queue_tag_set(struct request_queue *q) 2281 { 2282 struct blk_mq_tag_set *set = q->tag_set; 2283 2284 mutex_lock(&set->tag_list_lock); 2285 list_del_rcu(&q->tag_set_list); 2286 INIT_LIST_HEAD(&q->tag_set_list); 2287 if (list_is_singular(&set->tag_list)) { 2288 /* just transitioned to unshared */ 2289 set->flags &= ~BLK_MQ_F_TAG_SHARED; 2290 /* update existing queue */ 2291 blk_mq_update_tag_set_depth(set, false); 2292 } 2293 mutex_unlock(&set->tag_list_lock); 2294 2295 synchronize_rcu(); 2296 } 2297 2298 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 2299 struct request_queue *q) 2300 { 2301 q->tag_set = set; 2302 2303 mutex_lock(&set->tag_list_lock); 2304 2305 /* 2306 * Check to see if we're transitioning to shared (from 1 to 2 queues). 2307 */ 2308 if (!list_empty(&set->tag_list) && 2309 !(set->flags & BLK_MQ_F_TAG_SHARED)) { 2310 set->flags |= BLK_MQ_F_TAG_SHARED; 2311 /* update existing queue */ 2312 blk_mq_update_tag_set_depth(set, true); 2313 } 2314 if (set->flags & BLK_MQ_F_TAG_SHARED) 2315 queue_set_hctx_shared(q, true); 2316 list_add_tail_rcu(&q->tag_set_list, &set->tag_list); 2317 2318 mutex_unlock(&set->tag_list_lock); 2319 } 2320 2321 /* 2322 * It is the actual release handler for mq, but we do it from 2323 * request queue's release handler for avoiding use-after-free 2324 * and headache because q->mq_kobj shouldn't have been introduced, 2325 * but we can't group ctx/kctx kobj without it. 2326 */ 2327 void blk_mq_release(struct request_queue *q) 2328 { 2329 struct blk_mq_hw_ctx *hctx; 2330 unsigned int i; 2331 2332 /* hctx kobj stays in hctx */ 2333 queue_for_each_hw_ctx(q, hctx, i) { 2334 if (!hctx) 2335 continue; 2336 kobject_put(&hctx->kobj); 2337 } 2338 2339 q->mq_map = NULL; 2340 2341 kfree(q->queue_hw_ctx); 2342 2343 /* 2344 * release .mq_kobj and sw queue's kobject now because 2345 * both share lifetime with request queue. 2346 */ 2347 blk_mq_sysfs_deinit(q); 2348 2349 free_percpu(q->queue_ctx); 2350 } 2351 2352 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 2353 { 2354 struct request_queue *uninit_q, *q; 2355 2356 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); 2357 if (!uninit_q) 2358 return ERR_PTR(-ENOMEM); 2359 2360 q = blk_mq_init_allocated_queue(set, uninit_q); 2361 if (IS_ERR(q)) 2362 blk_cleanup_queue(uninit_q); 2363 2364 return q; 2365 } 2366 EXPORT_SYMBOL(blk_mq_init_queue); 2367 2368 static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) 2369 { 2370 int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); 2371 2372 BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu), 2373 __alignof__(struct blk_mq_hw_ctx)) != 2374 sizeof(struct blk_mq_hw_ctx)); 2375 2376 if (tag_set->flags & BLK_MQ_F_BLOCKING) 2377 hw_ctx_size += sizeof(struct srcu_struct); 2378 2379 return hw_ctx_size; 2380 } 2381 2382 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 2383 struct request_queue *q) 2384 { 2385 int i, j; 2386 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; 2387 2388 blk_mq_sysfs_unregister(q); 2389 for (i = 0; i < set->nr_hw_queues; i++) { 2390 int node; 2391 2392 if (hctxs[i]) 2393 continue; 2394 2395 node = blk_mq_hw_queue_to_node(q->mq_map, i); 2396 hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set), 2397 GFP_KERNEL, node); 2398 if (!hctxs[i]) 2399 break; 2400 2401 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, 2402 node)) { 2403 kfree(hctxs[i]); 2404 hctxs[i] = NULL; 2405 break; 2406 } 2407 2408 atomic_set(&hctxs[i]->nr_active, 0); 2409 hctxs[i]->numa_node = node; 2410 hctxs[i]->queue_num = i; 2411 2412 if (blk_mq_init_hctx(q, set, hctxs[i], i)) { 2413 free_cpumask_var(hctxs[i]->cpumask); 2414 kfree(hctxs[i]); 2415 hctxs[i] = NULL; 2416 break; 2417 } 2418 blk_mq_hctx_kobj_init(hctxs[i]); 2419 } 2420 for (j = i; j < q->nr_hw_queues; j++) { 2421 struct blk_mq_hw_ctx *hctx = hctxs[j]; 2422 2423 if (hctx) { 2424 if (hctx->tags) 2425 blk_mq_free_map_and_requests(set, j); 2426 blk_mq_exit_hctx(q, set, hctx, j); 2427 kobject_put(&hctx->kobj); 2428 hctxs[j] = NULL; 2429 2430 } 2431 } 2432 q->nr_hw_queues = i; 2433 blk_mq_sysfs_register(q); 2434 } 2435 2436 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 2437 struct request_queue *q) 2438 { 2439 /* mark the queue as mq asap */ 2440 q->mq_ops = set->ops; 2441 2442 q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, 2443 blk_mq_poll_stats_bkt, 2444 BLK_MQ_POLL_STATS_BKTS, q); 2445 if (!q->poll_cb) 2446 goto err_exit; 2447 2448 q->queue_ctx = alloc_percpu(struct blk_mq_ctx); 2449 if (!q->queue_ctx) 2450 goto err_exit; 2451 2452 /* init q->mq_kobj and sw queues' kobjects */ 2453 blk_mq_sysfs_init(q); 2454 2455 q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), 2456 GFP_KERNEL, set->numa_node); 2457 if (!q->queue_hw_ctx) 2458 goto err_percpu; 2459 2460 q->mq_map = set->mq_map; 2461 2462 blk_mq_realloc_hw_ctxs(set, q); 2463 if (!q->nr_hw_queues) 2464 goto err_hctxs; 2465 2466 INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 2467 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 2468 2469 q->nr_queues = nr_cpu_ids; 2470 2471 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 2472 2473 if (!(set->flags & BLK_MQ_F_SG_MERGE)) 2474 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; 2475 2476 q->sg_reserved_size = INT_MAX; 2477 2478 INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); 2479 INIT_LIST_HEAD(&q->requeue_list); 2480 spin_lock_init(&q->requeue_lock); 2481 2482 blk_queue_make_request(q, blk_mq_make_request); 2483 if (q->mq_ops->poll) 2484 q->poll_fn = blk_mq_poll; 2485 2486 /* 2487 * Do this after blk_queue_make_request() overrides it... 2488 */ 2489 q->nr_requests = set->queue_depth; 2490 2491 /* 2492 * Default to classic polling 2493 */ 2494 q->poll_nsec = -1; 2495 2496 if (set->ops->complete) 2497 blk_queue_softirq_done(q, set->ops->complete); 2498 2499 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 2500 blk_mq_add_queue_tag_set(set, q); 2501 blk_mq_map_swqueue(q); 2502 2503 if (!(set->flags & BLK_MQ_F_NO_SCHED)) { 2504 int ret; 2505 2506 ret = blk_mq_sched_init(q); 2507 if (ret) 2508 return ERR_PTR(ret); 2509 } 2510 2511 return q; 2512 2513 err_hctxs: 2514 kfree(q->queue_hw_ctx); 2515 err_percpu: 2516 free_percpu(q->queue_ctx); 2517 err_exit: 2518 q->mq_ops = NULL; 2519 return ERR_PTR(-ENOMEM); 2520 } 2521 EXPORT_SYMBOL(blk_mq_init_allocated_queue); 2522 2523 void blk_mq_free_queue(struct request_queue *q) 2524 { 2525 struct blk_mq_tag_set *set = q->tag_set; 2526 2527 blk_mq_del_queue_tag_set(q); 2528 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 2529 } 2530 2531 /* Basically redo blk_mq_init_queue with queue frozen */ 2532 static void blk_mq_queue_reinit(struct request_queue *q) 2533 { 2534 WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); 2535 2536 blk_mq_debugfs_unregister_hctxs(q); 2537 blk_mq_sysfs_unregister(q); 2538 2539 /* 2540 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 2541 * we should change hctx numa_node according to the new topology (this 2542 * involves freeing and re-allocating memory, worth doing?) 2543 */ 2544 blk_mq_map_swqueue(q); 2545 2546 blk_mq_sysfs_register(q); 2547 blk_mq_debugfs_register_hctxs(q); 2548 } 2549 2550 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2551 { 2552 int i; 2553 2554 for (i = 0; i < set->nr_hw_queues; i++) 2555 if (!__blk_mq_alloc_rq_map(set, i)) 2556 goto out_unwind; 2557 2558 return 0; 2559 2560 out_unwind: 2561 while (--i >= 0) 2562 blk_mq_free_rq_map(set->tags[i]); 2563 2564 return -ENOMEM; 2565 } 2566 2567 /* 2568 * Allocate the request maps associated with this tag_set. Note that this 2569 * may reduce the depth asked for, if memory is tight. set->queue_depth 2570 * will be updated to reflect the allocated depth. 2571 */ 2572 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2573 { 2574 unsigned int depth; 2575 int err; 2576 2577 depth = set->queue_depth; 2578 do { 2579 err = __blk_mq_alloc_rq_maps(set); 2580 if (!err) 2581 break; 2582 2583 set->queue_depth >>= 1; 2584 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 2585 err = -ENOMEM; 2586 break; 2587 } 2588 } while (set->queue_depth); 2589 2590 if (!set->queue_depth || err) { 2591 pr_err("blk-mq: failed to allocate request map\n"); 2592 return -ENOMEM; 2593 } 2594 2595 if (depth != set->queue_depth) 2596 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 2597 depth, set->queue_depth); 2598 2599 return 0; 2600 } 2601 2602 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) 2603 { 2604 if (set->ops->map_queues) 2605 return set->ops->map_queues(set); 2606 else 2607 return blk_mq_map_queues(set); 2608 } 2609 2610 /* 2611 * Alloc a tag set to be associated with one or more request queues. 2612 * May fail with EINVAL for various error conditions. May adjust the 2613 * requested depth down, if if it too large. In that case, the set 2614 * value will be stored in set->queue_depth. 2615 */ 2616 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2617 { 2618 int ret; 2619 2620 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 2621 2622 if (!set->nr_hw_queues) 2623 return -EINVAL; 2624 if (!set->queue_depth) 2625 return -EINVAL; 2626 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 2627 return -EINVAL; 2628 2629 if (!set->ops->queue_rq) 2630 return -EINVAL; 2631 2632 if (!set->ops->get_budget ^ !set->ops->put_budget) 2633 return -EINVAL; 2634 2635 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 2636 pr_info("blk-mq: reduced tag depth to %u\n", 2637 BLK_MQ_MAX_DEPTH); 2638 set->queue_depth = BLK_MQ_MAX_DEPTH; 2639 } 2640 2641 /* 2642 * If a crashdump is active, then we are potentially in a very 2643 * memory constrained environment. Limit us to 1 queue and 2644 * 64 tags to prevent using too much memory. 2645 */ 2646 if (is_kdump_kernel()) { 2647 set->nr_hw_queues = 1; 2648 set->queue_depth = min(64U, set->queue_depth); 2649 } 2650 /* 2651 * There is no use for more h/w queues than cpus. 2652 */ 2653 if (set->nr_hw_queues > nr_cpu_ids) 2654 set->nr_hw_queues = nr_cpu_ids; 2655 2656 set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *), 2657 GFP_KERNEL, set->numa_node); 2658 if (!set->tags) 2659 return -ENOMEM; 2660 2661 ret = -ENOMEM; 2662 set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids, 2663 GFP_KERNEL, set->numa_node); 2664 if (!set->mq_map) 2665 goto out_free_tags; 2666 2667 ret = blk_mq_update_queue_map(set); 2668 if (ret) 2669 goto out_free_mq_map; 2670 2671 ret = blk_mq_alloc_rq_maps(set); 2672 if (ret) 2673 goto out_free_mq_map; 2674 2675 mutex_init(&set->tag_list_lock); 2676 INIT_LIST_HEAD(&set->tag_list); 2677 2678 return 0; 2679 2680 out_free_mq_map: 2681 kfree(set->mq_map); 2682 set->mq_map = NULL; 2683 out_free_tags: 2684 kfree(set->tags); 2685 set->tags = NULL; 2686 return ret; 2687 } 2688 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 2689 2690 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 2691 { 2692 int i; 2693 2694 for (i = 0; i < nr_cpu_ids; i++) 2695 blk_mq_free_map_and_requests(set, i); 2696 2697 kfree(set->mq_map); 2698 set->mq_map = NULL; 2699 2700 kfree(set->tags); 2701 set->tags = NULL; 2702 } 2703 EXPORT_SYMBOL(blk_mq_free_tag_set); 2704 2705 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) 2706 { 2707 struct blk_mq_tag_set *set = q->tag_set; 2708 struct blk_mq_hw_ctx *hctx; 2709 int i, ret; 2710 2711 if (!set) 2712 return -EINVAL; 2713 2714 blk_mq_freeze_queue(q); 2715 2716 ret = 0; 2717 queue_for_each_hw_ctx(q, hctx, i) { 2718 if (!hctx->tags) 2719 continue; 2720 /* 2721 * If we're using an MQ scheduler, just update the scheduler 2722 * queue depth. This is similar to what the old code would do. 2723 */ 2724 if (!hctx->sched_tags) { 2725 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, 2726 false); 2727 } else { 2728 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, 2729 nr, true); 2730 } 2731 if (ret) 2732 break; 2733 } 2734 2735 if (!ret) 2736 q->nr_requests = nr; 2737 2738 blk_mq_unfreeze_queue(q); 2739 2740 return ret; 2741 } 2742 2743 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, 2744 int nr_hw_queues) 2745 { 2746 struct request_queue *q; 2747 2748 lockdep_assert_held(&set->tag_list_lock); 2749 2750 if (nr_hw_queues > nr_cpu_ids) 2751 nr_hw_queues = nr_cpu_ids; 2752 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) 2753 return; 2754 2755 list_for_each_entry(q, &set->tag_list, tag_set_list) 2756 blk_mq_freeze_queue(q); 2757 2758 set->nr_hw_queues = nr_hw_queues; 2759 blk_mq_update_queue_map(set); 2760 list_for_each_entry(q, &set->tag_list, tag_set_list) { 2761 blk_mq_realloc_hw_ctxs(set, q); 2762 blk_mq_queue_reinit(q); 2763 } 2764 2765 list_for_each_entry(q, &set->tag_list, tag_set_list) 2766 blk_mq_unfreeze_queue(q); 2767 } 2768 2769 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) 2770 { 2771 mutex_lock(&set->tag_list_lock); 2772 __blk_mq_update_nr_hw_queues(set, nr_hw_queues); 2773 mutex_unlock(&set->tag_list_lock); 2774 } 2775 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); 2776 2777 /* Enable polling stats and return whether they were already enabled. */ 2778 static bool blk_poll_stats_enable(struct request_queue *q) 2779 { 2780 if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || 2781 test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) 2782 return true; 2783 blk_stat_add_callback(q, q->poll_cb); 2784 return false; 2785 } 2786 2787 static void blk_mq_poll_stats_start(struct request_queue *q) 2788 { 2789 /* 2790 * We don't arm the callback if polling stats are not enabled or the 2791 * callback is already active. 2792 */ 2793 if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || 2794 blk_stat_is_active(q->poll_cb)) 2795 return; 2796 2797 blk_stat_activate_msecs(q->poll_cb, 100); 2798 } 2799 2800 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) 2801 { 2802 struct request_queue *q = cb->data; 2803 int bucket; 2804 2805 for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { 2806 if (cb->stat[bucket].nr_samples) 2807 q->poll_stat[bucket] = cb->stat[bucket]; 2808 } 2809 } 2810 2811 static unsigned long blk_mq_poll_nsecs(struct request_queue *q, 2812 struct blk_mq_hw_ctx *hctx, 2813 struct request *rq) 2814 { 2815 unsigned long ret = 0; 2816 int bucket; 2817 2818 /* 2819 * If stats collection isn't on, don't sleep but turn it on for 2820 * future users 2821 */ 2822 if (!blk_poll_stats_enable(q)) 2823 return 0; 2824 2825 /* 2826 * As an optimistic guess, use half of the mean service time 2827 * for this type of request. We can (and should) make this smarter. 2828 * For instance, if the completion latencies are tight, we can 2829 * get closer than just half the mean. This is especially 2830 * important on devices where the completion latencies are longer 2831 * than ~10 usec. We do use the stats for the relevant IO size 2832 * if available which does lead to better estimates. 2833 */ 2834 bucket = blk_mq_poll_stats_bkt(rq); 2835 if (bucket < 0) 2836 return ret; 2837 2838 if (q->poll_stat[bucket].nr_samples) 2839 ret = (q->poll_stat[bucket].mean + 1) / 2; 2840 2841 return ret; 2842 } 2843 2844 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, 2845 struct blk_mq_hw_ctx *hctx, 2846 struct request *rq) 2847 { 2848 struct hrtimer_sleeper hs; 2849 enum hrtimer_mode mode; 2850 unsigned int nsecs; 2851 ktime_t kt; 2852 2853 if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) 2854 return false; 2855 2856 /* 2857 * poll_nsec can be: 2858 * 2859 * -1: don't ever hybrid sleep 2860 * 0: use half of prev avg 2861 * >0: use this specific value 2862 */ 2863 if (q->poll_nsec == -1) 2864 return false; 2865 else if (q->poll_nsec > 0) 2866 nsecs = q->poll_nsec; 2867 else 2868 nsecs = blk_mq_poll_nsecs(q, hctx, rq); 2869 2870 if (!nsecs) 2871 return false; 2872 2873 set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); 2874 2875 /* 2876 * This will be replaced with the stats tracking code, using 2877 * 'avg_completion_time / 2' as the pre-sleep target. 2878 */ 2879 kt = nsecs; 2880 2881 mode = HRTIMER_MODE_REL; 2882 hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); 2883 hrtimer_set_expires(&hs.timer, kt); 2884 2885 hrtimer_init_sleeper(&hs, current); 2886 do { 2887 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) 2888 break; 2889 set_current_state(TASK_UNINTERRUPTIBLE); 2890 hrtimer_start_expires(&hs.timer, mode); 2891 if (hs.task) 2892 io_schedule(); 2893 hrtimer_cancel(&hs.timer); 2894 mode = HRTIMER_MODE_ABS; 2895 } while (hs.task && !signal_pending(current)); 2896 2897 __set_current_state(TASK_RUNNING); 2898 destroy_hrtimer_on_stack(&hs.timer); 2899 return true; 2900 } 2901 2902 static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) 2903 { 2904 struct request_queue *q = hctx->queue; 2905 long state; 2906 2907 /* 2908 * If we sleep, have the caller restart the poll loop to reset 2909 * the state. Like for the other success return cases, the 2910 * caller is responsible for checking if the IO completed. If 2911 * the IO isn't complete, we'll get called again and will go 2912 * straight to the busy poll loop. 2913 */ 2914 if (blk_mq_poll_hybrid_sleep(q, hctx, rq)) 2915 return true; 2916 2917 hctx->poll_considered++; 2918 2919 state = current->state; 2920 while (!need_resched()) { 2921 int ret; 2922 2923 hctx->poll_invoked++; 2924 2925 ret = q->mq_ops->poll(hctx, rq->tag); 2926 if (ret > 0) { 2927 hctx->poll_success++; 2928 set_current_state(TASK_RUNNING); 2929 return true; 2930 } 2931 2932 if (signal_pending_state(state, current)) 2933 set_current_state(TASK_RUNNING); 2934 2935 if (current->state == TASK_RUNNING) 2936 return true; 2937 if (ret < 0) 2938 break; 2939 cpu_relax(); 2940 } 2941 2942 return false; 2943 } 2944 2945 static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) 2946 { 2947 struct blk_mq_hw_ctx *hctx; 2948 struct request *rq; 2949 2950 if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) 2951 return false; 2952 2953 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; 2954 if (!blk_qc_t_is_internal(cookie)) 2955 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); 2956 else { 2957 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); 2958 /* 2959 * With scheduling, if the request has completed, we'll 2960 * get a NULL return here, as we clear the sched tag when 2961 * that happens. The request still remains valid, like always, 2962 * so we should be safe with just the NULL check. 2963 */ 2964 if (!rq) 2965 return false; 2966 } 2967 2968 return __blk_mq_poll(hctx, rq); 2969 } 2970 2971 static int __init blk_mq_init(void) 2972 { 2973 /* 2974 * See comment in block/blk.h rq_atomic_flags enum 2975 */ 2976 BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != 2977 (REQ_ATOM_COMPLETE / BITS_PER_BYTE)); 2978 2979 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 2980 blk_mq_hctx_notify_dead); 2981 return 0; 2982 } 2983 subsys_initcall(blk_mq_init); 2984