1 /* 2 * Block multiqueue core code 3 * 4 * Copyright (C) 2013-2014 Jens Axboe 5 * Copyright (C) 2013-2014 Christoph Hellwig 6 */ 7 #include <linux/kernel.h> 8 #include <linux/module.h> 9 #include <linux/backing-dev.h> 10 #include <linux/bio.h> 11 #include <linux/blkdev.h> 12 #include <linux/kmemleak.h> 13 #include <linux/mm.h> 14 #include <linux/init.h> 15 #include <linux/slab.h> 16 #include <linux/workqueue.h> 17 #include <linux/smp.h> 18 #include <linux/llist.h> 19 #include <linux/list_sort.h> 20 #include <linux/cpu.h> 21 #include <linux/cache.h> 22 #include <linux/sched/sysctl.h> 23 #include <linux/sched/topology.h> 24 #include <linux/sched/signal.h> 25 #include <linux/delay.h> 26 #include <linux/crash_dump.h> 27 #include <linux/prefetch.h> 28 29 #include <trace/events/block.h> 30 31 #include <linux/blk-mq.h> 32 #include "blk.h" 33 #include "blk-mq.h" 34 #include "blk-mq-debugfs.h" 35 #include "blk-mq-tag.h" 36 #include "blk-pm.h" 37 #include "blk-stat.h" 38 #include "blk-mq-sched.h" 39 #include "blk-rq-qos.h" 40 41 static void blk_mq_poll_stats_start(struct request_queue *q); 42 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); 43 44 static int blk_mq_poll_stats_bkt(const struct request *rq) 45 { 46 int ddir, bytes, bucket; 47 48 ddir = rq_data_dir(rq); 49 bytes = blk_rq_bytes(rq); 50 51 bucket = ddir + 2*(ilog2(bytes) - 9); 52 53 if (bucket < 0) 54 return -1; 55 else if (bucket >= BLK_MQ_POLL_STATS_BKTS) 56 return ddir + BLK_MQ_POLL_STATS_BKTS - 2; 57 58 return bucket; 59 } 60 61 /* 62 * Check if any of the ctx's have pending work in this hardware queue 63 */ 64 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 65 { 66 return !list_empty_careful(&hctx->dispatch) || 67 sbitmap_any_bit_set(&hctx->ctx_map) || 68 blk_mq_sched_has_work(hctx); 69 } 70 71 /* 72 * Mark this ctx as having pending work in this hardware queue 73 */ 74 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 75 struct blk_mq_ctx *ctx) 76 { 77 const int bit = ctx->index_hw[hctx->type]; 78 79 if (!sbitmap_test_bit(&hctx->ctx_map, bit)) 80 sbitmap_set_bit(&hctx->ctx_map, bit); 81 } 82 83 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 84 struct blk_mq_ctx *ctx) 85 { 86 const int bit = ctx->index_hw[hctx->type]; 87 88 sbitmap_clear_bit(&hctx->ctx_map, bit); 89 } 90 91 struct mq_inflight { 92 struct hd_struct *part; 93 unsigned int *inflight; 94 }; 95 96 static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, 97 struct request *rq, void *priv, 98 bool reserved) 99 { 100 struct mq_inflight *mi = priv; 101 102 /* 103 * index[0] counts the specific partition that was asked for. 104 */ 105 if (rq->part == mi->part) 106 mi->inflight[0]++; 107 108 return true; 109 } 110 111 unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) 112 { 113 unsigned inflight[2]; 114 struct mq_inflight mi = { .part = part, .inflight = inflight, }; 115 116 inflight[0] = inflight[1] = 0; 117 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); 118 119 return inflight[0]; 120 } 121 122 static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, 123 struct request *rq, void *priv, 124 bool reserved) 125 { 126 struct mq_inflight *mi = priv; 127 128 if (rq->part == mi->part) 129 mi->inflight[rq_data_dir(rq)]++; 130 131 return true; 132 } 133 134 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, 135 unsigned int inflight[2]) 136 { 137 struct mq_inflight mi = { .part = part, .inflight = inflight, }; 138 139 inflight[0] = inflight[1] = 0; 140 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); 141 } 142 143 void blk_freeze_queue_start(struct request_queue *q) 144 { 145 int freeze_depth; 146 147 freeze_depth = atomic_inc_return(&q->mq_freeze_depth); 148 if (freeze_depth == 1) { 149 percpu_ref_kill(&q->q_usage_counter); 150 if (queue_is_mq(q)) 151 blk_mq_run_hw_queues(q, false); 152 } 153 } 154 EXPORT_SYMBOL_GPL(blk_freeze_queue_start); 155 156 void blk_mq_freeze_queue_wait(struct request_queue *q) 157 { 158 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); 159 } 160 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); 161 162 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 163 unsigned long timeout) 164 { 165 return wait_event_timeout(q->mq_freeze_wq, 166 percpu_ref_is_zero(&q->q_usage_counter), 167 timeout); 168 } 169 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); 170 171 /* 172 * Guarantee no request is in use, so we can change any data structure of 173 * the queue afterward. 174 */ 175 void blk_freeze_queue(struct request_queue *q) 176 { 177 /* 178 * In the !blk_mq case we are only calling this to kill the 179 * q_usage_counter, otherwise this increases the freeze depth 180 * and waits for it to return to zero. For this reason there is 181 * no blk_unfreeze_queue(), and blk_freeze_queue() is not 182 * exported to drivers as the only user for unfreeze is blk_mq. 183 */ 184 blk_freeze_queue_start(q); 185 blk_mq_freeze_queue_wait(q); 186 } 187 188 void blk_mq_freeze_queue(struct request_queue *q) 189 { 190 /* 191 * ...just an alias to keep freeze and unfreeze actions balanced 192 * in the blk_mq_* namespace 193 */ 194 blk_freeze_queue(q); 195 } 196 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 197 198 void blk_mq_unfreeze_queue(struct request_queue *q) 199 { 200 int freeze_depth; 201 202 freeze_depth = atomic_dec_return(&q->mq_freeze_depth); 203 WARN_ON_ONCE(freeze_depth < 0); 204 if (!freeze_depth) { 205 percpu_ref_resurrect(&q->q_usage_counter); 206 wake_up_all(&q->mq_freeze_wq); 207 } 208 } 209 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 210 211 /* 212 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the 213 * mpt3sas driver such that this function can be removed. 214 */ 215 void blk_mq_quiesce_queue_nowait(struct request_queue *q) 216 { 217 blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); 218 } 219 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); 220 221 /** 222 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished 223 * @q: request queue. 224 * 225 * Note: this function does not prevent that the struct request end_io() 226 * callback function is invoked. Once this function is returned, we make 227 * sure no dispatch can happen until the queue is unquiesced via 228 * blk_mq_unquiesce_queue(). 229 */ 230 void blk_mq_quiesce_queue(struct request_queue *q) 231 { 232 struct blk_mq_hw_ctx *hctx; 233 unsigned int i; 234 bool rcu = false; 235 236 blk_mq_quiesce_queue_nowait(q); 237 238 queue_for_each_hw_ctx(q, hctx, i) { 239 if (hctx->flags & BLK_MQ_F_BLOCKING) 240 synchronize_srcu(hctx->srcu); 241 else 242 rcu = true; 243 } 244 if (rcu) 245 synchronize_rcu(); 246 } 247 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); 248 249 /* 250 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() 251 * @q: request queue. 252 * 253 * This function recovers queue into the state before quiescing 254 * which is done by blk_mq_quiesce_queue. 255 */ 256 void blk_mq_unquiesce_queue(struct request_queue *q) 257 { 258 blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); 259 260 /* dispatch requests which are inserted during quiescing */ 261 blk_mq_run_hw_queues(q, true); 262 } 263 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); 264 265 void blk_mq_wake_waiters(struct request_queue *q) 266 { 267 struct blk_mq_hw_ctx *hctx; 268 unsigned int i; 269 270 queue_for_each_hw_ctx(q, hctx, i) 271 if (blk_mq_hw_queue_mapped(hctx)) 272 blk_mq_tag_wakeup_all(hctx->tags, true); 273 } 274 275 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 276 { 277 return blk_mq_has_free_tags(hctx->tags); 278 } 279 EXPORT_SYMBOL(blk_mq_can_queue); 280 281 /* 282 * Only need start/end time stamping if we have stats enabled, or using 283 * an IO scheduler. 284 */ 285 static inline bool blk_mq_need_time_stamp(struct request *rq) 286 { 287 return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator; 288 } 289 290 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, 291 unsigned int tag, unsigned int op) 292 { 293 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 294 struct request *rq = tags->static_rqs[tag]; 295 req_flags_t rq_flags = 0; 296 297 if (data->flags & BLK_MQ_REQ_INTERNAL) { 298 rq->tag = -1; 299 rq->internal_tag = tag; 300 } else { 301 if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { 302 rq_flags = RQF_MQ_INFLIGHT; 303 atomic_inc(&data->hctx->nr_active); 304 } 305 rq->tag = tag; 306 rq->internal_tag = -1; 307 data->hctx->tags->rqs[rq->tag] = rq; 308 } 309 310 /* csd/requeue_work/fifo_time is initialized before use */ 311 rq->q = data->q; 312 rq->mq_ctx = data->ctx; 313 rq->mq_hctx = data->hctx; 314 rq->rq_flags = rq_flags; 315 rq->cmd_flags = op; 316 if (data->flags & BLK_MQ_REQ_PREEMPT) 317 rq->rq_flags |= RQF_PREEMPT; 318 if (blk_queue_io_stat(data->q)) 319 rq->rq_flags |= RQF_IO_STAT; 320 INIT_LIST_HEAD(&rq->queuelist); 321 INIT_HLIST_NODE(&rq->hash); 322 RB_CLEAR_NODE(&rq->rb_node); 323 rq->rq_disk = NULL; 324 rq->part = NULL; 325 if (blk_mq_need_time_stamp(rq)) 326 rq->start_time_ns = ktime_get_ns(); 327 else 328 rq->start_time_ns = 0; 329 rq->io_start_time_ns = 0; 330 rq->nr_phys_segments = 0; 331 #if defined(CONFIG_BLK_DEV_INTEGRITY) 332 rq->nr_integrity_segments = 0; 333 #endif 334 /* tag was already set */ 335 rq->extra_len = 0; 336 WRITE_ONCE(rq->deadline, 0); 337 338 rq->timeout = 0; 339 340 rq->end_io = NULL; 341 rq->end_io_data = NULL; 342 343 data->ctx->rq_dispatched[op_is_sync(op)]++; 344 refcount_set(&rq->ref, 1); 345 return rq; 346 } 347 348 static struct request *blk_mq_get_request(struct request_queue *q, 349 struct bio *bio, 350 struct blk_mq_alloc_data *data) 351 { 352 struct elevator_queue *e = q->elevator; 353 struct request *rq; 354 unsigned int tag; 355 bool put_ctx_on_error = false; 356 357 blk_queue_enter_live(q); 358 data->q = q; 359 if (likely(!data->ctx)) { 360 data->ctx = blk_mq_get_ctx(q); 361 put_ctx_on_error = true; 362 } 363 if (likely(!data->hctx)) 364 data->hctx = blk_mq_map_queue(q, data->cmd_flags, 365 data->ctx); 366 if (data->cmd_flags & REQ_NOWAIT) 367 data->flags |= BLK_MQ_REQ_NOWAIT; 368 369 if (e) { 370 data->flags |= BLK_MQ_REQ_INTERNAL; 371 372 /* 373 * Flush requests are special and go directly to the 374 * dispatch list. Don't include reserved tags in the 375 * limiting, as it isn't useful. 376 */ 377 if (!op_is_flush(data->cmd_flags) && 378 e->type->ops.limit_depth && 379 !(data->flags & BLK_MQ_REQ_RESERVED)) 380 e->type->ops.limit_depth(data->cmd_flags, data); 381 } else { 382 blk_mq_tag_busy(data->hctx); 383 } 384 385 tag = blk_mq_get_tag(data); 386 if (tag == BLK_MQ_TAG_FAIL) { 387 if (put_ctx_on_error) { 388 blk_mq_put_ctx(data->ctx); 389 data->ctx = NULL; 390 } 391 blk_queue_exit(q); 392 return NULL; 393 } 394 395 rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags); 396 if (!op_is_flush(data->cmd_flags)) { 397 rq->elv.icq = NULL; 398 if (e && e->type->ops.prepare_request) { 399 if (e->type->icq_cache) 400 blk_mq_sched_assign_ioc(rq); 401 402 e->type->ops.prepare_request(rq, bio); 403 rq->rq_flags |= RQF_ELVPRIV; 404 } 405 } 406 data->hctx->queued++; 407 return rq; 408 } 409 410 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 411 blk_mq_req_flags_t flags) 412 { 413 struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; 414 struct request *rq; 415 int ret; 416 417 ret = blk_queue_enter(q, flags); 418 if (ret) 419 return ERR_PTR(ret); 420 421 rq = blk_mq_get_request(q, NULL, &alloc_data); 422 blk_queue_exit(q); 423 424 if (!rq) 425 return ERR_PTR(-EWOULDBLOCK); 426 427 blk_mq_put_ctx(alloc_data.ctx); 428 429 rq->__data_len = 0; 430 rq->__sector = (sector_t) -1; 431 rq->bio = rq->biotail = NULL; 432 return rq; 433 } 434 EXPORT_SYMBOL(blk_mq_alloc_request); 435 436 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 437 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) 438 { 439 struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; 440 struct request *rq; 441 unsigned int cpu; 442 int ret; 443 444 /* 445 * If the tag allocator sleeps we could get an allocation for a 446 * different hardware context. No need to complicate the low level 447 * allocator for this for the rare use case of a command tied to 448 * a specific queue. 449 */ 450 if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT))) 451 return ERR_PTR(-EINVAL); 452 453 if (hctx_idx >= q->nr_hw_queues) 454 return ERR_PTR(-EIO); 455 456 ret = blk_queue_enter(q, flags); 457 if (ret) 458 return ERR_PTR(ret); 459 460 /* 461 * Check if the hardware context is actually mapped to anything. 462 * If not tell the caller that it should skip this queue. 463 */ 464 alloc_data.hctx = q->queue_hw_ctx[hctx_idx]; 465 if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) { 466 blk_queue_exit(q); 467 return ERR_PTR(-EXDEV); 468 } 469 cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); 470 alloc_data.ctx = __blk_mq_get_ctx(q, cpu); 471 472 rq = blk_mq_get_request(q, NULL, &alloc_data); 473 blk_queue_exit(q); 474 475 if (!rq) 476 return ERR_PTR(-EWOULDBLOCK); 477 478 return rq; 479 } 480 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); 481 482 static void __blk_mq_free_request(struct request *rq) 483 { 484 struct request_queue *q = rq->q; 485 struct blk_mq_ctx *ctx = rq->mq_ctx; 486 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 487 const int sched_tag = rq->internal_tag; 488 489 blk_pm_mark_last_busy(rq); 490 rq->mq_hctx = NULL; 491 if (rq->tag != -1) 492 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); 493 if (sched_tag != -1) 494 blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag); 495 blk_mq_sched_restart(hctx); 496 blk_queue_exit(q); 497 } 498 499 void blk_mq_free_request(struct request *rq) 500 { 501 struct request_queue *q = rq->q; 502 struct elevator_queue *e = q->elevator; 503 struct blk_mq_ctx *ctx = rq->mq_ctx; 504 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 505 506 if (rq->rq_flags & RQF_ELVPRIV) { 507 if (e && e->type->ops.finish_request) 508 e->type->ops.finish_request(rq); 509 if (rq->elv.icq) { 510 put_io_context(rq->elv.icq->ioc); 511 rq->elv.icq = NULL; 512 } 513 } 514 515 ctx->rq_completed[rq_is_sync(rq)]++; 516 if (rq->rq_flags & RQF_MQ_INFLIGHT) 517 atomic_dec(&hctx->nr_active); 518 519 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) 520 laptop_io_completion(q->backing_dev_info); 521 522 rq_qos_done(q, rq); 523 524 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 525 if (refcount_dec_and_test(&rq->ref)) 526 __blk_mq_free_request(rq); 527 } 528 EXPORT_SYMBOL_GPL(blk_mq_free_request); 529 530 inline void __blk_mq_end_request(struct request *rq, blk_status_t error) 531 { 532 u64 now = 0; 533 534 if (blk_mq_need_time_stamp(rq)) 535 now = ktime_get_ns(); 536 537 if (rq->rq_flags & RQF_STATS) { 538 blk_mq_poll_stats_start(rq->q); 539 blk_stat_add(rq, now); 540 } 541 542 if (rq->internal_tag != -1) 543 blk_mq_sched_completed_request(rq, now); 544 545 blk_account_io_done(rq, now); 546 547 if (rq->end_io) { 548 rq_qos_done(rq->q, rq); 549 rq->end_io(rq, error); 550 } else { 551 blk_mq_free_request(rq); 552 } 553 } 554 EXPORT_SYMBOL(__blk_mq_end_request); 555 556 void blk_mq_end_request(struct request *rq, blk_status_t error) 557 { 558 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 559 BUG(); 560 __blk_mq_end_request(rq, error); 561 } 562 EXPORT_SYMBOL(blk_mq_end_request); 563 564 static void __blk_mq_complete_request_remote(void *data) 565 { 566 struct request *rq = data; 567 struct request_queue *q = rq->q; 568 569 q->mq_ops->complete(rq); 570 } 571 572 static void __blk_mq_complete_request(struct request *rq) 573 { 574 struct blk_mq_ctx *ctx = rq->mq_ctx; 575 struct request_queue *q = rq->q; 576 bool shared = false; 577 int cpu; 578 579 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 580 /* 581 * Most of single queue controllers, there is only one irq vector 582 * for handling IO completion, and the only irq's affinity is set 583 * as all possible CPUs. On most of ARCHs, this affinity means the 584 * irq is handled on one specific CPU. 585 * 586 * So complete IO reqeust in softirq context in case of single queue 587 * for not degrading IO performance by irqsoff latency. 588 */ 589 if (q->nr_hw_queues == 1) { 590 __blk_complete_request(rq); 591 return; 592 } 593 594 /* 595 * For a polled request, always complete locallly, it's pointless 596 * to redirect the completion. 597 */ 598 if ((rq->cmd_flags & REQ_HIPRI) || 599 !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { 600 q->mq_ops->complete(rq); 601 return; 602 } 603 604 cpu = get_cpu(); 605 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) 606 shared = cpus_share_cache(cpu, ctx->cpu); 607 608 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { 609 rq->csd.func = __blk_mq_complete_request_remote; 610 rq->csd.info = rq; 611 rq->csd.flags = 0; 612 smp_call_function_single_async(ctx->cpu, &rq->csd); 613 } else { 614 q->mq_ops->complete(rq); 615 } 616 put_cpu(); 617 } 618 619 static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) 620 __releases(hctx->srcu) 621 { 622 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) 623 rcu_read_unlock(); 624 else 625 srcu_read_unlock(hctx->srcu, srcu_idx); 626 } 627 628 static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) 629 __acquires(hctx->srcu) 630 { 631 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { 632 /* shut up gcc false positive */ 633 *srcu_idx = 0; 634 rcu_read_lock(); 635 } else 636 *srcu_idx = srcu_read_lock(hctx->srcu); 637 } 638 639 /** 640 * blk_mq_complete_request - end I/O on a request 641 * @rq: the request being processed 642 * 643 * Description: 644 * Ends all I/O on a request. It does not handle partial completions. 645 * The actual completion happens out-of-order, through a IPI handler. 646 **/ 647 bool blk_mq_complete_request(struct request *rq) 648 { 649 if (unlikely(blk_should_fake_timeout(rq->q))) 650 return false; 651 __blk_mq_complete_request(rq); 652 return true; 653 } 654 EXPORT_SYMBOL(blk_mq_complete_request); 655 656 int blk_mq_request_started(struct request *rq) 657 { 658 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 659 } 660 EXPORT_SYMBOL_GPL(blk_mq_request_started); 661 662 void blk_mq_start_request(struct request *rq) 663 { 664 struct request_queue *q = rq->q; 665 666 blk_mq_sched_started_request(rq); 667 668 trace_block_rq_issue(q, rq); 669 670 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { 671 rq->io_start_time_ns = ktime_get_ns(); 672 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 673 rq->throtl_size = blk_rq_sectors(rq); 674 #endif 675 rq->rq_flags |= RQF_STATS; 676 rq_qos_issue(q, rq); 677 } 678 679 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); 680 681 blk_add_timer(rq); 682 WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); 683 684 if (q->dma_drain_size && blk_rq_bytes(rq)) { 685 /* 686 * Make sure space for the drain appears. We know we can do 687 * this because max_hw_segments has been adjusted to be one 688 * fewer than the device can handle. 689 */ 690 rq->nr_phys_segments++; 691 } 692 } 693 EXPORT_SYMBOL(blk_mq_start_request); 694 695 static void __blk_mq_requeue_request(struct request *rq) 696 { 697 struct request_queue *q = rq->q; 698 699 blk_mq_put_driver_tag(rq); 700 701 trace_block_rq_requeue(q, rq); 702 rq_qos_requeue(q, rq); 703 704 if (blk_mq_request_started(rq)) { 705 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 706 rq->rq_flags &= ~RQF_TIMED_OUT; 707 if (q->dma_drain_size && blk_rq_bytes(rq)) 708 rq->nr_phys_segments--; 709 } 710 } 711 712 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) 713 { 714 __blk_mq_requeue_request(rq); 715 716 /* this request will be re-inserted to io scheduler queue */ 717 blk_mq_sched_requeue_request(rq); 718 719 BUG_ON(!list_empty(&rq->queuelist)); 720 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); 721 } 722 EXPORT_SYMBOL(blk_mq_requeue_request); 723 724 static void blk_mq_requeue_work(struct work_struct *work) 725 { 726 struct request_queue *q = 727 container_of(work, struct request_queue, requeue_work.work); 728 LIST_HEAD(rq_list); 729 struct request *rq, *next; 730 731 spin_lock_irq(&q->requeue_lock); 732 list_splice_init(&q->requeue_list, &rq_list); 733 spin_unlock_irq(&q->requeue_lock); 734 735 list_for_each_entry_safe(rq, next, &rq_list, queuelist) { 736 if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) 737 continue; 738 739 rq->rq_flags &= ~RQF_SOFTBARRIER; 740 list_del_init(&rq->queuelist); 741 /* 742 * If RQF_DONTPREP, rq has contained some driver specific 743 * data, so insert it to hctx dispatch list to avoid any 744 * merge. 745 */ 746 if (rq->rq_flags & RQF_DONTPREP) 747 blk_mq_request_bypass_insert(rq, false); 748 else 749 blk_mq_sched_insert_request(rq, true, false, false); 750 } 751 752 while (!list_empty(&rq_list)) { 753 rq = list_entry(rq_list.next, struct request, queuelist); 754 list_del_init(&rq->queuelist); 755 blk_mq_sched_insert_request(rq, false, false, false); 756 } 757 758 blk_mq_run_hw_queues(q, false); 759 } 760 761 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, 762 bool kick_requeue_list) 763 { 764 struct request_queue *q = rq->q; 765 unsigned long flags; 766 767 /* 768 * We abuse this flag that is otherwise used by the I/O scheduler to 769 * request head insertion from the workqueue. 770 */ 771 BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); 772 773 spin_lock_irqsave(&q->requeue_lock, flags); 774 if (at_head) { 775 rq->rq_flags |= RQF_SOFTBARRIER; 776 list_add(&rq->queuelist, &q->requeue_list); 777 } else { 778 list_add_tail(&rq->queuelist, &q->requeue_list); 779 } 780 spin_unlock_irqrestore(&q->requeue_lock, flags); 781 782 if (kick_requeue_list) 783 blk_mq_kick_requeue_list(q); 784 } 785 786 void blk_mq_kick_requeue_list(struct request_queue *q) 787 { 788 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); 789 } 790 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 791 792 void blk_mq_delay_kick_requeue_list(struct request_queue *q, 793 unsigned long msecs) 794 { 795 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 796 msecs_to_jiffies(msecs)); 797 } 798 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); 799 800 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 801 { 802 if (tag < tags->nr_tags) { 803 prefetch(tags->rqs[tag]); 804 return tags->rqs[tag]; 805 } 806 807 return NULL; 808 } 809 EXPORT_SYMBOL(blk_mq_tag_to_rq); 810 811 static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, 812 void *priv, bool reserved) 813 { 814 /* 815 * If we find a request that is inflight and the queue matches, 816 * we know the queue is busy. Return false to stop the iteration. 817 */ 818 if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) { 819 bool *busy = priv; 820 821 *busy = true; 822 return false; 823 } 824 825 return true; 826 } 827 828 bool blk_mq_queue_inflight(struct request_queue *q) 829 { 830 bool busy = false; 831 832 blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); 833 return busy; 834 } 835 EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); 836 837 static void blk_mq_rq_timed_out(struct request *req, bool reserved) 838 { 839 req->rq_flags |= RQF_TIMED_OUT; 840 if (req->q->mq_ops->timeout) { 841 enum blk_eh_timer_return ret; 842 843 ret = req->q->mq_ops->timeout(req, reserved); 844 if (ret == BLK_EH_DONE) 845 return; 846 WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); 847 } 848 849 blk_add_timer(req); 850 } 851 852 static bool blk_mq_req_expired(struct request *rq, unsigned long *next) 853 { 854 unsigned long deadline; 855 856 if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) 857 return false; 858 if (rq->rq_flags & RQF_TIMED_OUT) 859 return false; 860 861 deadline = READ_ONCE(rq->deadline); 862 if (time_after_eq(jiffies, deadline)) 863 return true; 864 865 if (*next == 0) 866 *next = deadline; 867 else if (time_after(*next, deadline)) 868 *next = deadline; 869 return false; 870 } 871 872 static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 873 struct request *rq, void *priv, bool reserved) 874 { 875 unsigned long *next = priv; 876 877 /* 878 * Just do a quick check if it is expired before locking the request in 879 * so we're not unnecessarilly synchronizing across CPUs. 880 */ 881 if (!blk_mq_req_expired(rq, next)) 882 return true; 883 884 /* 885 * We have reason to believe the request may be expired. Take a 886 * reference on the request to lock this request lifetime into its 887 * currently allocated context to prevent it from being reallocated in 888 * the event the completion by-passes this timeout handler. 889 * 890 * If the reference was already released, then the driver beat the 891 * timeout handler to posting a natural completion. 892 */ 893 if (!refcount_inc_not_zero(&rq->ref)) 894 return true; 895 896 /* 897 * The request is now locked and cannot be reallocated underneath the 898 * timeout handler's processing. Re-verify this exact request is truly 899 * expired; if it is not expired, then the request was completed and 900 * reallocated as a new request. 901 */ 902 if (blk_mq_req_expired(rq, next)) 903 blk_mq_rq_timed_out(rq, reserved); 904 if (refcount_dec_and_test(&rq->ref)) 905 __blk_mq_free_request(rq); 906 907 return true; 908 } 909 910 static void blk_mq_timeout_work(struct work_struct *work) 911 { 912 struct request_queue *q = 913 container_of(work, struct request_queue, timeout_work); 914 unsigned long next = 0; 915 struct blk_mq_hw_ctx *hctx; 916 int i; 917 918 /* A deadlock might occur if a request is stuck requiring a 919 * timeout at the same time a queue freeze is waiting 920 * completion, since the timeout code would not be able to 921 * acquire the queue reference here. 922 * 923 * That's why we don't use blk_queue_enter here; instead, we use 924 * percpu_ref_tryget directly, because we need to be able to 925 * obtain a reference even in the short window between the queue 926 * starting to freeze, by dropping the first reference in 927 * blk_freeze_queue_start, and the moment the last request is 928 * consumed, marked by the instant q_usage_counter reaches 929 * zero. 930 */ 931 if (!percpu_ref_tryget(&q->q_usage_counter)) 932 return; 933 934 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); 935 936 if (next != 0) { 937 mod_timer(&q->timeout, next); 938 } else { 939 /* 940 * Request timeouts are handled as a forward rolling timer. If 941 * we end up here it means that no requests are pending and 942 * also that no request has been pending for a while. Mark 943 * each hctx as idle. 944 */ 945 queue_for_each_hw_ctx(q, hctx, i) { 946 /* the hctx may be unmapped, so check it here */ 947 if (blk_mq_hw_queue_mapped(hctx)) 948 blk_mq_tag_idle(hctx); 949 } 950 } 951 blk_queue_exit(q); 952 } 953 954 struct flush_busy_ctx_data { 955 struct blk_mq_hw_ctx *hctx; 956 struct list_head *list; 957 }; 958 959 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) 960 { 961 struct flush_busy_ctx_data *flush_data = data; 962 struct blk_mq_hw_ctx *hctx = flush_data->hctx; 963 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 964 enum hctx_type type = hctx->type; 965 966 spin_lock(&ctx->lock); 967 list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); 968 sbitmap_clear_bit(sb, bitnr); 969 spin_unlock(&ctx->lock); 970 return true; 971 } 972 973 /* 974 * Process software queues that have been marked busy, splicing them 975 * to the for-dispatch 976 */ 977 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 978 { 979 struct flush_busy_ctx_data data = { 980 .hctx = hctx, 981 .list = list, 982 }; 983 984 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); 985 } 986 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); 987 988 struct dispatch_rq_data { 989 struct blk_mq_hw_ctx *hctx; 990 struct request *rq; 991 }; 992 993 static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, 994 void *data) 995 { 996 struct dispatch_rq_data *dispatch_data = data; 997 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; 998 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 999 enum hctx_type type = hctx->type; 1000 1001 spin_lock(&ctx->lock); 1002 if (!list_empty(&ctx->rq_lists[type])) { 1003 dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); 1004 list_del_init(&dispatch_data->rq->queuelist); 1005 if (list_empty(&ctx->rq_lists[type])) 1006 sbitmap_clear_bit(sb, bitnr); 1007 } 1008 spin_unlock(&ctx->lock); 1009 1010 return !dispatch_data->rq; 1011 } 1012 1013 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, 1014 struct blk_mq_ctx *start) 1015 { 1016 unsigned off = start ? start->index_hw[hctx->type] : 0; 1017 struct dispatch_rq_data data = { 1018 .hctx = hctx, 1019 .rq = NULL, 1020 }; 1021 1022 __sbitmap_for_each_set(&hctx->ctx_map, off, 1023 dispatch_rq_from_ctx, &data); 1024 1025 return data.rq; 1026 } 1027 1028 static inline unsigned int queued_to_index(unsigned int queued) 1029 { 1030 if (!queued) 1031 return 0; 1032 1033 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); 1034 } 1035 1036 bool blk_mq_get_driver_tag(struct request *rq) 1037 { 1038 struct blk_mq_alloc_data data = { 1039 .q = rq->q, 1040 .hctx = rq->mq_hctx, 1041 .flags = BLK_MQ_REQ_NOWAIT, 1042 .cmd_flags = rq->cmd_flags, 1043 }; 1044 bool shared; 1045 1046 if (rq->tag != -1) 1047 goto done; 1048 1049 if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) 1050 data.flags |= BLK_MQ_REQ_RESERVED; 1051 1052 shared = blk_mq_tag_busy(data.hctx); 1053 rq->tag = blk_mq_get_tag(&data); 1054 if (rq->tag >= 0) { 1055 if (shared) { 1056 rq->rq_flags |= RQF_MQ_INFLIGHT; 1057 atomic_inc(&data.hctx->nr_active); 1058 } 1059 data.hctx->tags->rqs[rq->tag] = rq; 1060 } 1061 1062 done: 1063 return rq->tag != -1; 1064 } 1065 1066 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, 1067 int flags, void *key) 1068 { 1069 struct blk_mq_hw_ctx *hctx; 1070 1071 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); 1072 1073 spin_lock(&hctx->dispatch_wait_lock); 1074 list_del_init(&wait->entry); 1075 spin_unlock(&hctx->dispatch_wait_lock); 1076 1077 blk_mq_run_hw_queue(hctx, true); 1078 return 1; 1079 } 1080 1081 /* 1082 * Mark us waiting for a tag. For shared tags, this involves hooking us into 1083 * the tag wakeups. For non-shared tags, we can simply mark us needing a 1084 * restart. For both cases, take care to check the condition again after 1085 * marking us as waiting. 1086 */ 1087 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, 1088 struct request *rq) 1089 { 1090 struct wait_queue_head *wq; 1091 wait_queue_entry_t *wait; 1092 bool ret; 1093 1094 if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { 1095 blk_mq_sched_mark_restart_hctx(hctx); 1096 1097 /* 1098 * It's possible that a tag was freed in the window between the 1099 * allocation failure and adding the hardware queue to the wait 1100 * queue. 1101 * 1102 * Don't clear RESTART here, someone else could have set it. 1103 * At most this will cost an extra queue run. 1104 */ 1105 return blk_mq_get_driver_tag(rq); 1106 } 1107 1108 wait = &hctx->dispatch_wait; 1109 if (!list_empty_careful(&wait->entry)) 1110 return false; 1111 1112 wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait; 1113 1114 spin_lock_irq(&wq->lock); 1115 spin_lock(&hctx->dispatch_wait_lock); 1116 if (!list_empty(&wait->entry)) { 1117 spin_unlock(&hctx->dispatch_wait_lock); 1118 spin_unlock_irq(&wq->lock); 1119 return false; 1120 } 1121 1122 wait->flags &= ~WQ_FLAG_EXCLUSIVE; 1123 __add_wait_queue(wq, wait); 1124 1125 /* 1126 * It's possible that a tag was freed in the window between the 1127 * allocation failure and adding the hardware queue to the wait 1128 * queue. 1129 */ 1130 ret = blk_mq_get_driver_tag(rq); 1131 if (!ret) { 1132 spin_unlock(&hctx->dispatch_wait_lock); 1133 spin_unlock_irq(&wq->lock); 1134 return false; 1135 } 1136 1137 /* 1138 * We got a tag, remove ourselves from the wait queue to ensure 1139 * someone else gets the wakeup. 1140 */ 1141 list_del_init(&wait->entry); 1142 spin_unlock(&hctx->dispatch_wait_lock); 1143 spin_unlock_irq(&wq->lock); 1144 1145 return true; 1146 } 1147 1148 #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 1149 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 1150 /* 1151 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): 1152 * - EWMA is one simple way to compute running average value 1153 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially 1154 * - take 4 as factor for avoiding to get too small(0) result, and this 1155 * factor doesn't matter because EWMA decreases exponentially 1156 */ 1157 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) 1158 { 1159 unsigned int ewma; 1160 1161 if (hctx->queue->elevator) 1162 return; 1163 1164 ewma = hctx->dispatch_busy; 1165 1166 if (!ewma && !busy) 1167 return; 1168 1169 ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; 1170 if (busy) 1171 ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; 1172 ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; 1173 1174 hctx->dispatch_busy = ewma; 1175 } 1176 1177 #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ 1178 1179 /* 1180 * Returns true if we did some work AND can potentially do more. 1181 */ 1182 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, 1183 bool got_budget) 1184 { 1185 struct blk_mq_hw_ctx *hctx; 1186 struct request *rq, *nxt; 1187 bool no_tag = false; 1188 int errors, queued; 1189 blk_status_t ret = BLK_STS_OK; 1190 1191 if (list_empty(list)) 1192 return false; 1193 1194 WARN_ON(!list_is_singular(list) && got_budget); 1195 1196 /* 1197 * Now process all the entries, sending them to the driver. 1198 */ 1199 errors = queued = 0; 1200 do { 1201 struct blk_mq_queue_data bd; 1202 1203 rq = list_first_entry(list, struct request, queuelist); 1204 1205 hctx = rq->mq_hctx; 1206 if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) 1207 break; 1208 1209 if (!blk_mq_get_driver_tag(rq)) { 1210 /* 1211 * The initial allocation attempt failed, so we need to 1212 * rerun the hardware queue when a tag is freed. The 1213 * waitqueue takes care of that. If the queue is run 1214 * before we add this entry back on the dispatch list, 1215 * we'll re-run it below. 1216 */ 1217 if (!blk_mq_mark_tag_wait(hctx, rq)) { 1218 blk_mq_put_dispatch_budget(hctx); 1219 /* 1220 * For non-shared tags, the RESTART check 1221 * will suffice. 1222 */ 1223 if (hctx->flags & BLK_MQ_F_TAG_SHARED) 1224 no_tag = true; 1225 break; 1226 } 1227 } 1228 1229 list_del_init(&rq->queuelist); 1230 1231 bd.rq = rq; 1232 1233 /* 1234 * Flag last if we have no more requests, or if we have more 1235 * but can't assign a driver tag to it. 1236 */ 1237 if (list_empty(list)) 1238 bd.last = true; 1239 else { 1240 nxt = list_first_entry(list, struct request, queuelist); 1241 bd.last = !blk_mq_get_driver_tag(nxt); 1242 } 1243 1244 ret = q->mq_ops->queue_rq(hctx, &bd); 1245 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { 1246 /* 1247 * If an I/O scheduler has been configured and we got a 1248 * driver tag for the next request already, free it 1249 * again. 1250 */ 1251 if (!list_empty(list)) { 1252 nxt = list_first_entry(list, struct request, queuelist); 1253 blk_mq_put_driver_tag(nxt); 1254 } 1255 list_add(&rq->queuelist, list); 1256 __blk_mq_requeue_request(rq); 1257 break; 1258 } 1259 1260 if (unlikely(ret != BLK_STS_OK)) { 1261 errors++; 1262 blk_mq_end_request(rq, BLK_STS_IOERR); 1263 continue; 1264 } 1265 1266 queued++; 1267 } while (!list_empty(list)); 1268 1269 hctx->dispatched[queued_to_index(queued)]++; 1270 1271 /* 1272 * Any items that need requeuing? Stuff them into hctx->dispatch, 1273 * that is where we will continue on next queue run. 1274 */ 1275 if (!list_empty(list)) { 1276 bool needs_restart; 1277 1278 /* 1279 * If we didn't flush the entire list, we could have told 1280 * the driver there was more coming, but that turned out to 1281 * be a lie. 1282 */ 1283 if (q->mq_ops->commit_rqs) 1284 q->mq_ops->commit_rqs(hctx); 1285 1286 spin_lock(&hctx->lock); 1287 list_splice_init(list, &hctx->dispatch); 1288 spin_unlock(&hctx->lock); 1289 1290 /* 1291 * If SCHED_RESTART was set by the caller of this function and 1292 * it is no longer set that means that it was cleared by another 1293 * thread and hence that a queue rerun is needed. 1294 * 1295 * If 'no_tag' is set, that means that we failed getting 1296 * a driver tag with an I/O scheduler attached. If our dispatch 1297 * waitqueue is no longer active, ensure that we run the queue 1298 * AFTER adding our entries back to the list. 1299 * 1300 * If no I/O scheduler has been configured it is possible that 1301 * the hardware queue got stopped and restarted before requests 1302 * were pushed back onto the dispatch list. Rerun the queue to 1303 * avoid starvation. Notes: 1304 * - blk_mq_run_hw_queue() checks whether or not a queue has 1305 * been stopped before rerunning a queue. 1306 * - Some but not all block drivers stop a queue before 1307 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq 1308 * and dm-rq. 1309 * 1310 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART 1311 * bit is set, run queue after a delay to avoid IO stalls 1312 * that could otherwise occur if the queue is idle. 1313 */ 1314 needs_restart = blk_mq_sched_needs_restart(hctx); 1315 if (!needs_restart || 1316 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) 1317 blk_mq_run_hw_queue(hctx, true); 1318 else if (needs_restart && (ret == BLK_STS_RESOURCE)) 1319 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); 1320 1321 blk_mq_update_dispatch_busy(hctx, true); 1322 return false; 1323 } else 1324 blk_mq_update_dispatch_busy(hctx, false); 1325 1326 /* 1327 * If the host/device is unable to accept more work, inform the 1328 * caller of that. 1329 */ 1330 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) 1331 return false; 1332 1333 return (queued + errors) != 0; 1334 } 1335 1336 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 1337 { 1338 int srcu_idx; 1339 1340 /* 1341 * We should be running this queue from one of the CPUs that 1342 * are mapped to it. 1343 * 1344 * There are at least two related races now between setting 1345 * hctx->next_cpu from blk_mq_hctx_next_cpu() and running 1346 * __blk_mq_run_hw_queue(): 1347 * 1348 * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), 1349 * but later it becomes online, then this warning is harmless 1350 * at all 1351 * 1352 * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), 1353 * but later it becomes offline, then the warning can't be 1354 * triggered, and we depend on blk-mq timeout handler to 1355 * handle dispatched requests to this hctx 1356 */ 1357 if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && 1358 cpu_online(hctx->next_cpu)) { 1359 printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", 1360 raw_smp_processor_id(), 1361 cpumask_empty(hctx->cpumask) ? "inactive": "active"); 1362 dump_stack(); 1363 } 1364 1365 /* 1366 * We can't run the queue inline with ints disabled. Ensure that 1367 * we catch bad users of this early. 1368 */ 1369 WARN_ON_ONCE(in_interrupt()); 1370 1371 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); 1372 1373 hctx_lock(hctx, &srcu_idx); 1374 blk_mq_sched_dispatch_requests(hctx); 1375 hctx_unlock(hctx, srcu_idx); 1376 } 1377 1378 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) 1379 { 1380 int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); 1381 1382 if (cpu >= nr_cpu_ids) 1383 cpu = cpumask_first(hctx->cpumask); 1384 return cpu; 1385 } 1386 1387 /* 1388 * It'd be great if the workqueue API had a way to pass 1389 * in a mask and had some smarts for more clever placement. 1390 * For now we just round-robin here, switching for every 1391 * BLK_MQ_CPU_WORK_BATCH queued items. 1392 */ 1393 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 1394 { 1395 bool tried = false; 1396 int next_cpu = hctx->next_cpu; 1397 1398 if (hctx->queue->nr_hw_queues == 1) 1399 return WORK_CPU_UNBOUND; 1400 1401 if (--hctx->next_cpu_batch <= 0) { 1402 select_cpu: 1403 next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, 1404 cpu_online_mask); 1405 if (next_cpu >= nr_cpu_ids) 1406 next_cpu = blk_mq_first_mapped_cpu(hctx); 1407 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1408 } 1409 1410 /* 1411 * Do unbound schedule if we can't find a online CPU for this hctx, 1412 * and it should only happen in the path of handling CPU DEAD. 1413 */ 1414 if (!cpu_online(next_cpu)) { 1415 if (!tried) { 1416 tried = true; 1417 goto select_cpu; 1418 } 1419 1420 /* 1421 * Make sure to re-select CPU next time once after CPUs 1422 * in hctx->cpumask become online again. 1423 */ 1424 hctx->next_cpu = next_cpu; 1425 hctx->next_cpu_batch = 1; 1426 return WORK_CPU_UNBOUND; 1427 } 1428 1429 hctx->next_cpu = next_cpu; 1430 return next_cpu; 1431 } 1432 1433 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, 1434 unsigned long msecs) 1435 { 1436 if (unlikely(blk_mq_hctx_stopped(hctx))) 1437 return; 1438 1439 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { 1440 int cpu = get_cpu(); 1441 if (cpumask_test_cpu(cpu, hctx->cpumask)) { 1442 __blk_mq_run_hw_queue(hctx); 1443 put_cpu(); 1444 return; 1445 } 1446 1447 put_cpu(); 1448 } 1449 1450 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, 1451 msecs_to_jiffies(msecs)); 1452 } 1453 1454 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 1455 { 1456 __blk_mq_delay_run_hw_queue(hctx, true, msecs); 1457 } 1458 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); 1459 1460 bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1461 { 1462 int srcu_idx; 1463 bool need_run; 1464 1465 /* 1466 * When queue is quiesced, we may be switching io scheduler, or 1467 * updating nr_hw_queues, or other things, and we can't run queue 1468 * any more, even __blk_mq_hctx_has_pending() can't be called safely. 1469 * 1470 * And queue will be rerun in blk_mq_unquiesce_queue() if it is 1471 * quiesced. 1472 */ 1473 hctx_lock(hctx, &srcu_idx); 1474 need_run = !blk_queue_quiesced(hctx->queue) && 1475 blk_mq_hctx_has_pending(hctx); 1476 hctx_unlock(hctx, srcu_idx); 1477 1478 if (need_run) { 1479 __blk_mq_delay_run_hw_queue(hctx, async, 0); 1480 return true; 1481 } 1482 1483 return false; 1484 } 1485 EXPORT_SYMBOL(blk_mq_run_hw_queue); 1486 1487 void blk_mq_run_hw_queues(struct request_queue *q, bool async) 1488 { 1489 struct blk_mq_hw_ctx *hctx; 1490 int i; 1491 1492 queue_for_each_hw_ctx(q, hctx, i) { 1493 if (blk_mq_hctx_stopped(hctx)) 1494 continue; 1495 1496 blk_mq_run_hw_queue(hctx, async); 1497 } 1498 } 1499 EXPORT_SYMBOL(blk_mq_run_hw_queues); 1500 1501 /** 1502 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped 1503 * @q: request queue. 1504 * 1505 * The caller is responsible for serializing this function against 1506 * blk_mq_{start,stop}_hw_queue(). 1507 */ 1508 bool blk_mq_queue_stopped(struct request_queue *q) 1509 { 1510 struct blk_mq_hw_ctx *hctx; 1511 int i; 1512 1513 queue_for_each_hw_ctx(q, hctx, i) 1514 if (blk_mq_hctx_stopped(hctx)) 1515 return true; 1516 1517 return false; 1518 } 1519 EXPORT_SYMBOL(blk_mq_queue_stopped); 1520 1521 /* 1522 * This function is often used for pausing .queue_rq() by driver when 1523 * there isn't enough resource or some conditions aren't satisfied, and 1524 * BLK_STS_RESOURCE is usually returned. 1525 * 1526 * We do not guarantee that dispatch can be drained or blocked 1527 * after blk_mq_stop_hw_queue() returns. Please use 1528 * blk_mq_quiesce_queue() for that requirement. 1529 */ 1530 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 1531 { 1532 cancel_delayed_work(&hctx->run_work); 1533 1534 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 1535 } 1536 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 1537 1538 /* 1539 * This function is often used for pausing .queue_rq() by driver when 1540 * there isn't enough resource or some conditions aren't satisfied, and 1541 * BLK_STS_RESOURCE is usually returned. 1542 * 1543 * We do not guarantee that dispatch can be drained or blocked 1544 * after blk_mq_stop_hw_queues() returns. Please use 1545 * blk_mq_quiesce_queue() for that requirement. 1546 */ 1547 void blk_mq_stop_hw_queues(struct request_queue *q) 1548 { 1549 struct blk_mq_hw_ctx *hctx; 1550 int i; 1551 1552 queue_for_each_hw_ctx(q, hctx, i) 1553 blk_mq_stop_hw_queue(hctx); 1554 } 1555 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 1556 1557 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 1558 { 1559 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1560 1561 blk_mq_run_hw_queue(hctx, false); 1562 } 1563 EXPORT_SYMBOL(blk_mq_start_hw_queue); 1564 1565 void blk_mq_start_hw_queues(struct request_queue *q) 1566 { 1567 struct blk_mq_hw_ctx *hctx; 1568 int i; 1569 1570 queue_for_each_hw_ctx(q, hctx, i) 1571 blk_mq_start_hw_queue(hctx); 1572 } 1573 EXPORT_SYMBOL(blk_mq_start_hw_queues); 1574 1575 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1576 { 1577 if (!blk_mq_hctx_stopped(hctx)) 1578 return; 1579 1580 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1581 blk_mq_run_hw_queue(hctx, async); 1582 } 1583 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); 1584 1585 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 1586 { 1587 struct blk_mq_hw_ctx *hctx; 1588 int i; 1589 1590 queue_for_each_hw_ctx(q, hctx, i) 1591 blk_mq_start_stopped_hw_queue(hctx, async); 1592 } 1593 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 1594 1595 static void blk_mq_run_work_fn(struct work_struct *work) 1596 { 1597 struct blk_mq_hw_ctx *hctx; 1598 1599 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); 1600 1601 /* 1602 * If we are stopped, don't run the queue. 1603 */ 1604 if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 1605 return; 1606 1607 __blk_mq_run_hw_queue(hctx); 1608 } 1609 1610 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, 1611 struct request *rq, 1612 bool at_head) 1613 { 1614 struct blk_mq_ctx *ctx = rq->mq_ctx; 1615 enum hctx_type type = hctx->type; 1616 1617 lockdep_assert_held(&ctx->lock); 1618 1619 trace_block_rq_insert(hctx->queue, rq); 1620 1621 if (at_head) 1622 list_add(&rq->queuelist, &ctx->rq_lists[type]); 1623 else 1624 list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); 1625 } 1626 1627 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 1628 bool at_head) 1629 { 1630 struct blk_mq_ctx *ctx = rq->mq_ctx; 1631 1632 lockdep_assert_held(&ctx->lock); 1633 1634 __blk_mq_insert_req_list(hctx, rq, at_head); 1635 blk_mq_hctx_mark_pending(hctx, ctx); 1636 } 1637 1638 /* 1639 * Should only be used carefully, when the caller knows we want to 1640 * bypass a potential IO scheduler on the target device. 1641 */ 1642 void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) 1643 { 1644 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 1645 1646 spin_lock(&hctx->lock); 1647 list_add_tail(&rq->queuelist, &hctx->dispatch); 1648 spin_unlock(&hctx->lock); 1649 1650 if (run_queue) 1651 blk_mq_run_hw_queue(hctx, false); 1652 } 1653 1654 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 1655 struct list_head *list) 1656 1657 { 1658 struct request *rq; 1659 enum hctx_type type = hctx->type; 1660 1661 /* 1662 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1663 * offline now 1664 */ 1665 list_for_each_entry(rq, list, queuelist) { 1666 BUG_ON(rq->mq_ctx != ctx); 1667 trace_block_rq_insert(hctx->queue, rq); 1668 } 1669 1670 spin_lock(&ctx->lock); 1671 list_splice_tail_init(list, &ctx->rq_lists[type]); 1672 blk_mq_hctx_mark_pending(hctx, ctx); 1673 spin_unlock(&ctx->lock); 1674 } 1675 1676 static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) 1677 { 1678 struct request *rqa = container_of(a, struct request, queuelist); 1679 struct request *rqb = container_of(b, struct request, queuelist); 1680 1681 if (rqa->mq_ctx < rqb->mq_ctx) 1682 return -1; 1683 else if (rqa->mq_ctx > rqb->mq_ctx) 1684 return 1; 1685 else if (rqa->mq_hctx < rqb->mq_hctx) 1686 return -1; 1687 else if (rqa->mq_hctx > rqb->mq_hctx) 1688 return 1; 1689 1690 return blk_rq_pos(rqa) > blk_rq_pos(rqb); 1691 } 1692 1693 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1694 { 1695 struct blk_mq_hw_ctx *this_hctx; 1696 struct blk_mq_ctx *this_ctx; 1697 struct request_queue *this_q; 1698 struct request *rq; 1699 LIST_HEAD(list); 1700 LIST_HEAD(rq_list); 1701 unsigned int depth; 1702 1703 list_splice_init(&plug->mq_list, &list); 1704 plug->rq_count = 0; 1705 1706 if (plug->rq_count > 2 && plug->multiple_queues) 1707 list_sort(NULL, &list, plug_rq_cmp); 1708 1709 this_q = NULL; 1710 this_hctx = NULL; 1711 this_ctx = NULL; 1712 depth = 0; 1713 1714 while (!list_empty(&list)) { 1715 rq = list_entry_rq(list.next); 1716 list_del_init(&rq->queuelist); 1717 BUG_ON(!rq->q); 1718 if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) { 1719 if (this_hctx) { 1720 trace_block_unplug(this_q, depth, !from_schedule); 1721 blk_mq_sched_insert_requests(this_hctx, this_ctx, 1722 &rq_list, 1723 from_schedule); 1724 } 1725 1726 this_q = rq->q; 1727 this_ctx = rq->mq_ctx; 1728 this_hctx = rq->mq_hctx; 1729 depth = 0; 1730 } 1731 1732 depth++; 1733 list_add_tail(&rq->queuelist, &rq_list); 1734 } 1735 1736 /* 1737 * If 'this_hctx' is set, we know we have entries to complete 1738 * on 'rq_list'. Do those. 1739 */ 1740 if (this_hctx) { 1741 trace_block_unplug(this_q, depth, !from_schedule); 1742 blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, 1743 from_schedule); 1744 } 1745 } 1746 1747 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1748 { 1749 blk_init_request_from_bio(rq, bio); 1750 1751 blk_account_io_start(rq, true); 1752 } 1753 1754 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, 1755 struct request *rq, 1756 blk_qc_t *cookie, bool last) 1757 { 1758 struct request_queue *q = rq->q; 1759 struct blk_mq_queue_data bd = { 1760 .rq = rq, 1761 .last = last, 1762 }; 1763 blk_qc_t new_cookie; 1764 blk_status_t ret; 1765 1766 new_cookie = request_to_qc_t(hctx, rq); 1767 1768 /* 1769 * For OK queue, we are done. For error, caller may kill it. 1770 * Any other error (busy), just add it to our list as we 1771 * previously would have done. 1772 */ 1773 ret = q->mq_ops->queue_rq(hctx, &bd); 1774 switch (ret) { 1775 case BLK_STS_OK: 1776 blk_mq_update_dispatch_busy(hctx, false); 1777 *cookie = new_cookie; 1778 break; 1779 case BLK_STS_RESOURCE: 1780 case BLK_STS_DEV_RESOURCE: 1781 blk_mq_update_dispatch_busy(hctx, true); 1782 __blk_mq_requeue_request(rq); 1783 break; 1784 default: 1785 blk_mq_update_dispatch_busy(hctx, false); 1786 *cookie = BLK_QC_T_NONE; 1787 break; 1788 } 1789 1790 return ret; 1791 } 1792 1793 blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 1794 struct request *rq, 1795 blk_qc_t *cookie, 1796 bool bypass, bool last) 1797 { 1798 struct request_queue *q = rq->q; 1799 bool run_queue = true; 1800 blk_status_t ret = BLK_STS_RESOURCE; 1801 int srcu_idx; 1802 bool force = false; 1803 1804 hctx_lock(hctx, &srcu_idx); 1805 /* 1806 * hctx_lock is needed before checking quiesced flag. 1807 * 1808 * When queue is stopped or quiesced, ignore 'bypass', insert 1809 * and return BLK_STS_OK to caller, and avoid driver to try to 1810 * dispatch again. 1811 */ 1812 if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) { 1813 run_queue = false; 1814 bypass = false; 1815 goto out_unlock; 1816 } 1817 1818 if (unlikely(q->elevator && !bypass)) 1819 goto out_unlock; 1820 1821 if (!blk_mq_get_dispatch_budget(hctx)) 1822 goto out_unlock; 1823 1824 if (!blk_mq_get_driver_tag(rq)) { 1825 blk_mq_put_dispatch_budget(hctx); 1826 goto out_unlock; 1827 } 1828 1829 /* 1830 * Always add a request that has been through 1831 *.queue_rq() to the hardware dispatch list. 1832 */ 1833 force = true; 1834 ret = __blk_mq_issue_directly(hctx, rq, cookie, last); 1835 out_unlock: 1836 hctx_unlock(hctx, srcu_idx); 1837 switch (ret) { 1838 case BLK_STS_OK: 1839 break; 1840 case BLK_STS_DEV_RESOURCE: 1841 case BLK_STS_RESOURCE: 1842 if (force) { 1843 blk_mq_request_bypass_insert(rq, run_queue); 1844 /* 1845 * We have to return BLK_STS_OK for the DM 1846 * to avoid livelock. Otherwise, we return 1847 * the real result to indicate whether the 1848 * request is direct-issued successfully. 1849 */ 1850 ret = bypass ? BLK_STS_OK : ret; 1851 } else if (!bypass) { 1852 blk_mq_sched_insert_request(rq, false, 1853 run_queue, false); 1854 } 1855 break; 1856 default: 1857 if (!bypass) 1858 blk_mq_end_request(rq, ret); 1859 break; 1860 } 1861 1862 return ret; 1863 } 1864 1865 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 1866 struct list_head *list) 1867 { 1868 blk_qc_t unused; 1869 blk_status_t ret = BLK_STS_OK; 1870 1871 while (!list_empty(list)) { 1872 struct request *rq = list_first_entry(list, struct request, 1873 queuelist); 1874 1875 list_del_init(&rq->queuelist); 1876 if (ret == BLK_STS_OK) 1877 ret = blk_mq_try_issue_directly(hctx, rq, &unused, 1878 false, 1879 list_empty(list)); 1880 else 1881 blk_mq_sched_insert_request(rq, false, true, false); 1882 } 1883 1884 /* 1885 * If we didn't flush the entire list, we could have told 1886 * the driver there was more coming, but that turned out to 1887 * be a lie. 1888 */ 1889 if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs) 1890 hctx->queue->mq_ops->commit_rqs(hctx); 1891 } 1892 1893 static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) 1894 { 1895 list_add_tail(&rq->queuelist, &plug->mq_list); 1896 plug->rq_count++; 1897 if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { 1898 struct request *tmp; 1899 1900 tmp = list_first_entry(&plug->mq_list, struct request, 1901 queuelist); 1902 if (tmp->q != rq->q) 1903 plug->multiple_queues = true; 1904 } 1905 } 1906 1907 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) 1908 { 1909 const int is_sync = op_is_sync(bio->bi_opf); 1910 const int is_flush_fua = op_is_flush(bio->bi_opf); 1911 struct blk_mq_alloc_data data = { .flags = 0}; 1912 struct request *rq; 1913 struct blk_plug *plug; 1914 struct request *same_queue_rq = NULL; 1915 blk_qc_t cookie; 1916 1917 blk_queue_bounce(q, &bio); 1918 1919 blk_queue_split(q, &bio); 1920 1921 if (!bio_integrity_prep(bio)) 1922 return BLK_QC_T_NONE; 1923 1924 if (!is_flush_fua && !blk_queue_nomerges(q) && 1925 blk_attempt_plug_merge(q, bio, &same_queue_rq)) 1926 return BLK_QC_T_NONE; 1927 1928 if (blk_mq_sched_bio_merge(q, bio)) 1929 return BLK_QC_T_NONE; 1930 1931 rq_qos_throttle(q, bio); 1932 1933 data.cmd_flags = bio->bi_opf; 1934 rq = blk_mq_get_request(q, bio, &data); 1935 if (unlikely(!rq)) { 1936 rq_qos_cleanup(q, bio); 1937 if (bio->bi_opf & REQ_NOWAIT) 1938 bio_wouldblock_error(bio); 1939 return BLK_QC_T_NONE; 1940 } 1941 1942 trace_block_getrq(q, bio, bio->bi_opf); 1943 1944 rq_qos_track(q, rq, bio); 1945 1946 cookie = request_to_qc_t(data.hctx, rq); 1947 1948 plug = current->plug; 1949 if (unlikely(is_flush_fua)) { 1950 blk_mq_put_ctx(data.ctx); 1951 blk_mq_bio_to_request(rq, bio); 1952 1953 /* bypass scheduler for flush rq */ 1954 blk_insert_flush(rq); 1955 blk_mq_run_hw_queue(data.hctx, true); 1956 } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) { 1957 /* 1958 * Use plugging if we have a ->commit_rqs() hook as well, as 1959 * we know the driver uses bd->last in a smart fashion. 1960 */ 1961 unsigned int request_count = plug->rq_count; 1962 struct request *last = NULL; 1963 1964 blk_mq_put_ctx(data.ctx); 1965 blk_mq_bio_to_request(rq, bio); 1966 1967 if (!request_count) 1968 trace_block_plug(q); 1969 else 1970 last = list_entry_rq(plug->mq_list.prev); 1971 1972 if (request_count >= BLK_MAX_REQUEST_COUNT || (last && 1973 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { 1974 blk_flush_plug_list(plug, false); 1975 trace_block_plug(q); 1976 } 1977 1978 blk_add_rq_to_plug(plug, rq); 1979 } else if (plug && !blk_queue_nomerges(q)) { 1980 blk_mq_bio_to_request(rq, bio); 1981 1982 /* 1983 * We do limited plugging. If the bio can be merged, do that. 1984 * Otherwise the existing request in the plug list will be 1985 * issued. So the plug list will have one request at most 1986 * The plug list might get flushed before this. If that happens, 1987 * the plug list is empty, and same_queue_rq is invalid. 1988 */ 1989 if (list_empty(&plug->mq_list)) 1990 same_queue_rq = NULL; 1991 if (same_queue_rq) { 1992 list_del_init(&same_queue_rq->queuelist); 1993 plug->rq_count--; 1994 } 1995 blk_add_rq_to_plug(plug, rq); 1996 1997 blk_mq_put_ctx(data.ctx); 1998 1999 if (same_queue_rq) { 2000 data.hctx = same_queue_rq->mq_hctx; 2001 blk_mq_try_issue_directly(data.hctx, same_queue_rq, 2002 &cookie, false, true); 2003 } 2004 } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && 2005 !data.hctx->dispatch_busy)) { 2006 blk_mq_put_ctx(data.ctx); 2007 blk_mq_bio_to_request(rq, bio); 2008 blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true); 2009 } else { 2010 blk_mq_put_ctx(data.ctx); 2011 blk_mq_bio_to_request(rq, bio); 2012 blk_mq_sched_insert_request(rq, false, true, true); 2013 } 2014 2015 return cookie; 2016 } 2017 2018 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 2019 unsigned int hctx_idx) 2020 { 2021 struct page *page; 2022 2023 if (tags->rqs && set->ops->exit_request) { 2024 int i; 2025 2026 for (i = 0; i < tags->nr_tags; i++) { 2027 struct request *rq = tags->static_rqs[i]; 2028 2029 if (!rq) 2030 continue; 2031 set->ops->exit_request(set, rq, hctx_idx); 2032 tags->static_rqs[i] = NULL; 2033 } 2034 } 2035 2036 while (!list_empty(&tags->page_list)) { 2037 page = list_first_entry(&tags->page_list, struct page, lru); 2038 list_del_init(&page->lru); 2039 /* 2040 * Remove kmemleak object previously allocated in 2041 * blk_mq_init_rq_map(). 2042 */ 2043 kmemleak_free(page_address(page)); 2044 __free_pages(page, page->private); 2045 } 2046 } 2047 2048 void blk_mq_free_rq_map(struct blk_mq_tags *tags) 2049 { 2050 kfree(tags->rqs); 2051 tags->rqs = NULL; 2052 kfree(tags->static_rqs); 2053 tags->static_rqs = NULL; 2054 2055 blk_mq_free_tags(tags); 2056 } 2057 2058 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, 2059 unsigned int hctx_idx, 2060 unsigned int nr_tags, 2061 unsigned int reserved_tags) 2062 { 2063 struct blk_mq_tags *tags; 2064 int node; 2065 2066 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); 2067 if (node == NUMA_NO_NODE) 2068 node = set->numa_node; 2069 2070 tags = blk_mq_init_tags(nr_tags, reserved_tags, node, 2071 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 2072 if (!tags) 2073 return NULL; 2074 2075 tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), 2076 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 2077 node); 2078 if (!tags->rqs) { 2079 blk_mq_free_tags(tags); 2080 return NULL; 2081 } 2082 2083 tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), 2084 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 2085 node); 2086 if (!tags->static_rqs) { 2087 kfree(tags->rqs); 2088 blk_mq_free_tags(tags); 2089 return NULL; 2090 } 2091 2092 return tags; 2093 } 2094 2095 static size_t order_to_size(unsigned int order) 2096 { 2097 return (size_t)PAGE_SIZE << order; 2098 } 2099 2100 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, 2101 unsigned int hctx_idx, int node) 2102 { 2103 int ret; 2104 2105 if (set->ops->init_request) { 2106 ret = set->ops->init_request(set, rq, hctx_idx, node); 2107 if (ret) 2108 return ret; 2109 } 2110 2111 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 2112 return 0; 2113 } 2114 2115 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 2116 unsigned int hctx_idx, unsigned int depth) 2117 { 2118 unsigned int i, j, entries_per_page, max_order = 4; 2119 size_t rq_size, left; 2120 int node; 2121 2122 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); 2123 if (node == NUMA_NO_NODE) 2124 node = set->numa_node; 2125 2126 INIT_LIST_HEAD(&tags->page_list); 2127 2128 /* 2129 * rq_size is the size of the request plus driver payload, rounded 2130 * to the cacheline size 2131 */ 2132 rq_size = round_up(sizeof(struct request) + set->cmd_size, 2133 cache_line_size()); 2134 left = rq_size * depth; 2135 2136 for (i = 0; i < depth; ) { 2137 int this_order = max_order; 2138 struct page *page; 2139 int to_do; 2140 void *p; 2141 2142 while (this_order && left < order_to_size(this_order - 1)) 2143 this_order--; 2144 2145 do { 2146 page = alloc_pages_node(node, 2147 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 2148 this_order); 2149 if (page) 2150 break; 2151 if (!this_order--) 2152 break; 2153 if (order_to_size(this_order) < rq_size) 2154 break; 2155 } while (1); 2156 2157 if (!page) 2158 goto fail; 2159 2160 page->private = this_order; 2161 list_add_tail(&page->lru, &tags->page_list); 2162 2163 p = page_address(page); 2164 /* 2165 * Allow kmemleak to scan these pages as they contain pointers 2166 * to additional allocations like via ops->init_request(). 2167 */ 2168 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); 2169 entries_per_page = order_to_size(this_order) / rq_size; 2170 to_do = min(entries_per_page, depth - i); 2171 left -= to_do * rq_size; 2172 for (j = 0; j < to_do; j++) { 2173 struct request *rq = p; 2174 2175 tags->static_rqs[i] = rq; 2176 if (blk_mq_init_request(set, rq, hctx_idx, node)) { 2177 tags->static_rqs[i] = NULL; 2178 goto fail; 2179 } 2180 2181 p += rq_size; 2182 i++; 2183 } 2184 } 2185 return 0; 2186 2187 fail: 2188 blk_mq_free_rqs(set, tags, hctx_idx); 2189 return -ENOMEM; 2190 } 2191 2192 /* 2193 * 'cpu' is going away. splice any existing rq_list entries from this 2194 * software queue to the hw queue dispatch list, and ensure that it 2195 * gets run. 2196 */ 2197 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) 2198 { 2199 struct blk_mq_hw_ctx *hctx; 2200 struct blk_mq_ctx *ctx; 2201 LIST_HEAD(tmp); 2202 enum hctx_type type; 2203 2204 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); 2205 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 2206 type = hctx->type; 2207 2208 spin_lock(&ctx->lock); 2209 if (!list_empty(&ctx->rq_lists[type])) { 2210 list_splice_init(&ctx->rq_lists[type], &tmp); 2211 blk_mq_hctx_clear_pending(hctx, ctx); 2212 } 2213 spin_unlock(&ctx->lock); 2214 2215 if (list_empty(&tmp)) 2216 return 0; 2217 2218 spin_lock(&hctx->lock); 2219 list_splice_tail_init(&tmp, &hctx->dispatch); 2220 spin_unlock(&hctx->lock); 2221 2222 blk_mq_run_hw_queue(hctx, true); 2223 return 0; 2224 } 2225 2226 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 2227 { 2228 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, 2229 &hctx->cpuhp_dead); 2230 } 2231 2232 /* hctx->ctxs will be freed in queue's release handler */ 2233 static void blk_mq_exit_hctx(struct request_queue *q, 2234 struct blk_mq_tag_set *set, 2235 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 2236 { 2237 if (blk_mq_hw_queue_mapped(hctx)) 2238 blk_mq_tag_idle(hctx); 2239 2240 if (set->ops->exit_request) 2241 set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); 2242 2243 if (set->ops->exit_hctx) 2244 set->ops->exit_hctx(hctx, hctx_idx); 2245 2246 if (hctx->flags & BLK_MQ_F_BLOCKING) 2247 cleanup_srcu_struct(hctx->srcu); 2248 2249 blk_mq_remove_cpuhp(hctx); 2250 blk_free_flush_queue(hctx->fq); 2251 sbitmap_free(&hctx->ctx_map); 2252 } 2253 2254 static void blk_mq_exit_hw_queues(struct request_queue *q, 2255 struct blk_mq_tag_set *set, int nr_queue) 2256 { 2257 struct blk_mq_hw_ctx *hctx; 2258 unsigned int i; 2259 2260 queue_for_each_hw_ctx(q, hctx, i) { 2261 if (i == nr_queue) 2262 break; 2263 blk_mq_debugfs_unregister_hctx(hctx); 2264 blk_mq_exit_hctx(q, set, hctx, i); 2265 } 2266 } 2267 2268 static int blk_mq_init_hctx(struct request_queue *q, 2269 struct blk_mq_tag_set *set, 2270 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 2271 { 2272 int node; 2273 2274 node = hctx->numa_node; 2275 if (node == NUMA_NO_NODE) 2276 node = hctx->numa_node = set->numa_node; 2277 2278 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 2279 spin_lock_init(&hctx->lock); 2280 INIT_LIST_HEAD(&hctx->dispatch); 2281 hctx->queue = q; 2282 hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; 2283 2284 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); 2285 2286 hctx->tags = set->tags[hctx_idx]; 2287 2288 /* 2289 * Allocate space for all possible cpus to avoid allocation at 2290 * runtime 2291 */ 2292 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), 2293 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); 2294 if (!hctx->ctxs) 2295 goto unregister_cpu_notifier; 2296 2297 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), 2298 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node)) 2299 goto free_ctxs; 2300 2301 hctx->nr_ctx = 0; 2302 2303 spin_lock_init(&hctx->dispatch_wait_lock); 2304 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); 2305 INIT_LIST_HEAD(&hctx->dispatch_wait.entry); 2306 2307 if (set->ops->init_hctx && 2308 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 2309 goto free_bitmap; 2310 2311 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size, 2312 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); 2313 if (!hctx->fq) 2314 goto exit_hctx; 2315 2316 if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node)) 2317 goto free_fq; 2318 2319 if (hctx->flags & BLK_MQ_F_BLOCKING) 2320 init_srcu_struct(hctx->srcu); 2321 2322 return 0; 2323 2324 free_fq: 2325 kfree(hctx->fq); 2326 exit_hctx: 2327 if (set->ops->exit_hctx) 2328 set->ops->exit_hctx(hctx, hctx_idx); 2329 free_bitmap: 2330 sbitmap_free(&hctx->ctx_map); 2331 free_ctxs: 2332 kfree(hctx->ctxs); 2333 unregister_cpu_notifier: 2334 blk_mq_remove_cpuhp(hctx); 2335 return -1; 2336 } 2337 2338 static void blk_mq_init_cpu_queues(struct request_queue *q, 2339 unsigned int nr_hw_queues) 2340 { 2341 struct blk_mq_tag_set *set = q->tag_set; 2342 unsigned int i, j; 2343 2344 for_each_possible_cpu(i) { 2345 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 2346 struct blk_mq_hw_ctx *hctx; 2347 int k; 2348 2349 __ctx->cpu = i; 2350 spin_lock_init(&__ctx->lock); 2351 for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) 2352 INIT_LIST_HEAD(&__ctx->rq_lists[k]); 2353 2354 __ctx->queue = q; 2355 2356 /* 2357 * Set local node, IFF we have more than one hw queue. If 2358 * not, we remain on the home node of the device 2359 */ 2360 for (j = 0; j < set->nr_maps; j++) { 2361 hctx = blk_mq_map_queue_type(q, j, i); 2362 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 2363 hctx->numa_node = local_memory_node(cpu_to_node(i)); 2364 } 2365 } 2366 } 2367 2368 static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) 2369 { 2370 int ret = 0; 2371 2372 set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, 2373 set->queue_depth, set->reserved_tags); 2374 if (!set->tags[hctx_idx]) 2375 return false; 2376 2377 ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, 2378 set->queue_depth); 2379 if (!ret) 2380 return true; 2381 2382 blk_mq_free_rq_map(set->tags[hctx_idx]); 2383 set->tags[hctx_idx] = NULL; 2384 return false; 2385 } 2386 2387 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, 2388 unsigned int hctx_idx) 2389 { 2390 if (set->tags && set->tags[hctx_idx]) { 2391 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); 2392 blk_mq_free_rq_map(set->tags[hctx_idx]); 2393 set->tags[hctx_idx] = NULL; 2394 } 2395 } 2396 2397 static void blk_mq_map_swqueue(struct request_queue *q) 2398 { 2399 unsigned int i, j, hctx_idx; 2400 struct blk_mq_hw_ctx *hctx; 2401 struct blk_mq_ctx *ctx; 2402 struct blk_mq_tag_set *set = q->tag_set; 2403 2404 /* 2405 * Avoid others reading imcomplete hctx->cpumask through sysfs 2406 */ 2407 mutex_lock(&q->sysfs_lock); 2408 2409 queue_for_each_hw_ctx(q, hctx, i) { 2410 cpumask_clear(hctx->cpumask); 2411 hctx->nr_ctx = 0; 2412 hctx->dispatch_from = NULL; 2413 } 2414 2415 /* 2416 * Map software to hardware queues. 2417 * 2418 * If the cpu isn't present, the cpu is mapped to first hctx. 2419 */ 2420 for_each_possible_cpu(i) { 2421 hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i]; 2422 /* unmapped hw queue can be remapped after CPU topo changed */ 2423 if (!set->tags[hctx_idx] && 2424 !__blk_mq_alloc_rq_map(set, hctx_idx)) { 2425 /* 2426 * If tags initialization fail for some hctx, 2427 * that hctx won't be brought online. In this 2428 * case, remap the current ctx to hctx[0] which 2429 * is guaranteed to always have tags allocated 2430 */ 2431 set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0; 2432 } 2433 2434 ctx = per_cpu_ptr(q->queue_ctx, i); 2435 for (j = 0; j < set->nr_maps; j++) { 2436 if (!set->map[j].nr_queues) { 2437 ctx->hctxs[j] = blk_mq_map_queue_type(q, 2438 HCTX_TYPE_DEFAULT, i); 2439 continue; 2440 } 2441 2442 hctx = blk_mq_map_queue_type(q, j, i); 2443 ctx->hctxs[j] = hctx; 2444 /* 2445 * If the CPU is already set in the mask, then we've 2446 * mapped this one already. This can happen if 2447 * devices share queues across queue maps. 2448 */ 2449 if (cpumask_test_cpu(i, hctx->cpumask)) 2450 continue; 2451 2452 cpumask_set_cpu(i, hctx->cpumask); 2453 hctx->type = j; 2454 ctx->index_hw[hctx->type] = hctx->nr_ctx; 2455 hctx->ctxs[hctx->nr_ctx++] = ctx; 2456 2457 /* 2458 * If the nr_ctx type overflows, we have exceeded the 2459 * amount of sw queues we can support. 2460 */ 2461 BUG_ON(!hctx->nr_ctx); 2462 } 2463 2464 for (; j < HCTX_MAX_TYPES; j++) 2465 ctx->hctxs[j] = blk_mq_map_queue_type(q, 2466 HCTX_TYPE_DEFAULT, i); 2467 } 2468 2469 mutex_unlock(&q->sysfs_lock); 2470 2471 queue_for_each_hw_ctx(q, hctx, i) { 2472 /* 2473 * If no software queues are mapped to this hardware queue, 2474 * disable it and free the request entries. 2475 */ 2476 if (!hctx->nr_ctx) { 2477 /* Never unmap queue 0. We need it as a 2478 * fallback in case of a new remap fails 2479 * allocation 2480 */ 2481 if (i && set->tags[i]) 2482 blk_mq_free_map_and_requests(set, i); 2483 2484 hctx->tags = NULL; 2485 continue; 2486 } 2487 2488 hctx->tags = set->tags[i]; 2489 WARN_ON(!hctx->tags); 2490 2491 /* 2492 * Set the map size to the number of mapped software queues. 2493 * This is more accurate and more efficient than looping 2494 * over all possibly mapped software queues. 2495 */ 2496 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); 2497 2498 /* 2499 * Initialize batch roundrobin counts 2500 */ 2501 hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); 2502 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 2503 } 2504 } 2505 2506 /* 2507 * Caller needs to ensure that we're either frozen/quiesced, or that 2508 * the queue isn't live yet. 2509 */ 2510 static void queue_set_hctx_shared(struct request_queue *q, bool shared) 2511 { 2512 struct blk_mq_hw_ctx *hctx; 2513 int i; 2514 2515 queue_for_each_hw_ctx(q, hctx, i) { 2516 if (shared) 2517 hctx->flags |= BLK_MQ_F_TAG_SHARED; 2518 else 2519 hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 2520 } 2521 } 2522 2523 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, 2524 bool shared) 2525 { 2526 struct request_queue *q; 2527 2528 lockdep_assert_held(&set->tag_list_lock); 2529 2530 list_for_each_entry(q, &set->tag_list, tag_set_list) { 2531 blk_mq_freeze_queue(q); 2532 queue_set_hctx_shared(q, shared); 2533 blk_mq_unfreeze_queue(q); 2534 } 2535 } 2536 2537 static void blk_mq_del_queue_tag_set(struct request_queue *q) 2538 { 2539 struct blk_mq_tag_set *set = q->tag_set; 2540 2541 mutex_lock(&set->tag_list_lock); 2542 list_del_rcu(&q->tag_set_list); 2543 if (list_is_singular(&set->tag_list)) { 2544 /* just transitioned to unshared */ 2545 set->flags &= ~BLK_MQ_F_TAG_SHARED; 2546 /* update existing queue */ 2547 blk_mq_update_tag_set_depth(set, false); 2548 } 2549 mutex_unlock(&set->tag_list_lock); 2550 INIT_LIST_HEAD(&q->tag_set_list); 2551 } 2552 2553 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 2554 struct request_queue *q) 2555 { 2556 mutex_lock(&set->tag_list_lock); 2557 2558 /* 2559 * Check to see if we're transitioning to shared (from 1 to 2 queues). 2560 */ 2561 if (!list_empty(&set->tag_list) && 2562 !(set->flags & BLK_MQ_F_TAG_SHARED)) { 2563 set->flags |= BLK_MQ_F_TAG_SHARED; 2564 /* update existing queue */ 2565 blk_mq_update_tag_set_depth(set, true); 2566 } 2567 if (set->flags & BLK_MQ_F_TAG_SHARED) 2568 queue_set_hctx_shared(q, true); 2569 list_add_tail_rcu(&q->tag_set_list, &set->tag_list); 2570 2571 mutex_unlock(&set->tag_list_lock); 2572 } 2573 2574 /* All allocations will be freed in release handler of q->mq_kobj */ 2575 static int blk_mq_alloc_ctxs(struct request_queue *q) 2576 { 2577 struct blk_mq_ctxs *ctxs; 2578 int cpu; 2579 2580 ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); 2581 if (!ctxs) 2582 return -ENOMEM; 2583 2584 ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); 2585 if (!ctxs->queue_ctx) 2586 goto fail; 2587 2588 for_each_possible_cpu(cpu) { 2589 struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); 2590 ctx->ctxs = ctxs; 2591 } 2592 2593 q->mq_kobj = &ctxs->kobj; 2594 q->queue_ctx = ctxs->queue_ctx; 2595 2596 return 0; 2597 fail: 2598 kfree(ctxs); 2599 return -ENOMEM; 2600 } 2601 2602 /* 2603 * It is the actual release handler for mq, but we do it from 2604 * request queue's release handler for avoiding use-after-free 2605 * and headache because q->mq_kobj shouldn't have been introduced, 2606 * but we can't group ctx/kctx kobj without it. 2607 */ 2608 void blk_mq_release(struct request_queue *q) 2609 { 2610 struct blk_mq_hw_ctx *hctx; 2611 unsigned int i; 2612 2613 /* hctx kobj stays in hctx */ 2614 queue_for_each_hw_ctx(q, hctx, i) { 2615 if (!hctx) 2616 continue; 2617 kobject_put(&hctx->kobj); 2618 } 2619 2620 kfree(q->queue_hw_ctx); 2621 2622 /* 2623 * release .mq_kobj and sw queue's kobject now because 2624 * both share lifetime with request queue. 2625 */ 2626 blk_mq_sysfs_deinit(q); 2627 } 2628 2629 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 2630 { 2631 struct request_queue *uninit_q, *q; 2632 2633 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); 2634 if (!uninit_q) 2635 return ERR_PTR(-ENOMEM); 2636 2637 q = blk_mq_init_allocated_queue(set, uninit_q); 2638 if (IS_ERR(q)) 2639 blk_cleanup_queue(uninit_q); 2640 2641 return q; 2642 } 2643 EXPORT_SYMBOL(blk_mq_init_queue); 2644 2645 /* 2646 * Helper for setting up a queue with mq ops, given queue depth, and 2647 * the passed in mq ops flags. 2648 */ 2649 struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, 2650 const struct blk_mq_ops *ops, 2651 unsigned int queue_depth, 2652 unsigned int set_flags) 2653 { 2654 struct request_queue *q; 2655 int ret; 2656 2657 memset(set, 0, sizeof(*set)); 2658 set->ops = ops; 2659 set->nr_hw_queues = 1; 2660 set->nr_maps = 1; 2661 set->queue_depth = queue_depth; 2662 set->numa_node = NUMA_NO_NODE; 2663 set->flags = set_flags; 2664 2665 ret = blk_mq_alloc_tag_set(set); 2666 if (ret) 2667 return ERR_PTR(ret); 2668 2669 q = blk_mq_init_queue(set); 2670 if (IS_ERR(q)) { 2671 blk_mq_free_tag_set(set); 2672 return q; 2673 } 2674 2675 return q; 2676 } 2677 EXPORT_SYMBOL(blk_mq_init_sq_queue); 2678 2679 static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) 2680 { 2681 int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); 2682 2683 BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), 2684 __alignof__(struct blk_mq_hw_ctx)) != 2685 sizeof(struct blk_mq_hw_ctx)); 2686 2687 if (tag_set->flags & BLK_MQ_F_BLOCKING) 2688 hw_ctx_size += sizeof(struct srcu_struct); 2689 2690 return hw_ctx_size; 2691 } 2692 2693 static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( 2694 struct blk_mq_tag_set *set, struct request_queue *q, 2695 int hctx_idx, int node) 2696 { 2697 struct blk_mq_hw_ctx *hctx; 2698 2699 hctx = kzalloc_node(blk_mq_hw_ctx_size(set), 2700 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 2701 node); 2702 if (!hctx) 2703 return NULL; 2704 2705 if (!zalloc_cpumask_var_node(&hctx->cpumask, 2706 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 2707 node)) { 2708 kfree(hctx); 2709 return NULL; 2710 } 2711 2712 atomic_set(&hctx->nr_active, 0); 2713 hctx->numa_node = node; 2714 hctx->queue_num = hctx_idx; 2715 2716 if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) { 2717 free_cpumask_var(hctx->cpumask); 2718 kfree(hctx); 2719 return NULL; 2720 } 2721 blk_mq_hctx_kobj_init(hctx); 2722 2723 return hctx; 2724 } 2725 2726 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 2727 struct request_queue *q) 2728 { 2729 int i, j, end; 2730 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; 2731 2732 /* protect against switching io scheduler */ 2733 mutex_lock(&q->sysfs_lock); 2734 for (i = 0; i < set->nr_hw_queues; i++) { 2735 int node; 2736 struct blk_mq_hw_ctx *hctx; 2737 2738 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); 2739 /* 2740 * If the hw queue has been mapped to another numa node, 2741 * we need to realloc the hctx. If allocation fails, fallback 2742 * to use the previous one. 2743 */ 2744 if (hctxs[i] && (hctxs[i]->numa_node == node)) 2745 continue; 2746 2747 hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); 2748 if (hctx) { 2749 if (hctxs[i]) { 2750 blk_mq_exit_hctx(q, set, hctxs[i], i); 2751 kobject_put(&hctxs[i]->kobj); 2752 } 2753 hctxs[i] = hctx; 2754 } else { 2755 if (hctxs[i]) 2756 pr_warn("Allocate new hctx on node %d fails,\ 2757 fallback to previous one on node %d\n", 2758 node, hctxs[i]->numa_node); 2759 else 2760 break; 2761 } 2762 } 2763 /* 2764 * Increasing nr_hw_queues fails. Free the newly allocated 2765 * hctxs and keep the previous q->nr_hw_queues. 2766 */ 2767 if (i != set->nr_hw_queues) { 2768 j = q->nr_hw_queues; 2769 end = i; 2770 } else { 2771 j = i; 2772 end = q->nr_hw_queues; 2773 q->nr_hw_queues = set->nr_hw_queues; 2774 } 2775 2776 for (; j < end; j++) { 2777 struct blk_mq_hw_ctx *hctx = hctxs[j]; 2778 2779 if (hctx) { 2780 if (hctx->tags) 2781 blk_mq_free_map_and_requests(set, j); 2782 blk_mq_exit_hctx(q, set, hctx, j); 2783 kobject_put(&hctx->kobj); 2784 hctxs[j] = NULL; 2785 2786 } 2787 } 2788 mutex_unlock(&q->sysfs_lock); 2789 } 2790 2791 /* 2792 * Maximum number of hardware queues we support. For single sets, we'll never 2793 * have more than the CPUs (software queues). For multiple sets, the tag_set 2794 * user may have set ->nr_hw_queues larger. 2795 */ 2796 static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) 2797 { 2798 if (set->nr_maps == 1) 2799 return nr_cpu_ids; 2800 2801 return max(set->nr_hw_queues, nr_cpu_ids); 2802 } 2803 2804 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 2805 struct request_queue *q) 2806 { 2807 /* mark the queue as mq asap */ 2808 q->mq_ops = set->ops; 2809 2810 q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, 2811 blk_mq_poll_stats_bkt, 2812 BLK_MQ_POLL_STATS_BKTS, q); 2813 if (!q->poll_cb) 2814 goto err_exit; 2815 2816 if (blk_mq_alloc_ctxs(q)) 2817 goto err_exit; 2818 2819 /* init q->mq_kobj and sw queues' kobjects */ 2820 blk_mq_sysfs_init(q); 2821 2822 q->nr_queues = nr_hw_queues(set); 2823 q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)), 2824 GFP_KERNEL, set->numa_node); 2825 if (!q->queue_hw_ctx) 2826 goto err_sys_init; 2827 2828 blk_mq_realloc_hw_ctxs(set, q); 2829 if (!q->nr_hw_queues) 2830 goto err_hctxs; 2831 2832 INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 2833 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 2834 2835 q->tag_set = set; 2836 2837 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 2838 if (set->nr_maps > HCTX_TYPE_POLL && 2839 set->map[HCTX_TYPE_POLL].nr_queues) 2840 blk_queue_flag_set(QUEUE_FLAG_POLL, q); 2841 2842 q->sg_reserved_size = INT_MAX; 2843 2844 INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); 2845 INIT_LIST_HEAD(&q->requeue_list); 2846 spin_lock_init(&q->requeue_lock); 2847 2848 blk_queue_make_request(q, blk_mq_make_request); 2849 2850 /* 2851 * Do this after blk_queue_make_request() overrides it... 2852 */ 2853 q->nr_requests = set->queue_depth; 2854 2855 /* 2856 * Default to classic polling 2857 */ 2858 q->poll_nsec = BLK_MQ_POLL_CLASSIC; 2859 2860 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 2861 blk_mq_add_queue_tag_set(set, q); 2862 blk_mq_map_swqueue(q); 2863 2864 if (!(set->flags & BLK_MQ_F_NO_SCHED)) { 2865 int ret; 2866 2867 ret = elevator_init_mq(q); 2868 if (ret) 2869 return ERR_PTR(ret); 2870 } 2871 2872 return q; 2873 2874 err_hctxs: 2875 kfree(q->queue_hw_ctx); 2876 err_sys_init: 2877 blk_mq_sysfs_deinit(q); 2878 err_exit: 2879 q->mq_ops = NULL; 2880 return ERR_PTR(-ENOMEM); 2881 } 2882 EXPORT_SYMBOL(blk_mq_init_allocated_queue); 2883 2884 void blk_mq_free_queue(struct request_queue *q) 2885 { 2886 struct blk_mq_tag_set *set = q->tag_set; 2887 2888 blk_mq_del_queue_tag_set(q); 2889 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 2890 } 2891 2892 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2893 { 2894 int i; 2895 2896 for (i = 0; i < set->nr_hw_queues; i++) 2897 if (!__blk_mq_alloc_rq_map(set, i)) 2898 goto out_unwind; 2899 2900 return 0; 2901 2902 out_unwind: 2903 while (--i >= 0) 2904 blk_mq_free_rq_map(set->tags[i]); 2905 2906 return -ENOMEM; 2907 } 2908 2909 /* 2910 * Allocate the request maps associated with this tag_set. Note that this 2911 * may reduce the depth asked for, if memory is tight. set->queue_depth 2912 * will be updated to reflect the allocated depth. 2913 */ 2914 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2915 { 2916 unsigned int depth; 2917 int err; 2918 2919 depth = set->queue_depth; 2920 do { 2921 err = __blk_mq_alloc_rq_maps(set); 2922 if (!err) 2923 break; 2924 2925 set->queue_depth >>= 1; 2926 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 2927 err = -ENOMEM; 2928 break; 2929 } 2930 } while (set->queue_depth); 2931 2932 if (!set->queue_depth || err) { 2933 pr_err("blk-mq: failed to allocate request map\n"); 2934 return -ENOMEM; 2935 } 2936 2937 if (depth != set->queue_depth) 2938 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 2939 depth, set->queue_depth); 2940 2941 return 0; 2942 } 2943 2944 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) 2945 { 2946 if (set->ops->map_queues && !is_kdump_kernel()) { 2947 int i; 2948 2949 /* 2950 * transport .map_queues is usually done in the following 2951 * way: 2952 * 2953 * for (queue = 0; queue < set->nr_hw_queues; queue++) { 2954 * mask = get_cpu_mask(queue) 2955 * for_each_cpu(cpu, mask) 2956 * set->map[x].mq_map[cpu] = queue; 2957 * } 2958 * 2959 * When we need to remap, the table has to be cleared for 2960 * killing stale mapping since one CPU may not be mapped 2961 * to any hw queue. 2962 */ 2963 for (i = 0; i < set->nr_maps; i++) 2964 blk_mq_clear_mq_map(&set->map[i]); 2965 2966 return set->ops->map_queues(set); 2967 } else { 2968 BUG_ON(set->nr_maps > 1); 2969 return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 2970 } 2971 } 2972 2973 /* 2974 * Alloc a tag set to be associated with one or more request queues. 2975 * May fail with EINVAL for various error conditions. May adjust the 2976 * requested depth down, if it's too large. In that case, the set 2977 * value will be stored in set->queue_depth. 2978 */ 2979 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2980 { 2981 int i, ret; 2982 2983 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 2984 2985 if (!set->nr_hw_queues) 2986 return -EINVAL; 2987 if (!set->queue_depth) 2988 return -EINVAL; 2989 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 2990 return -EINVAL; 2991 2992 if (!set->ops->queue_rq) 2993 return -EINVAL; 2994 2995 if (!set->ops->get_budget ^ !set->ops->put_budget) 2996 return -EINVAL; 2997 2998 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 2999 pr_info("blk-mq: reduced tag depth to %u\n", 3000 BLK_MQ_MAX_DEPTH); 3001 set->queue_depth = BLK_MQ_MAX_DEPTH; 3002 } 3003 3004 if (!set->nr_maps) 3005 set->nr_maps = 1; 3006 else if (set->nr_maps > HCTX_MAX_TYPES) 3007 return -EINVAL; 3008 3009 /* 3010 * If a crashdump is active, then we are potentially in a very 3011 * memory constrained environment. Limit us to 1 queue and 3012 * 64 tags to prevent using too much memory. 3013 */ 3014 if (is_kdump_kernel()) { 3015 set->nr_hw_queues = 1; 3016 set->nr_maps = 1; 3017 set->queue_depth = min(64U, set->queue_depth); 3018 } 3019 /* 3020 * There is no use for more h/w queues than cpus if we just have 3021 * a single map 3022 */ 3023 if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) 3024 set->nr_hw_queues = nr_cpu_ids; 3025 3026 set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *), 3027 GFP_KERNEL, set->numa_node); 3028 if (!set->tags) 3029 return -ENOMEM; 3030 3031 ret = -ENOMEM; 3032 for (i = 0; i < set->nr_maps; i++) { 3033 set->map[i].mq_map = kcalloc_node(nr_cpu_ids, 3034 sizeof(set->map[i].mq_map[0]), 3035 GFP_KERNEL, set->numa_node); 3036 if (!set->map[i].mq_map) 3037 goto out_free_mq_map; 3038 set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; 3039 } 3040 3041 ret = blk_mq_update_queue_map(set); 3042 if (ret) 3043 goto out_free_mq_map; 3044 3045 ret = blk_mq_alloc_rq_maps(set); 3046 if (ret) 3047 goto out_free_mq_map; 3048 3049 mutex_init(&set->tag_list_lock); 3050 INIT_LIST_HEAD(&set->tag_list); 3051 3052 return 0; 3053 3054 out_free_mq_map: 3055 for (i = 0; i < set->nr_maps; i++) { 3056 kfree(set->map[i].mq_map); 3057 set->map[i].mq_map = NULL; 3058 } 3059 kfree(set->tags); 3060 set->tags = NULL; 3061 return ret; 3062 } 3063 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 3064 3065 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 3066 { 3067 int i, j; 3068 3069 for (i = 0; i < nr_hw_queues(set); i++) 3070 blk_mq_free_map_and_requests(set, i); 3071 3072 for (j = 0; j < set->nr_maps; j++) { 3073 kfree(set->map[j].mq_map); 3074 set->map[j].mq_map = NULL; 3075 } 3076 3077 kfree(set->tags); 3078 set->tags = NULL; 3079 } 3080 EXPORT_SYMBOL(blk_mq_free_tag_set); 3081 3082 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) 3083 { 3084 struct blk_mq_tag_set *set = q->tag_set; 3085 struct blk_mq_hw_ctx *hctx; 3086 int i, ret; 3087 3088 if (!set) 3089 return -EINVAL; 3090 3091 if (q->nr_requests == nr) 3092 return 0; 3093 3094 blk_mq_freeze_queue(q); 3095 blk_mq_quiesce_queue(q); 3096 3097 ret = 0; 3098 queue_for_each_hw_ctx(q, hctx, i) { 3099 if (!hctx->tags) 3100 continue; 3101 /* 3102 * If we're using an MQ scheduler, just update the scheduler 3103 * queue depth. This is similar to what the old code would do. 3104 */ 3105 if (!hctx->sched_tags) { 3106 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, 3107 false); 3108 } else { 3109 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, 3110 nr, true); 3111 } 3112 if (ret) 3113 break; 3114 } 3115 3116 if (!ret) 3117 q->nr_requests = nr; 3118 3119 blk_mq_unquiesce_queue(q); 3120 blk_mq_unfreeze_queue(q); 3121 3122 return ret; 3123 } 3124 3125 /* 3126 * request_queue and elevator_type pair. 3127 * It is just used by __blk_mq_update_nr_hw_queues to cache 3128 * the elevator_type associated with a request_queue. 3129 */ 3130 struct blk_mq_qe_pair { 3131 struct list_head node; 3132 struct request_queue *q; 3133 struct elevator_type *type; 3134 }; 3135 3136 /* 3137 * Cache the elevator_type in qe pair list and switch the 3138 * io scheduler to 'none' 3139 */ 3140 static bool blk_mq_elv_switch_none(struct list_head *head, 3141 struct request_queue *q) 3142 { 3143 struct blk_mq_qe_pair *qe; 3144 3145 if (!q->elevator) 3146 return true; 3147 3148 qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); 3149 if (!qe) 3150 return false; 3151 3152 INIT_LIST_HEAD(&qe->node); 3153 qe->q = q; 3154 qe->type = q->elevator->type; 3155 list_add(&qe->node, head); 3156 3157 mutex_lock(&q->sysfs_lock); 3158 /* 3159 * After elevator_switch_mq, the previous elevator_queue will be 3160 * released by elevator_release. The reference of the io scheduler 3161 * module get by elevator_get will also be put. So we need to get 3162 * a reference of the io scheduler module here to prevent it to be 3163 * removed. 3164 */ 3165 __module_get(qe->type->elevator_owner); 3166 elevator_switch_mq(q, NULL); 3167 mutex_unlock(&q->sysfs_lock); 3168 3169 return true; 3170 } 3171 3172 static void blk_mq_elv_switch_back(struct list_head *head, 3173 struct request_queue *q) 3174 { 3175 struct blk_mq_qe_pair *qe; 3176 struct elevator_type *t = NULL; 3177 3178 list_for_each_entry(qe, head, node) 3179 if (qe->q == q) { 3180 t = qe->type; 3181 break; 3182 } 3183 3184 if (!t) 3185 return; 3186 3187 list_del(&qe->node); 3188 kfree(qe); 3189 3190 mutex_lock(&q->sysfs_lock); 3191 elevator_switch_mq(q, t); 3192 mutex_unlock(&q->sysfs_lock); 3193 } 3194 3195 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, 3196 int nr_hw_queues) 3197 { 3198 struct request_queue *q; 3199 LIST_HEAD(head); 3200 int prev_nr_hw_queues; 3201 3202 lockdep_assert_held(&set->tag_list_lock); 3203 3204 if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) 3205 nr_hw_queues = nr_cpu_ids; 3206 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) 3207 return; 3208 3209 list_for_each_entry(q, &set->tag_list, tag_set_list) 3210 blk_mq_freeze_queue(q); 3211 /* 3212 * Sync with blk_mq_queue_tag_busy_iter. 3213 */ 3214 synchronize_rcu(); 3215 /* 3216 * Switch IO scheduler to 'none', cleaning up the data associated 3217 * with the previous scheduler. We will switch back once we are done 3218 * updating the new sw to hw queue mappings. 3219 */ 3220 list_for_each_entry(q, &set->tag_list, tag_set_list) 3221 if (!blk_mq_elv_switch_none(&head, q)) 3222 goto switch_back; 3223 3224 list_for_each_entry(q, &set->tag_list, tag_set_list) { 3225 blk_mq_debugfs_unregister_hctxs(q); 3226 blk_mq_sysfs_unregister(q); 3227 } 3228 3229 prev_nr_hw_queues = set->nr_hw_queues; 3230 set->nr_hw_queues = nr_hw_queues; 3231 blk_mq_update_queue_map(set); 3232 fallback: 3233 list_for_each_entry(q, &set->tag_list, tag_set_list) { 3234 blk_mq_realloc_hw_ctxs(set, q); 3235 if (q->nr_hw_queues != set->nr_hw_queues) { 3236 pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", 3237 nr_hw_queues, prev_nr_hw_queues); 3238 set->nr_hw_queues = prev_nr_hw_queues; 3239 blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 3240 goto fallback; 3241 } 3242 blk_mq_map_swqueue(q); 3243 } 3244 3245 list_for_each_entry(q, &set->tag_list, tag_set_list) { 3246 blk_mq_sysfs_register(q); 3247 blk_mq_debugfs_register_hctxs(q); 3248 } 3249 3250 switch_back: 3251 list_for_each_entry(q, &set->tag_list, tag_set_list) 3252 blk_mq_elv_switch_back(&head, q); 3253 3254 list_for_each_entry(q, &set->tag_list, tag_set_list) 3255 blk_mq_unfreeze_queue(q); 3256 } 3257 3258 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) 3259 { 3260 mutex_lock(&set->tag_list_lock); 3261 __blk_mq_update_nr_hw_queues(set, nr_hw_queues); 3262 mutex_unlock(&set->tag_list_lock); 3263 } 3264 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); 3265 3266 /* Enable polling stats and return whether they were already enabled. */ 3267 static bool blk_poll_stats_enable(struct request_queue *q) 3268 { 3269 if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || 3270 blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q)) 3271 return true; 3272 blk_stat_add_callback(q, q->poll_cb); 3273 return false; 3274 } 3275 3276 static void blk_mq_poll_stats_start(struct request_queue *q) 3277 { 3278 /* 3279 * We don't arm the callback if polling stats are not enabled or the 3280 * callback is already active. 3281 */ 3282 if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || 3283 blk_stat_is_active(q->poll_cb)) 3284 return; 3285 3286 blk_stat_activate_msecs(q->poll_cb, 100); 3287 } 3288 3289 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) 3290 { 3291 struct request_queue *q = cb->data; 3292 int bucket; 3293 3294 for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { 3295 if (cb->stat[bucket].nr_samples) 3296 q->poll_stat[bucket] = cb->stat[bucket]; 3297 } 3298 } 3299 3300 static unsigned long blk_mq_poll_nsecs(struct request_queue *q, 3301 struct blk_mq_hw_ctx *hctx, 3302 struct request *rq) 3303 { 3304 unsigned long ret = 0; 3305 int bucket; 3306 3307 /* 3308 * If stats collection isn't on, don't sleep but turn it on for 3309 * future users 3310 */ 3311 if (!blk_poll_stats_enable(q)) 3312 return 0; 3313 3314 /* 3315 * As an optimistic guess, use half of the mean service time 3316 * for this type of request. We can (and should) make this smarter. 3317 * For instance, if the completion latencies are tight, we can 3318 * get closer than just half the mean. This is especially 3319 * important on devices where the completion latencies are longer 3320 * than ~10 usec. We do use the stats for the relevant IO size 3321 * if available which does lead to better estimates. 3322 */ 3323 bucket = blk_mq_poll_stats_bkt(rq); 3324 if (bucket < 0) 3325 return ret; 3326 3327 if (q->poll_stat[bucket].nr_samples) 3328 ret = (q->poll_stat[bucket].mean + 1) / 2; 3329 3330 return ret; 3331 } 3332 3333 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, 3334 struct blk_mq_hw_ctx *hctx, 3335 struct request *rq) 3336 { 3337 struct hrtimer_sleeper hs; 3338 enum hrtimer_mode mode; 3339 unsigned int nsecs; 3340 ktime_t kt; 3341 3342 if (rq->rq_flags & RQF_MQ_POLL_SLEPT) 3343 return false; 3344 3345 /* 3346 * If we get here, hybrid polling is enabled. Hence poll_nsec can be: 3347 * 3348 * 0: use half of prev avg 3349 * >0: use this specific value 3350 */ 3351 if (q->poll_nsec > 0) 3352 nsecs = q->poll_nsec; 3353 else 3354 nsecs = blk_mq_poll_nsecs(q, hctx, rq); 3355 3356 if (!nsecs) 3357 return false; 3358 3359 rq->rq_flags |= RQF_MQ_POLL_SLEPT; 3360 3361 /* 3362 * This will be replaced with the stats tracking code, using 3363 * 'avg_completion_time / 2' as the pre-sleep target. 3364 */ 3365 kt = nsecs; 3366 3367 mode = HRTIMER_MODE_REL; 3368 hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); 3369 hrtimer_set_expires(&hs.timer, kt); 3370 3371 hrtimer_init_sleeper(&hs, current); 3372 do { 3373 if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) 3374 break; 3375 set_current_state(TASK_UNINTERRUPTIBLE); 3376 hrtimer_start_expires(&hs.timer, mode); 3377 if (hs.task) 3378 io_schedule(); 3379 hrtimer_cancel(&hs.timer); 3380 mode = HRTIMER_MODE_ABS; 3381 } while (hs.task && !signal_pending(current)); 3382 3383 __set_current_state(TASK_RUNNING); 3384 destroy_hrtimer_on_stack(&hs.timer); 3385 return true; 3386 } 3387 3388 static bool blk_mq_poll_hybrid(struct request_queue *q, 3389 struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) 3390 { 3391 struct request *rq; 3392 3393 if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) 3394 return false; 3395 3396 if (!blk_qc_t_is_internal(cookie)) 3397 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); 3398 else { 3399 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); 3400 /* 3401 * With scheduling, if the request has completed, we'll 3402 * get a NULL return here, as we clear the sched tag when 3403 * that happens. The request still remains valid, like always, 3404 * so we should be safe with just the NULL check. 3405 */ 3406 if (!rq) 3407 return false; 3408 } 3409 3410 return blk_mq_poll_hybrid_sleep(q, hctx, rq); 3411 } 3412 3413 /** 3414 * blk_poll - poll for IO completions 3415 * @q: the queue 3416 * @cookie: cookie passed back at IO submission time 3417 * @spin: whether to spin for completions 3418 * 3419 * Description: 3420 * Poll for completions on the passed in queue. Returns number of 3421 * completed entries found. If @spin is true, then blk_poll will continue 3422 * looping until at least one completion is found, unless the task is 3423 * otherwise marked running (or we need to reschedule). 3424 */ 3425 int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) 3426 { 3427 struct blk_mq_hw_ctx *hctx; 3428 long state; 3429 3430 if (!blk_qc_t_valid(cookie) || 3431 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) 3432 return 0; 3433 3434 if (current->plug) 3435 blk_flush_plug_list(current->plug, false); 3436 3437 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; 3438 3439 /* 3440 * If we sleep, have the caller restart the poll loop to reset 3441 * the state. Like for the other success return cases, the 3442 * caller is responsible for checking if the IO completed. If 3443 * the IO isn't complete, we'll get called again and will go 3444 * straight to the busy poll loop. 3445 */ 3446 if (blk_mq_poll_hybrid(q, hctx, cookie)) 3447 return 1; 3448 3449 hctx->poll_considered++; 3450 3451 state = current->state; 3452 do { 3453 int ret; 3454 3455 hctx->poll_invoked++; 3456 3457 ret = q->mq_ops->poll(hctx); 3458 if (ret > 0) { 3459 hctx->poll_success++; 3460 __set_current_state(TASK_RUNNING); 3461 return ret; 3462 } 3463 3464 if (signal_pending_state(state, current)) 3465 __set_current_state(TASK_RUNNING); 3466 3467 if (current->state == TASK_RUNNING) 3468 return 1; 3469 if (ret < 0 || !spin) 3470 break; 3471 cpu_relax(); 3472 } while (!need_resched()); 3473 3474 __set_current_state(TASK_RUNNING); 3475 return 0; 3476 } 3477 EXPORT_SYMBOL_GPL(blk_poll); 3478 3479 unsigned int blk_mq_rq_cpu(struct request *rq) 3480 { 3481 return rq->mq_ctx->cpu; 3482 } 3483 EXPORT_SYMBOL(blk_mq_rq_cpu); 3484 3485 static int __init blk_mq_init(void) 3486 { 3487 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 3488 blk_mq_hctx_notify_dead); 3489 return 0; 3490 } 3491 subsys_initcall(blk_mq_init); 3492