1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Block multiqueue core code 4 * 5 * Copyright (C) 2013-2014 Jens Axboe 6 * Copyright (C) 2013-2014 Christoph Hellwig 7 */ 8 #include <linux/kernel.h> 9 #include <linux/module.h> 10 #include <linux/backing-dev.h> 11 #include <linux/bio.h> 12 #include <linux/blkdev.h> 13 #include <linux/blk-integrity.h> 14 #include <linux/kmemleak.h> 15 #include <linux/mm.h> 16 #include <linux/init.h> 17 #include <linux/slab.h> 18 #include <linux/workqueue.h> 19 #include <linux/smp.h> 20 #include <linux/interrupt.h> 21 #include <linux/llist.h> 22 #include <linux/cpu.h> 23 #include <linux/cache.h> 24 #include <linux/sched/topology.h> 25 #include <linux/sched/signal.h> 26 #include <linux/delay.h> 27 #include <linux/crash_dump.h> 28 #include <linux/prefetch.h> 29 #include <linux/blk-crypto.h> 30 #include <linux/part_stat.h> 31 #include <linux/sched/isolation.h> 32 33 #include <trace/events/block.h> 34 35 #include <linux/t10-pi.h> 36 #include "blk.h" 37 #include "blk-mq.h" 38 #include "blk-mq-debugfs.h" 39 #include "blk-pm.h" 40 #include "blk-stat.h" 41 #include "blk-mq-sched.h" 42 #include "blk-rq-qos.h" 43 44 static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); 45 static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); 46 static DEFINE_MUTEX(blk_mq_cpuhp_lock); 47 48 static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); 49 static void blk_mq_request_bypass_insert(struct request *rq, 50 blk_insert_t flags); 51 static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 52 struct list_head *list); 53 static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 54 struct io_comp_batch *iob, unsigned int flags); 55 56 /* 57 * Check if any of the ctx, dispatch list or elevator 58 * have pending work in this hardware queue. 59 */ 60 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 61 { 62 return !list_empty_careful(&hctx->dispatch) || 63 sbitmap_any_bit_set(&hctx->ctx_map) || 64 blk_mq_sched_has_work(hctx); 65 } 66 67 /* 68 * Mark this ctx as having pending work in this hardware queue 69 */ 70 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 71 struct blk_mq_ctx *ctx) 72 { 73 const int bit = ctx->index_hw[hctx->type]; 74 75 if (!sbitmap_test_bit(&hctx->ctx_map, bit)) 76 sbitmap_set_bit(&hctx->ctx_map, bit); 77 } 78 79 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 80 struct blk_mq_ctx *ctx) 81 { 82 const int bit = ctx->index_hw[hctx->type]; 83 84 sbitmap_clear_bit(&hctx->ctx_map, bit); 85 } 86 87 struct mq_inflight { 88 struct block_device *part; 89 unsigned int inflight[2]; 90 }; 91 92 static bool blk_mq_check_in_driver(struct request *rq, void *priv) 93 { 94 struct mq_inflight *mi = priv; 95 96 if (rq->rq_flags & RQF_IO_STAT && 97 (!bdev_is_partition(mi->part) || rq->part == mi->part) && 98 blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) 99 mi->inflight[rq_data_dir(rq)]++; 100 101 return true; 102 } 103 104 void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]) 105 { 106 struct mq_inflight mi = { .part = part }; 107 108 blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver, 109 &mi); 110 inflight[READ] = mi.inflight[READ]; 111 inflight[WRITE] = mi.inflight[WRITE]; 112 } 113 114 #ifdef CONFIG_LOCKDEP 115 static bool blk_freeze_set_owner(struct request_queue *q, 116 struct task_struct *owner) 117 { 118 if (!owner) 119 return false; 120 121 if (!q->mq_freeze_depth) { 122 q->mq_freeze_owner = owner; 123 q->mq_freeze_owner_depth = 1; 124 q->mq_freeze_disk_dead = !q->disk || 125 test_bit(GD_DEAD, &q->disk->state) || 126 !blk_queue_registered(q); 127 q->mq_freeze_queue_dying = blk_queue_dying(q); 128 return true; 129 } 130 131 if (owner == q->mq_freeze_owner) 132 q->mq_freeze_owner_depth += 1; 133 return false; 134 } 135 136 /* verify the last unfreeze in owner context */ 137 static bool blk_unfreeze_check_owner(struct request_queue *q) 138 { 139 if (q->mq_freeze_owner != current) 140 return false; 141 if (--q->mq_freeze_owner_depth == 0) { 142 q->mq_freeze_owner = NULL; 143 return true; 144 } 145 return false; 146 } 147 148 #else 149 150 static bool blk_freeze_set_owner(struct request_queue *q, 151 struct task_struct *owner) 152 { 153 return false; 154 } 155 156 static bool blk_unfreeze_check_owner(struct request_queue *q) 157 { 158 return false; 159 } 160 #endif 161 162 bool __blk_freeze_queue_start(struct request_queue *q, 163 struct task_struct *owner) 164 { 165 bool freeze; 166 167 mutex_lock(&q->mq_freeze_lock); 168 freeze = blk_freeze_set_owner(q, owner); 169 if (++q->mq_freeze_depth == 1) { 170 percpu_ref_kill(&q->q_usage_counter); 171 mutex_unlock(&q->mq_freeze_lock); 172 if (queue_is_mq(q)) 173 blk_mq_run_hw_queues(q, false); 174 } else { 175 mutex_unlock(&q->mq_freeze_lock); 176 } 177 178 return freeze; 179 } 180 181 void blk_freeze_queue_start(struct request_queue *q) 182 { 183 if (__blk_freeze_queue_start(q, current)) 184 blk_freeze_acquire_lock(q); 185 } 186 EXPORT_SYMBOL_GPL(blk_freeze_queue_start); 187 188 void blk_mq_freeze_queue_wait(struct request_queue *q) 189 { 190 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); 191 } 192 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); 193 194 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 195 unsigned long timeout) 196 { 197 return wait_event_timeout(q->mq_freeze_wq, 198 percpu_ref_is_zero(&q->q_usage_counter), 199 timeout); 200 } 201 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); 202 203 void blk_mq_freeze_queue_nomemsave(struct request_queue *q) 204 { 205 blk_freeze_queue_start(q); 206 blk_mq_freeze_queue_wait(q); 207 } 208 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave); 209 210 bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) 211 { 212 bool unfreeze; 213 214 mutex_lock(&q->mq_freeze_lock); 215 if (force_atomic) 216 q->q_usage_counter.data->force_atomic = true; 217 q->mq_freeze_depth--; 218 WARN_ON_ONCE(q->mq_freeze_depth < 0); 219 if (!q->mq_freeze_depth) { 220 percpu_ref_resurrect(&q->q_usage_counter); 221 wake_up_all(&q->mq_freeze_wq); 222 } 223 unfreeze = blk_unfreeze_check_owner(q); 224 mutex_unlock(&q->mq_freeze_lock); 225 226 return unfreeze; 227 } 228 229 void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q) 230 { 231 if (__blk_mq_unfreeze_queue(q, false)) 232 blk_unfreeze_release_lock(q); 233 } 234 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore); 235 236 /* 237 * non_owner variant of blk_freeze_queue_start 238 * 239 * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen 240 * by the same task. This is fragile and should not be used if at all 241 * possible. 242 */ 243 void blk_freeze_queue_start_non_owner(struct request_queue *q) 244 { 245 __blk_freeze_queue_start(q, NULL); 246 } 247 EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); 248 249 /* non_owner variant of blk_mq_unfreeze_queue */ 250 void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) 251 { 252 __blk_mq_unfreeze_queue(q, false); 253 } 254 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); 255 256 /* 257 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the 258 * mpt3sas driver such that this function can be removed. 259 */ 260 void blk_mq_quiesce_queue_nowait(struct request_queue *q) 261 { 262 unsigned long flags; 263 264 spin_lock_irqsave(&q->queue_lock, flags); 265 if (!q->quiesce_depth++) 266 blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); 267 spin_unlock_irqrestore(&q->queue_lock, flags); 268 } 269 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); 270 271 /** 272 * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done 273 * @set: tag_set to wait on 274 * 275 * Note: it is driver's responsibility for making sure that quiesce has 276 * been started on or more of the request_queues of the tag_set. This 277 * function only waits for the quiesce on those request_queues that had 278 * the quiesce flag set using blk_mq_quiesce_queue_nowait. 279 */ 280 void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set) 281 { 282 if (set->flags & BLK_MQ_F_BLOCKING) 283 synchronize_srcu(set->srcu); 284 else 285 synchronize_rcu(); 286 } 287 EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); 288 289 /** 290 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished 291 * @q: request queue. 292 * 293 * Note: this function does not prevent that the struct request end_io() 294 * callback function is invoked. Once this function is returned, we make 295 * sure no dispatch can happen until the queue is unquiesced via 296 * blk_mq_unquiesce_queue(). 297 */ 298 void blk_mq_quiesce_queue(struct request_queue *q) 299 { 300 blk_mq_quiesce_queue_nowait(q); 301 /* nothing to wait for non-mq queues */ 302 if (queue_is_mq(q)) 303 blk_mq_wait_quiesce_done(q->tag_set); 304 } 305 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); 306 307 /* 308 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() 309 * @q: request queue. 310 * 311 * This function recovers queue into the state before quiescing 312 * which is done by blk_mq_quiesce_queue. 313 */ 314 void blk_mq_unquiesce_queue(struct request_queue *q) 315 { 316 unsigned long flags; 317 bool run_queue = false; 318 319 spin_lock_irqsave(&q->queue_lock, flags); 320 if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { 321 ; 322 } else if (!--q->quiesce_depth) { 323 blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); 324 run_queue = true; 325 } 326 spin_unlock_irqrestore(&q->queue_lock, flags); 327 328 /* dispatch requests which are inserted during quiescing */ 329 if (run_queue) 330 blk_mq_run_hw_queues(q, true); 331 } 332 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); 333 334 void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) 335 { 336 struct request_queue *q; 337 338 mutex_lock(&set->tag_list_lock); 339 list_for_each_entry(q, &set->tag_list, tag_set_list) { 340 if (!blk_queue_skip_tagset_quiesce(q)) 341 blk_mq_quiesce_queue_nowait(q); 342 } 343 mutex_unlock(&set->tag_list_lock); 344 345 blk_mq_wait_quiesce_done(set); 346 } 347 EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); 348 349 void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) 350 { 351 struct request_queue *q; 352 353 mutex_lock(&set->tag_list_lock); 354 list_for_each_entry(q, &set->tag_list, tag_set_list) { 355 if (!blk_queue_skip_tagset_quiesce(q)) 356 blk_mq_unquiesce_queue(q); 357 } 358 mutex_unlock(&set->tag_list_lock); 359 } 360 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); 361 362 void blk_mq_wake_waiters(struct request_queue *q) 363 { 364 struct blk_mq_hw_ctx *hctx; 365 unsigned long i; 366 367 queue_for_each_hw_ctx(q, hctx, i) 368 if (blk_mq_hw_queue_mapped(hctx)) 369 blk_mq_tag_wakeup_all(hctx->tags, true); 370 } 371 372 void blk_rq_init(struct request_queue *q, struct request *rq) 373 { 374 memset(rq, 0, sizeof(*rq)); 375 376 INIT_LIST_HEAD(&rq->queuelist); 377 rq->q = q; 378 rq->__sector = (sector_t) -1; 379 rq->phys_gap_bit = 0; 380 INIT_HLIST_NODE(&rq->hash); 381 RB_CLEAR_NODE(&rq->rb_node); 382 rq->tag = BLK_MQ_NO_TAG; 383 rq->internal_tag = BLK_MQ_NO_TAG; 384 rq->start_time_ns = blk_time_get_ns(); 385 blk_crypto_rq_set_defaults(rq); 386 } 387 EXPORT_SYMBOL(blk_rq_init); 388 389 /* Set start and alloc time when the allocated request is actually used */ 390 static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) 391 { 392 #ifdef CONFIG_BLK_RQ_ALLOC_TIME 393 if (blk_queue_rq_alloc_time(rq->q)) 394 rq->alloc_time_ns = alloc_time_ns; 395 else 396 rq->alloc_time_ns = 0; 397 #endif 398 } 399 400 static inline void blk_mq_bio_issue_init(struct request_queue *q, 401 struct bio *bio) 402 { 403 #ifdef CONFIG_BLK_CGROUP 404 if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags)) 405 bio->issue_time_ns = blk_time_get_ns(); 406 #endif 407 } 408 409 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, 410 struct blk_mq_tags *tags, unsigned int tag) 411 { 412 struct blk_mq_ctx *ctx = data->ctx; 413 struct blk_mq_hw_ctx *hctx = data->hctx; 414 struct request_queue *q = data->q; 415 struct request *rq = tags->static_rqs[tag]; 416 417 rq->q = q; 418 rq->mq_ctx = ctx; 419 rq->mq_hctx = hctx; 420 rq->cmd_flags = data->cmd_flags; 421 422 if (data->flags & BLK_MQ_REQ_PM) 423 data->rq_flags |= RQF_PM; 424 rq->rq_flags = data->rq_flags; 425 426 if (data->rq_flags & RQF_SCHED_TAGS) { 427 rq->tag = BLK_MQ_NO_TAG; 428 rq->internal_tag = tag; 429 } else { 430 rq->tag = tag; 431 rq->internal_tag = BLK_MQ_NO_TAG; 432 } 433 rq->timeout = 0; 434 435 rq->part = NULL; 436 rq->io_start_time_ns = 0; 437 rq->stats_sectors = 0; 438 rq->nr_phys_segments = 0; 439 rq->nr_integrity_segments = 0; 440 rq->end_io = NULL; 441 rq->end_io_data = NULL; 442 443 blk_crypto_rq_set_defaults(rq); 444 INIT_LIST_HEAD(&rq->queuelist); 445 /* tag was already set */ 446 WRITE_ONCE(rq->deadline, 0); 447 req_ref_set(rq, 1); 448 449 if (rq->rq_flags & RQF_USE_SCHED) { 450 struct elevator_queue *e = data->q->elevator; 451 452 INIT_HLIST_NODE(&rq->hash); 453 RB_CLEAR_NODE(&rq->rb_node); 454 455 if (e->type->ops.prepare_request) 456 e->type->ops.prepare_request(rq); 457 } 458 459 return rq; 460 } 461 462 static inline struct request * 463 __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) 464 { 465 unsigned int tag, tag_offset; 466 struct blk_mq_tags *tags; 467 struct request *rq; 468 unsigned long tag_mask; 469 int i, nr = 0; 470 471 tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); 472 if (unlikely(!tag_mask)) 473 return NULL; 474 475 tags = blk_mq_tags_from_data(data); 476 for (i = 0; tag_mask; i++) { 477 if (!(tag_mask & (1UL << i))) 478 continue; 479 tag = tag_offset + i; 480 prefetch(tags->static_rqs[tag]); 481 tag_mask &= ~(1UL << i); 482 rq = blk_mq_rq_ctx_init(data, tags, tag); 483 rq_list_add_head(data->cached_rqs, rq); 484 nr++; 485 } 486 if (!(data->rq_flags & RQF_SCHED_TAGS)) 487 blk_mq_add_active_requests(data->hctx, nr); 488 /* caller already holds a reference, add for remainder */ 489 percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); 490 data->nr_tags -= nr; 491 492 return rq_list_pop(data->cached_rqs); 493 } 494 495 static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) 496 { 497 struct request_queue *q = data->q; 498 u64 alloc_time_ns = 0; 499 struct request *rq; 500 unsigned int tag; 501 502 /* alloc_time includes depth and tag waits */ 503 if (blk_queue_rq_alloc_time(q)) 504 alloc_time_ns = blk_time_get_ns(); 505 506 if (data->cmd_flags & REQ_NOWAIT) 507 data->flags |= BLK_MQ_REQ_NOWAIT; 508 509 retry: 510 data->ctx = blk_mq_get_ctx(q); 511 data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); 512 513 if (q->elevator) { 514 /* 515 * All requests use scheduler tags when an I/O scheduler is 516 * enabled for the queue. 517 */ 518 data->rq_flags |= RQF_SCHED_TAGS; 519 520 /* 521 * Flush/passthrough requests are special and go directly to the 522 * dispatch list. 523 */ 524 if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH && 525 !blk_op_is_passthrough(data->cmd_flags)) { 526 struct elevator_mq_ops *ops = &q->elevator->type->ops; 527 528 WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); 529 530 data->rq_flags |= RQF_USE_SCHED; 531 if (ops->limit_depth) 532 ops->limit_depth(data->cmd_flags, data); 533 } 534 } else { 535 blk_mq_tag_busy(data->hctx); 536 } 537 538 if (data->flags & BLK_MQ_REQ_RESERVED) 539 data->rq_flags |= RQF_RESV; 540 541 /* 542 * Try batched alloc if we want more than 1 tag. 543 */ 544 if (data->nr_tags > 1) { 545 rq = __blk_mq_alloc_requests_batch(data); 546 if (rq) { 547 blk_mq_rq_time_init(rq, alloc_time_ns); 548 return rq; 549 } 550 data->nr_tags = 1; 551 } 552 553 /* 554 * Waiting allocations only fail because of an inactive hctx. In that 555 * case just retry the hctx assignment and tag allocation as CPU hotplug 556 * should have migrated us to an online CPU by now. 557 */ 558 tag = blk_mq_get_tag(data); 559 if (tag == BLK_MQ_NO_TAG) { 560 if (data->flags & BLK_MQ_REQ_NOWAIT) 561 return NULL; 562 /* 563 * Give up the CPU and sleep for a random short time to 564 * ensure that thread using a realtime scheduling class 565 * are migrated off the CPU, and thus off the hctx that 566 * is going away. 567 */ 568 msleep(3); 569 goto retry; 570 } 571 572 if (!(data->rq_flags & RQF_SCHED_TAGS)) 573 blk_mq_inc_active_requests(data->hctx); 574 rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); 575 blk_mq_rq_time_init(rq, alloc_time_ns); 576 return rq; 577 } 578 579 static struct request *blk_mq_rq_cache_fill(struct request_queue *q, 580 struct blk_plug *plug, 581 blk_opf_t opf, 582 blk_mq_req_flags_t flags) 583 { 584 struct blk_mq_alloc_data data = { 585 .q = q, 586 .flags = flags, 587 .shallow_depth = 0, 588 .cmd_flags = opf, 589 .rq_flags = 0, 590 .nr_tags = plug->nr_ios, 591 .cached_rqs = &plug->cached_rqs, 592 .ctx = NULL, 593 .hctx = NULL 594 }; 595 struct request *rq; 596 597 if (blk_queue_enter(q, flags)) 598 return NULL; 599 600 plug->nr_ios = 1; 601 602 rq = __blk_mq_alloc_requests(&data); 603 if (unlikely(!rq)) 604 blk_queue_exit(q); 605 return rq; 606 } 607 608 static struct request *blk_mq_alloc_cached_request(struct request_queue *q, 609 blk_opf_t opf, 610 blk_mq_req_flags_t flags) 611 { 612 struct blk_plug *plug = current->plug; 613 struct request *rq; 614 615 if (!plug) 616 return NULL; 617 618 if (rq_list_empty(&plug->cached_rqs)) { 619 if (plug->nr_ios == 1) 620 return NULL; 621 rq = blk_mq_rq_cache_fill(q, plug, opf, flags); 622 if (!rq) 623 return NULL; 624 } else { 625 rq = rq_list_peek(&plug->cached_rqs); 626 if (!rq || rq->q != q) 627 return NULL; 628 629 if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) 630 return NULL; 631 if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) 632 return NULL; 633 634 rq_list_pop(&plug->cached_rqs); 635 blk_mq_rq_time_init(rq, blk_time_get_ns()); 636 } 637 638 rq->cmd_flags = opf; 639 INIT_LIST_HEAD(&rq->queuelist); 640 return rq; 641 } 642 643 struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, 644 blk_mq_req_flags_t flags) 645 { 646 struct request *rq; 647 648 rq = blk_mq_alloc_cached_request(q, opf, flags); 649 if (!rq) { 650 struct blk_mq_alloc_data data = { 651 .q = q, 652 .flags = flags, 653 .shallow_depth = 0, 654 .cmd_flags = opf, 655 .rq_flags = 0, 656 .nr_tags = 1, 657 .cached_rqs = NULL, 658 .ctx = NULL, 659 .hctx = NULL 660 }; 661 int ret; 662 663 ret = blk_queue_enter(q, flags); 664 if (ret) 665 return ERR_PTR(ret); 666 667 rq = __blk_mq_alloc_requests(&data); 668 if (!rq) 669 goto out_queue_exit; 670 } 671 rq->__data_len = 0; 672 rq->phys_gap_bit = 0; 673 rq->__sector = (sector_t) -1; 674 rq->bio = rq->biotail = NULL; 675 return rq; 676 out_queue_exit: 677 blk_queue_exit(q); 678 return ERR_PTR(-EWOULDBLOCK); 679 } 680 EXPORT_SYMBOL(blk_mq_alloc_request); 681 682 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 683 blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx) 684 { 685 struct blk_mq_alloc_data data = { 686 .q = q, 687 .flags = flags, 688 .shallow_depth = 0, 689 .cmd_flags = opf, 690 .rq_flags = 0, 691 .nr_tags = 1, 692 .cached_rqs = NULL, 693 .ctx = NULL, 694 .hctx = NULL 695 }; 696 u64 alloc_time_ns = 0; 697 struct request *rq; 698 unsigned int cpu; 699 unsigned int tag; 700 int ret; 701 702 /* alloc_time includes depth and tag waits */ 703 if (blk_queue_rq_alloc_time(q)) 704 alloc_time_ns = blk_time_get_ns(); 705 706 /* 707 * If the tag allocator sleeps we could get an allocation for a 708 * different hardware context. No need to complicate the low level 709 * allocator for this for the rare use case of a command tied to 710 * a specific queue. 711 */ 712 if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) || 713 WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) 714 return ERR_PTR(-EINVAL); 715 716 if (hctx_idx >= q->nr_hw_queues) 717 return ERR_PTR(-EIO); 718 719 ret = blk_queue_enter(q, flags); 720 if (ret) 721 return ERR_PTR(ret); 722 723 /* 724 * Check if the hardware context is actually mapped to anything. 725 * If not tell the caller that it should skip this queue. 726 */ 727 ret = -EXDEV; 728 data.hctx = xa_load(&q->hctx_table, hctx_idx); 729 if (!blk_mq_hw_queue_mapped(data.hctx)) 730 goto out_queue_exit; 731 cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); 732 if (cpu >= nr_cpu_ids) 733 goto out_queue_exit; 734 data.ctx = __blk_mq_get_ctx(q, cpu); 735 736 if (q->elevator) 737 data.rq_flags |= RQF_SCHED_TAGS; 738 else 739 blk_mq_tag_busy(data.hctx); 740 741 if (flags & BLK_MQ_REQ_RESERVED) 742 data.rq_flags |= RQF_RESV; 743 744 ret = -EWOULDBLOCK; 745 tag = blk_mq_get_tag(&data); 746 if (tag == BLK_MQ_NO_TAG) 747 goto out_queue_exit; 748 if (!(data.rq_flags & RQF_SCHED_TAGS)) 749 blk_mq_inc_active_requests(data.hctx); 750 rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); 751 blk_mq_rq_time_init(rq, alloc_time_ns); 752 rq->__data_len = 0; 753 rq->phys_gap_bit = 0; 754 rq->__sector = (sector_t) -1; 755 rq->bio = rq->biotail = NULL; 756 return rq; 757 758 out_queue_exit: 759 blk_queue_exit(q); 760 return ERR_PTR(ret); 761 } 762 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); 763 764 static void blk_mq_finish_request(struct request *rq) 765 { 766 struct request_queue *q = rq->q; 767 768 blk_zone_finish_request(rq); 769 770 if (rq->rq_flags & RQF_USE_SCHED) { 771 q->elevator->type->ops.finish_request(rq); 772 /* 773 * For postflush request that may need to be 774 * completed twice, we should clear this flag 775 * to avoid double finish_request() on the rq. 776 */ 777 rq->rq_flags &= ~RQF_USE_SCHED; 778 } 779 } 780 781 static void __blk_mq_free_request(struct request *rq) 782 { 783 struct request_queue *q = rq->q; 784 struct blk_mq_ctx *ctx = rq->mq_ctx; 785 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 786 const int sched_tag = rq->internal_tag; 787 788 blk_crypto_free_request(rq); 789 blk_pm_mark_last_busy(rq); 790 rq->mq_hctx = NULL; 791 792 if (rq->tag != BLK_MQ_NO_TAG) { 793 blk_mq_dec_active_requests(hctx); 794 blk_mq_put_tag(hctx->tags, ctx, rq->tag); 795 } 796 if (sched_tag != BLK_MQ_NO_TAG) 797 blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); 798 blk_mq_sched_restart(hctx); 799 blk_queue_exit(q); 800 } 801 802 void blk_mq_free_request(struct request *rq) 803 { 804 struct request_queue *q = rq->q; 805 806 blk_mq_finish_request(rq); 807 808 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) 809 laptop_io_completion(q->disk->bdi); 810 811 rq_qos_done(q, rq); 812 813 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 814 if (req_ref_put_and_test(rq)) 815 __blk_mq_free_request(rq); 816 } 817 EXPORT_SYMBOL_GPL(blk_mq_free_request); 818 819 void blk_mq_free_plug_rqs(struct blk_plug *plug) 820 { 821 struct request *rq; 822 823 while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL) 824 blk_mq_free_request(rq); 825 } 826 827 void blk_dump_rq_flags(struct request *rq, char *msg) 828 { 829 printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, 830 rq->q->disk ? rq->q->disk->disk_name : "?", 831 (__force unsigned long long) rq->cmd_flags); 832 833 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", 834 (unsigned long long)blk_rq_pos(rq), 835 blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); 836 printk(KERN_INFO " bio %p, biotail %p, len %u\n", 837 rq->bio, rq->biotail, blk_rq_bytes(rq)); 838 } 839 EXPORT_SYMBOL(blk_dump_rq_flags); 840 841 static void blk_account_io_completion(struct request *req, unsigned int bytes) 842 { 843 if (req->rq_flags & RQF_IO_STAT) { 844 const int sgrp = op_stat_group(req_op(req)); 845 846 part_stat_lock(); 847 part_stat_add(req->part, sectors[sgrp], bytes >> 9); 848 part_stat_unlock(); 849 } 850 } 851 852 static void blk_print_req_error(struct request *req, blk_status_t status) 853 { 854 printk_ratelimited(KERN_ERR 855 "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " 856 "phys_seg %u prio class %u\n", 857 blk_status_to_str(status), 858 req->q->disk ? req->q->disk->disk_name : "?", 859 blk_rq_pos(req), (__force u32)req_op(req), 860 blk_op_str(req_op(req)), 861 (__force u32)(req->cmd_flags & ~REQ_OP_MASK), 862 req->nr_phys_segments, 863 IOPRIO_PRIO_CLASS(req_get_ioprio(req))); 864 } 865 866 /* 867 * Fully end IO on a request. Does not support partial completions, or 868 * errors. 869 */ 870 static void blk_complete_request(struct request *req) 871 { 872 const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; 873 int total_bytes = blk_rq_bytes(req); 874 struct bio *bio = req->bio; 875 876 trace_block_rq_complete(req, BLK_STS_OK, total_bytes); 877 878 if (!bio) 879 return; 880 881 if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) 882 blk_integrity_complete(req, total_bytes); 883 884 /* 885 * Upper layers may call blk_crypto_evict_key() anytime after the last 886 * bio_endio(). Therefore, the keyslot must be released before that. 887 */ 888 blk_crypto_rq_put_keyslot(req); 889 890 blk_account_io_completion(req, total_bytes); 891 892 do { 893 struct bio *next = bio->bi_next; 894 895 /* Completion has already been traced */ 896 bio_clear_flag(bio, BIO_TRACE_COMPLETION); 897 898 if (blk_req_bio_is_zone_append(req, bio)) 899 blk_zone_append_update_request_bio(req, bio); 900 901 if (!is_flush) 902 bio_endio(bio); 903 bio = next; 904 } while (bio); 905 906 /* 907 * Reset counters so that the request stacking driver 908 * can find how many bytes remain in the request 909 * later. 910 */ 911 if (!req->end_io) { 912 req->bio = NULL; 913 req->__data_len = 0; 914 } 915 } 916 917 /** 918 * blk_update_request - Complete multiple bytes without completing the request 919 * @req: the request being processed 920 * @error: block status code 921 * @nr_bytes: number of bytes to complete for @req 922 * 923 * Description: 924 * Ends I/O on a number of bytes attached to @req, but doesn't complete 925 * the request structure even if @req doesn't have leftover. 926 * If @req has leftover, sets it up for the next range of segments. 927 * 928 * Passing the result of blk_rq_bytes() as @nr_bytes guarantees 929 * %false return from this function. 930 * 931 * Note: 932 * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function 933 * except in the consistency check at the end of this function. 934 * 935 * Return: 936 * %false - this request doesn't have any more data 937 * %true - this request has more data 938 **/ 939 bool blk_update_request(struct request *req, blk_status_t error, 940 unsigned int nr_bytes) 941 { 942 bool is_flush = req->rq_flags & RQF_FLUSH_SEQ; 943 bool quiet = req->rq_flags & RQF_QUIET; 944 int total_bytes; 945 946 trace_block_rq_complete(req, error, nr_bytes); 947 948 if (!req->bio) 949 return false; 950 951 if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && 952 error == BLK_STS_OK) 953 blk_integrity_complete(req, nr_bytes); 954 955 /* 956 * Upper layers may call blk_crypto_evict_key() anytime after the last 957 * bio_endio(). Therefore, the keyslot must be released before that. 958 */ 959 if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) 960 __blk_crypto_rq_put_keyslot(req); 961 962 if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) && 963 !test_bit(GD_DEAD, &req->q->disk->state)) { 964 blk_print_req_error(req, error); 965 trace_block_rq_error(req, error, nr_bytes); 966 } 967 968 blk_account_io_completion(req, nr_bytes); 969 970 total_bytes = 0; 971 while (req->bio) { 972 struct bio *bio = req->bio; 973 unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); 974 975 if (unlikely(error)) 976 bio->bi_status = error; 977 978 if (bio_bytes == bio->bi_iter.bi_size) { 979 req->bio = bio->bi_next; 980 } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) { 981 /* 982 * Partial zone append completions cannot be supported 983 * as the BIO fragments may end up not being written 984 * sequentially. 985 */ 986 bio->bi_status = BLK_STS_IOERR; 987 } 988 989 /* Completion has already been traced */ 990 bio_clear_flag(bio, BIO_TRACE_COMPLETION); 991 if (unlikely(quiet)) 992 bio_set_flag(bio, BIO_QUIET); 993 994 bio_advance(bio, bio_bytes); 995 996 /* Don't actually finish bio if it's part of flush sequence */ 997 if (!bio->bi_iter.bi_size) { 998 if (blk_req_bio_is_zone_append(req, bio)) 999 blk_zone_append_update_request_bio(req, bio); 1000 if (!is_flush) 1001 bio_endio(bio); 1002 } 1003 1004 total_bytes += bio_bytes; 1005 nr_bytes -= bio_bytes; 1006 1007 if (!nr_bytes) 1008 break; 1009 } 1010 1011 /* 1012 * completely done 1013 */ 1014 if (!req->bio) { 1015 /* 1016 * Reset counters so that the request stacking driver 1017 * can find how many bytes remain in the request 1018 * later. 1019 */ 1020 req->__data_len = 0; 1021 return false; 1022 } 1023 1024 req->__data_len -= total_bytes; 1025 1026 /* update sector only for requests with clear definition of sector */ 1027 if (!blk_rq_is_passthrough(req)) 1028 req->__sector += total_bytes >> 9; 1029 1030 /* mixed attributes always follow the first bio */ 1031 if (req->rq_flags & RQF_MIXED_MERGE) { 1032 req->cmd_flags &= ~REQ_FAILFAST_MASK; 1033 req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; 1034 } 1035 1036 if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { 1037 /* 1038 * If total number of sectors is less than the first segment 1039 * size, something has gone terribly wrong. 1040 */ 1041 if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { 1042 blk_dump_rq_flags(req, "request botched"); 1043 req->__data_len = blk_rq_cur_bytes(req); 1044 } 1045 1046 /* recalculate the number of segments */ 1047 req->nr_phys_segments = blk_recalc_rq_segments(req); 1048 } 1049 1050 return true; 1051 } 1052 EXPORT_SYMBOL_GPL(blk_update_request); 1053 1054 static inline void blk_account_io_done(struct request *req, u64 now) 1055 { 1056 trace_block_io_done(req); 1057 1058 /* 1059 * Account IO completion. flush_rq isn't accounted as a 1060 * normal IO on queueing nor completion. Accounting the 1061 * containing request is enough. 1062 */ 1063 if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { 1064 const int sgrp = op_stat_group(req_op(req)); 1065 1066 part_stat_lock(); 1067 update_io_ticks(req->part, jiffies, true); 1068 part_stat_inc(req->part, ios[sgrp]); 1069 part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); 1070 part_stat_local_dec(req->part, 1071 in_flight[op_is_write(req_op(req))]); 1072 part_stat_unlock(); 1073 } 1074 } 1075 1076 static inline bool blk_rq_passthrough_stats(struct request *req) 1077 { 1078 struct bio *bio = req->bio; 1079 1080 if (!blk_queue_passthrough_stat(req->q)) 1081 return false; 1082 1083 /* Requests without a bio do not transfer data. */ 1084 if (!bio) 1085 return false; 1086 1087 /* 1088 * Stats are accumulated in the bdev, so must have one attached to a 1089 * bio to track stats. Most drivers do not set the bdev for passthrough 1090 * requests, but nvme is one that will set it. 1091 */ 1092 if (!bio->bi_bdev) 1093 return false; 1094 1095 /* 1096 * We don't know what a passthrough command does, but we know the 1097 * payload size and data direction. Ensuring the size is aligned to the 1098 * block size filters out most commands with payloads that don't 1099 * represent sector access. 1100 */ 1101 if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) 1102 return false; 1103 return true; 1104 } 1105 1106 static inline void blk_account_io_start(struct request *req) 1107 { 1108 trace_block_io_start(req); 1109 1110 if (!blk_queue_io_stat(req->q)) 1111 return; 1112 if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req)) 1113 return; 1114 1115 req->rq_flags |= RQF_IO_STAT; 1116 req->start_time_ns = blk_time_get_ns(); 1117 1118 /* 1119 * All non-passthrough requests are created from a bio with one 1120 * exception: when a flush command that is part of a flush sequence 1121 * generated by the state machine in blk-flush.c is cloned onto the 1122 * lower device by dm-multipath we can get here without a bio. 1123 */ 1124 if (req->bio) 1125 req->part = req->bio->bi_bdev; 1126 else 1127 req->part = req->q->disk->part0; 1128 1129 part_stat_lock(); 1130 update_io_ticks(req->part, jiffies, false); 1131 part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]); 1132 part_stat_unlock(); 1133 } 1134 1135 static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) 1136 { 1137 if (rq->rq_flags & RQF_STATS) 1138 blk_stat_add(rq, now); 1139 1140 blk_mq_sched_completed_request(rq, now); 1141 blk_account_io_done(rq, now); 1142 } 1143 1144 inline void __blk_mq_end_request(struct request *rq, blk_status_t error) 1145 { 1146 if (blk_mq_need_time_stamp(rq)) 1147 __blk_mq_end_request_acct(rq, blk_time_get_ns()); 1148 1149 blk_mq_finish_request(rq); 1150 1151 if (rq->end_io) { 1152 rq_qos_done(rq->q, rq); 1153 if (rq->end_io(rq, error) == RQ_END_IO_FREE) 1154 blk_mq_free_request(rq); 1155 } else { 1156 blk_mq_free_request(rq); 1157 } 1158 } 1159 EXPORT_SYMBOL(__blk_mq_end_request); 1160 1161 void blk_mq_end_request(struct request *rq, blk_status_t error) 1162 { 1163 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 1164 BUG(); 1165 __blk_mq_end_request(rq, error); 1166 } 1167 EXPORT_SYMBOL(blk_mq_end_request); 1168 1169 #define TAG_COMP_BATCH 32 1170 1171 static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, 1172 int *tag_array, int nr_tags) 1173 { 1174 struct request_queue *q = hctx->queue; 1175 1176 blk_mq_sub_active_requests(hctx, nr_tags); 1177 1178 blk_mq_put_tags(hctx->tags, tag_array, nr_tags); 1179 percpu_ref_put_many(&q->q_usage_counter, nr_tags); 1180 } 1181 1182 void blk_mq_end_request_batch(struct io_comp_batch *iob) 1183 { 1184 int tags[TAG_COMP_BATCH], nr_tags = 0; 1185 struct blk_mq_hw_ctx *cur_hctx = NULL; 1186 struct request *rq; 1187 u64 now = 0; 1188 1189 if (iob->need_ts) 1190 now = blk_time_get_ns(); 1191 1192 while ((rq = rq_list_pop(&iob->req_list)) != NULL) { 1193 prefetch(rq->bio); 1194 prefetch(rq->rq_next); 1195 1196 blk_complete_request(rq); 1197 if (iob->need_ts) 1198 __blk_mq_end_request_acct(rq, now); 1199 1200 blk_mq_finish_request(rq); 1201 1202 rq_qos_done(rq->q, rq); 1203 1204 /* 1205 * If end_io handler returns NONE, then it still has 1206 * ownership of the request. 1207 */ 1208 if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) 1209 continue; 1210 1211 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 1212 if (!req_ref_put_and_test(rq)) 1213 continue; 1214 1215 blk_crypto_free_request(rq); 1216 blk_pm_mark_last_busy(rq); 1217 1218 if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { 1219 if (cur_hctx) 1220 blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); 1221 nr_tags = 0; 1222 cur_hctx = rq->mq_hctx; 1223 } 1224 tags[nr_tags++] = rq->tag; 1225 } 1226 1227 if (nr_tags) 1228 blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); 1229 } 1230 EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); 1231 1232 static void blk_complete_reqs(struct llist_head *list) 1233 { 1234 struct llist_node *entry = llist_reverse_order(llist_del_all(list)); 1235 struct request *rq, *next; 1236 1237 llist_for_each_entry_safe(rq, next, entry, ipi_list) 1238 rq->q->mq_ops->complete(rq); 1239 } 1240 1241 static __latent_entropy void blk_done_softirq(void) 1242 { 1243 blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); 1244 } 1245 1246 static int blk_softirq_cpu_dead(unsigned int cpu) 1247 { 1248 blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); 1249 return 0; 1250 } 1251 1252 static void __blk_mq_complete_request_remote(void *data) 1253 { 1254 __raise_softirq_irqoff(BLOCK_SOFTIRQ); 1255 } 1256 1257 static inline bool blk_mq_complete_need_ipi(struct request *rq) 1258 { 1259 int cpu = raw_smp_processor_id(); 1260 1261 if (!IS_ENABLED(CONFIG_SMP) || 1262 !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) 1263 return false; 1264 /* 1265 * With force threaded interrupts enabled, raising softirq from an SMP 1266 * function call will always result in waking the ksoftirqd thread. 1267 * This is probably worse than completing the request on a different 1268 * cache domain. 1269 */ 1270 if (force_irqthreads()) 1271 return false; 1272 1273 /* same CPU or cache domain and capacity? Complete locally */ 1274 if (cpu == rq->mq_ctx->cpu || 1275 (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && 1276 cpus_share_cache(cpu, rq->mq_ctx->cpu) && 1277 cpus_equal_capacity(cpu, rq->mq_ctx->cpu))) 1278 return false; 1279 1280 /* don't try to IPI to an offline CPU */ 1281 return cpu_online(rq->mq_ctx->cpu); 1282 } 1283 1284 static void blk_mq_complete_send_ipi(struct request *rq) 1285 { 1286 unsigned int cpu; 1287 1288 cpu = rq->mq_ctx->cpu; 1289 if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) 1290 smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); 1291 } 1292 1293 static void blk_mq_raise_softirq(struct request *rq) 1294 { 1295 struct llist_head *list; 1296 1297 preempt_disable(); 1298 list = this_cpu_ptr(&blk_cpu_done); 1299 if (llist_add(&rq->ipi_list, list)) 1300 raise_softirq(BLOCK_SOFTIRQ); 1301 preempt_enable(); 1302 } 1303 1304 bool blk_mq_complete_request_remote(struct request *rq) 1305 { 1306 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 1307 1308 /* 1309 * For request which hctx has only one ctx mapping, 1310 * or a polled request, always complete locally, 1311 * it's pointless to redirect the completion. 1312 */ 1313 if ((rq->mq_hctx->nr_ctx == 1 && 1314 rq->mq_ctx->cpu == raw_smp_processor_id()) || 1315 rq->cmd_flags & REQ_POLLED) 1316 return false; 1317 1318 if (blk_mq_complete_need_ipi(rq)) { 1319 blk_mq_complete_send_ipi(rq); 1320 return true; 1321 } 1322 1323 if (rq->q->nr_hw_queues == 1) { 1324 blk_mq_raise_softirq(rq); 1325 return true; 1326 } 1327 return false; 1328 } 1329 EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); 1330 1331 /** 1332 * blk_mq_complete_request - end I/O on a request 1333 * @rq: the request being processed 1334 * 1335 * Description: 1336 * Complete a request by scheduling the ->complete_rq operation. 1337 **/ 1338 void blk_mq_complete_request(struct request *rq) 1339 { 1340 if (!blk_mq_complete_request_remote(rq)) 1341 rq->q->mq_ops->complete(rq); 1342 } 1343 EXPORT_SYMBOL(blk_mq_complete_request); 1344 1345 /** 1346 * blk_mq_start_request - Start processing a request 1347 * @rq: Pointer to request to be started 1348 * 1349 * Function used by device drivers to notify the block layer that a request 1350 * is going to be processed now, so blk layer can do proper initializations 1351 * such as starting the timeout timer. 1352 */ 1353 void blk_mq_start_request(struct request *rq) 1354 { 1355 struct request_queue *q = rq->q; 1356 1357 trace_block_rq_issue(rq); 1358 1359 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && 1360 !blk_rq_is_passthrough(rq)) { 1361 rq->io_start_time_ns = blk_time_get_ns(); 1362 rq->stats_sectors = blk_rq_sectors(rq); 1363 rq->rq_flags |= RQF_STATS; 1364 rq_qos_issue(q, rq); 1365 } 1366 1367 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); 1368 1369 blk_add_timer(rq); 1370 WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); 1371 rq->mq_hctx->tags->rqs[rq->tag] = rq; 1372 1373 if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) 1374 blk_integrity_prepare(rq); 1375 1376 if (rq->bio && rq->bio->bi_opf & REQ_POLLED) 1377 WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); 1378 } 1379 EXPORT_SYMBOL(blk_mq_start_request); 1380 1381 /* 1382 * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple 1383 * queues. This is important for md arrays to benefit from merging 1384 * requests. 1385 */ 1386 static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) 1387 { 1388 if (plug->multiple_queues) 1389 return BLK_MAX_REQUEST_COUNT * 2; 1390 return BLK_MAX_REQUEST_COUNT; 1391 } 1392 1393 static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) 1394 { 1395 struct request *last = rq_list_peek(&plug->mq_list); 1396 1397 if (!plug->rq_count) { 1398 trace_block_plug(rq->q); 1399 } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || 1400 (!blk_queue_nomerges(rq->q) && 1401 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { 1402 blk_mq_flush_plug_list(plug, false); 1403 last = NULL; 1404 trace_block_plug(rq->q); 1405 } 1406 1407 if (!plug->multiple_queues && last && last->q != rq->q) 1408 plug->multiple_queues = true; 1409 /* 1410 * Any request allocated from sched tags can't be issued to 1411 * ->queue_rqs() directly 1412 */ 1413 if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) 1414 plug->has_elevator = true; 1415 rq_list_add_tail(&plug->mq_list, rq); 1416 plug->rq_count++; 1417 } 1418 1419 /** 1420 * blk_execute_rq_nowait - insert a request to I/O scheduler for execution 1421 * @rq: request to insert 1422 * @at_head: insert request at head or tail of queue 1423 * 1424 * Description: 1425 * Insert a fully prepared request at the back of the I/O scheduler queue 1426 * for execution. Don't wait for completion. 1427 * 1428 * Note: 1429 * This function will invoke @done directly if the queue is dead. 1430 */ 1431 void blk_execute_rq_nowait(struct request *rq, bool at_head) 1432 { 1433 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 1434 1435 WARN_ON(irqs_disabled()); 1436 WARN_ON(!blk_rq_is_passthrough(rq)); 1437 1438 blk_account_io_start(rq); 1439 1440 if (current->plug && !at_head) { 1441 blk_add_rq_to_plug(current->plug, rq); 1442 return; 1443 } 1444 1445 blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); 1446 blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); 1447 } 1448 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); 1449 1450 struct blk_rq_wait { 1451 struct completion done; 1452 blk_status_t ret; 1453 }; 1454 1455 static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) 1456 { 1457 struct blk_rq_wait *wait = rq->end_io_data; 1458 1459 wait->ret = ret; 1460 complete(&wait->done); 1461 return RQ_END_IO_NONE; 1462 } 1463 1464 bool blk_rq_is_poll(struct request *rq) 1465 { 1466 if (!rq->mq_hctx) 1467 return false; 1468 if (rq->mq_hctx->type != HCTX_TYPE_POLL) 1469 return false; 1470 return true; 1471 } 1472 EXPORT_SYMBOL_GPL(blk_rq_is_poll); 1473 1474 static void blk_rq_poll_completion(struct request *rq, struct completion *wait) 1475 { 1476 do { 1477 blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0); 1478 cond_resched(); 1479 } while (!completion_done(wait)); 1480 } 1481 1482 /** 1483 * blk_execute_rq - insert a request into queue for execution 1484 * @rq: request to insert 1485 * @at_head: insert request at head or tail of queue 1486 * 1487 * Description: 1488 * Insert a fully prepared request at the back of the I/O scheduler queue 1489 * for execution and wait for completion. 1490 * Return: The blk_status_t result provided to blk_mq_end_request(). 1491 */ 1492 blk_status_t blk_execute_rq(struct request *rq, bool at_head) 1493 { 1494 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 1495 struct blk_rq_wait wait = { 1496 .done = COMPLETION_INITIALIZER_ONSTACK(wait.done), 1497 }; 1498 1499 WARN_ON(irqs_disabled()); 1500 WARN_ON(!blk_rq_is_passthrough(rq)); 1501 1502 rq->end_io_data = &wait; 1503 rq->end_io = blk_end_sync_rq; 1504 1505 blk_account_io_start(rq); 1506 blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); 1507 blk_mq_run_hw_queue(hctx, false); 1508 1509 if (blk_rq_is_poll(rq)) 1510 blk_rq_poll_completion(rq, &wait.done); 1511 else 1512 blk_wait_io(&wait.done); 1513 1514 return wait.ret; 1515 } 1516 EXPORT_SYMBOL(blk_execute_rq); 1517 1518 static void __blk_mq_requeue_request(struct request *rq) 1519 { 1520 struct request_queue *q = rq->q; 1521 1522 blk_mq_put_driver_tag(rq); 1523 1524 trace_block_rq_requeue(rq); 1525 rq_qos_requeue(q, rq); 1526 1527 if (blk_mq_request_started(rq)) { 1528 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 1529 rq->rq_flags &= ~RQF_TIMED_OUT; 1530 } 1531 } 1532 1533 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) 1534 { 1535 struct request_queue *q = rq->q; 1536 unsigned long flags; 1537 1538 __blk_mq_requeue_request(rq); 1539 1540 /* this request will be re-inserted to io scheduler queue */ 1541 blk_mq_sched_requeue_request(rq); 1542 1543 spin_lock_irqsave(&q->requeue_lock, flags); 1544 list_add_tail(&rq->queuelist, &q->requeue_list); 1545 spin_unlock_irqrestore(&q->requeue_lock, flags); 1546 1547 if (kick_requeue_list) 1548 blk_mq_kick_requeue_list(q); 1549 } 1550 EXPORT_SYMBOL(blk_mq_requeue_request); 1551 1552 static void blk_mq_requeue_work(struct work_struct *work) 1553 { 1554 struct request_queue *q = 1555 container_of(work, struct request_queue, requeue_work.work); 1556 LIST_HEAD(rq_list); 1557 LIST_HEAD(flush_list); 1558 struct request *rq; 1559 1560 spin_lock_irq(&q->requeue_lock); 1561 list_splice_init(&q->requeue_list, &rq_list); 1562 list_splice_init(&q->flush_list, &flush_list); 1563 spin_unlock_irq(&q->requeue_lock); 1564 1565 while (!list_empty(&rq_list)) { 1566 rq = list_entry(rq_list.next, struct request, queuelist); 1567 list_del_init(&rq->queuelist); 1568 /* 1569 * If RQF_DONTPREP is set, the request has been started by the 1570 * driver already and might have driver-specific data allocated 1571 * already. Insert it into the hctx dispatch list to avoid 1572 * block layer merges for the request. 1573 */ 1574 if (rq->rq_flags & RQF_DONTPREP) 1575 blk_mq_request_bypass_insert(rq, 0); 1576 else 1577 blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); 1578 } 1579 1580 while (!list_empty(&flush_list)) { 1581 rq = list_entry(flush_list.next, struct request, queuelist); 1582 list_del_init(&rq->queuelist); 1583 blk_mq_insert_request(rq, 0); 1584 } 1585 1586 blk_mq_run_hw_queues(q, false); 1587 } 1588 1589 void blk_mq_kick_requeue_list(struct request_queue *q) 1590 { 1591 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); 1592 } 1593 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 1594 1595 void blk_mq_delay_kick_requeue_list(struct request_queue *q, 1596 unsigned long msecs) 1597 { 1598 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 1599 msecs_to_jiffies(msecs)); 1600 } 1601 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); 1602 1603 static bool blk_is_flush_data_rq(struct request *rq) 1604 { 1605 return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq); 1606 } 1607 1608 static bool blk_mq_rq_inflight(struct request *rq, void *priv) 1609 { 1610 /* 1611 * If we find a request that isn't idle we know the queue is busy 1612 * as it's checked in the iter. 1613 * Return false to stop the iteration. 1614 * 1615 * In case of queue quiesce, if one flush data request is completed, 1616 * don't count it as inflight given the flush sequence is suspended, 1617 * and the original flush data request is invisible to driver, just 1618 * like other pending requests because of quiesce 1619 */ 1620 if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) && 1621 blk_is_flush_data_rq(rq) && 1622 blk_mq_request_completed(rq))) { 1623 bool *busy = priv; 1624 1625 *busy = true; 1626 return false; 1627 } 1628 1629 return true; 1630 } 1631 1632 bool blk_mq_queue_inflight(struct request_queue *q) 1633 { 1634 bool busy = false; 1635 1636 blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); 1637 return busy; 1638 } 1639 EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); 1640 1641 static void blk_mq_rq_timed_out(struct request *req) 1642 { 1643 req->rq_flags |= RQF_TIMED_OUT; 1644 if (req->q->mq_ops->timeout) { 1645 enum blk_eh_timer_return ret; 1646 1647 ret = req->q->mq_ops->timeout(req); 1648 if (ret == BLK_EH_DONE) 1649 return; 1650 WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); 1651 } 1652 1653 blk_add_timer(req); 1654 } 1655 1656 struct blk_expired_data { 1657 bool has_timedout_rq; 1658 unsigned long next; 1659 unsigned long timeout_start; 1660 }; 1661 1662 static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired) 1663 { 1664 unsigned long deadline; 1665 1666 if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) 1667 return false; 1668 if (rq->rq_flags & RQF_TIMED_OUT) 1669 return false; 1670 1671 deadline = READ_ONCE(rq->deadline); 1672 if (time_after_eq(expired->timeout_start, deadline)) 1673 return true; 1674 1675 if (expired->next == 0) 1676 expired->next = deadline; 1677 else if (time_after(expired->next, deadline)) 1678 expired->next = deadline; 1679 return false; 1680 } 1681 1682 void blk_mq_put_rq_ref(struct request *rq) 1683 { 1684 if (is_flush_rq(rq)) { 1685 if (rq->end_io(rq, 0) == RQ_END_IO_FREE) 1686 blk_mq_free_request(rq); 1687 } else if (req_ref_put_and_test(rq)) { 1688 __blk_mq_free_request(rq); 1689 } 1690 } 1691 1692 static bool blk_mq_check_expired(struct request *rq, void *priv) 1693 { 1694 struct blk_expired_data *expired = priv; 1695 1696 /* 1697 * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot 1698 * be reallocated underneath the timeout handler's processing, then 1699 * the expire check is reliable. If the request is not expired, then 1700 * it was completed and reallocated as a new request after returning 1701 * from blk_mq_check_expired(). 1702 */ 1703 if (blk_mq_req_expired(rq, expired)) { 1704 expired->has_timedout_rq = true; 1705 return false; 1706 } 1707 return true; 1708 } 1709 1710 static bool blk_mq_handle_expired(struct request *rq, void *priv) 1711 { 1712 struct blk_expired_data *expired = priv; 1713 1714 if (blk_mq_req_expired(rq, expired)) 1715 blk_mq_rq_timed_out(rq); 1716 return true; 1717 } 1718 1719 static void blk_mq_timeout_work(struct work_struct *work) 1720 { 1721 struct request_queue *q = 1722 container_of(work, struct request_queue, timeout_work); 1723 struct blk_expired_data expired = { 1724 .timeout_start = jiffies, 1725 }; 1726 struct blk_mq_hw_ctx *hctx; 1727 unsigned long i; 1728 1729 /* A deadlock might occur if a request is stuck requiring a 1730 * timeout at the same time a queue freeze is waiting 1731 * completion, since the timeout code would not be able to 1732 * acquire the queue reference here. 1733 * 1734 * That's why we don't use blk_queue_enter here; instead, we use 1735 * percpu_ref_tryget directly, because we need to be able to 1736 * obtain a reference even in the short window between the queue 1737 * starting to freeze, by dropping the first reference in 1738 * blk_freeze_queue_start, and the moment the last request is 1739 * consumed, marked by the instant q_usage_counter reaches 1740 * zero. 1741 */ 1742 if (!percpu_ref_tryget(&q->q_usage_counter)) 1743 return; 1744 1745 /* check if there is any timed-out request */ 1746 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); 1747 if (expired.has_timedout_rq) { 1748 /* 1749 * Before walking tags, we must ensure any submit started 1750 * before the current time has finished. Since the submit 1751 * uses srcu or rcu, wait for a synchronization point to 1752 * ensure all running submits have finished 1753 */ 1754 blk_mq_wait_quiesce_done(q->tag_set); 1755 1756 expired.next = 0; 1757 blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); 1758 } 1759 1760 if (expired.next != 0) { 1761 mod_timer(&q->timeout, expired.next); 1762 } else { 1763 /* 1764 * Request timeouts are handled as a forward rolling timer. If 1765 * we end up here it means that no requests are pending and 1766 * also that no request has been pending for a while. Mark 1767 * each hctx as idle. 1768 */ 1769 queue_for_each_hw_ctx(q, hctx, i) { 1770 /* the hctx may be unmapped, so check it here */ 1771 if (blk_mq_hw_queue_mapped(hctx)) 1772 blk_mq_tag_idle(hctx); 1773 } 1774 } 1775 blk_queue_exit(q); 1776 } 1777 1778 struct flush_busy_ctx_data { 1779 struct blk_mq_hw_ctx *hctx; 1780 struct list_head *list; 1781 }; 1782 1783 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) 1784 { 1785 struct flush_busy_ctx_data *flush_data = data; 1786 struct blk_mq_hw_ctx *hctx = flush_data->hctx; 1787 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 1788 enum hctx_type type = hctx->type; 1789 1790 spin_lock(&ctx->lock); 1791 list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); 1792 sbitmap_clear_bit(sb, bitnr); 1793 spin_unlock(&ctx->lock); 1794 return true; 1795 } 1796 1797 /* 1798 * Process software queues that have been marked busy, splicing them 1799 * to the for-dispatch 1800 */ 1801 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 1802 { 1803 struct flush_busy_ctx_data data = { 1804 .hctx = hctx, 1805 .list = list, 1806 }; 1807 1808 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); 1809 } 1810 1811 struct dispatch_rq_data { 1812 struct blk_mq_hw_ctx *hctx; 1813 struct request *rq; 1814 }; 1815 1816 static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, 1817 void *data) 1818 { 1819 struct dispatch_rq_data *dispatch_data = data; 1820 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; 1821 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 1822 enum hctx_type type = hctx->type; 1823 1824 spin_lock(&ctx->lock); 1825 if (!list_empty(&ctx->rq_lists[type])) { 1826 dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); 1827 list_del_init(&dispatch_data->rq->queuelist); 1828 if (list_empty(&ctx->rq_lists[type])) 1829 sbitmap_clear_bit(sb, bitnr); 1830 } 1831 spin_unlock(&ctx->lock); 1832 1833 return !dispatch_data->rq; 1834 } 1835 1836 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, 1837 struct blk_mq_ctx *start) 1838 { 1839 unsigned off = start ? start->index_hw[hctx->type] : 0; 1840 struct dispatch_rq_data data = { 1841 .hctx = hctx, 1842 .rq = NULL, 1843 }; 1844 1845 __sbitmap_for_each_set(&hctx->ctx_map, off, 1846 dispatch_rq_from_ctx, &data); 1847 1848 return data.rq; 1849 } 1850 1851 bool __blk_mq_alloc_driver_tag(struct request *rq) 1852 { 1853 struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; 1854 unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; 1855 int tag; 1856 1857 blk_mq_tag_busy(rq->mq_hctx); 1858 1859 if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { 1860 bt = &rq->mq_hctx->tags->breserved_tags; 1861 tag_offset = 0; 1862 } else { 1863 if (!hctx_may_queue(rq->mq_hctx, bt)) 1864 return false; 1865 } 1866 1867 tag = __sbitmap_queue_get(bt); 1868 if (tag == BLK_MQ_NO_TAG) 1869 return false; 1870 1871 rq->tag = tag + tag_offset; 1872 blk_mq_inc_active_requests(rq->mq_hctx); 1873 return true; 1874 } 1875 1876 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, 1877 int flags, void *key) 1878 { 1879 struct blk_mq_hw_ctx *hctx; 1880 1881 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); 1882 1883 spin_lock(&hctx->dispatch_wait_lock); 1884 if (!list_empty(&wait->entry)) { 1885 struct sbitmap_queue *sbq; 1886 1887 list_del_init(&wait->entry); 1888 sbq = &hctx->tags->bitmap_tags; 1889 atomic_dec(&sbq->ws_active); 1890 } 1891 spin_unlock(&hctx->dispatch_wait_lock); 1892 1893 blk_mq_run_hw_queue(hctx, true); 1894 return 1; 1895 } 1896 1897 /* 1898 * Mark us waiting for a tag. For shared tags, this involves hooking us into 1899 * the tag wakeups. For non-shared tags, we can simply mark us needing a 1900 * restart. For both cases, take care to check the condition again after 1901 * marking us as waiting. 1902 */ 1903 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, 1904 struct request *rq) 1905 { 1906 struct sbitmap_queue *sbq; 1907 struct wait_queue_head *wq; 1908 wait_queue_entry_t *wait; 1909 bool ret; 1910 1911 if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && 1912 !(blk_mq_is_shared_tags(hctx->flags))) { 1913 blk_mq_sched_mark_restart_hctx(hctx); 1914 1915 /* 1916 * It's possible that a tag was freed in the window between the 1917 * allocation failure and adding the hardware queue to the wait 1918 * queue. 1919 * 1920 * Don't clear RESTART here, someone else could have set it. 1921 * At most this will cost an extra queue run. 1922 */ 1923 return blk_mq_get_driver_tag(rq); 1924 } 1925 1926 wait = &hctx->dispatch_wait; 1927 if (!list_empty_careful(&wait->entry)) 1928 return false; 1929 1930 if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) 1931 sbq = &hctx->tags->breserved_tags; 1932 else 1933 sbq = &hctx->tags->bitmap_tags; 1934 wq = &bt_wait_ptr(sbq, hctx)->wait; 1935 1936 spin_lock_irq(&wq->lock); 1937 spin_lock(&hctx->dispatch_wait_lock); 1938 if (!list_empty(&wait->entry)) { 1939 spin_unlock(&hctx->dispatch_wait_lock); 1940 spin_unlock_irq(&wq->lock); 1941 return false; 1942 } 1943 1944 atomic_inc(&sbq->ws_active); 1945 wait->flags &= ~WQ_FLAG_EXCLUSIVE; 1946 __add_wait_queue(wq, wait); 1947 1948 /* 1949 * Add one explicit barrier since blk_mq_get_driver_tag() may 1950 * not imply barrier in case of failure. 1951 * 1952 * Order adding us to wait queue and allocating driver tag. 1953 * 1954 * The pair is the one implied in sbitmap_queue_wake_up() which 1955 * orders clearing sbitmap tag bits and waitqueue_active() in 1956 * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless 1957 * 1958 * Otherwise, re-order of adding wait queue and getting driver tag 1959 * may cause __sbitmap_queue_wake_up() to wake up nothing because 1960 * the waitqueue_active() may not observe us in wait queue. 1961 */ 1962 smp_mb(); 1963 1964 /* 1965 * It's possible that a tag was freed in the window between the 1966 * allocation failure and adding the hardware queue to the wait 1967 * queue. 1968 */ 1969 ret = blk_mq_get_driver_tag(rq); 1970 if (!ret) { 1971 spin_unlock(&hctx->dispatch_wait_lock); 1972 spin_unlock_irq(&wq->lock); 1973 return false; 1974 } 1975 1976 /* 1977 * We got a tag, remove ourselves from the wait queue to ensure 1978 * someone else gets the wakeup. 1979 */ 1980 list_del_init(&wait->entry); 1981 atomic_dec(&sbq->ws_active); 1982 spin_unlock(&hctx->dispatch_wait_lock); 1983 spin_unlock_irq(&wq->lock); 1984 1985 return true; 1986 } 1987 1988 #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 1989 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 1990 /* 1991 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): 1992 * - EWMA is one simple way to compute running average value 1993 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially 1994 * - take 4 as factor for avoiding to get too small(0) result, and this 1995 * factor doesn't matter because EWMA decreases exponentially 1996 */ 1997 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) 1998 { 1999 unsigned int ewma; 2000 2001 ewma = hctx->dispatch_busy; 2002 2003 if (!ewma && !busy) 2004 return; 2005 2006 ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; 2007 if (busy) 2008 ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; 2009 ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; 2010 2011 hctx->dispatch_busy = ewma; 2012 } 2013 2014 #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ 2015 2016 static void blk_mq_handle_dev_resource(struct request *rq, 2017 struct list_head *list) 2018 { 2019 list_add(&rq->queuelist, list); 2020 __blk_mq_requeue_request(rq); 2021 } 2022 2023 enum prep_dispatch { 2024 PREP_DISPATCH_OK, 2025 PREP_DISPATCH_NO_TAG, 2026 PREP_DISPATCH_NO_BUDGET, 2027 }; 2028 2029 static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, 2030 bool need_budget) 2031 { 2032 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 2033 int budget_token = -1; 2034 2035 if (need_budget) { 2036 budget_token = blk_mq_get_dispatch_budget(rq->q); 2037 if (budget_token < 0) { 2038 blk_mq_put_driver_tag(rq); 2039 return PREP_DISPATCH_NO_BUDGET; 2040 } 2041 blk_mq_set_rq_budget_token(rq, budget_token); 2042 } 2043 2044 if (!blk_mq_get_driver_tag(rq)) { 2045 /* 2046 * The initial allocation attempt failed, so we need to 2047 * rerun the hardware queue when a tag is freed. The 2048 * waitqueue takes care of that. If the queue is run 2049 * before we add this entry back on the dispatch list, 2050 * we'll re-run it below. 2051 */ 2052 if (!blk_mq_mark_tag_wait(hctx, rq)) { 2053 /* 2054 * All budgets not got from this function will be put 2055 * together during handling partial dispatch 2056 */ 2057 if (need_budget) 2058 blk_mq_put_dispatch_budget(rq->q, budget_token); 2059 return PREP_DISPATCH_NO_TAG; 2060 } 2061 } 2062 2063 return PREP_DISPATCH_OK; 2064 } 2065 2066 /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ 2067 static void blk_mq_release_budgets(struct request_queue *q, 2068 struct list_head *list) 2069 { 2070 struct request *rq; 2071 2072 list_for_each_entry(rq, list, queuelist) { 2073 int budget_token = blk_mq_get_rq_budget_token(rq); 2074 2075 if (budget_token >= 0) 2076 blk_mq_put_dispatch_budget(q, budget_token); 2077 } 2078 } 2079 2080 /* 2081 * blk_mq_commit_rqs will notify driver using bd->last that there is no 2082 * more requests. (See comment in struct blk_mq_ops for commit_rqs for 2083 * details) 2084 * Attention, we should explicitly call this in unusual cases: 2085 * 1) did not queue everything initially scheduled to queue 2086 * 2) the last attempt to queue a request failed 2087 */ 2088 static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, 2089 bool from_schedule) 2090 { 2091 if (hctx->queue->mq_ops->commit_rqs && queued) { 2092 trace_block_unplug(hctx->queue, queued, !from_schedule); 2093 hctx->queue->mq_ops->commit_rqs(hctx); 2094 } 2095 } 2096 2097 /* 2098 * Returns true if we did some work AND can potentially do more. 2099 */ 2100 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, 2101 bool get_budget) 2102 { 2103 enum prep_dispatch prep; 2104 struct request_queue *q = hctx->queue; 2105 struct request *rq; 2106 int queued; 2107 blk_status_t ret = BLK_STS_OK; 2108 bool needs_resource = false; 2109 2110 if (list_empty(list)) 2111 return false; 2112 2113 /* 2114 * Now process all the entries, sending them to the driver. 2115 */ 2116 queued = 0; 2117 do { 2118 struct blk_mq_queue_data bd; 2119 2120 rq = list_first_entry(list, struct request, queuelist); 2121 2122 WARN_ON_ONCE(hctx != rq->mq_hctx); 2123 prep = blk_mq_prep_dispatch_rq(rq, get_budget); 2124 if (prep != PREP_DISPATCH_OK) 2125 break; 2126 2127 list_del_init(&rq->queuelist); 2128 2129 bd.rq = rq; 2130 bd.last = list_empty(list); 2131 2132 ret = q->mq_ops->queue_rq(hctx, &bd); 2133 switch (ret) { 2134 case BLK_STS_OK: 2135 queued++; 2136 break; 2137 case BLK_STS_RESOURCE: 2138 needs_resource = true; 2139 fallthrough; 2140 case BLK_STS_DEV_RESOURCE: 2141 blk_mq_handle_dev_resource(rq, list); 2142 goto out; 2143 default: 2144 blk_mq_end_request(rq, ret); 2145 } 2146 } while (!list_empty(list)); 2147 out: 2148 /* If we didn't flush the entire list, we could have told the driver 2149 * there was more coming, but that turned out to be a lie. 2150 */ 2151 if (!list_empty(list) || ret != BLK_STS_OK) 2152 blk_mq_commit_rqs(hctx, queued, false); 2153 2154 /* 2155 * Any items that need requeuing? Stuff them into hctx->dispatch, 2156 * that is where we will continue on next queue run. 2157 */ 2158 if (!list_empty(list)) { 2159 bool needs_restart; 2160 /* For non-shared tags, the RESTART check will suffice */ 2161 bool no_tag = prep == PREP_DISPATCH_NO_TAG && 2162 ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || 2163 blk_mq_is_shared_tags(hctx->flags)); 2164 2165 /* 2166 * If the caller allocated budgets, free the budgets of the 2167 * requests that have not yet been passed to the block driver. 2168 */ 2169 if (!get_budget) 2170 blk_mq_release_budgets(q, list); 2171 2172 spin_lock(&hctx->lock); 2173 list_splice_tail_init(list, &hctx->dispatch); 2174 spin_unlock(&hctx->lock); 2175 2176 /* 2177 * Order adding requests to hctx->dispatch and checking 2178 * SCHED_RESTART flag. The pair of this smp_mb() is the one 2179 * in blk_mq_sched_restart(). Avoid restart code path to 2180 * miss the new added requests to hctx->dispatch, meantime 2181 * SCHED_RESTART is observed here. 2182 */ 2183 smp_mb(); 2184 2185 /* 2186 * If SCHED_RESTART was set by the caller of this function and 2187 * it is no longer set that means that it was cleared by another 2188 * thread and hence that a queue rerun is needed. 2189 * 2190 * If 'no_tag' is set, that means that we failed getting 2191 * a driver tag with an I/O scheduler attached. If our dispatch 2192 * waitqueue is no longer active, ensure that we run the queue 2193 * AFTER adding our entries back to the list. 2194 * 2195 * If no I/O scheduler has been configured it is possible that 2196 * the hardware queue got stopped and restarted before requests 2197 * were pushed back onto the dispatch list. Rerun the queue to 2198 * avoid starvation. Notes: 2199 * - blk_mq_run_hw_queue() checks whether or not a queue has 2200 * been stopped before rerunning a queue. 2201 * - Some but not all block drivers stop a queue before 2202 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq 2203 * and dm-rq. 2204 * 2205 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART 2206 * bit is set, run queue after a delay to avoid IO stalls 2207 * that could otherwise occur if the queue is idle. We'll do 2208 * similar if we couldn't get budget or couldn't lock a zone 2209 * and SCHED_RESTART is set. 2210 */ 2211 needs_restart = blk_mq_sched_needs_restart(hctx); 2212 if (prep == PREP_DISPATCH_NO_BUDGET) 2213 needs_resource = true; 2214 if (!needs_restart || 2215 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) 2216 blk_mq_run_hw_queue(hctx, true); 2217 else if (needs_resource) 2218 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); 2219 2220 blk_mq_update_dispatch_busy(hctx, true); 2221 return false; 2222 } 2223 2224 blk_mq_update_dispatch_busy(hctx, false); 2225 return true; 2226 } 2227 2228 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) 2229 { 2230 int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); 2231 2232 if (cpu >= nr_cpu_ids) 2233 cpu = cpumask_first(hctx->cpumask); 2234 return cpu; 2235 } 2236 2237 /* 2238 * ->next_cpu is always calculated from hctx->cpumask, so simply use 2239 * it for speeding up the check 2240 */ 2241 static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx) 2242 { 2243 return hctx->next_cpu >= nr_cpu_ids; 2244 } 2245 2246 /* 2247 * It'd be great if the workqueue API had a way to pass 2248 * in a mask and had some smarts for more clever placement. 2249 * For now we just round-robin here, switching for every 2250 * BLK_MQ_CPU_WORK_BATCH queued items. 2251 */ 2252 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 2253 { 2254 bool tried = false; 2255 int next_cpu = hctx->next_cpu; 2256 2257 /* Switch to unbound if no allowable CPUs in this hctx */ 2258 if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx)) 2259 return WORK_CPU_UNBOUND; 2260 2261 if (--hctx->next_cpu_batch <= 0) { 2262 select_cpu: 2263 next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, 2264 cpu_online_mask); 2265 if (next_cpu >= nr_cpu_ids) 2266 next_cpu = blk_mq_first_mapped_cpu(hctx); 2267 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 2268 } 2269 2270 /* 2271 * Do unbound schedule if we can't find a online CPU for this hctx, 2272 * and it should only happen in the path of handling CPU DEAD. 2273 */ 2274 if (!cpu_online(next_cpu)) { 2275 if (!tried) { 2276 tried = true; 2277 goto select_cpu; 2278 } 2279 2280 /* 2281 * Make sure to re-select CPU next time once after CPUs 2282 * in hctx->cpumask become online again. 2283 */ 2284 hctx->next_cpu = next_cpu; 2285 hctx->next_cpu_batch = 1; 2286 return WORK_CPU_UNBOUND; 2287 } 2288 2289 hctx->next_cpu = next_cpu; 2290 return next_cpu; 2291 } 2292 2293 /** 2294 * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. 2295 * @hctx: Pointer to the hardware queue to run. 2296 * @msecs: Milliseconds of delay to wait before running the queue. 2297 * 2298 * Run a hardware queue asynchronously with a delay of @msecs. 2299 */ 2300 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 2301 { 2302 if (unlikely(blk_mq_hctx_stopped(hctx))) 2303 return; 2304 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, 2305 msecs_to_jiffies(msecs)); 2306 } 2307 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); 2308 2309 static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx) 2310 { 2311 bool need_run; 2312 2313 /* 2314 * When queue is quiesced, we may be switching io scheduler, or 2315 * updating nr_hw_queues, or other things, and we can't run queue 2316 * any more, even blk_mq_hctx_has_pending() can't be called safely. 2317 * 2318 * And queue will be rerun in blk_mq_unquiesce_queue() if it is 2319 * quiesced. 2320 */ 2321 __blk_mq_run_dispatch_ops(hctx->queue, false, 2322 need_run = !blk_queue_quiesced(hctx->queue) && 2323 blk_mq_hctx_has_pending(hctx)); 2324 return need_run; 2325 } 2326 2327 /** 2328 * blk_mq_run_hw_queue - Start to run a hardware queue. 2329 * @hctx: Pointer to the hardware queue to run. 2330 * @async: If we want to run the queue asynchronously. 2331 * 2332 * Check if the request queue is not in a quiesced state and if there are 2333 * pending requests to be sent. If this is true, run the queue to send requests 2334 * to hardware. 2335 */ 2336 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 2337 { 2338 bool need_run; 2339 2340 /* 2341 * We can't run the queue inline with interrupts disabled. 2342 */ 2343 WARN_ON_ONCE(!async && in_interrupt()); 2344 2345 might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); 2346 2347 need_run = blk_mq_hw_queue_need_run(hctx); 2348 if (!need_run) { 2349 unsigned long flags; 2350 2351 /* 2352 * Synchronize with blk_mq_unquiesce_queue(), because we check 2353 * if hw queue is quiesced locklessly above, we need the use 2354 * ->queue_lock to make sure we see the up-to-date status to 2355 * not miss rerunning the hw queue. 2356 */ 2357 spin_lock_irqsave(&hctx->queue->queue_lock, flags); 2358 need_run = blk_mq_hw_queue_need_run(hctx); 2359 spin_unlock_irqrestore(&hctx->queue->queue_lock, flags); 2360 2361 if (!need_run) 2362 return; 2363 } 2364 2365 if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { 2366 blk_mq_delay_run_hw_queue(hctx, 0); 2367 return; 2368 } 2369 2370 blk_mq_run_dispatch_ops(hctx->queue, 2371 blk_mq_sched_dispatch_requests(hctx)); 2372 } 2373 EXPORT_SYMBOL(blk_mq_run_hw_queue); 2374 2375 /* 2376 * Return prefered queue to dispatch from (if any) for non-mq aware IO 2377 * scheduler. 2378 */ 2379 static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) 2380 { 2381 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 2382 /* 2383 * If the IO scheduler does not respect hardware queues when 2384 * dispatching, we just don't bother with multiple HW queues and 2385 * dispatch from hctx for the current CPU since running multiple queues 2386 * just causes lock contention inside the scheduler and pointless cache 2387 * bouncing. 2388 */ 2389 struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT]; 2390 2391 if (!blk_mq_hctx_stopped(hctx)) 2392 return hctx; 2393 return NULL; 2394 } 2395 2396 /** 2397 * blk_mq_run_hw_queues - Run all hardware queues in a request queue. 2398 * @q: Pointer to the request queue to run. 2399 * @async: If we want to run the queue asynchronously. 2400 */ 2401 void blk_mq_run_hw_queues(struct request_queue *q, bool async) 2402 { 2403 struct blk_mq_hw_ctx *hctx, *sq_hctx; 2404 unsigned long i; 2405 2406 sq_hctx = NULL; 2407 if (blk_queue_sq_sched(q)) 2408 sq_hctx = blk_mq_get_sq_hctx(q); 2409 queue_for_each_hw_ctx(q, hctx, i) { 2410 if (blk_mq_hctx_stopped(hctx)) 2411 continue; 2412 /* 2413 * Dispatch from this hctx either if there's no hctx preferred 2414 * by IO scheduler or if it has requests that bypass the 2415 * scheduler. 2416 */ 2417 if (!sq_hctx || sq_hctx == hctx || 2418 !list_empty_careful(&hctx->dispatch)) 2419 blk_mq_run_hw_queue(hctx, async); 2420 } 2421 } 2422 EXPORT_SYMBOL(blk_mq_run_hw_queues); 2423 2424 /** 2425 * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. 2426 * @q: Pointer to the request queue to run. 2427 * @msecs: Milliseconds of delay to wait before running the queues. 2428 */ 2429 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) 2430 { 2431 struct blk_mq_hw_ctx *hctx, *sq_hctx; 2432 unsigned long i; 2433 2434 sq_hctx = NULL; 2435 if (blk_queue_sq_sched(q)) 2436 sq_hctx = blk_mq_get_sq_hctx(q); 2437 queue_for_each_hw_ctx(q, hctx, i) { 2438 if (blk_mq_hctx_stopped(hctx)) 2439 continue; 2440 /* 2441 * If there is already a run_work pending, leave the 2442 * pending delay untouched. Otherwise, a hctx can stall 2443 * if another hctx is re-delaying the other's work 2444 * before the work executes. 2445 */ 2446 if (delayed_work_pending(&hctx->run_work)) 2447 continue; 2448 /* 2449 * Dispatch from this hctx either if there's no hctx preferred 2450 * by IO scheduler or if it has requests that bypass the 2451 * scheduler. 2452 */ 2453 if (!sq_hctx || sq_hctx == hctx || 2454 !list_empty_careful(&hctx->dispatch)) 2455 blk_mq_delay_run_hw_queue(hctx, msecs); 2456 } 2457 } 2458 EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); 2459 2460 /* 2461 * This function is often used for pausing .queue_rq() by driver when 2462 * there isn't enough resource or some conditions aren't satisfied, and 2463 * BLK_STS_RESOURCE is usually returned. 2464 * 2465 * We do not guarantee that dispatch can be drained or blocked 2466 * after blk_mq_stop_hw_queue() returns. Please use 2467 * blk_mq_quiesce_queue() for that requirement. 2468 */ 2469 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 2470 { 2471 cancel_delayed_work(&hctx->run_work); 2472 2473 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 2474 } 2475 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 2476 2477 /* 2478 * This function is often used for pausing .queue_rq() by driver when 2479 * there isn't enough resource or some conditions aren't satisfied, and 2480 * BLK_STS_RESOURCE is usually returned. 2481 * 2482 * We do not guarantee that dispatch can be drained or blocked 2483 * after blk_mq_stop_hw_queues() returns. Please use 2484 * blk_mq_quiesce_queue() for that requirement. 2485 */ 2486 void blk_mq_stop_hw_queues(struct request_queue *q) 2487 { 2488 struct blk_mq_hw_ctx *hctx; 2489 unsigned long i; 2490 2491 queue_for_each_hw_ctx(q, hctx, i) 2492 blk_mq_stop_hw_queue(hctx); 2493 } 2494 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 2495 2496 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 2497 { 2498 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 2499 2500 blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); 2501 } 2502 EXPORT_SYMBOL(blk_mq_start_hw_queue); 2503 2504 void blk_mq_start_hw_queues(struct request_queue *q) 2505 { 2506 struct blk_mq_hw_ctx *hctx; 2507 unsigned long i; 2508 2509 queue_for_each_hw_ctx(q, hctx, i) 2510 blk_mq_start_hw_queue(hctx); 2511 } 2512 EXPORT_SYMBOL(blk_mq_start_hw_queues); 2513 2514 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 2515 { 2516 if (!blk_mq_hctx_stopped(hctx)) 2517 return; 2518 2519 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 2520 /* 2521 * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the 2522 * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch 2523 * list in the subsequent routine. 2524 */ 2525 smp_mb__after_atomic(); 2526 blk_mq_run_hw_queue(hctx, async); 2527 } 2528 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); 2529 2530 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 2531 { 2532 struct blk_mq_hw_ctx *hctx; 2533 unsigned long i; 2534 2535 queue_for_each_hw_ctx(q, hctx, i) 2536 blk_mq_start_stopped_hw_queue(hctx, async || 2537 (hctx->flags & BLK_MQ_F_BLOCKING)); 2538 } 2539 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 2540 2541 static void blk_mq_run_work_fn(struct work_struct *work) 2542 { 2543 struct blk_mq_hw_ctx *hctx = 2544 container_of(work, struct blk_mq_hw_ctx, run_work.work); 2545 2546 blk_mq_run_dispatch_ops(hctx->queue, 2547 blk_mq_sched_dispatch_requests(hctx)); 2548 } 2549 2550 /** 2551 * blk_mq_request_bypass_insert - Insert a request at dispatch list. 2552 * @rq: Pointer to request to be inserted. 2553 * @flags: BLK_MQ_INSERT_* 2554 * 2555 * Should only be used carefully, when the caller knows we want to 2556 * bypass a potential IO scheduler on the target device. 2557 */ 2558 static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) 2559 { 2560 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 2561 2562 spin_lock(&hctx->lock); 2563 if (flags & BLK_MQ_INSERT_AT_HEAD) 2564 list_add(&rq->queuelist, &hctx->dispatch); 2565 else 2566 list_add_tail(&rq->queuelist, &hctx->dispatch); 2567 spin_unlock(&hctx->lock); 2568 } 2569 2570 static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, 2571 struct blk_mq_ctx *ctx, struct list_head *list, 2572 bool run_queue_async) 2573 { 2574 struct request *rq; 2575 enum hctx_type type = hctx->type; 2576 2577 /* 2578 * Try to issue requests directly if the hw queue isn't busy to save an 2579 * extra enqueue & dequeue to the sw queue. 2580 */ 2581 if (!hctx->dispatch_busy && !run_queue_async) { 2582 blk_mq_run_dispatch_ops(hctx->queue, 2583 blk_mq_try_issue_list_directly(hctx, list)); 2584 if (list_empty(list)) 2585 goto out; 2586 } 2587 2588 /* 2589 * preemption doesn't flush plug list, so it's possible ctx->cpu is 2590 * offline now 2591 */ 2592 list_for_each_entry(rq, list, queuelist) { 2593 BUG_ON(rq->mq_ctx != ctx); 2594 trace_block_rq_insert(rq); 2595 if (rq->cmd_flags & REQ_NOWAIT) 2596 run_queue_async = true; 2597 } 2598 2599 spin_lock(&ctx->lock); 2600 list_splice_tail_init(list, &ctx->rq_lists[type]); 2601 blk_mq_hctx_mark_pending(hctx, ctx); 2602 spin_unlock(&ctx->lock); 2603 out: 2604 blk_mq_run_hw_queue(hctx, run_queue_async); 2605 } 2606 2607 static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) 2608 { 2609 struct request_queue *q = rq->q; 2610 struct blk_mq_ctx *ctx = rq->mq_ctx; 2611 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 2612 2613 if (blk_rq_is_passthrough(rq)) { 2614 /* 2615 * Passthrough request have to be added to hctx->dispatch 2616 * directly. The device may be in a situation where it can't 2617 * handle FS request, and always returns BLK_STS_RESOURCE for 2618 * them, which gets them added to hctx->dispatch. 2619 * 2620 * If a passthrough request is required to unblock the queues, 2621 * and it is added to the scheduler queue, there is no chance to 2622 * dispatch it given we prioritize requests in hctx->dispatch. 2623 */ 2624 blk_mq_request_bypass_insert(rq, flags); 2625 } else if (req_op(rq) == REQ_OP_FLUSH) { 2626 /* 2627 * Firstly normal IO request is inserted to scheduler queue or 2628 * sw queue, meantime we add flush request to dispatch queue( 2629 * hctx->dispatch) directly and there is at most one in-flight 2630 * flush request for each hw queue, so it doesn't matter to add 2631 * flush request to tail or front of the dispatch queue. 2632 * 2633 * Secondly in case of NCQ, flush request belongs to non-NCQ 2634 * command, and queueing it will fail when there is any 2635 * in-flight normal IO request(NCQ command). When adding flush 2636 * rq to the front of hctx->dispatch, it is easier to introduce 2637 * extra time to flush rq's latency because of S_SCHED_RESTART 2638 * compared with adding to the tail of dispatch queue, then 2639 * chance of flush merge is increased, and less flush requests 2640 * will be issued to controller. It is observed that ~10% time 2641 * is saved in blktests block/004 on disk attached to AHCI/NCQ 2642 * drive when adding flush rq to the front of hctx->dispatch. 2643 * 2644 * Simply queue flush rq to the front of hctx->dispatch so that 2645 * intensive flush workloads can benefit in case of NCQ HW. 2646 */ 2647 blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD); 2648 } else if (q->elevator) { 2649 LIST_HEAD(list); 2650 2651 WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG); 2652 2653 list_add(&rq->queuelist, &list); 2654 q->elevator->type->ops.insert_requests(hctx, &list, flags); 2655 } else { 2656 trace_block_rq_insert(rq); 2657 2658 spin_lock(&ctx->lock); 2659 if (flags & BLK_MQ_INSERT_AT_HEAD) 2660 list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); 2661 else 2662 list_add_tail(&rq->queuelist, 2663 &ctx->rq_lists[hctx->type]); 2664 blk_mq_hctx_mark_pending(hctx, ctx); 2665 spin_unlock(&ctx->lock); 2666 } 2667 } 2668 2669 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, 2670 unsigned int nr_segs) 2671 { 2672 int err; 2673 2674 if (bio->bi_opf & REQ_RAHEAD) 2675 rq->cmd_flags |= REQ_FAILFAST_MASK; 2676 2677 rq->bio = rq->biotail = bio; 2678 rq->__sector = bio->bi_iter.bi_sector; 2679 rq->__data_len = bio->bi_iter.bi_size; 2680 rq->phys_gap_bit = bio->bi_bvec_gap_bit; 2681 2682 rq->nr_phys_segments = nr_segs; 2683 if (bio_integrity(bio)) 2684 rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, 2685 bio); 2686 2687 /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ 2688 err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); 2689 WARN_ON_ONCE(err); 2690 2691 blk_account_io_start(rq); 2692 } 2693 2694 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, 2695 struct request *rq, bool last) 2696 { 2697 struct request_queue *q = rq->q; 2698 struct blk_mq_queue_data bd = { 2699 .rq = rq, 2700 .last = last, 2701 }; 2702 blk_status_t ret; 2703 2704 /* 2705 * For OK queue, we are done. For error, caller may kill it. 2706 * Any other error (busy), just add it to our list as we 2707 * previously would have done. 2708 */ 2709 ret = q->mq_ops->queue_rq(hctx, &bd); 2710 switch (ret) { 2711 case BLK_STS_OK: 2712 blk_mq_update_dispatch_busy(hctx, false); 2713 break; 2714 case BLK_STS_RESOURCE: 2715 case BLK_STS_DEV_RESOURCE: 2716 blk_mq_update_dispatch_busy(hctx, true); 2717 __blk_mq_requeue_request(rq); 2718 break; 2719 default: 2720 blk_mq_update_dispatch_busy(hctx, false); 2721 break; 2722 } 2723 2724 return ret; 2725 } 2726 2727 static bool blk_mq_get_budget_and_tag(struct request *rq) 2728 { 2729 int budget_token; 2730 2731 budget_token = blk_mq_get_dispatch_budget(rq->q); 2732 if (budget_token < 0) 2733 return false; 2734 blk_mq_set_rq_budget_token(rq, budget_token); 2735 if (!blk_mq_get_driver_tag(rq)) { 2736 blk_mq_put_dispatch_budget(rq->q, budget_token); 2737 return false; 2738 } 2739 return true; 2740 } 2741 2742 /** 2743 * blk_mq_try_issue_directly - Try to send a request directly to device driver. 2744 * @hctx: Pointer of the associated hardware queue. 2745 * @rq: Pointer to request to be sent. 2746 * 2747 * If the device has enough resources to accept a new request now, send the 2748 * request directly to device driver. Else, insert at hctx->dispatch queue, so 2749 * we can try send it another time in the future. Requests inserted at this 2750 * queue have higher priority. 2751 */ 2752 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 2753 struct request *rq) 2754 { 2755 blk_status_t ret; 2756 2757 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { 2758 blk_mq_insert_request(rq, 0); 2759 blk_mq_run_hw_queue(hctx, false); 2760 return; 2761 } 2762 2763 if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { 2764 blk_mq_insert_request(rq, 0); 2765 blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); 2766 return; 2767 } 2768 2769 ret = __blk_mq_issue_directly(hctx, rq, true); 2770 switch (ret) { 2771 case BLK_STS_OK: 2772 break; 2773 case BLK_STS_RESOURCE: 2774 case BLK_STS_DEV_RESOURCE: 2775 blk_mq_request_bypass_insert(rq, 0); 2776 blk_mq_run_hw_queue(hctx, false); 2777 break; 2778 default: 2779 blk_mq_end_request(rq, ret); 2780 break; 2781 } 2782 } 2783 2784 static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) 2785 { 2786 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 2787 2788 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { 2789 blk_mq_insert_request(rq, 0); 2790 blk_mq_run_hw_queue(hctx, false); 2791 return BLK_STS_OK; 2792 } 2793 2794 if (!blk_mq_get_budget_and_tag(rq)) 2795 return BLK_STS_RESOURCE; 2796 return __blk_mq_issue_directly(hctx, rq, last); 2797 } 2798 2799 static void blk_mq_issue_direct(struct rq_list *rqs) 2800 { 2801 struct blk_mq_hw_ctx *hctx = NULL; 2802 struct request *rq; 2803 int queued = 0; 2804 blk_status_t ret = BLK_STS_OK; 2805 2806 while ((rq = rq_list_pop(rqs))) { 2807 bool last = rq_list_empty(rqs); 2808 2809 if (hctx != rq->mq_hctx) { 2810 if (hctx) { 2811 blk_mq_commit_rqs(hctx, queued, false); 2812 queued = 0; 2813 } 2814 hctx = rq->mq_hctx; 2815 } 2816 2817 ret = blk_mq_request_issue_directly(rq, last); 2818 switch (ret) { 2819 case BLK_STS_OK: 2820 queued++; 2821 break; 2822 case BLK_STS_RESOURCE: 2823 case BLK_STS_DEV_RESOURCE: 2824 blk_mq_request_bypass_insert(rq, 0); 2825 blk_mq_run_hw_queue(hctx, false); 2826 goto out; 2827 default: 2828 blk_mq_end_request(rq, ret); 2829 break; 2830 } 2831 } 2832 2833 out: 2834 if (ret != BLK_STS_OK) 2835 blk_mq_commit_rqs(hctx, queued, false); 2836 } 2837 2838 static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs) 2839 { 2840 if (blk_queue_quiesced(q)) 2841 return; 2842 q->mq_ops->queue_rqs(rqs); 2843 } 2844 2845 static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs, 2846 struct rq_list *queue_rqs) 2847 { 2848 struct request *rq = rq_list_pop(rqs); 2849 struct request_queue *this_q = rq->q; 2850 struct request **prev = &rqs->head; 2851 struct rq_list matched_rqs = {}; 2852 struct request *last = NULL; 2853 unsigned depth = 1; 2854 2855 rq_list_add_tail(&matched_rqs, rq); 2856 while ((rq = *prev)) { 2857 if (rq->q == this_q) { 2858 /* move rq from rqs to matched_rqs */ 2859 *prev = rq->rq_next; 2860 rq_list_add_tail(&matched_rqs, rq); 2861 depth++; 2862 } else { 2863 /* leave rq in rqs */ 2864 prev = &rq->rq_next; 2865 last = rq; 2866 } 2867 } 2868 2869 rqs->tail = last; 2870 *queue_rqs = matched_rqs; 2871 return depth; 2872 } 2873 2874 static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth) 2875 { 2876 struct request_queue *q = rq_list_peek(rqs)->q; 2877 2878 trace_block_unplug(q, depth, true); 2879 2880 /* 2881 * Peek first request and see if we have a ->queue_rqs() hook. 2882 * If we do, we can dispatch the whole list in one go. 2883 * We already know at this point that all requests belong to the 2884 * same queue, caller must ensure that's the case. 2885 */ 2886 if (q->mq_ops->queue_rqs) { 2887 blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs)); 2888 if (rq_list_empty(rqs)) 2889 return; 2890 } 2891 2892 blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs)); 2893 } 2894 2895 static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched) 2896 { 2897 struct blk_mq_hw_ctx *this_hctx = NULL; 2898 struct blk_mq_ctx *this_ctx = NULL; 2899 struct rq_list requeue_list = {}; 2900 unsigned int depth = 0; 2901 bool is_passthrough = false; 2902 LIST_HEAD(list); 2903 2904 do { 2905 struct request *rq = rq_list_pop(rqs); 2906 2907 if (!this_hctx) { 2908 this_hctx = rq->mq_hctx; 2909 this_ctx = rq->mq_ctx; 2910 is_passthrough = blk_rq_is_passthrough(rq); 2911 } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || 2912 is_passthrough != blk_rq_is_passthrough(rq)) { 2913 rq_list_add_tail(&requeue_list, rq); 2914 continue; 2915 } 2916 list_add_tail(&rq->queuelist, &list); 2917 depth++; 2918 } while (!rq_list_empty(rqs)); 2919 2920 *rqs = requeue_list; 2921 trace_block_unplug(this_hctx->queue, depth, !from_sched); 2922 2923 percpu_ref_get(&this_hctx->queue->q_usage_counter); 2924 /* passthrough requests should never be issued to the I/O scheduler */ 2925 if (is_passthrough) { 2926 spin_lock(&this_hctx->lock); 2927 list_splice_tail_init(&list, &this_hctx->dispatch); 2928 spin_unlock(&this_hctx->lock); 2929 blk_mq_run_hw_queue(this_hctx, from_sched); 2930 } else if (this_hctx->queue->elevator) { 2931 this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, 2932 &list, 0); 2933 blk_mq_run_hw_queue(this_hctx, from_sched); 2934 } else { 2935 blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched); 2936 } 2937 percpu_ref_put(&this_hctx->queue->q_usage_counter); 2938 } 2939 2940 static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs) 2941 { 2942 do { 2943 struct rq_list queue_rqs; 2944 unsigned depth; 2945 2946 depth = blk_mq_extract_queue_requests(rqs, &queue_rqs); 2947 blk_mq_dispatch_queue_requests(&queue_rqs, depth); 2948 while (!rq_list_empty(&queue_rqs)) 2949 blk_mq_dispatch_list(&queue_rqs, false); 2950 } while (!rq_list_empty(rqs)); 2951 } 2952 2953 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 2954 { 2955 unsigned int depth; 2956 2957 /* 2958 * We may have been called recursively midway through handling 2959 * plug->mq_list via a schedule() in the driver's queue_rq() callback. 2960 * To avoid mq_list changing under our feet, clear rq_count early and 2961 * bail out specifically if rq_count is 0 rather than checking 2962 * whether the mq_list is empty. 2963 */ 2964 if (plug->rq_count == 0) 2965 return; 2966 depth = plug->rq_count; 2967 plug->rq_count = 0; 2968 2969 if (!plug->has_elevator && !from_schedule) { 2970 if (plug->multiple_queues) { 2971 blk_mq_dispatch_multiple_queue_requests(&plug->mq_list); 2972 return; 2973 } 2974 2975 blk_mq_dispatch_queue_requests(&plug->mq_list, depth); 2976 if (rq_list_empty(&plug->mq_list)) 2977 return; 2978 } 2979 2980 do { 2981 blk_mq_dispatch_list(&plug->mq_list, from_schedule); 2982 } while (!rq_list_empty(&plug->mq_list)); 2983 } 2984 2985 static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 2986 struct list_head *list) 2987 { 2988 int queued = 0; 2989 blk_status_t ret = BLK_STS_OK; 2990 2991 while (!list_empty(list)) { 2992 struct request *rq = list_first_entry(list, struct request, 2993 queuelist); 2994 2995 list_del_init(&rq->queuelist); 2996 ret = blk_mq_request_issue_directly(rq, list_empty(list)); 2997 switch (ret) { 2998 case BLK_STS_OK: 2999 queued++; 3000 break; 3001 case BLK_STS_RESOURCE: 3002 case BLK_STS_DEV_RESOURCE: 3003 blk_mq_request_bypass_insert(rq, 0); 3004 if (list_empty(list)) 3005 blk_mq_run_hw_queue(hctx, false); 3006 goto out; 3007 default: 3008 blk_mq_end_request(rq, ret); 3009 break; 3010 } 3011 } 3012 3013 out: 3014 if (ret != BLK_STS_OK) 3015 blk_mq_commit_rqs(hctx, queued, false); 3016 } 3017 3018 static bool blk_mq_attempt_bio_merge(struct request_queue *q, 3019 struct bio *bio, unsigned int nr_segs) 3020 { 3021 if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { 3022 if (blk_attempt_plug_merge(q, bio, nr_segs)) 3023 return true; 3024 if (blk_mq_sched_bio_merge(q, bio, nr_segs)) 3025 return true; 3026 } 3027 return false; 3028 } 3029 3030 static struct request *blk_mq_get_new_requests(struct request_queue *q, 3031 struct blk_plug *plug, 3032 struct bio *bio) 3033 { 3034 struct blk_mq_alloc_data data = { 3035 .q = q, 3036 .flags = 0, 3037 .shallow_depth = 0, 3038 .cmd_flags = bio->bi_opf, 3039 .rq_flags = 0, 3040 .nr_tags = 1, 3041 .cached_rqs = NULL, 3042 .ctx = NULL, 3043 .hctx = NULL 3044 }; 3045 struct request *rq; 3046 3047 rq_qos_throttle(q, bio); 3048 3049 if (plug) { 3050 data.nr_tags = plug->nr_ios; 3051 plug->nr_ios = 1; 3052 data.cached_rqs = &plug->cached_rqs; 3053 } 3054 3055 rq = __blk_mq_alloc_requests(&data); 3056 if (unlikely(!rq)) 3057 rq_qos_cleanup(q, bio); 3058 return rq; 3059 } 3060 3061 /* 3062 * Check if there is a suitable cached request and return it. 3063 */ 3064 static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, 3065 struct request_queue *q, blk_opf_t opf) 3066 { 3067 enum hctx_type type = blk_mq_get_hctx_type(opf); 3068 struct request *rq; 3069 3070 if (!plug) 3071 return NULL; 3072 rq = rq_list_peek(&plug->cached_rqs); 3073 if (!rq || rq->q != q) 3074 return NULL; 3075 if (type != rq->mq_hctx->type && 3076 (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT)) 3077 return NULL; 3078 if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) 3079 return NULL; 3080 return rq; 3081 } 3082 3083 static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, 3084 struct bio *bio) 3085 { 3086 if (rq_list_pop(&plug->cached_rqs) != rq) 3087 WARN_ON_ONCE(1); 3088 3089 /* 3090 * If any qos ->throttle() end up blocking, we will have flushed the 3091 * plug and hence killed the cached_rq list as well. Pop this entry 3092 * before we throttle. 3093 */ 3094 rq_qos_throttle(rq->q, bio); 3095 3096 blk_mq_rq_time_init(rq, blk_time_get_ns()); 3097 rq->cmd_flags = bio->bi_opf; 3098 INIT_LIST_HEAD(&rq->queuelist); 3099 } 3100 3101 static bool bio_unaligned(const struct bio *bio, struct request_queue *q) 3102 { 3103 unsigned int bs_mask = queue_logical_block_size(q) - 1; 3104 3105 /* .bi_sector of any zero sized bio need to be initialized */ 3106 if ((bio->bi_iter.bi_size & bs_mask) || 3107 ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask)) 3108 return true; 3109 return false; 3110 } 3111 3112 /** 3113 * blk_mq_submit_bio - Create and send a request to block device. 3114 * @bio: Bio pointer. 3115 * 3116 * Builds up a request structure from @q and @bio and send to the device. The 3117 * request may not be queued directly to hardware if: 3118 * * This request can be merged with another one 3119 * * We want to place request at plug queue for possible future merging 3120 * * There is an IO scheduler active at this queue 3121 * 3122 * It will not queue the request if there is an error with the bio, or at the 3123 * request creation. 3124 */ 3125 void blk_mq_submit_bio(struct bio *bio) 3126 { 3127 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 3128 struct blk_plug *plug = current->plug; 3129 const int is_sync = op_is_sync(bio->bi_opf); 3130 struct blk_mq_hw_ctx *hctx; 3131 unsigned int nr_segs; 3132 struct request *rq; 3133 blk_status_t ret; 3134 3135 /* 3136 * If the plug has a cached request for this queue, try to use it. 3137 */ 3138 rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); 3139 3140 /* 3141 * A BIO that was released from a zone write plug has already been 3142 * through the preparation in this function, already holds a reference 3143 * on the queue usage counter, and is the only write BIO in-flight for 3144 * the target zone. Go straight to preparing a request for it. 3145 */ 3146 if (bio_zone_write_plugging(bio)) { 3147 nr_segs = bio->__bi_nr_segments; 3148 if (rq) 3149 blk_queue_exit(q); 3150 goto new_request; 3151 } 3152 3153 /* 3154 * The cached request already holds a q_usage_counter reference and we 3155 * don't have to acquire a new one if we use it. 3156 */ 3157 if (!rq) { 3158 if (unlikely(bio_queue_enter(bio))) 3159 return; 3160 } 3161 3162 /* 3163 * Device reconfiguration may change logical block size or reduce the 3164 * number of poll queues, so the checks for alignment and poll support 3165 * have to be done with queue usage counter held. 3166 */ 3167 if (unlikely(bio_unaligned(bio, q))) { 3168 bio_io_error(bio); 3169 goto queue_exit; 3170 } 3171 3172 if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) { 3173 bio->bi_status = BLK_STS_NOTSUPP; 3174 bio_endio(bio); 3175 goto queue_exit; 3176 } 3177 3178 bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); 3179 if (!bio) 3180 goto queue_exit; 3181 3182 if (!bio_integrity_prep(bio)) 3183 goto queue_exit; 3184 3185 blk_mq_bio_issue_init(q, bio); 3186 if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) 3187 goto queue_exit; 3188 3189 if (bio_needs_zone_write_plugging(bio)) { 3190 if (blk_zone_plug_bio(bio, nr_segs)) 3191 goto queue_exit; 3192 } 3193 3194 new_request: 3195 if (rq) { 3196 blk_mq_use_cached_rq(rq, plug, bio); 3197 } else { 3198 rq = blk_mq_get_new_requests(q, plug, bio); 3199 if (unlikely(!rq)) { 3200 if (bio->bi_opf & REQ_NOWAIT) 3201 bio_wouldblock_error(bio); 3202 goto queue_exit; 3203 } 3204 } 3205 3206 trace_block_getrq(bio); 3207 3208 rq_qos_track(q, rq, bio); 3209 3210 blk_mq_bio_to_request(rq, bio, nr_segs); 3211 3212 ret = blk_crypto_rq_get_keyslot(rq); 3213 if (ret != BLK_STS_OK) { 3214 bio->bi_status = ret; 3215 bio_endio(bio); 3216 blk_mq_free_request(rq); 3217 return; 3218 } 3219 3220 if (bio_zone_write_plugging(bio)) 3221 blk_zone_write_plug_init_request(rq); 3222 3223 if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) 3224 return; 3225 3226 if (plug) { 3227 blk_add_rq_to_plug(plug, rq); 3228 return; 3229 } 3230 3231 hctx = rq->mq_hctx; 3232 if ((rq->rq_flags & RQF_USE_SCHED) || 3233 (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { 3234 blk_mq_insert_request(rq, 0); 3235 blk_mq_run_hw_queue(hctx, true); 3236 } else { 3237 blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); 3238 } 3239 return; 3240 3241 queue_exit: 3242 /* 3243 * Don't drop the queue reference if we were trying to use a cached 3244 * request and thus didn't acquire one. 3245 */ 3246 if (!rq) 3247 blk_queue_exit(q); 3248 } 3249 3250 #ifdef CONFIG_BLK_MQ_STACKING 3251 /** 3252 * blk_insert_cloned_request - Helper for stacking drivers to submit a request 3253 * @rq: the request being queued 3254 */ 3255 blk_status_t blk_insert_cloned_request(struct request *rq) 3256 { 3257 struct request_queue *q = rq->q; 3258 unsigned int max_sectors = blk_queue_get_max_sectors(rq); 3259 unsigned int max_segments = blk_rq_get_max_segments(rq); 3260 blk_status_t ret; 3261 3262 if (blk_rq_sectors(rq) > max_sectors) { 3263 /* 3264 * SCSI device does not have a good way to return if 3265 * Write Same/Zero is actually supported. If a device rejects 3266 * a non-read/write command (discard, write same,etc.) the 3267 * low-level device driver will set the relevant queue limit to 3268 * 0 to prevent blk-lib from issuing more of the offending 3269 * operations. Commands queued prior to the queue limit being 3270 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O 3271 * errors being propagated to upper layers. 3272 */ 3273 if (max_sectors == 0) 3274 return BLK_STS_NOTSUPP; 3275 3276 printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", 3277 __func__, blk_rq_sectors(rq), max_sectors); 3278 return BLK_STS_IOERR; 3279 } 3280 3281 /* 3282 * The queue settings related to segment counting may differ from the 3283 * original queue. 3284 */ 3285 rq->nr_phys_segments = blk_recalc_rq_segments(rq); 3286 if (rq->nr_phys_segments > max_segments) { 3287 printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n", 3288 __func__, rq->nr_phys_segments, max_segments); 3289 return BLK_STS_IOERR; 3290 } 3291 3292 if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) 3293 return BLK_STS_IOERR; 3294 3295 ret = blk_crypto_rq_get_keyslot(rq); 3296 if (ret != BLK_STS_OK) 3297 return ret; 3298 3299 blk_account_io_start(rq); 3300 3301 /* 3302 * Since we have a scheduler attached on the top device, 3303 * bypass a potential scheduler on the bottom device for 3304 * insert. 3305 */ 3306 blk_mq_run_dispatch_ops(q, 3307 ret = blk_mq_request_issue_directly(rq, true)); 3308 if (ret) 3309 blk_account_io_done(rq, blk_time_get_ns()); 3310 return ret; 3311 } 3312 EXPORT_SYMBOL_GPL(blk_insert_cloned_request); 3313 3314 /** 3315 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request 3316 * @rq: the clone request to be cleaned up 3317 * 3318 * Description: 3319 * Free all bios in @rq for a cloned request. 3320 */ 3321 void blk_rq_unprep_clone(struct request *rq) 3322 { 3323 struct bio *bio; 3324 3325 while ((bio = rq->bio) != NULL) { 3326 rq->bio = bio->bi_next; 3327 3328 bio_put(bio); 3329 } 3330 } 3331 EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); 3332 3333 /** 3334 * blk_rq_prep_clone - Helper function to setup clone request 3335 * @rq: the request to be setup 3336 * @rq_src: original request to be cloned 3337 * @bs: bio_set that bios for clone are allocated from 3338 * @gfp_mask: memory allocation mask for bio 3339 * @bio_ctr: setup function to be called for each clone bio. 3340 * Returns %0 for success, non %0 for failure. 3341 * @data: private data to be passed to @bio_ctr 3342 * 3343 * Description: 3344 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. 3345 * Also, pages which the original bios are pointing to are not copied 3346 * and the cloned bios just point same pages. 3347 * So cloned bios must be completed before original bios, which means 3348 * the caller must complete @rq before @rq_src. 3349 */ 3350 int blk_rq_prep_clone(struct request *rq, struct request *rq_src, 3351 struct bio_set *bs, gfp_t gfp_mask, 3352 int (*bio_ctr)(struct bio *, struct bio *, void *), 3353 void *data) 3354 { 3355 struct bio *bio_src; 3356 3357 if (!bs) 3358 bs = &fs_bio_set; 3359 3360 __rq_for_each_bio(bio_src, rq_src) { 3361 struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src, 3362 gfp_mask, bs); 3363 if (!bio) 3364 goto free_and_out; 3365 3366 if (bio_ctr && bio_ctr(bio, bio_src, data)) { 3367 bio_put(bio); 3368 goto free_and_out; 3369 } 3370 3371 if (rq->bio) { 3372 rq->biotail->bi_next = bio; 3373 rq->biotail = bio; 3374 } else { 3375 rq->bio = rq->biotail = bio; 3376 } 3377 } 3378 3379 /* Copy attributes of the original request to the clone request. */ 3380 rq->__sector = blk_rq_pos(rq_src); 3381 rq->__data_len = blk_rq_bytes(rq_src); 3382 if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { 3383 rq->rq_flags |= RQF_SPECIAL_PAYLOAD; 3384 rq->special_vec = rq_src->special_vec; 3385 } 3386 rq->nr_phys_segments = rq_src->nr_phys_segments; 3387 rq->nr_integrity_segments = rq_src->nr_integrity_segments; 3388 rq->phys_gap_bit = rq_src->phys_gap_bit; 3389 3390 if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) 3391 goto free_and_out; 3392 3393 return 0; 3394 3395 free_and_out: 3396 blk_rq_unprep_clone(rq); 3397 3398 return -ENOMEM; 3399 } 3400 EXPORT_SYMBOL_GPL(blk_rq_prep_clone); 3401 #endif /* CONFIG_BLK_MQ_STACKING */ 3402 3403 /* 3404 * Steal bios from a request and add them to a bio list. 3405 * The request must not have been partially completed before. 3406 */ 3407 void blk_steal_bios(struct bio_list *list, struct request *rq) 3408 { 3409 if (rq->bio) { 3410 if (list->tail) 3411 list->tail->bi_next = rq->bio; 3412 else 3413 list->head = rq->bio; 3414 list->tail = rq->biotail; 3415 3416 rq->bio = NULL; 3417 rq->biotail = NULL; 3418 } 3419 3420 rq->__data_len = 0; 3421 } 3422 EXPORT_SYMBOL_GPL(blk_steal_bios); 3423 3424 static size_t order_to_size(unsigned int order) 3425 { 3426 return (size_t)PAGE_SIZE << order; 3427 } 3428 3429 /* called before freeing request pool in @tags */ 3430 static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, 3431 struct blk_mq_tags *tags) 3432 { 3433 struct page *page; 3434 3435 /* 3436 * There is no need to clear mapping if driver tags is not initialized 3437 * or the mapping belongs to the driver tags. 3438 */ 3439 if (!drv_tags || drv_tags == tags) 3440 return; 3441 3442 list_for_each_entry(page, &tags->page_list, lru) { 3443 unsigned long start = (unsigned long)page_address(page); 3444 unsigned long end = start + order_to_size(page->private); 3445 int i; 3446 3447 for (i = 0; i < drv_tags->nr_tags; i++) { 3448 struct request *rq = drv_tags->rqs[i]; 3449 unsigned long rq_addr = (unsigned long)rq; 3450 3451 if (rq_addr >= start && rq_addr < end) { 3452 WARN_ON_ONCE(req_ref_read(rq) != 0); 3453 cmpxchg(&drv_tags->rqs[i], rq, NULL); 3454 } 3455 } 3456 } 3457 } 3458 3459 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 3460 unsigned int hctx_idx) 3461 { 3462 struct blk_mq_tags *drv_tags; 3463 3464 if (list_empty(&tags->page_list)) 3465 return; 3466 3467 if (blk_mq_is_shared_tags(set->flags)) 3468 drv_tags = set->shared_tags; 3469 else 3470 drv_tags = set->tags[hctx_idx]; 3471 3472 if (tags->static_rqs && set->ops->exit_request) { 3473 int i; 3474 3475 for (i = 0; i < tags->nr_tags; i++) { 3476 struct request *rq = tags->static_rqs[i]; 3477 3478 if (!rq) 3479 continue; 3480 set->ops->exit_request(set, rq, hctx_idx); 3481 tags->static_rqs[i] = NULL; 3482 } 3483 } 3484 3485 blk_mq_clear_rq_mapping(drv_tags, tags); 3486 /* 3487 * Free request pages in SRCU callback, which is called from 3488 * blk_mq_free_tags(). 3489 */ 3490 } 3491 3492 void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) 3493 { 3494 kfree(tags->rqs); 3495 tags->rqs = NULL; 3496 kfree(tags->static_rqs); 3497 tags->static_rqs = NULL; 3498 3499 blk_mq_free_tags(set, tags); 3500 } 3501 3502 static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, 3503 unsigned int hctx_idx) 3504 { 3505 int i; 3506 3507 for (i = 0; i < set->nr_maps; i++) { 3508 unsigned int start = set->map[i].queue_offset; 3509 unsigned int end = start + set->map[i].nr_queues; 3510 3511 if (hctx_idx >= start && hctx_idx < end) 3512 break; 3513 } 3514 3515 if (i >= set->nr_maps) 3516 i = HCTX_TYPE_DEFAULT; 3517 3518 return i; 3519 } 3520 3521 static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set, 3522 unsigned int hctx_idx) 3523 { 3524 enum hctx_type type = hctx_idx_to_type(set, hctx_idx); 3525 3526 return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx); 3527 } 3528 3529 static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, 3530 unsigned int hctx_idx, 3531 unsigned int nr_tags, 3532 unsigned int reserved_tags) 3533 { 3534 int node = blk_mq_get_hctx_node(set, hctx_idx); 3535 struct blk_mq_tags *tags; 3536 3537 if (node == NUMA_NO_NODE) 3538 node = set->numa_node; 3539 3540 tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node); 3541 if (!tags) 3542 return NULL; 3543 3544 tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), 3545 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 3546 node); 3547 if (!tags->rqs) 3548 goto err_free_tags; 3549 3550 tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), 3551 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 3552 node); 3553 if (!tags->static_rqs) 3554 goto err_free_rqs; 3555 3556 return tags; 3557 3558 err_free_rqs: 3559 kfree(tags->rqs); 3560 err_free_tags: 3561 blk_mq_free_tags(set, tags); 3562 return NULL; 3563 } 3564 3565 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, 3566 unsigned int hctx_idx, int node) 3567 { 3568 int ret; 3569 3570 if (set->ops->init_request) { 3571 ret = set->ops->init_request(set, rq, hctx_idx, node); 3572 if (ret) 3573 return ret; 3574 } 3575 3576 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 3577 return 0; 3578 } 3579 3580 static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, 3581 struct blk_mq_tags *tags, 3582 unsigned int hctx_idx, unsigned int depth) 3583 { 3584 unsigned int i, j, entries_per_page, max_order = 4; 3585 int node = blk_mq_get_hctx_node(set, hctx_idx); 3586 size_t rq_size, left; 3587 3588 if (node == NUMA_NO_NODE) 3589 node = set->numa_node; 3590 3591 /* 3592 * rq_size is the size of the request plus driver payload, rounded 3593 * to the cacheline size 3594 */ 3595 rq_size = round_up(sizeof(struct request) + set->cmd_size, 3596 cache_line_size()); 3597 left = rq_size * depth; 3598 3599 for (i = 0; i < depth; ) { 3600 int this_order = max_order; 3601 struct page *page; 3602 int to_do; 3603 void *p; 3604 3605 while (this_order && left < order_to_size(this_order - 1)) 3606 this_order--; 3607 3608 do { 3609 page = alloc_pages_node(node, 3610 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 3611 this_order); 3612 if (page) 3613 break; 3614 if (!this_order--) 3615 break; 3616 if (order_to_size(this_order) < rq_size) 3617 break; 3618 } while (1); 3619 3620 if (!page) 3621 goto fail; 3622 3623 page->private = this_order; 3624 list_add_tail(&page->lru, &tags->page_list); 3625 3626 p = page_address(page); 3627 /* 3628 * Allow kmemleak to scan these pages as they contain pointers 3629 * to additional allocations like via ops->init_request(). 3630 */ 3631 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); 3632 entries_per_page = order_to_size(this_order) / rq_size; 3633 to_do = min(entries_per_page, depth - i); 3634 left -= to_do * rq_size; 3635 for (j = 0; j < to_do; j++) { 3636 struct request *rq = p; 3637 3638 tags->static_rqs[i] = rq; 3639 if (blk_mq_init_request(set, rq, hctx_idx, node)) { 3640 tags->static_rqs[i] = NULL; 3641 goto fail; 3642 } 3643 3644 p += rq_size; 3645 i++; 3646 } 3647 } 3648 return 0; 3649 3650 fail: 3651 blk_mq_free_rqs(set, tags, hctx_idx); 3652 return -ENOMEM; 3653 } 3654 3655 struct rq_iter_data { 3656 struct blk_mq_hw_ctx *hctx; 3657 bool has_rq; 3658 }; 3659 3660 static bool blk_mq_has_request(struct request *rq, void *data) 3661 { 3662 struct rq_iter_data *iter_data = data; 3663 3664 if (rq->mq_hctx != iter_data->hctx) 3665 return true; 3666 iter_data->has_rq = true; 3667 return false; 3668 } 3669 3670 static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) 3671 { 3672 struct blk_mq_tags *tags = hctx->sched_tags ? 3673 hctx->sched_tags : hctx->tags; 3674 struct rq_iter_data data = { 3675 .hctx = hctx, 3676 }; 3677 int srcu_idx; 3678 3679 srcu_idx = srcu_read_lock(&hctx->queue->tag_set->tags_srcu); 3680 blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); 3681 srcu_read_unlock(&hctx->queue->tag_set->tags_srcu, srcu_idx); 3682 3683 return data.has_rq; 3684 } 3685 3686 static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx, 3687 unsigned int this_cpu) 3688 { 3689 enum hctx_type type = hctx->type; 3690 int cpu; 3691 3692 /* 3693 * hctx->cpumask has to rule out isolated CPUs, but userspace still 3694 * might submit IOs on these isolated CPUs, so use the queue map to 3695 * check if all CPUs mapped to this hctx are offline 3696 */ 3697 for_each_online_cpu(cpu) { 3698 struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue, 3699 type, cpu); 3700 3701 if (h != hctx) 3702 continue; 3703 3704 /* this hctx has at least one online CPU */ 3705 if (this_cpu != cpu) 3706 return true; 3707 } 3708 3709 return false; 3710 } 3711 3712 static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) 3713 { 3714 struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, 3715 struct blk_mq_hw_ctx, cpuhp_online); 3716 3717 if (blk_mq_hctx_has_online_cpu(hctx, cpu)) 3718 return 0; 3719 3720 /* 3721 * Prevent new request from being allocated on the current hctx. 3722 * 3723 * The smp_mb__after_atomic() Pairs with the implied barrier in 3724 * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is 3725 * seen once we return from the tag allocator. 3726 */ 3727 set_bit(BLK_MQ_S_INACTIVE, &hctx->state); 3728 smp_mb__after_atomic(); 3729 3730 /* 3731 * Try to grab a reference to the queue and wait for any outstanding 3732 * requests. If we could not grab a reference the queue has been 3733 * frozen and there are no requests. 3734 */ 3735 if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { 3736 while (blk_mq_hctx_has_requests(hctx)) 3737 msleep(5); 3738 percpu_ref_put(&hctx->queue->q_usage_counter); 3739 } 3740 3741 return 0; 3742 } 3743 3744 /* 3745 * Check if one CPU is mapped to the specified hctx 3746 * 3747 * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed 3748 * to be used for scheduling kworker only. For other usage, please call this 3749 * helper for checking if one CPU belongs to the specified hctx 3750 */ 3751 static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu, 3752 const struct blk_mq_hw_ctx *hctx) 3753 { 3754 struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue, 3755 hctx->type, cpu); 3756 3757 return mapped_hctx == hctx; 3758 } 3759 3760 static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) 3761 { 3762 struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, 3763 struct blk_mq_hw_ctx, cpuhp_online); 3764 3765 if (blk_mq_cpu_mapped_to_hctx(cpu, hctx)) 3766 clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); 3767 return 0; 3768 } 3769 3770 /* 3771 * 'cpu' is going away. splice any existing rq_list entries from this 3772 * software queue to the hw queue dispatch list, and ensure that it 3773 * gets run. 3774 */ 3775 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) 3776 { 3777 struct blk_mq_hw_ctx *hctx; 3778 struct blk_mq_ctx *ctx; 3779 LIST_HEAD(tmp); 3780 enum hctx_type type; 3781 3782 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); 3783 if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx)) 3784 return 0; 3785 3786 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 3787 type = hctx->type; 3788 3789 spin_lock(&ctx->lock); 3790 if (!list_empty(&ctx->rq_lists[type])) { 3791 list_splice_init(&ctx->rq_lists[type], &tmp); 3792 blk_mq_hctx_clear_pending(hctx, ctx); 3793 } 3794 spin_unlock(&ctx->lock); 3795 3796 if (list_empty(&tmp)) 3797 return 0; 3798 3799 spin_lock(&hctx->lock); 3800 list_splice_tail_init(&tmp, &hctx->dispatch); 3801 spin_unlock(&hctx->lock); 3802 3803 blk_mq_run_hw_queue(hctx, true); 3804 return 0; 3805 } 3806 3807 static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 3808 { 3809 lockdep_assert_held(&blk_mq_cpuhp_lock); 3810 3811 if (!(hctx->flags & BLK_MQ_F_STACKING) && 3812 !hlist_unhashed(&hctx->cpuhp_online)) { 3813 cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 3814 &hctx->cpuhp_online); 3815 INIT_HLIST_NODE(&hctx->cpuhp_online); 3816 } 3817 3818 if (!hlist_unhashed(&hctx->cpuhp_dead)) { 3819 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, 3820 &hctx->cpuhp_dead); 3821 INIT_HLIST_NODE(&hctx->cpuhp_dead); 3822 } 3823 } 3824 3825 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 3826 { 3827 mutex_lock(&blk_mq_cpuhp_lock); 3828 __blk_mq_remove_cpuhp(hctx); 3829 mutex_unlock(&blk_mq_cpuhp_lock); 3830 } 3831 3832 static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx) 3833 { 3834 lockdep_assert_held(&blk_mq_cpuhp_lock); 3835 3836 if (!(hctx->flags & BLK_MQ_F_STACKING) && 3837 hlist_unhashed(&hctx->cpuhp_online)) 3838 cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 3839 &hctx->cpuhp_online); 3840 3841 if (hlist_unhashed(&hctx->cpuhp_dead)) 3842 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, 3843 &hctx->cpuhp_dead); 3844 } 3845 3846 static void __blk_mq_remove_cpuhp_list(struct list_head *head) 3847 { 3848 struct blk_mq_hw_ctx *hctx; 3849 3850 lockdep_assert_held(&blk_mq_cpuhp_lock); 3851 3852 list_for_each_entry(hctx, head, hctx_list) 3853 __blk_mq_remove_cpuhp(hctx); 3854 } 3855 3856 /* 3857 * Unregister cpuhp callbacks from exited hw queues 3858 * 3859 * Safe to call if this `request_queue` is live 3860 */ 3861 static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q) 3862 { 3863 LIST_HEAD(hctx_list); 3864 3865 spin_lock(&q->unused_hctx_lock); 3866 list_splice_init(&q->unused_hctx_list, &hctx_list); 3867 spin_unlock(&q->unused_hctx_lock); 3868 3869 mutex_lock(&blk_mq_cpuhp_lock); 3870 __blk_mq_remove_cpuhp_list(&hctx_list); 3871 mutex_unlock(&blk_mq_cpuhp_lock); 3872 3873 spin_lock(&q->unused_hctx_lock); 3874 list_splice(&hctx_list, &q->unused_hctx_list); 3875 spin_unlock(&q->unused_hctx_lock); 3876 } 3877 3878 /* 3879 * Register cpuhp callbacks from all hw queues 3880 * 3881 * Safe to call if this `request_queue` is live 3882 */ 3883 static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q) 3884 { 3885 struct blk_mq_hw_ctx *hctx; 3886 unsigned long i; 3887 3888 mutex_lock(&blk_mq_cpuhp_lock); 3889 queue_for_each_hw_ctx(q, hctx, i) 3890 __blk_mq_add_cpuhp(hctx); 3891 mutex_unlock(&blk_mq_cpuhp_lock); 3892 } 3893 3894 /* 3895 * Before freeing hw queue, clearing the flush request reference in 3896 * tags->rqs[] for avoiding potential UAF. 3897 */ 3898 static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, 3899 unsigned int queue_depth, struct request *flush_rq) 3900 { 3901 int i; 3902 3903 /* The hw queue may not be mapped yet */ 3904 if (!tags) 3905 return; 3906 3907 WARN_ON_ONCE(req_ref_read(flush_rq) != 0); 3908 3909 for (i = 0; i < queue_depth; i++) 3910 cmpxchg(&tags->rqs[i], flush_rq, NULL); 3911 } 3912 3913 static void blk_free_flush_queue_callback(struct rcu_head *head) 3914 { 3915 struct blk_flush_queue *fq = 3916 container_of(head, struct blk_flush_queue, rcu_head); 3917 3918 blk_free_flush_queue(fq); 3919 } 3920 3921 /* hctx->ctxs will be freed in queue's release handler */ 3922 static void blk_mq_exit_hctx(struct request_queue *q, 3923 struct blk_mq_tag_set *set, 3924 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 3925 { 3926 struct request *flush_rq = hctx->fq->flush_rq; 3927 3928 if (blk_mq_hw_queue_mapped(hctx)) 3929 blk_mq_tag_idle(hctx); 3930 3931 if (blk_queue_init_done(q)) 3932 blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], 3933 set->queue_depth, flush_rq); 3934 if (set->ops->exit_request) 3935 set->ops->exit_request(set, flush_rq, hctx_idx); 3936 3937 if (set->ops->exit_hctx) 3938 set->ops->exit_hctx(hctx, hctx_idx); 3939 3940 call_srcu(&set->tags_srcu, &hctx->fq->rcu_head, 3941 blk_free_flush_queue_callback); 3942 hctx->fq = NULL; 3943 3944 xa_erase(&q->hctx_table, hctx_idx); 3945 3946 spin_lock(&q->unused_hctx_lock); 3947 list_add(&hctx->hctx_list, &q->unused_hctx_list); 3948 spin_unlock(&q->unused_hctx_lock); 3949 } 3950 3951 static void blk_mq_exit_hw_queues(struct request_queue *q, 3952 struct blk_mq_tag_set *set, int nr_queue) 3953 { 3954 struct blk_mq_hw_ctx *hctx; 3955 unsigned long i; 3956 3957 queue_for_each_hw_ctx(q, hctx, i) { 3958 if (i == nr_queue) 3959 break; 3960 blk_mq_remove_cpuhp(hctx); 3961 blk_mq_exit_hctx(q, set, hctx, i); 3962 } 3963 } 3964 3965 static int blk_mq_init_hctx(struct request_queue *q, 3966 struct blk_mq_tag_set *set, 3967 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 3968 { 3969 gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; 3970 3971 hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); 3972 if (!hctx->fq) 3973 goto fail; 3974 3975 hctx->queue_num = hctx_idx; 3976 3977 hctx->tags = set->tags[hctx_idx]; 3978 3979 if (set->ops->init_hctx && 3980 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 3981 goto fail_free_fq; 3982 3983 if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, 3984 hctx->numa_node)) 3985 goto exit_hctx; 3986 3987 if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) 3988 goto exit_flush_rq; 3989 3990 return 0; 3991 3992 exit_flush_rq: 3993 if (set->ops->exit_request) 3994 set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); 3995 exit_hctx: 3996 if (set->ops->exit_hctx) 3997 set->ops->exit_hctx(hctx, hctx_idx); 3998 fail_free_fq: 3999 blk_free_flush_queue(hctx->fq); 4000 hctx->fq = NULL; 4001 fail: 4002 return -1; 4003 } 4004 4005 static struct blk_mq_hw_ctx * 4006 blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, 4007 int node) 4008 { 4009 struct blk_mq_hw_ctx *hctx; 4010 gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; 4011 4012 hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node); 4013 if (!hctx) 4014 goto fail_alloc_hctx; 4015 4016 if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node)) 4017 goto free_hctx; 4018 4019 atomic_set(&hctx->nr_active, 0); 4020 if (node == NUMA_NO_NODE) 4021 node = set->numa_node; 4022 hctx->numa_node = node; 4023 4024 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 4025 spin_lock_init(&hctx->lock); 4026 INIT_LIST_HEAD(&hctx->dispatch); 4027 INIT_HLIST_NODE(&hctx->cpuhp_dead); 4028 INIT_HLIST_NODE(&hctx->cpuhp_online); 4029 hctx->queue = q; 4030 hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; 4031 4032 INIT_LIST_HEAD(&hctx->hctx_list); 4033 4034 /* 4035 * Allocate space for all possible cpus to avoid allocation at 4036 * runtime 4037 */ 4038 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), 4039 gfp, node); 4040 if (!hctx->ctxs) 4041 goto free_cpumask; 4042 4043 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), 4044 gfp, node, false, false)) 4045 goto free_ctxs; 4046 hctx->nr_ctx = 0; 4047 4048 spin_lock_init(&hctx->dispatch_wait_lock); 4049 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); 4050 INIT_LIST_HEAD(&hctx->dispatch_wait.entry); 4051 4052 blk_mq_hctx_kobj_init(hctx); 4053 4054 return hctx; 4055 4056 free_ctxs: 4057 kfree(hctx->ctxs); 4058 free_cpumask: 4059 free_cpumask_var(hctx->cpumask); 4060 free_hctx: 4061 kfree(hctx); 4062 fail_alloc_hctx: 4063 return NULL; 4064 } 4065 4066 static void blk_mq_init_cpu_queues(struct request_queue *q, 4067 unsigned int nr_hw_queues) 4068 { 4069 struct blk_mq_tag_set *set = q->tag_set; 4070 unsigned int i, j; 4071 4072 for_each_possible_cpu(i) { 4073 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 4074 struct blk_mq_hw_ctx *hctx; 4075 int k; 4076 4077 __ctx->cpu = i; 4078 spin_lock_init(&__ctx->lock); 4079 for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) 4080 INIT_LIST_HEAD(&__ctx->rq_lists[k]); 4081 4082 __ctx->queue = q; 4083 4084 /* 4085 * Set local node, IFF we have more than one hw queue. If 4086 * not, we remain on the home node of the device 4087 */ 4088 for (j = 0; j < set->nr_maps; j++) { 4089 hctx = blk_mq_map_queue_type(q, j, i); 4090 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 4091 hctx->numa_node = cpu_to_node(i); 4092 } 4093 } 4094 } 4095 4096 struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, 4097 unsigned int hctx_idx, 4098 unsigned int depth) 4099 { 4100 struct blk_mq_tags *tags; 4101 int ret; 4102 4103 tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); 4104 if (!tags) 4105 return NULL; 4106 4107 ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); 4108 if (ret) { 4109 blk_mq_free_rq_map(set, tags); 4110 return NULL; 4111 } 4112 4113 return tags; 4114 } 4115 4116 static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, 4117 int hctx_idx) 4118 { 4119 if (blk_mq_is_shared_tags(set->flags)) { 4120 set->tags[hctx_idx] = set->shared_tags; 4121 4122 return true; 4123 } 4124 4125 set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, 4126 set->queue_depth); 4127 4128 return set->tags[hctx_idx]; 4129 } 4130 4131 void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, 4132 struct blk_mq_tags *tags, 4133 unsigned int hctx_idx) 4134 { 4135 if (tags) { 4136 blk_mq_free_rqs(set, tags, hctx_idx); 4137 blk_mq_free_rq_map(set, tags); 4138 } 4139 } 4140 4141 static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, 4142 unsigned int hctx_idx) 4143 { 4144 if (!blk_mq_is_shared_tags(set->flags)) 4145 blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); 4146 4147 set->tags[hctx_idx] = NULL; 4148 } 4149 4150 static void blk_mq_map_swqueue(struct request_queue *q) 4151 { 4152 unsigned int j, hctx_idx; 4153 unsigned long i; 4154 struct blk_mq_hw_ctx *hctx; 4155 struct blk_mq_ctx *ctx; 4156 struct blk_mq_tag_set *set = q->tag_set; 4157 4158 queue_for_each_hw_ctx(q, hctx, i) { 4159 cpumask_clear(hctx->cpumask); 4160 hctx->nr_ctx = 0; 4161 hctx->dispatch_from = NULL; 4162 } 4163 4164 /* 4165 * Map software to hardware queues. 4166 * 4167 * If the cpu isn't present, the cpu is mapped to first hctx. 4168 */ 4169 for_each_possible_cpu(i) { 4170 4171 ctx = per_cpu_ptr(q->queue_ctx, i); 4172 for (j = 0; j < set->nr_maps; j++) { 4173 if (!set->map[j].nr_queues) { 4174 ctx->hctxs[j] = blk_mq_map_queue_type(q, 4175 HCTX_TYPE_DEFAULT, i); 4176 continue; 4177 } 4178 hctx_idx = set->map[j].mq_map[i]; 4179 /* unmapped hw queue can be remapped after CPU topo changed */ 4180 if (!set->tags[hctx_idx] && 4181 !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { 4182 /* 4183 * If tags initialization fail for some hctx, 4184 * that hctx won't be brought online. In this 4185 * case, remap the current ctx to hctx[0] which 4186 * is guaranteed to always have tags allocated 4187 */ 4188 set->map[j].mq_map[i] = 0; 4189 } 4190 4191 hctx = blk_mq_map_queue_type(q, j, i); 4192 ctx->hctxs[j] = hctx; 4193 /* 4194 * If the CPU is already set in the mask, then we've 4195 * mapped this one already. This can happen if 4196 * devices share queues across queue maps. 4197 */ 4198 if (cpumask_test_cpu(i, hctx->cpumask)) 4199 continue; 4200 4201 cpumask_set_cpu(i, hctx->cpumask); 4202 hctx->type = j; 4203 ctx->index_hw[hctx->type] = hctx->nr_ctx; 4204 hctx->ctxs[hctx->nr_ctx++] = ctx; 4205 4206 /* 4207 * If the nr_ctx type overflows, we have exceeded the 4208 * amount of sw queues we can support. 4209 */ 4210 BUG_ON(!hctx->nr_ctx); 4211 } 4212 4213 for (; j < HCTX_MAX_TYPES; j++) 4214 ctx->hctxs[j] = blk_mq_map_queue_type(q, 4215 HCTX_TYPE_DEFAULT, i); 4216 } 4217 4218 queue_for_each_hw_ctx(q, hctx, i) { 4219 int cpu; 4220 4221 /* 4222 * If no software queues are mapped to this hardware queue, 4223 * disable it and free the request entries. 4224 */ 4225 if (!hctx->nr_ctx) { 4226 /* Never unmap queue 0. We need it as a 4227 * fallback in case of a new remap fails 4228 * allocation 4229 */ 4230 if (i) 4231 __blk_mq_free_map_and_rqs(set, i); 4232 4233 hctx->tags = NULL; 4234 continue; 4235 } 4236 4237 hctx->tags = set->tags[i]; 4238 WARN_ON(!hctx->tags); 4239 4240 /* 4241 * Set the map size to the number of mapped software queues. 4242 * This is more accurate and more efficient than looping 4243 * over all possibly mapped software queues. 4244 */ 4245 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); 4246 4247 /* 4248 * Rule out isolated CPUs from hctx->cpumask to avoid 4249 * running block kworker on isolated CPUs 4250 */ 4251 for_each_cpu(cpu, hctx->cpumask) { 4252 if (cpu_is_isolated(cpu)) 4253 cpumask_clear_cpu(cpu, hctx->cpumask); 4254 } 4255 4256 /* 4257 * Initialize batch roundrobin counts 4258 */ 4259 hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); 4260 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 4261 } 4262 } 4263 4264 /* 4265 * Caller needs to ensure that we're either frozen/quiesced, or that 4266 * the queue isn't live yet. 4267 */ 4268 static void queue_set_hctx_shared(struct request_queue *q, bool shared) 4269 { 4270 struct blk_mq_hw_ctx *hctx; 4271 unsigned long i; 4272 4273 queue_for_each_hw_ctx(q, hctx, i) { 4274 if (shared) { 4275 hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; 4276 } else { 4277 blk_mq_tag_idle(hctx); 4278 hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; 4279 } 4280 } 4281 } 4282 4283 static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, 4284 bool shared) 4285 { 4286 struct request_queue *q; 4287 unsigned int memflags; 4288 4289 lockdep_assert_held(&set->tag_list_lock); 4290 4291 list_for_each_entry(q, &set->tag_list, tag_set_list) { 4292 memflags = blk_mq_freeze_queue(q); 4293 queue_set_hctx_shared(q, shared); 4294 blk_mq_unfreeze_queue(q, memflags); 4295 } 4296 } 4297 4298 static void blk_mq_del_queue_tag_set(struct request_queue *q) 4299 { 4300 struct blk_mq_tag_set *set = q->tag_set; 4301 4302 mutex_lock(&set->tag_list_lock); 4303 list_del(&q->tag_set_list); 4304 if (list_is_singular(&set->tag_list)) { 4305 /* just transitioned to unshared */ 4306 set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; 4307 /* update existing queue */ 4308 blk_mq_update_tag_set_shared(set, false); 4309 } 4310 mutex_unlock(&set->tag_list_lock); 4311 INIT_LIST_HEAD(&q->tag_set_list); 4312 } 4313 4314 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 4315 struct request_queue *q) 4316 { 4317 mutex_lock(&set->tag_list_lock); 4318 4319 /* 4320 * Check to see if we're transitioning to shared (from 1 to 2 queues). 4321 */ 4322 if (!list_empty(&set->tag_list) && 4323 !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { 4324 set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; 4325 /* update existing queue */ 4326 blk_mq_update_tag_set_shared(set, true); 4327 } 4328 if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) 4329 queue_set_hctx_shared(q, true); 4330 list_add_tail(&q->tag_set_list, &set->tag_list); 4331 4332 mutex_unlock(&set->tag_list_lock); 4333 } 4334 4335 /* All allocations will be freed in release handler of q->mq_kobj */ 4336 static int blk_mq_alloc_ctxs(struct request_queue *q) 4337 { 4338 struct blk_mq_ctxs *ctxs; 4339 int cpu; 4340 4341 ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); 4342 if (!ctxs) 4343 return -ENOMEM; 4344 4345 ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); 4346 if (!ctxs->queue_ctx) 4347 goto fail; 4348 4349 for_each_possible_cpu(cpu) { 4350 struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); 4351 ctx->ctxs = ctxs; 4352 } 4353 4354 q->mq_kobj = &ctxs->kobj; 4355 q->queue_ctx = ctxs->queue_ctx; 4356 4357 return 0; 4358 fail: 4359 kfree(ctxs); 4360 return -ENOMEM; 4361 } 4362 4363 /* 4364 * It is the actual release handler for mq, but we do it from 4365 * request queue's release handler for avoiding use-after-free 4366 * and headache because q->mq_kobj shouldn't have been introduced, 4367 * but we can't group ctx/kctx kobj without it. 4368 */ 4369 void blk_mq_release(struct request_queue *q) 4370 { 4371 struct blk_mq_hw_ctx *hctx, *next; 4372 unsigned long i; 4373 4374 queue_for_each_hw_ctx(q, hctx, i) 4375 WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); 4376 4377 /* all hctx are in .unused_hctx_list now */ 4378 list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) { 4379 list_del_init(&hctx->hctx_list); 4380 kobject_put(&hctx->kobj); 4381 } 4382 4383 xa_destroy(&q->hctx_table); 4384 4385 /* 4386 * release .mq_kobj and sw queue's kobject now because 4387 * both share lifetime with request queue. 4388 */ 4389 blk_mq_sysfs_deinit(q); 4390 } 4391 4392 struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, 4393 struct queue_limits *lim, void *queuedata) 4394 { 4395 struct queue_limits default_lim = { }; 4396 struct request_queue *q; 4397 int ret; 4398 4399 if (!lim) 4400 lim = &default_lim; 4401 lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; 4402 if (set->nr_maps > HCTX_TYPE_POLL) 4403 lim->features |= BLK_FEAT_POLL; 4404 4405 q = blk_alloc_queue(lim, set->numa_node); 4406 if (IS_ERR(q)) 4407 return q; 4408 q->queuedata = queuedata; 4409 ret = blk_mq_init_allocated_queue(set, q); 4410 if (ret) { 4411 blk_put_queue(q); 4412 return ERR_PTR(ret); 4413 } 4414 return q; 4415 } 4416 EXPORT_SYMBOL(blk_mq_alloc_queue); 4417 4418 /** 4419 * blk_mq_destroy_queue - shutdown a request queue 4420 * @q: request queue to shutdown 4421 * 4422 * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future 4423 * requests will be failed with -ENODEV. The caller is responsible for dropping 4424 * the reference from blk_mq_alloc_queue() by calling blk_put_queue(). 4425 * 4426 * Context: can sleep 4427 */ 4428 void blk_mq_destroy_queue(struct request_queue *q) 4429 { 4430 WARN_ON_ONCE(!queue_is_mq(q)); 4431 WARN_ON_ONCE(blk_queue_registered(q)); 4432 4433 might_sleep(); 4434 4435 blk_queue_flag_set(QUEUE_FLAG_DYING, q); 4436 blk_queue_start_drain(q); 4437 blk_mq_freeze_queue_wait(q); 4438 4439 blk_sync_queue(q); 4440 blk_mq_cancel_work_sync(q); 4441 blk_mq_exit_queue(q); 4442 } 4443 EXPORT_SYMBOL(blk_mq_destroy_queue); 4444 4445 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, 4446 struct queue_limits *lim, void *queuedata, 4447 struct lock_class_key *lkclass) 4448 { 4449 struct request_queue *q; 4450 struct gendisk *disk; 4451 4452 q = blk_mq_alloc_queue(set, lim, queuedata); 4453 if (IS_ERR(q)) 4454 return ERR_CAST(q); 4455 4456 disk = __alloc_disk_node(q, set->numa_node, lkclass); 4457 if (!disk) { 4458 blk_mq_destroy_queue(q); 4459 blk_put_queue(q); 4460 return ERR_PTR(-ENOMEM); 4461 } 4462 set_bit(GD_OWNS_QUEUE, &disk->state); 4463 return disk; 4464 } 4465 EXPORT_SYMBOL(__blk_mq_alloc_disk); 4466 4467 struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, 4468 struct lock_class_key *lkclass) 4469 { 4470 struct gendisk *disk; 4471 4472 if (!blk_get_queue(q)) 4473 return NULL; 4474 disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass); 4475 if (!disk) 4476 blk_put_queue(q); 4477 return disk; 4478 } 4479 EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); 4480 4481 /* 4482 * Only hctx removed from cpuhp list can be reused 4483 */ 4484 static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx) 4485 { 4486 return hlist_unhashed(&hctx->cpuhp_online) && 4487 hlist_unhashed(&hctx->cpuhp_dead); 4488 } 4489 4490 static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( 4491 struct blk_mq_tag_set *set, struct request_queue *q, 4492 int hctx_idx, int node) 4493 { 4494 struct blk_mq_hw_ctx *hctx = NULL, *tmp; 4495 4496 /* reuse dead hctx first */ 4497 spin_lock(&q->unused_hctx_lock); 4498 list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { 4499 if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) { 4500 hctx = tmp; 4501 break; 4502 } 4503 } 4504 if (hctx) 4505 list_del_init(&hctx->hctx_list); 4506 spin_unlock(&q->unused_hctx_lock); 4507 4508 if (!hctx) 4509 hctx = blk_mq_alloc_hctx(q, set, node); 4510 if (!hctx) 4511 goto fail; 4512 4513 if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) 4514 goto free_hctx; 4515 4516 return hctx; 4517 4518 free_hctx: 4519 kobject_put(&hctx->kobj); 4520 fail: 4521 return NULL; 4522 } 4523 4524 static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 4525 struct request_queue *q) 4526 { 4527 struct blk_mq_hw_ctx *hctx; 4528 unsigned long i, j; 4529 4530 for (i = 0; i < set->nr_hw_queues; i++) { 4531 int old_node; 4532 int node = blk_mq_get_hctx_node(set, i); 4533 struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); 4534 4535 if (old_hctx) { 4536 old_node = old_hctx->numa_node; 4537 blk_mq_exit_hctx(q, set, old_hctx, i); 4538 } 4539 4540 if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { 4541 if (!old_hctx) 4542 break; 4543 pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", 4544 node, old_node); 4545 hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); 4546 WARN_ON_ONCE(!hctx); 4547 } 4548 } 4549 /* 4550 * Increasing nr_hw_queues fails. Free the newly allocated 4551 * hctxs and keep the previous q->nr_hw_queues. 4552 */ 4553 if (i != set->nr_hw_queues) { 4554 j = q->nr_hw_queues; 4555 } else { 4556 j = i; 4557 q->nr_hw_queues = set->nr_hw_queues; 4558 } 4559 4560 xa_for_each_start(&q->hctx_table, j, hctx, j) 4561 blk_mq_exit_hctx(q, set, hctx, j); 4562 } 4563 4564 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 4565 struct request_queue *q) 4566 { 4567 __blk_mq_realloc_hw_ctxs(set, q); 4568 4569 /* unregister cpuhp callbacks for exited hctxs */ 4570 blk_mq_remove_hw_queues_cpuhp(q); 4571 4572 /* register cpuhp for new initialized hctxs */ 4573 blk_mq_add_hw_queues_cpuhp(q); 4574 } 4575 4576 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 4577 struct request_queue *q) 4578 { 4579 /* mark the queue as mq asap */ 4580 q->mq_ops = set->ops; 4581 4582 /* 4583 * ->tag_set has to be setup before initialize hctx, which cpuphp 4584 * handler needs it for checking queue mapping 4585 */ 4586 q->tag_set = set; 4587 4588 if (blk_mq_alloc_ctxs(q)) 4589 goto err_exit; 4590 4591 /* init q->mq_kobj and sw queues' kobjects */ 4592 blk_mq_sysfs_init(q); 4593 4594 INIT_LIST_HEAD(&q->unused_hctx_list); 4595 spin_lock_init(&q->unused_hctx_lock); 4596 4597 xa_init(&q->hctx_table); 4598 4599 blk_mq_realloc_hw_ctxs(set, q); 4600 if (!q->nr_hw_queues) 4601 goto err_hctxs; 4602 4603 INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 4604 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 4605 4606 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 4607 4608 INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); 4609 INIT_LIST_HEAD(&q->flush_list); 4610 INIT_LIST_HEAD(&q->requeue_list); 4611 spin_lock_init(&q->requeue_lock); 4612 4613 q->nr_requests = set->queue_depth; 4614 4615 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 4616 blk_mq_map_swqueue(q); 4617 blk_mq_add_queue_tag_set(set, q); 4618 return 0; 4619 4620 err_hctxs: 4621 blk_mq_release(q); 4622 err_exit: 4623 q->mq_ops = NULL; 4624 return -ENOMEM; 4625 } 4626 EXPORT_SYMBOL(blk_mq_init_allocated_queue); 4627 4628 /* tags can _not_ be used after returning from blk_mq_exit_queue */ 4629 void blk_mq_exit_queue(struct request_queue *q) 4630 { 4631 struct blk_mq_tag_set *set = q->tag_set; 4632 4633 /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ 4634 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 4635 /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ 4636 blk_mq_del_queue_tag_set(q); 4637 } 4638 4639 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 4640 { 4641 int i; 4642 4643 if (blk_mq_is_shared_tags(set->flags)) { 4644 set->shared_tags = blk_mq_alloc_map_and_rqs(set, 4645 BLK_MQ_NO_HCTX_IDX, 4646 set->queue_depth); 4647 if (!set->shared_tags) 4648 return -ENOMEM; 4649 } 4650 4651 for (i = 0; i < set->nr_hw_queues; i++) { 4652 if (!__blk_mq_alloc_map_and_rqs(set, i)) 4653 goto out_unwind; 4654 cond_resched(); 4655 } 4656 4657 return 0; 4658 4659 out_unwind: 4660 while (--i >= 0) 4661 __blk_mq_free_map_and_rqs(set, i); 4662 4663 if (blk_mq_is_shared_tags(set->flags)) { 4664 blk_mq_free_map_and_rqs(set, set->shared_tags, 4665 BLK_MQ_NO_HCTX_IDX); 4666 } 4667 4668 return -ENOMEM; 4669 } 4670 4671 /* 4672 * Allocate the request maps associated with this tag_set. Note that this 4673 * may reduce the depth asked for, if memory is tight. set->queue_depth 4674 * will be updated to reflect the allocated depth. 4675 */ 4676 static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) 4677 { 4678 unsigned int depth; 4679 int err; 4680 4681 depth = set->queue_depth; 4682 do { 4683 err = __blk_mq_alloc_rq_maps(set); 4684 if (!err) 4685 break; 4686 4687 set->queue_depth >>= 1; 4688 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 4689 err = -ENOMEM; 4690 break; 4691 } 4692 } while (set->queue_depth); 4693 4694 if (!set->queue_depth || err) { 4695 pr_err("blk-mq: failed to allocate request map\n"); 4696 return -ENOMEM; 4697 } 4698 4699 if (depth != set->queue_depth) 4700 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 4701 depth, set->queue_depth); 4702 4703 return 0; 4704 } 4705 4706 static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) 4707 { 4708 /* 4709 * blk_mq_map_queues() and multiple .map_queues() implementations 4710 * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the 4711 * number of hardware queues. 4712 */ 4713 if (set->nr_maps == 1) 4714 set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; 4715 4716 if (set->ops->map_queues) { 4717 int i; 4718 4719 /* 4720 * transport .map_queues is usually done in the following 4721 * way: 4722 * 4723 * for (queue = 0; queue < set->nr_hw_queues; queue++) { 4724 * mask = get_cpu_mask(queue) 4725 * for_each_cpu(cpu, mask) 4726 * set->map[x].mq_map[cpu] = queue; 4727 * } 4728 * 4729 * When we need to remap, the table has to be cleared for 4730 * killing stale mapping since one CPU may not be mapped 4731 * to any hw queue. 4732 */ 4733 for (i = 0; i < set->nr_maps; i++) 4734 blk_mq_clear_mq_map(&set->map[i]); 4735 4736 set->ops->map_queues(set); 4737 } else { 4738 BUG_ON(set->nr_maps > 1); 4739 blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 4740 } 4741 } 4742 4743 static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, 4744 int new_nr_hw_queues) 4745 { 4746 struct blk_mq_tags **new_tags; 4747 int i; 4748 4749 if (set->nr_hw_queues >= new_nr_hw_queues) 4750 goto done; 4751 4752 new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), 4753 GFP_KERNEL, set->numa_node); 4754 if (!new_tags) 4755 return -ENOMEM; 4756 4757 if (set->tags) 4758 memcpy(new_tags, set->tags, set->nr_hw_queues * 4759 sizeof(*set->tags)); 4760 kfree(set->tags); 4761 set->tags = new_tags; 4762 4763 for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { 4764 if (!__blk_mq_alloc_map_and_rqs(set, i)) { 4765 while (--i >= set->nr_hw_queues) 4766 __blk_mq_free_map_and_rqs(set, i); 4767 return -ENOMEM; 4768 } 4769 cond_resched(); 4770 } 4771 4772 done: 4773 set->nr_hw_queues = new_nr_hw_queues; 4774 return 0; 4775 } 4776 4777 /* 4778 * Alloc a tag set to be associated with one or more request queues. 4779 * May fail with EINVAL for various error conditions. May adjust the 4780 * requested depth down, if it's too large. In that case, the set 4781 * value will be stored in set->queue_depth. 4782 */ 4783 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 4784 { 4785 int i, ret; 4786 4787 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 4788 4789 if (!set->nr_hw_queues) 4790 return -EINVAL; 4791 if (!set->queue_depth) 4792 return -EINVAL; 4793 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 4794 return -EINVAL; 4795 4796 if (!set->ops->queue_rq) 4797 return -EINVAL; 4798 4799 if (!set->ops->get_budget ^ !set->ops->put_budget) 4800 return -EINVAL; 4801 4802 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 4803 pr_info("blk-mq: reduced tag depth to %u\n", 4804 BLK_MQ_MAX_DEPTH); 4805 set->queue_depth = BLK_MQ_MAX_DEPTH; 4806 } 4807 4808 if (!set->nr_maps) 4809 set->nr_maps = 1; 4810 else if (set->nr_maps > HCTX_MAX_TYPES) 4811 return -EINVAL; 4812 4813 /* 4814 * If a crashdump is active, then we are potentially in a very 4815 * memory constrained environment. Limit us to 64 tags to prevent 4816 * using too much memory. 4817 */ 4818 if (is_kdump_kernel()) 4819 set->queue_depth = min(64U, set->queue_depth); 4820 4821 /* 4822 * There is no use for more h/w queues than cpus if we just have 4823 * a single map 4824 */ 4825 if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) 4826 set->nr_hw_queues = nr_cpu_ids; 4827 4828 if (set->flags & BLK_MQ_F_BLOCKING) { 4829 set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); 4830 if (!set->srcu) 4831 return -ENOMEM; 4832 ret = init_srcu_struct(set->srcu); 4833 if (ret) 4834 goto out_free_srcu; 4835 } 4836 ret = init_srcu_struct(&set->tags_srcu); 4837 if (ret) 4838 goto out_cleanup_srcu; 4839 4840 init_rwsem(&set->update_nr_hwq_lock); 4841 4842 ret = -ENOMEM; 4843 set->tags = kcalloc_node(set->nr_hw_queues, 4844 sizeof(struct blk_mq_tags *), GFP_KERNEL, 4845 set->numa_node); 4846 if (!set->tags) 4847 goto out_cleanup_tags_srcu; 4848 4849 for (i = 0; i < set->nr_maps; i++) { 4850 set->map[i].mq_map = kcalloc_node(nr_cpu_ids, 4851 sizeof(set->map[i].mq_map[0]), 4852 GFP_KERNEL, set->numa_node); 4853 if (!set->map[i].mq_map) 4854 goto out_free_mq_map; 4855 set->map[i].nr_queues = set->nr_hw_queues; 4856 } 4857 4858 blk_mq_update_queue_map(set); 4859 4860 ret = blk_mq_alloc_set_map_and_rqs(set); 4861 if (ret) 4862 goto out_free_mq_map; 4863 4864 mutex_init(&set->tag_list_lock); 4865 INIT_LIST_HEAD(&set->tag_list); 4866 4867 return 0; 4868 4869 out_free_mq_map: 4870 for (i = 0; i < set->nr_maps; i++) { 4871 kfree(set->map[i].mq_map); 4872 set->map[i].mq_map = NULL; 4873 } 4874 kfree(set->tags); 4875 set->tags = NULL; 4876 out_cleanup_tags_srcu: 4877 cleanup_srcu_struct(&set->tags_srcu); 4878 out_cleanup_srcu: 4879 if (set->flags & BLK_MQ_F_BLOCKING) 4880 cleanup_srcu_struct(set->srcu); 4881 out_free_srcu: 4882 if (set->flags & BLK_MQ_F_BLOCKING) 4883 kfree(set->srcu); 4884 return ret; 4885 } 4886 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 4887 4888 /* allocate and initialize a tagset for a simple single-queue device */ 4889 int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, 4890 const struct blk_mq_ops *ops, unsigned int queue_depth, 4891 unsigned int set_flags) 4892 { 4893 memset(set, 0, sizeof(*set)); 4894 set->ops = ops; 4895 set->nr_hw_queues = 1; 4896 set->nr_maps = 1; 4897 set->queue_depth = queue_depth; 4898 set->numa_node = NUMA_NO_NODE; 4899 set->flags = set_flags; 4900 return blk_mq_alloc_tag_set(set); 4901 } 4902 EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); 4903 4904 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 4905 { 4906 int i, j; 4907 4908 for (i = 0; i < set->nr_hw_queues; i++) 4909 __blk_mq_free_map_and_rqs(set, i); 4910 4911 if (blk_mq_is_shared_tags(set->flags)) { 4912 blk_mq_free_map_and_rqs(set, set->shared_tags, 4913 BLK_MQ_NO_HCTX_IDX); 4914 } 4915 4916 for (j = 0; j < set->nr_maps; j++) { 4917 kfree(set->map[j].mq_map); 4918 set->map[j].mq_map = NULL; 4919 } 4920 4921 kfree(set->tags); 4922 set->tags = NULL; 4923 4924 srcu_barrier(&set->tags_srcu); 4925 cleanup_srcu_struct(&set->tags_srcu); 4926 if (set->flags & BLK_MQ_F_BLOCKING) { 4927 cleanup_srcu_struct(set->srcu); 4928 kfree(set->srcu); 4929 } 4930 } 4931 EXPORT_SYMBOL(blk_mq_free_tag_set); 4932 4933 struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, 4934 struct elevator_tags *et, 4935 unsigned int nr) 4936 { 4937 struct blk_mq_tag_set *set = q->tag_set; 4938 struct elevator_tags *old_et = NULL; 4939 struct blk_mq_hw_ctx *hctx; 4940 unsigned long i; 4941 4942 blk_mq_quiesce_queue(q); 4943 4944 if (blk_mq_is_shared_tags(set->flags)) { 4945 /* 4946 * Shared tags, for sched tags, we allocate max initially hence 4947 * tags can't grow, see blk_mq_alloc_sched_tags(). 4948 */ 4949 if (q->elevator) 4950 blk_mq_tag_update_sched_shared_tags(q, nr); 4951 else 4952 blk_mq_tag_resize_shared_tags(set, nr); 4953 } else if (!q->elevator) { 4954 /* 4955 * Non-shared hardware tags, nr is already checked from 4956 * queue_requests_store() and tags can't grow. 4957 */ 4958 queue_for_each_hw_ctx(q, hctx, i) { 4959 if (!hctx->tags) 4960 continue; 4961 sbitmap_queue_resize(&hctx->tags->bitmap_tags, 4962 nr - hctx->tags->nr_reserved_tags); 4963 } 4964 } else if (nr <= q->elevator->et->nr_requests) { 4965 /* Non-shared sched tags, and tags don't grow. */ 4966 queue_for_each_hw_ctx(q, hctx, i) { 4967 if (!hctx->sched_tags) 4968 continue; 4969 sbitmap_queue_resize(&hctx->sched_tags->bitmap_tags, 4970 nr - hctx->sched_tags->nr_reserved_tags); 4971 } 4972 } else { 4973 /* Non-shared sched tags, and tags grow */ 4974 queue_for_each_hw_ctx(q, hctx, i) 4975 hctx->sched_tags = et->tags[i]; 4976 old_et = q->elevator->et; 4977 q->elevator->et = et; 4978 } 4979 4980 q->nr_requests = nr; 4981 if (q->elevator && q->elevator->type->ops.depth_updated) 4982 q->elevator->type->ops.depth_updated(q); 4983 4984 blk_mq_unquiesce_queue(q); 4985 return old_et; 4986 } 4987 4988 /* 4989 * Switch back to the elevator type stored in the xarray. 4990 */ 4991 static void blk_mq_elv_switch_back(struct request_queue *q, 4992 struct xarray *elv_tbl, struct xarray *et_tbl) 4993 { 4994 struct elevator_type *e = xa_load(elv_tbl, q->id); 4995 struct elevator_tags *t = xa_load(et_tbl, q->id); 4996 4997 /* The elv_update_nr_hw_queues unfreezes the queue. */ 4998 elv_update_nr_hw_queues(q, e, t); 4999 5000 /* Drop the reference acquired in blk_mq_elv_switch_none. */ 5001 if (e) 5002 elevator_put(e); 5003 } 5004 5005 /* 5006 * Stores elevator type in xarray and set current elevator to none. It uses 5007 * q->id as an index to store the elevator type into the xarray. 5008 */ 5009 static int blk_mq_elv_switch_none(struct request_queue *q, 5010 struct xarray *elv_tbl) 5011 { 5012 int ret = 0; 5013 5014 lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); 5015 5016 /* 5017 * Accessing q->elevator without holding q->elevator_lock is safe here 5018 * because we're called from nr_hw_queue update which is protected by 5019 * set->update_nr_hwq_lock in the writer context. So, scheduler update/ 5020 * switch code (which acquires the same lock in the reader context) 5021 * can't run concurrently. 5022 */ 5023 if (q->elevator) { 5024 5025 ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL); 5026 if (WARN_ON_ONCE(ret)) 5027 return ret; 5028 5029 /* 5030 * Before we switch elevator to 'none', take a reference to 5031 * the elevator module so that while nr_hw_queue update is 5032 * running, no one can remove elevator module. We'd put the 5033 * reference to elevator module later when we switch back 5034 * elevator. 5035 */ 5036 __elevator_get(q->elevator->type); 5037 5038 elevator_set_none(q); 5039 } 5040 return ret; 5041 } 5042 5043 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, 5044 int nr_hw_queues) 5045 { 5046 struct request_queue *q; 5047 int prev_nr_hw_queues = set->nr_hw_queues; 5048 unsigned int memflags; 5049 int i; 5050 struct xarray elv_tbl, et_tbl; 5051 bool queues_frozen = false; 5052 5053 lockdep_assert_held(&set->tag_list_lock); 5054 5055 if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) 5056 nr_hw_queues = nr_cpu_ids; 5057 if (nr_hw_queues < 1) 5058 return; 5059 if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) 5060 return; 5061 5062 memflags = memalloc_noio_save(); 5063 5064 xa_init(&et_tbl); 5065 if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0) 5066 goto out_memalloc_restore; 5067 5068 xa_init(&elv_tbl); 5069 5070 list_for_each_entry(q, &set->tag_list, tag_set_list) { 5071 blk_mq_debugfs_unregister_hctxs(q); 5072 blk_mq_sysfs_unregister_hctxs(q); 5073 } 5074 5075 /* 5076 * Switch IO scheduler to 'none', cleaning up the data associated 5077 * with the previous scheduler. We will switch back once we are done 5078 * updating the new sw to hw queue mappings. 5079 */ 5080 list_for_each_entry(q, &set->tag_list, tag_set_list) 5081 if (blk_mq_elv_switch_none(q, &elv_tbl)) 5082 goto switch_back; 5083 5084 list_for_each_entry(q, &set->tag_list, tag_set_list) 5085 blk_mq_freeze_queue_nomemsave(q); 5086 queues_frozen = true; 5087 if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) 5088 goto switch_back; 5089 5090 fallback: 5091 blk_mq_update_queue_map(set); 5092 list_for_each_entry(q, &set->tag_list, tag_set_list) { 5093 __blk_mq_realloc_hw_ctxs(set, q); 5094 5095 if (q->nr_hw_queues != set->nr_hw_queues) { 5096 int i = prev_nr_hw_queues; 5097 5098 pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", 5099 nr_hw_queues, prev_nr_hw_queues); 5100 for (; i < set->nr_hw_queues; i++) 5101 __blk_mq_free_map_and_rqs(set, i); 5102 5103 set->nr_hw_queues = prev_nr_hw_queues; 5104 goto fallback; 5105 } 5106 blk_mq_map_swqueue(q); 5107 } 5108 switch_back: 5109 /* The blk_mq_elv_switch_back unfreezes queue for us. */ 5110 list_for_each_entry(q, &set->tag_list, tag_set_list) { 5111 /* switch_back expects queue to be frozen */ 5112 if (!queues_frozen) 5113 blk_mq_freeze_queue_nomemsave(q); 5114 blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl); 5115 } 5116 5117 list_for_each_entry(q, &set->tag_list, tag_set_list) { 5118 blk_mq_sysfs_register_hctxs(q); 5119 blk_mq_debugfs_register_hctxs(q); 5120 5121 blk_mq_remove_hw_queues_cpuhp(q); 5122 blk_mq_add_hw_queues_cpuhp(q); 5123 } 5124 5125 xa_destroy(&elv_tbl); 5126 xa_destroy(&et_tbl); 5127 out_memalloc_restore: 5128 memalloc_noio_restore(memflags); 5129 5130 /* Free the excess tags when nr_hw_queues shrink. */ 5131 for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) 5132 __blk_mq_free_map_and_rqs(set, i); 5133 } 5134 5135 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) 5136 { 5137 down_write(&set->update_nr_hwq_lock); 5138 mutex_lock(&set->tag_list_lock); 5139 __blk_mq_update_nr_hw_queues(set, nr_hw_queues); 5140 mutex_unlock(&set->tag_list_lock); 5141 up_write(&set->update_nr_hwq_lock); 5142 } 5143 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); 5144 5145 static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 5146 struct io_comp_batch *iob, unsigned int flags) 5147 { 5148 long state = get_current_state(); 5149 int ret; 5150 5151 do { 5152 ret = q->mq_ops->poll(hctx, iob); 5153 if (ret > 0) { 5154 __set_current_state(TASK_RUNNING); 5155 return ret; 5156 } 5157 5158 if (signal_pending_state(state, current)) 5159 __set_current_state(TASK_RUNNING); 5160 if (task_is_running(current)) 5161 return 1; 5162 5163 if (ret < 0 || (flags & BLK_POLL_ONESHOT)) 5164 break; 5165 cpu_relax(); 5166 } while (!need_resched()); 5167 5168 __set_current_state(TASK_RUNNING); 5169 return 0; 5170 } 5171 5172 int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, 5173 struct io_comp_batch *iob, unsigned int flags) 5174 { 5175 if (!blk_mq_can_poll(q)) 5176 return 0; 5177 return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags); 5178 } 5179 5180 int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, 5181 unsigned int poll_flags) 5182 { 5183 struct request_queue *q = rq->q; 5184 int ret; 5185 5186 if (!blk_rq_is_poll(rq)) 5187 return 0; 5188 if (!percpu_ref_tryget(&q->q_usage_counter)) 5189 return 0; 5190 5191 ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags); 5192 blk_queue_exit(q); 5193 5194 return ret; 5195 } 5196 EXPORT_SYMBOL_GPL(blk_rq_poll); 5197 5198 unsigned int blk_mq_rq_cpu(struct request *rq) 5199 { 5200 return rq->mq_ctx->cpu; 5201 } 5202 EXPORT_SYMBOL(blk_mq_rq_cpu); 5203 5204 void blk_mq_cancel_work_sync(struct request_queue *q) 5205 { 5206 struct blk_mq_hw_ctx *hctx; 5207 unsigned long i; 5208 5209 cancel_delayed_work_sync(&q->requeue_work); 5210 5211 queue_for_each_hw_ctx(q, hctx, i) 5212 cancel_delayed_work_sync(&hctx->run_work); 5213 } 5214 5215 static int __init blk_mq_init(void) 5216 { 5217 int i; 5218 5219 for_each_possible_cpu(i) 5220 init_llist_head(&per_cpu(blk_cpu_done, i)); 5221 for_each_possible_cpu(i) 5222 INIT_CSD(&per_cpu(blk_cpu_csd, i), 5223 __blk_mq_complete_request_remote, NULL); 5224 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); 5225 5226 cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, 5227 "block/softirq:dead", NULL, 5228 blk_softirq_cpu_dead); 5229 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 5230 blk_mq_hctx_notify_dead); 5231 cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", 5232 blk_mq_hctx_notify_online, 5233 blk_mq_hctx_notify_offline); 5234 return 0; 5235 } 5236 subsys_initcall(blk_mq_init); 5237