1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Block multiqueue core code 4 * 5 * Copyright (C) 2013-2014 Jens Axboe 6 * Copyright (C) 2013-2014 Christoph Hellwig 7 */ 8 #include <linux/kernel.h> 9 #include <linux/module.h> 10 #include <linux/backing-dev.h> 11 #include <linux/bio.h> 12 #include <linux/blkdev.h> 13 #include <linux/blk-integrity.h> 14 #include <linux/kmemleak.h> 15 #include <linux/mm.h> 16 #include <linux/init.h> 17 #include <linux/slab.h> 18 #include <linux/workqueue.h> 19 #include <linux/smp.h> 20 #include <linux/interrupt.h> 21 #include <linux/llist.h> 22 #include <linux/cpu.h> 23 #include <linux/cache.h> 24 #include <linux/sched/topology.h> 25 #include <linux/sched/signal.h> 26 #include <linux/delay.h> 27 #include <linux/crash_dump.h> 28 #include <linux/prefetch.h> 29 #include <linux/blk-crypto.h> 30 #include <linux/part_stat.h> 31 #include <linux/sched/isolation.h> 32 33 #include <trace/events/block.h> 34 35 #include <linux/t10-pi.h> 36 #include "blk.h" 37 #include "blk-mq.h" 38 #include "blk-mq-debugfs.h" 39 #include "blk-pm.h" 40 #include "blk-stat.h" 41 #include "blk-mq-sched.h" 42 #include "blk-rq-qos.h" 43 44 static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); 45 static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); 46 static DEFINE_MUTEX(blk_mq_cpuhp_lock); 47 48 static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); 49 static void blk_mq_request_bypass_insert(struct request *rq, 50 blk_insert_t flags); 51 static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 52 struct list_head *list); 53 static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 54 struct io_comp_batch *iob, unsigned int flags); 55 56 /* 57 * Check if any of the ctx, dispatch list or elevator 58 * have pending work in this hardware queue. 59 */ 60 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 61 { 62 return !list_empty_careful(&hctx->dispatch) || 63 sbitmap_any_bit_set(&hctx->ctx_map) || 64 blk_mq_sched_has_work(hctx); 65 } 66 67 /* 68 * Mark this ctx as having pending work in this hardware queue 69 */ 70 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 71 struct blk_mq_ctx *ctx) 72 { 73 const int bit = ctx->index_hw[hctx->type]; 74 75 if (!sbitmap_test_bit(&hctx->ctx_map, bit)) 76 sbitmap_set_bit(&hctx->ctx_map, bit); 77 } 78 79 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 80 struct blk_mq_ctx *ctx) 81 { 82 const int bit = ctx->index_hw[hctx->type]; 83 84 sbitmap_clear_bit(&hctx->ctx_map, bit); 85 } 86 87 struct mq_inflight { 88 struct block_device *part; 89 unsigned int inflight[2]; 90 }; 91 92 static bool blk_mq_check_in_driver(struct request *rq, void *priv) 93 { 94 struct mq_inflight *mi = priv; 95 96 if (rq->rq_flags & RQF_IO_STAT && 97 (!bdev_is_partition(mi->part) || rq->part == mi->part) && 98 blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) 99 mi->inflight[rq_data_dir(rq)]++; 100 101 return true; 102 } 103 104 void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]) 105 { 106 struct mq_inflight mi = { .part = part }; 107 108 blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver, 109 &mi); 110 inflight[READ] = mi.inflight[READ]; 111 inflight[WRITE] = mi.inflight[WRITE]; 112 } 113 114 #ifdef CONFIG_LOCKDEP 115 static bool blk_freeze_set_owner(struct request_queue *q, 116 struct task_struct *owner) 117 { 118 if (!owner) 119 return false; 120 121 if (!q->mq_freeze_depth) { 122 q->mq_freeze_owner = owner; 123 q->mq_freeze_owner_depth = 1; 124 q->mq_freeze_disk_dead = !q->disk || 125 test_bit(GD_DEAD, &q->disk->state) || 126 !blk_queue_registered(q); 127 q->mq_freeze_queue_dying = blk_queue_dying(q); 128 return true; 129 } 130 131 if (owner == q->mq_freeze_owner) 132 q->mq_freeze_owner_depth += 1; 133 return false; 134 } 135 136 /* verify the last unfreeze in owner context */ 137 static bool blk_unfreeze_check_owner(struct request_queue *q) 138 { 139 if (q->mq_freeze_owner != current) 140 return false; 141 if (--q->mq_freeze_owner_depth == 0) { 142 q->mq_freeze_owner = NULL; 143 return true; 144 } 145 return false; 146 } 147 148 #else 149 150 static bool blk_freeze_set_owner(struct request_queue *q, 151 struct task_struct *owner) 152 { 153 return false; 154 } 155 156 static bool blk_unfreeze_check_owner(struct request_queue *q) 157 { 158 return false; 159 } 160 #endif 161 162 bool __blk_freeze_queue_start(struct request_queue *q, 163 struct task_struct *owner) 164 { 165 bool freeze; 166 167 mutex_lock(&q->mq_freeze_lock); 168 freeze = blk_freeze_set_owner(q, owner); 169 if (++q->mq_freeze_depth == 1) { 170 percpu_ref_kill(&q->q_usage_counter); 171 mutex_unlock(&q->mq_freeze_lock); 172 if (queue_is_mq(q)) 173 blk_mq_run_hw_queues(q, false); 174 } else { 175 mutex_unlock(&q->mq_freeze_lock); 176 } 177 178 return freeze; 179 } 180 181 void blk_freeze_queue_start(struct request_queue *q) 182 { 183 if (__blk_freeze_queue_start(q, current)) 184 blk_freeze_acquire_lock(q); 185 } 186 EXPORT_SYMBOL_GPL(blk_freeze_queue_start); 187 188 void blk_mq_freeze_queue_wait(struct request_queue *q) 189 { 190 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); 191 } 192 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); 193 194 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 195 unsigned long timeout) 196 { 197 return wait_event_timeout(q->mq_freeze_wq, 198 percpu_ref_is_zero(&q->q_usage_counter), 199 timeout); 200 } 201 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); 202 203 void blk_mq_freeze_queue_nomemsave(struct request_queue *q) 204 { 205 blk_freeze_queue_start(q); 206 blk_mq_freeze_queue_wait(q); 207 } 208 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave); 209 210 bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) 211 { 212 bool unfreeze; 213 214 mutex_lock(&q->mq_freeze_lock); 215 if (force_atomic) 216 q->q_usage_counter.data->force_atomic = true; 217 q->mq_freeze_depth--; 218 WARN_ON_ONCE(q->mq_freeze_depth < 0); 219 if (!q->mq_freeze_depth) { 220 percpu_ref_resurrect(&q->q_usage_counter); 221 wake_up_all(&q->mq_freeze_wq); 222 } 223 unfreeze = blk_unfreeze_check_owner(q); 224 mutex_unlock(&q->mq_freeze_lock); 225 226 return unfreeze; 227 } 228 229 void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q) 230 { 231 if (__blk_mq_unfreeze_queue(q, false)) 232 blk_unfreeze_release_lock(q); 233 } 234 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore); 235 236 /* 237 * non_owner variant of blk_freeze_queue_start 238 * 239 * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen 240 * by the same task. This is fragile and should not be used if at all 241 * possible. 242 */ 243 void blk_freeze_queue_start_non_owner(struct request_queue *q) 244 { 245 __blk_freeze_queue_start(q, NULL); 246 } 247 EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); 248 249 /* non_owner variant of blk_mq_unfreeze_queue */ 250 void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) 251 { 252 __blk_mq_unfreeze_queue(q, false); 253 } 254 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); 255 256 /* 257 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the 258 * mpt3sas driver such that this function can be removed. 259 */ 260 void blk_mq_quiesce_queue_nowait(struct request_queue *q) 261 { 262 unsigned long flags; 263 264 spin_lock_irqsave(&q->queue_lock, flags); 265 if (!q->quiesce_depth++) 266 blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); 267 spin_unlock_irqrestore(&q->queue_lock, flags); 268 } 269 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); 270 271 /** 272 * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done 273 * @set: tag_set to wait on 274 * 275 * Note: it is driver's responsibility for making sure that quiesce has 276 * been started on or more of the request_queues of the tag_set. This 277 * function only waits for the quiesce on those request_queues that had 278 * the quiesce flag set using blk_mq_quiesce_queue_nowait. 279 */ 280 void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set) 281 { 282 if (set->flags & BLK_MQ_F_BLOCKING) 283 synchronize_srcu(set->srcu); 284 else 285 synchronize_rcu(); 286 } 287 EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); 288 289 /** 290 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished 291 * @q: request queue. 292 * 293 * Note: this function does not prevent that the struct request end_io() 294 * callback function is invoked. Once this function is returned, we make 295 * sure no dispatch can happen until the queue is unquiesced via 296 * blk_mq_unquiesce_queue(). 297 */ 298 void blk_mq_quiesce_queue(struct request_queue *q) 299 { 300 blk_mq_quiesce_queue_nowait(q); 301 /* nothing to wait for non-mq queues */ 302 if (queue_is_mq(q)) 303 blk_mq_wait_quiesce_done(q->tag_set); 304 } 305 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); 306 307 /* 308 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() 309 * @q: request queue. 310 * 311 * This function recovers queue into the state before quiescing 312 * which is done by blk_mq_quiesce_queue. 313 */ 314 void blk_mq_unquiesce_queue(struct request_queue *q) 315 { 316 unsigned long flags; 317 bool run_queue = false; 318 319 spin_lock_irqsave(&q->queue_lock, flags); 320 if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { 321 ; 322 } else if (!--q->quiesce_depth) { 323 blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); 324 run_queue = true; 325 } 326 spin_unlock_irqrestore(&q->queue_lock, flags); 327 328 /* dispatch requests which are inserted during quiescing */ 329 if (run_queue) 330 blk_mq_run_hw_queues(q, true); 331 } 332 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); 333 334 void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) 335 { 336 struct request_queue *q; 337 338 mutex_lock(&set->tag_list_lock); 339 list_for_each_entry(q, &set->tag_list, tag_set_list) { 340 if (!blk_queue_skip_tagset_quiesce(q)) 341 blk_mq_quiesce_queue_nowait(q); 342 } 343 mutex_unlock(&set->tag_list_lock); 344 345 blk_mq_wait_quiesce_done(set); 346 } 347 EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); 348 349 void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) 350 { 351 struct request_queue *q; 352 353 mutex_lock(&set->tag_list_lock); 354 list_for_each_entry(q, &set->tag_list, tag_set_list) { 355 if (!blk_queue_skip_tagset_quiesce(q)) 356 blk_mq_unquiesce_queue(q); 357 } 358 mutex_unlock(&set->tag_list_lock); 359 } 360 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); 361 362 void blk_mq_wake_waiters(struct request_queue *q) 363 { 364 struct blk_mq_hw_ctx *hctx; 365 unsigned long i; 366 367 queue_for_each_hw_ctx(q, hctx, i) 368 if (blk_mq_hw_queue_mapped(hctx)) 369 blk_mq_tag_wakeup_all(hctx->tags, true); 370 } 371 372 void blk_rq_init(struct request_queue *q, struct request *rq) 373 { 374 memset(rq, 0, sizeof(*rq)); 375 376 INIT_LIST_HEAD(&rq->queuelist); 377 rq->q = q; 378 rq->__sector = (sector_t) -1; 379 INIT_HLIST_NODE(&rq->hash); 380 RB_CLEAR_NODE(&rq->rb_node); 381 rq->tag = BLK_MQ_NO_TAG; 382 rq->internal_tag = BLK_MQ_NO_TAG; 383 rq->start_time_ns = blk_time_get_ns(); 384 blk_crypto_rq_set_defaults(rq); 385 } 386 EXPORT_SYMBOL(blk_rq_init); 387 388 /* Set start and alloc time when the allocated request is actually used */ 389 static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) 390 { 391 #ifdef CONFIG_BLK_RQ_ALLOC_TIME 392 if (blk_queue_rq_alloc_time(rq->q)) 393 rq->alloc_time_ns = alloc_time_ns; 394 else 395 rq->alloc_time_ns = 0; 396 #endif 397 } 398 399 static inline void blk_mq_bio_issue_init(struct request_queue *q, 400 struct bio *bio) 401 { 402 #ifdef CONFIG_BLK_CGROUP 403 if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags)) 404 bio->issue_time_ns = blk_time_get_ns(); 405 #endif 406 } 407 408 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, 409 struct blk_mq_tags *tags, unsigned int tag) 410 { 411 struct blk_mq_ctx *ctx = data->ctx; 412 struct blk_mq_hw_ctx *hctx = data->hctx; 413 struct request_queue *q = data->q; 414 struct request *rq = tags->static_rqs[tag]; 415 416 rq->q = q; 417 rq->mq_ctx = ctx; 418 rq->mq_hctx = hctx; 419 rq->cmd_flags = data->cmd_flags; 420 421 if (data->flags & BLK_MQ_REQ_PM) 422 data->rq_flags |= RQF_PM; 423 rq->rq_flags = data->rq_flags; 424 425 if (data->rq_flags & RQF_SCHED_TAGS) { 426 rq->tag = BLK_MQ_NO_TAG; 427 rq->internal_tag = tag; 428 } else { 429 rq->tag = tag; 430 rq->internal_tag = BLK_MQ_NO_TAG; 431 } 432 rq->timeout = 0; 433 434 rq->part = NULL; 435 rq->io_start_time_ns = 0; 436 rq->stats_sectors = 0; 437 rq->nr_phys_segments = 0; 438 rq->nr_integrity_segments = 0; 439 rq->end_io = NULL; 440 rq->end_io_data = NULL; 441 442 blk_crypto_rq_set_defaults(rq); 443 INIT_LIST_HEAD(&rq->queuelist); 444 /* tag was already set */ 445 WRITE_ONCE(rq->deadline, 0); 446 req_ref_set(rq, 1); 447 448 if (rq->rq_flags & RQF_USE_SCHED) { 449 struct elevator_queue *e = data->q->elevator; 450 451 INIT_HLIST_NODE(&rq->hash); 452 RB_CLEAR_NODE(&rq->rb_node); 453 454 if (e->type->ops.prepare_request) 455 e->type->ops.prepare_request(rq); 456 } 457 458 return rq; 459 } 460 461 static inline struct request * 462 __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) 463 { 464 unsigned int tag, tag_offset; 465 struct blk_mq_tags *tags; 466 struct request *rq; 467 unsigned long tag_mask; 468 int i, nr = 0; 469 470 tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); 471 if (unlikely(!tag_mask)) 472 return NULL; 473 474 tags = blk_mq_tags_from_data(data); 475 for (i = 0; tag_mask; i++) { 476 if (!(tag_mask & (1UL << i))) 477 continue; 478 tag = tag_offset + i; 479 prefetch(tags->static_rqs[tag]); 480 tag_mask &= ~(1UL << i); 481 rq = blk_mq_rq_ctx_init(data, tags, tag); 482 rq_list_add_head(data->cached_rqs, rq); 483 nr++; 484 } 485 if (!(data->rq_flags & RQF_SCHED_TAGS)) 486 blk_mq_add_active_requests(data->hctx, nr); 487 /* caller already holds a reference, add for remainder */ 488 percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); 489 data->nr_tags -= nr; 490 491 return rq_list_pop(data->cached_rqs); 492 } 493 494 static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) 495 { 496 struct request_queue *q = data->q; 497 u64 alloc_time_ns = 0; 498 struct request *rq; 499 unsigned int tag; 500 501 /* alloc_time includes depth and tag waits */ 502 if (blk_queue_rq_alloc_time(q)) 503 alloc_time_ns = blk_time_get_ns(); 504 505 if (data->cmd_flags & REQ_NOWAIT) 506 data->flags |= BLK_MQ_REQ_NOWAIT; 507 508 retry: 509 data->ctx = blk_mq_get_ctx(q); 510 data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); 511 512 if (q->elevator) { 513 /* 514 * All requests use scheduler tags when an I/O scheduler is 515 * enabled for the queue. 516 */ 517 data->rq_flags |= RQF_SCHED_TAGS; 518 519 /* 520 * Flush/passthrough requests are special and go directly to the 521 * dispatch list. 522 */ 523 if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH && 524 !blk_op_is_passthrough(data->cmd_flags)) { 525 struct elevator_mq_ops *ops = &q->elevator->type->ops; 526 527 WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); 528 529 data->rq_flags |= RQF_USE_SCHED; 530 if (ops->limit_depth) 531 ops->limit_depth(data->cmd_flags, data); 532 } 533 } else { 534 blk_mq_tag_busy(data->hctx); 535 } 536 537 if (data->flags & BLK_MQ_REQ_RESERVED) 538 data->rq_flags |= RQF_RESV; 539 540 /* 541 * Try batched alloc if we want more than 1 tag. 542 */ 543 if (data->nr_tags > 1) { 544 rq = __blk_mq_alloc_requests_batch(data); 545 if (rq) { 546 blk_mq_rq_time_init(rq, alloc_time_ns); 547 return rq; 548 } 549 data->nr_tags = 1; 550 } 551 552 /* 553 * Waiting allocations only fail because of an inactive hctx. In that 554 * case just retry the hctx assignment and tag allocation as CPU hotplug 555 * should have migrated us to an online CPU by now. 556 */ 557 tag = blk_mq_get_tag(data); 558 if (tag == BLK_MQ_NO_TAG) { 559 if (data->flags & BLK_MQ_REQ_NOWAIT) 560 return NULL; 561 /* 562 * Give up the CPU and sleep for a random short time to 563 * ensure that thread using a realtime scheduling class 564 * are migrated off the CPU, and thus off the hctx that 565 * is going away. 566 */ 567 msleep(3); 568 goto retry; 569 } 570 571 if (!(data->rq_flags & RQF_SCHED_TAGS)) 572 blk_mq_inc_active_requests(data->hctx); 573 rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); 574 blk_mq_rq_time_init(rq, alloc_time_ns); 575 return rq; 576 } 577 578 static struct request *blk_mq_rq_cache_fill(struct request_queue *q, 579 struct blk_plug *plug, 580 blk_opf_t opf, 581 blk_mq_req_flags_t flags) 582 { 583 struct blk_mq_alloc_data data = { 584 .q = q, 585 .flags = flags, 586 .shallow_depth = 0, 587 .cmd_flags = opf, 588 .rq_flags = 0, 589 .nr_tags = plug->nr_ios, 590 .cached_rqs = &plug->cached_rqs, 591 .ctx = NULL, 592 .hctx = NULL 593 }; 594 struct request *rq; 595 596 if (blk_queue_enter(q, flags)) 597 return NULL; 598 599 plug->nr_ios = 1; 600 601 rq = __blk_mq_alloc_requests(&data); 602 if (unlikely(!rq)) 603 blk_queue_exit(q); 604 return rq; 605 } 606 607 static struct request *blk_mq_alloc_cached_request(struct request_queue *q, 608 blk_opf_t opf, 609 blk_mq_req_flags_t flags) 610 { 611 struct blk_plug *plug = current->plug; 612 struct request *rq; 613 614 if (!plug) 615 return NULL; 616 617 if (rq_list_empty(&plug->cached_rqs)) { 618 if (plug->nr_ios == 1) 619 return NULL; 620 rq = blk_mq_rq_cache_fill(q, plug, opf, flags); 621 if (!rq) 622 return NULL; 623 } else { 624 rq = rq_list_peek(&plug->cached_rqs); 625 if (!rq || rq->q != q) 626 return NULL; 627 628 if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) 629 return NULL; 630 if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) 631 return NULL; 632 633 rq_list_pop(&plug->cached_rqs); 634 blk_mq_rq_time_init(rq, blk_time_get_ns()); 635 } 636 637 rq->cmd_flags = opf; 638 INIT_LIST_HEAD(&rq->queuelist); 639 return rq; 640 } 641 642 struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, 643 blk_mq_req_flags_t flags) 644 { 645 struct request *rq; 646 647 rq = blk_mq_alloc_cached_request(q, opf, flags); 648 if (!rq) { 649 struct blk_mq_alloc_data data = { 650 .q = q, 651 .flags = flags, 652 .shallow_depth = 0, 653 .cmd_flags = opf, 654 .rq_flags = 0, 655 .nr_tags = 1, 656 .cached_rqs = NULL, 657 .ctx = NULL, 658 .hctx = NULL 659 }; 660 int ret; 661 662 ret = blk_queue_enter(q, flags); 663 if (ret) 664 return ERR_PTR(ret); 665 666 rq = __blk_mq_alloc_requests(&data); 667 if (!rq) 668 goto out_queue_exit; 669 } 670 rq->__data_len = 0; 671 rq->__sector = (sector_t) -1; 672 rq->bio = rq->biotail = NULL; 673 return rq; 674 out_queue_exit: 675 blk_queue_exit(q); 676 return ERR_PTR(-EWOULDBLOCK); 677 } 678 EXPORT_SYMBOL(blk_mq_alloc_request); 679 680 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 681 blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx) 682 { 683 struct blk_mq_alloc_data data = { 684 .q = q, 685 .flags = flags, 686 .shallow_depth = 0, 687 .cmd_flags = opf, 688 .rq_flags = 0, 689 .nr_tags = 1, 690 .cached_rqs = NULL, 691 .ctx = NULL, 692 .hctx = NULL 693 }; 694 u64 alloc_time_ns = 0; 695 struct request *rq; 696 unsigned int cpu; 697 unsigned int tag; 698 int ret; 699 700 /* alloc_time includes depth and tag waits */ 701 if (blk_queue_rq_alloc_time(q)) 702 alloc_time_ns = blk_time_get_ns(); 703 704 /* 705 * If the tag allocator sleeps we could get an allocation for a 706 * different hardware context. No need to complicate the low level 707 * allocator for this for the rare use case of a command tied to 708 * a specific queue. 709 */ 710 if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) || 711 WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) 712 return ERR_PTR(-EINVAL); 713 714 if (hctx_idx >= q->nr_hw_queues) 715 return ERR_PTR(-EIO); 716 717 ret = blk_queue_enter(q, flags); 718 if (ret) 719 return ERR_PTR(ret); 720 721 /* 722 * Check if the hardware context is actually mapped to anything. 723 * If not tell the caller that it should skip this queue. 724 */ 725 ret = -EXDEV; 726 data.hctx = xa_load(&q->hctx_table, hctx_idx); 727 if (!blk_mq_hw_queue_mapped(data.hctx)) 728 goto out_queue_exit; 729 cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); 730 if (cpu >= nr_cpu_ids) 731 goto out_queue_exit; 732 data.ctx = __blk_mq_get_ctx(q, cpu); 733 734 if (q->elevator) 735 data.rq_flags |= RQF_SCHED_TAGS; 736 else 737 blk_mq_tag_busy(data.hctx); 738 739 if (flags & BLK_MQ_REQ_RESERVED) 740 data.rq_flags |= RQF_RESV; 741 742 ret = -EWOULDBLOCK; 743 tag = blk_mq_get_tag(&data); 744 if (tag == BLK_MQ_NO_TAG) 745 goto out_queue_exit; 746 if (!(data.rq_flags & RQF_SCHED_TAGS)) 747 blk_mq_inc_active_requests(data.hctx); 748 rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); 749 blk_mq_rq_time_init(rq, alloc_time_ns); 750 rq->__data_len = 0; 751 rq->__sector = (sector_t) -1; 752 rq->bio = rq->biotail = NULL; 753 return rq; 754 755 out_queue_exit: 756 blk_queue_exit(q); 757 return ERR_PTR(ret); 758 } 759 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); 760 761 static void blk_mq_finish_request(struct request *rq) 762 { 763 struct request_queue *q = rq->q; 764 765 blk_zone_finish_request(rq); 766 767 if (rq->rq_flags & RQF_USE_SCHED) { 768 q->elevator->type->ops.finish_request(rq); 769 /* 770 * For postflush request that may need to be 771 * completed twice, we should clear this flag 772 * to avoid double finish_request() on the rq. 773 */ 774 rq->rq_flags &= ~RQF_USE_SCHED; 775 } 776 } 777 778 static void __blk_mq_free_request(struct request *rq) 779 { 780 struct request_queue *q = rq->q; 781 struct blk_mq_ctx *ctx = rq->mq_ctx; 782 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 783 const int sched_tag = rq->internal_tag; 784 785 blk_crypto_free_request(rq); 786 blk_pm_mark_last_busy(rq); 787 rq->mq_hctx = NULL; 788 789 if (rq->tag != BLK_MQ_NO_TAG) { 790 blk_mq_dec_active_requests(hctx); 791 blk_mq_put_tag(hctx->tags, ctx, rq->tag); 792 } 793 if (sched_tag != BLK_MQ_NO_TAG) 794 blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); 795 blk_mq_sched_restart(hctx); 796 blk_queue_exit(q); 797 } 798 799 void blk_mq_free_request(struct request *rq) 800 { 801 struct request_queue *q = rq->q; 802 803 blk_mq_finish_request(rq); 804 805 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) 806 laptop_io_completion(q->disk->bdi); 807 808 rq_qos_done(q, rq); 809 810 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 811 if (req_ref_put_and_test(rq)) 812 __blk_mq_free_request(rq); 813 } 814 EXPORT_SYMBOL_GPL(blk_mq_free_request); 815 816 void blk_mq_free_plug_rqs(struct blk_plug *plug) 817 { 818 struct request *rq; 819 820 while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL) 821 blk_mq_free_request(rq); 822 } 823 824 void blk_dump_rq_flags(struct request *rq, char *msg) 825 { 826 printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, 827 rq->q->disk ? rq->q->disk->disk_name : "?", 828 (__force unsigned long long) rq->cmd_flags); 829 830 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", 831 (unsigned long long)blk_rq_pos(rq), 832 blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); 833 printk(KERN_INFO " bio %p, biotail %p, len %u\n", 834 rq->bio, rq->biotail, blk_rq_bytes(rq)); 835 } 836 EXPORT_SYMBOL(blk_dump_rq_flags); 837 838 static void blk_account_io_completion(struct request *req, unsigned int bytes) 839 { 840 if (req->rq_flags & RQF_IO_STAT) { 841 const int sgrp = op_stat_group(req_op(req)); 842 843 part_stat_lock(); 844 part_stat_add(req->part, sectors[sgrp], bytes >> 9); 845 part_stat_unlock(); 846 } 847 } 848 849 static void blk_print_req_error(struct request *req, blk_status_t status) 850 { 851 printk_ratelimited(KERN_ERR 852 "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " 853 "phys_seg %u prio class %u\n", 854 blk_status_to_str(status), 855 req->q->disk ? req->q->disk->disk_name : "?", 856 blk_rq_pos(req), (__force u32)req_op(req), 857 blk_op_str(req_op(req)), 858 (__force u32)(req->cmd_flags & ~REQ_OP_MASK), 859 req->nr_phys_segments, 860 IOPRIO_PRIO_CLASS(req_get_ioprio(req))); 861 } 862 863 /* 864 * Fully end IO on a request. Does not support partial completions, or 865 * errors. 866 */ 867 static void blk_complete_request(struct request *req) 868 { 869 const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; 870 int total_bytes = blk_rq_bytes(req); 871 struct bio *bio = req->bio; 872 873 trace_block_rq_complete(req, BLK_STS_OK, total_bytes); 874 875 if (!bio) 876 return; 877 878 if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) 879 blk_integrity_complete(req, total_bytes); 880 881 /* 882 * Upper layers may call blk_crypto_evict_key() anytime after the last 883 * bio_endio(). Therefore, the keyslot must be released before that. 884 */ 885 blk_crypto_rq_put_keyslot(req); 886 887 blk_account_io_completion(req, total_bytes); 888 889 do { 890 struct bio *next = bio->bi_next; 891 892 /* Completion has already been traced */ 893 bio_clear_flag(bio, BIO_TRACE_COMPLETION); 894 895 if (blk_req_bio_is_zone_append(req, bio)) 896 blk_zone_append_update_request_bio(req, bio); 897 898 if (!is_flush) 899 bio_endio(bio); 900 bio = next; 901 } while (bio); 902 903 /* 904 * Reset counters so that the request stacking driver 905 * can find how many bytes remain in the request 906 * later. 907 */ 908 if (!req->end_io) { 909 req->bio = NULL; 910 req->__data_len = 0; 911 } 912 } 913 914 /** 915 * blk_update_request - Complete multiple bytes without completing the request 916 * @req: the request being processed 917 * @error: block status code 918 * @nr_bytes: number of bytes to complete for @req 919 * 920 * Description: 921 * Ends I/O on a number of bytes attached to @req, but doesn't complete 922 * the request structure even if @req doesn't have leftover. 923 * If @req has leftover, sets it up for the next range of segments. 924 * 925 * Passing the result of blk_rq_bytes() as @nr_bytes guarantees 926 * %false return from this function. 927 * 928 * Note: 929 * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function 930 * except in the consistency check at the end of this function. 931 * 932 * Return: 933 * %false - this request doesn't have any more data 934 * %true - this request has more data 935 **/ 936 bool blk_update_request(struct request *req, blk_status_t error, 937 unsigned int nr_bytes) 938 { 939 bool is_flush = req->rq_flags & RQF_FLUSH_SEQ; 940 bool quiet = req->rq_flags & RQF_QUIET; 941 int total_bytes; 942 943 trace_block_rq_complete(req, error, nr_bytes); 944 945 if (!req->bio) 946 return false; 947 948 if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && 949 error == BLK_STS_OK) 950 blk_integrity_complete(req, nr_bytes); 951 952 /* 953 * Upper layers may call blk_crypto_evict_key() anytime after the last 954 * bio_endio(). Therefore, the keyslot must be released before that. 955 */ 956 if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) 957 __blk_crypto_rq_put_keyslot(req); 958 959 if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) && 960 !test_bit(GD_DEAD, &req->q->disk->state)) { 961 blk_print_req_error(req, error); 962 trace_block_rq_error(req, error, nr_bytes); 963 } 964 965 blk_account_io_completion(req, nr_bytes); 966 967 total_bytes = 0; 968 while (req->bio) { 969 struct bio *bio = req->bio; 970 unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); 971 972 if (unlikely(error)) 973 bio->bi_status = error; 974 975 if (bio_bytes == bio->bi_iter.bi_size) { 976 req->bio = bio->bi_next; 977 } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) { 978 /* 979 * Partial zone append completions cannot be supported 980 * as the BIO fragments may end up not being written 981 * sequentially. 982 */ 983 bio->bi_status = BLK_STS_IOERR; 984 } 985 986 /* Completion has already been traced */ 987 bio_clear_flag(bio, BIO_TRACE_COMPLETION); 988 if (unlikely(quiet)) 989 bio_set_flag(bio, BIO_QUIET); 990 991 bio_advance(bio, bio_bytes); 992 993 /* Don't actually finish bio if it's part of flush sequence */ 994 if (!bio->bi_iter.bi_size) { 995 if (blk_req_bio_is_zone_append(req, bio)) 996 blk_zone_append_update_request_bio(req, bio); 997 if (!is_flush) 998 bio_endio(bio); 999 } 1000 1001 total_bytes += bio_bytes; 1002 nr_bytes -= bio_bytes; 1003 1004 if (!nr_bytes) 1005 break; 1006 } 1007 1008 /* 1009 * completely done 1010 */ 1011 if (!req->bio) { 1012 /* 1013 * Reset counters so that the request stacking driver 1014 * can find how many bytes remain in the request 1015 * later. 1016 */ 1017 req->__data_len = 0; 1018 return false; 1019 } 1020 1021 req->__data_len -= total_bytes; 1022 1023 /* update sector only for requests with clear definition of sector */ 1024 if (!blk_rq_is_passthrough(req)) 1025 req->__sector += total_bytes >> 9; 1026 1027 /* mixed attributes always follow the first bio */ 1028 if (req->rq_flags & RQF_MIXED_MERGE) { 1029 req->cmd_flags &= ~REQ_FAILFAST_MASK; 1030 req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; 1031 } 1032 1033 if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { 1034 /* 1035 * If total number of sectors is less than the first segment 1036 * size, something has gone terribly wrong. 1037 */ 1038 if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { 1039 blk_dump_rq_flags(req, "request botched"); 1040 req->__data_len = blk_rq_cur_bytes(req); 1041 } 1042 1043 /* recalculate the number of segments */ 1044 req->nr_phys_segments = blk_recalc_rq_segments(req); 1045 } 1046 1047 return true; 1048 } 1049 EXPORT_SYMBOL_GPL(blk_update_request); 1050 1051 static inline void blk_account_io_done(struct request *req, u64 now) 1052 { 1053 trace_block_io_done(req); 1054 1055 /* 1056 * Account IO completion. flush_rq isn't accounted as a 1057 * normal IO on queueing nor completion. Accounting the 1058 * containing request is enough. 1059 */ 1060 if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { 1061 const int sgrp = op_stat_group(req_op(req)); 1062 1063 part_stat_lock(); 1064 update_io_ticks(req->part, jiffies, true); 1065 part_stat_inc(req->part, ios[sgrp]); 1066 part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); 1067 part_stat_local_dec(req->part, 1068 in_flight[op_is_write(req_op(req))]); 1069 part_stat_unlock(); 1070 } 1071 } 1072 1073 static inline bool blk_rq_passthrough_stats(struct request *req) 1074 { 1075 struct bio *bio = req->bio; 1076 1077 if (!blk_queue_passthrough_stat(req->q)) 1078 return false; 1079 1080 /* Requests without a bio do not transfer data. */ 1081 if (!bio) 1082 return false; 1083 1084 /* 1085 * Stats are accumulated in the bdev, so must have one attached to a 1086 * bio to track stats. Most drivers do not set the bdev for passthrough 1087 * requests, but nvme is one that will set it. 1088 */ 1089 if (!bio->bi_bdev) 1090 return false; 1091 1092 /* 1093 * We don't know what a passthrough command does, but we know the 1094 * payload size and data direction. Ensuring the size is aligned to the 1095 * block size filters out most commands with payloads that don't 1096 * represent sector access. 1097 */ 1098 if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) 1099 return false; 1100 return true; 1101 } 1102 1103 static inline void blk_account_io_start(struct request *req) 1104 { 1105 trace_block_io_start(req); 1106 1107 if (!blk_queue_io_stat(req->q)) 1108 return; 1109 if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req)) 1110 return; 1111 1112 req->rq_flags |= RQF_IO_STAT; 1113 req->start_time_ns = blk_time_get_ns(); 1114 1115 /* 1116 * All non-passthrough requests are created from a bio with one 1117 * exception: when a flush command that is part of a flush sequence 1118 * generated by the state machine in blk-flush.c is cloned onto the 1119 * lower device by dm-multipath we can get here without a bio. 1120 */ 1121 if (req->bio) 1122 req->part = req->bio->bi_bdev; 1123 else 1124 req->part = req->q->disk->part0; 1125 1126 part_stat_lock(); 1127 update_io_ticks(req->part, jiffies, false); 1128 part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]); 1129 part_stat_unlock(); 1130 } 1131 1132 static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) 1133 { 1134 if (rq->rq_flags & RQF_STATS) 1135 blk_stat_add(rq, now); 1136 1137 blk_mq_sched_completed_request(rq, now); 1138 blk_account_io_done(rq, now); 1139 } 1140 1141 inline void __blk_mq_end_request(struct request *rq, blk_status_t error) 1142 { 1143 if (blk_mq_need_time_stamp(rq)) 1144 __blk_mq_end_request_acct(rq, blk_time_get_ns()); 1145 1146 blk_mq_finish_request(rq); 1147 1148 if (rq->end_io) { 1149 rq_qos_done(rq->q, rq); 1150 if (rq->end_io(rq, error) == RQ_END_IO_FREE) 1151 blk_mq_free_request(rq); 1152 } else { 1153 blk_mq_free_request(rq); 1154 } 1155 } 1156 EXPORT_SYMBOL(__blk_mq_end_request); 1157 1158 void blk_mq_end_request(struct request *rq, blk_status_t error) 1159 { 1160 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 1161 BUG(); 1162 __blk_mq_end_request(rq, error); 1163 } 1164 EXPORT_SYMBOL(blk_mq_end_request); 1165 1166 #define TAG_COMP_BATCH 32 1167 1168 static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, 1169 int *tag_array, int nr_tags) 1170 { 1171 struct request_queue *q = hctx->queue; 1172 1173 blk_mq_sub_active_requests(hctx, nr_tags); 1174 1175 blk_mq_put_tags(hctx->tags, tag_array, nr_tags); 1176 percpu_ref_put_many(&q->q_usage_counter, nr_tags); 1177 } 1178 1179 void blk_mq_end_request_batch(struct io_comp_batch *iob) 1180 { 1181 int tags[TAG_COMP_BATCH], nr_tags = 0; 1182 struct blk_mq_hw_ctx *cur_hctx = NULL; 1183 struct request *rq; 1184 u64 now = 0; 1185 1186 if (iob->need_ts) 1187 now = blk_time_get_ns(); 1188 1189 while ((rq = rq_list_pop(&iob->req_list)) != NULL) { 1190 prefetch(rq->bio); 1191 prefetch(rq->rq_next); 1192 1193 blk_complete_request(rq); 1194 if (iob->need_ts) 1195 __blk_mq_end_request_acct(rq, now); 1196 1197 blk_mq_finish_request(rq); 1198 1199 rq_qos_done(rq->q, rq); 1200 1201 /* 1202 * If end_io handler returns NONE, then it still has 1203 * ownership of the request. 1204 */ 1205 if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) 1206 continue; 1207 1208 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 1209 if (!req_ref_put_and_test(rq)) 1210 continue; 1211 1212 blk_crypto_free_request(rq); 1213 blk_pm_mark_last_busy(rq); 1214 1215 if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { 1216 if (cur_hctx) 1217 blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); 1218 nr_tags = 0; 1219 cur_hctx = rq->mq_hctx; 1220 } 1221 tags[nr_tags++] = rq->tag; 1222 } 1223 1224 if (nr_tags) 1225 blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); 1226 } 1227 EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); 1228 1229 static void blk_complete_reqs(struct llist_head *list) 1230 { 1231 struct llist_node *entry = llist_reverse_order(llist_del_all(list)); 1232 struct request *rq, *next; 1233 1234 llist_for_each_entry_safe(rq, next, entry, ipi_list) 1235 rq->q->mq_ops->complete(rq); 1236 } 1237 1238 static __latent_entropy void blk_done_softirq(void) 1239 { 1240 blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); 1241 } 1242 1243 static int blk_softirq_cpu_dead(unsigned int cpu) 1244 { 1245 blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); 1246 return 0; 1247 } 1248 1249 static void __blk_mq_complete_request_remote(void *data) 1250 { 1251 __raise_softirq_irqoff(BLOCK_SOFTIRQ); 1252 } 1253 1254 static inline bool blk_mq_complete_need_ipi(struct request *rq) 1255 { 1256 int cpu = raw_smp_processor_id(); 1257 1258 if (!IS_ENABLED(CONFIG_SMP) || 1259 !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) 1260 return false; 1261 /* 1262 * With force threaded interrupts enabled, raising softirq from an SMP 1263 * function call will always result in waking the ksoftirqd thread. 1264 * This is probably worse than completing the request on a different 1265 * cache domain. 1266 */ 1267 if (force_irqthreads()) 1268 return false; 1269 1270 /* same CPU or cache domain and capacity? Complete locally */ 1271 if (cpu == rq->mq_ctx->cpu || 1272 (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && 1273 cpus_share_cache(cpu, rq->mq_ctx->cpu) && 1274 cpus_equal_capacity(cpu, rq->mq_ctx->cpu))) 1275 return false; 1276 1277 /* don't try to IPI to an offline CPU */ 1278 return cpu_online(rq->mq_ctx->cpu); 1279 } 1280 1281 static void blk_mq_complete_send_ipi(struct request *rq) 1282 { 1283 unsigned int cpu; 1284 1285 cpu = rq->mq_ctx->cpu; 1286 if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) 1287 smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); 1288 } 1289 1290 static void blk_mq_raise_softirq(struct request *rq) 1291 { 1292 struct llist_head *list; 1293 1294 preempt_disable(); 1295 list = this_cpu_ptr(&blk_cpu_done); 1296 if (llist_add(&rq->ipi_list, list)) 1297 raise_softirq(BLOCK_SOFTIRQ); 1298 preempt_enable(); 1299 } 1300 1301 bool blk_mq_complete_request_remote(struct request *rq) 1302 { 1303 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 1304 1305 /* 1306 * For request which hctx has only one ctx mapping, 1307 * or a polled request, always complete locally, 1308 * it's pointless to redirect the completion. 1309 */ 1310 if ((rq->mq_hctx->nr_ctx == 1 && 1311 rq->mq_ctx->cpu == raw_smp_processor_id()) || 1312 rq->cmd_flags & REQ_POLLED) 1313 return false; 1314 1315 if (blk_mq_complete_need_ipi(rq)) { 1316 blk_mq_complete_send_ipi(rq); 1317 return true; 1318 } 1319 1320 if (rq->q->nr_hw_queues == 1) { 1321 blk_mq_raise_softirq(rq); 1322 return true; 1323 } 1324 return false; 1325 } 1326 EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); 1327 1328 /** 1329 * blk_mq_complete_request - end I/O on a request 1330 * @rq: the request being processed 1331 * 1332 * Description: 1333 * Complete a request by scheduling the ->complete_rq operation. 1334 **/ 1335 void blk_mq_complete_request(struct request *rq) 1336 { 1337 if (!blk_mq_complete_request_remote(rq)) 1338 rq->q->mq_ops->complete(rq); 1339 } 1340 EXPORT_SYMBOL(blk_mq_complete_request); 1341 1342 /** 1343 * blk_mq_start_request - Start processing a request 1344 * @rq: Pointer to request to be started 1345 * 1346 * Function used by device drivers to notify the block layer that a request 1347 * is going to be processed now, so blk layer can do proper initializations 1348 * such as starting the timeout timer. 1349 */ 1350 void blk_mq_start_request(struct request *rq) 1351 { 1352 struct request_queue *q = rq->q; 1353 1354 trace_block_rq_issue(rq); 1355 1356 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && 1357 !blk_rq_is_passthrough(rq)) { 1358 rq->io_start_time_ns = blk_time_get_ns(); 1359 rq->stats_sectors = blk_rq_sectors(rq); 1360 rq->rq_flags |= RQF_STATS; 1361 rq_qos_issue(q, rq); 1362 } 1363 1364 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); 1365 1366 blk_add_timer(rq); 1367 WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); 1368 rq->mq_hctx->tags->rqs[rq->tag] = rq; 1369 1370 if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) 1371 blk_integrity_prepare(rq); 1372 1373 if (rq->bio && rq->bio->bi_opf & REQ_POLLED) 1374 WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); 1375 } 1376 EXPORT_SYMBOL(blk_mq_start_request); 1377 1378 /* 1379 * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple 1380 * queues. This is important for md arrays to benefit from merging 1381 * requests. 1382 */ 1383 static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) 1384 { 1385 if (plug->multiple_queues) 1386 return BLK_MAX_REQUEST_COUNT * 2; 1387 return BLK_MAX_REQUEST_COUNT; 1388 } 1389 1390 static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) 1391 { 1392 struct request *last = rq_list_peek(&plug->mq_list); 1393 1394 if (!plug->rq_count) { 1395 trace_block_plug(rq->q); 1396 } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || 1397 (!blk_queue_nomerges(rq->q) && 1398 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { 1399 blk_mq_flush_plug_list(plug, false); 1400 last = NULL; 1401 trace_block_plug(rq->q); 1402 } 1403 1404 if (!plug->multiple_queues && last && last->q != rq->q) 1405 plug->multiple_queues = true; 1406 /* 1407 * Any request allocated from sched tags can't be issued to 1408 * ->queue_rqs() directly 1409 */ 1410 if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) 1411 plug->has_elevator = true; 1412 rq_list_add_tail(&plug->mq_list, rq); 1413 plug->rq_count++; 1414 } 1415 1416 /** 1417 * blk_execute_rq_nowait - insert a request to I/O scheduler for execution 1418 * @rq: request to insert 1419 * @at_head: insert request at head or tail of queue 1420 * 1421 * Description: 1422 * Insert a fully prepared request at the back of the I/O scheduler queue 1423 * for execution. Don't wait for completion. 1424 * 1425 * Note: 1426 * This function will invoke @done directly if the queue is dead. 1427 */ 1428 void blk_execute_rq_nowait(struct request *rq, bool at_head) 1429 { 1430 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 1431 1432 WARN_ON(irqs_disabled()); 1433 WARN_ON(!blk_rq_is_passthrough(rq)); 1434 1435 blk_account_io_start(rq); 1436 1437 if (current->plug && !at_head) { 1438 blk_add_rq_to_plug(current->plug, rq); 1439 return; 1440 } 1441 1442 blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); 1443 blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); 1444 } 1445 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); 1446 1447 struct blk_rq_wait { 1448 struct completion done; 1449 blk_status_t ret; 1450 }; 1451 1452 static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) 1453 { 1454 struct blk_rq_wait *wait = rq->end_io_data; 1455 1456 wait->ret = ret; 1457 complete(&wait->done); 1458 return RQ_END_IO_NONE; 1459 } 1460 1461 bool blk_rq_is_poll(struct request *rq) 1462 { 1463 if (!rq->mq_hctx) 1464 return false; 1465 if (rq->mq_hctx->type != HCTX_TYPE_POLL) 1466 return false; 1467 return true; 1468 } 1469 EXPORT_SYMBOL_GPL(blk_rq_is_poll); 1470 1471 static void blk_rq_poll_completion(struct request *rq, struct completion *wait) 1472 { 1473 do { 1474 blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0); 1475 cond_resched(); 1476 } while (!completion_done(wait)); 1477 } 1478 1479 /** 1480 * blk_execute_rq - insert a request into queue for execution 1481 * @rq: request to insert 1482 * @at_head: insert request at head or tail of queue 1483 * 1484 * Description: 1485 * Insert a fully prepared request at the back of the I/O scheduler queue 1486 * for execution and wait for completion. 1487 * Return: The blk_status_t result provided to blk_mq_end_request(). 1488 */ 1489 blk_status_t blk_execute_rq(struct request *rq, bool at_head) 1490 { 1491 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 1492 struct blk_rq_wait wait = { 1493 .done = COMPLETION_INITIALIZER_ONSTACK(wait.done), 1494 }; 1495 1496 WARN_ON(irqs_disabled()); 1497 WARN_ON(!blk_rq_is_passthrough(rq)); 1498 1499 rq->end_io_data = &wait; 1500 rq->end_io = blk_end_sync_rq; 1501 1502 blk_account_io_start(rq); 1503 blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); 1504 blk_mq_run_hw_queue(hctx, false); 1505 1506 if (blk_rq_is_poll(rq)) 1507 blk_rq_poll_completion(rq, &wait.done); 1508 else 1509 blk_wait_io(&wait.done); 1510 1511 return wait.ret; 1512 } 1513 EXPORT_SYMBOL(blk_execute_rq); 1514 1515 static void __blk_mq_requeue_request(struct request *rq) 1516 { 1517 struct request_queue *q = rq->q; 1518 1519 blk_mq_put_driver_tag(rq); 1520 1521 trace_block_rq_requeue(rq); 1522 rq_qos_requeue(q, rq); 1523 1524 if (blk_mq_request_started(rq)) { 1525 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 1526 rq->rq_flags &= ~RQF_TIMED_OUT; 1527 } 1528 } 1529 1530 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) 1531 { 1532 struct request_queue *q = rq->q; 1533 unsigned long flags; 1534 1535 __blk_mq_requeue_request(rq); 1536 1537 /* this request will be re-inserted to io scheduler queue */ 1538 blk_mq_sched_requeue_request(rq); 1539 1540 spin_lock_irqsave(&q->requeue_lock, flags); 1541 list_add_tail(&rq->queuelist, &q->requeue_list); 1542 spin_unlock_irqrestore(&q->requeue_lock, flags); 1543 1544 if (kick_requeue_list) 1545 blk_mq_kick_requeue_list(q); 1546 } 1547 EXPORT_SYMBOL(blk_mq_requeue_request); 1548 1549 static void blk_mq_requeue_work(struct work_struct *work) 1550 { 1551 struct request_queue *q = 1552 container_of(work, struct request_queue, requeue_work.work); 1553 LIST_HEAD(rq_list); 1554 LIST_HEAD(flush_list); 1555 struct request *rq; 1556 1557 spin_lock_irq(&q->requeue_lock); 1558 list_splice_init(&q->requeue_list, &rq_list); 1559 list_splice_init(&q->flush_list, &flush_list); 1560 spin_unlock_irq(&q->requeue_lock); 1561 1562 while (!list_empty(&rq_list)) { 1563 rq = list_entry(rq_list.next, struct request, queuelist); 1564 list_del_init(&rq->queuelist); 1565 /* 1566 * If RQF_DONTPREP is set, the request has been started by the 1567 * driver already and might have driver-specific data allocated 1568 * already. Insert it into the hctx dispatch list to avoid 1569 * block layer merges for the request. 1570 */ 1571 if (rq->rq_flags & RQF_DONTPREP) 1572 blk_mq_request_bypass_insert(rq, 0); 1573 else 1574 blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); 1575 } 1576 1577 while (!list_empty(&flush_list)) { 1578 rq = list_entry(flush_list.next, struct request, queuelist); 1579 list_del_init(&rq->queuelist); 1580 blk_mq_insert_request(rq, 0); 1581 } 1582 1583 blk_mq_run_hw_queues(q, false); 1584 } 1585 1586 void blk_mq_kick_requeue_list(struct request_queue *q) 1587 { 1588 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); 1589 } 1590 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 1591 1592 void blk_mq_delay_kick_requeue_list(struct request_queue *q, 1593 unsigned long msecs) 1594 { 1595 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 1596 msecs_to_jiffies(msecs)); 1597 } 1598 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); 1599 1600 static bool blk_is_flush_data_rq(struct request *rq) 1601 { 1602 return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq); 1603 } 1604 1605 static bool blk_mq_rq_inflight(struct request *rq, void *priv) 1606 { 1607 /* 1608 * If we find a request that isn't idle we know the queue is busy 1609 * as it's checked in the iter. 1610 * Return false to stop the iteration. 1611 * 1612 * In case of queue quiesce, if one flush data request is completed, 1613 * don't count it as inflight given the flush sequence is suspended, 1614 * and the original flush data request is invisible to driver, just 1615 * like other pending requests because of quiesce 1616 */ 1617 if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) && 1618 blk_is_flush_data_rq(rq) && 1619 blk_mq_request_completed(rq))) { 1620 bool *busy = priv; 1621 1622 *busy = true; 1623 return false; 1624 } 1625 1626 return true; 1627 } 1628 1629 bool blk_mq_queue_inflight(struct request_queue *q) 1630 { 1631 bool busy = false; 1632 1633 blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); 1634 return busy; 1635 } 1636 EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); 1637 1638 static void blk_mq_rq_timed_out(struct request *req) 1639 { 1640 req->rq_flags |= RQF_TIMED_OUT; 1641 if (req->q->mq_ops->timeout) { 1642 enum blk_eh_timer_return ret; 1643 1644 ret = req->q->mq_ops->timeout(req); 1645 if (ret == BLK_EH_DONE) 1646 return; 1647 WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); 1648 } 1649 1650 blk_add_timer(req); 1651 } 1652 1653 struct blk_expired_data { 1654 bool has_timedout_rq; 1655 unsigned long next; 1656 unsigned long timeout_start; 1657 }; 1658 1659 static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired) 1660 { 1661 unsigned long deadline; 1662 1663 if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) 1664 return false; 1665 if (rq->rq_flags & RQF_TIMED_OUT) 1666 return false; 1667 1668 deadline = READ_ONCE(rq->deadline); 1669 if (time_after_eq(expired->timeout_start, deadline)) 1670 return true; 1671 1672 if (expired->next == 0) 1673 expired->next = deadline; 1674 else if (time_after(expired->next, deadline)) 1675 expired->next = deadline; 1676 return false; 1677 } 1678 1679 void blk_mq_put_rq_ref(struct request *rq) 1680 { 1681 if (is_flush_rq(rq)) { 1682 if (rq->end_io(rq, 0) == RQ_END_IO_FREE) 1683 blk_mq_free_request(rq); 1684 } else if (req_ref_put_and_test(rq)) { 1685 __blk_mq_free_request(rq); 1686 } 1687 } 1688 1689 static bool blk_mq_check_expired(struct request *rq, void *priv) 1690 { 1691 struct blk_expired_data *expired = priv; 1692 1693 /* 1694 * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot 1695 * be reallocated underneath the timeout handler's processing, then 1696 * the expire check is reliable. If the request is not expired, then 1697 * it was completed and reallocated as a new request after returning 1698 * from blk_mq_check_expired(). 1699 */ 1700 if (blk_mq_req_expired(rq, expired)) { 1701 expired->has_timedout_rq = true; 1702 return false; 1703 } 1704 return true; 1705 } 1706 1707 static bool blk_mq_handle_expired(struct request *rq, void *priv) 1708 { 1709 struct blk_expired_data *expired = priv; 1710 1711 if (blk_mq_req_expired(rq, expired)) 1712 blk_mq_rq_timed_out(rq); 1713 return true; 1714 } 1715 1716 static void blk_mq_timeout_work(struct work_struct *work) 1717 { 1718 struct request_queue *q = 1719 container_of(work, struct request_queue, timeout_work); 1720 struct blk_expired_data expired = { 1721 .timeout_start = jiffies, 1722 }; 1723 struct blk_mq_hw_ctx *hctx; 1724 unsigned long i; 1725 1726 /* A deadlock might occur if a request is stuck requiring a 1727 * timeout at the same time a queue freeze is waiting 1728 * completion, since the timeout code would not be able to 1729 * acquire the queue reference here. 1730 * 1731 * That's why we don't use blk_queue_enter here; instead, we use 1732 * percpu_ref_tryget directly, because we need to be able to 1733 * obtain a reference even in the short window between the queue 1734 * starting to freeze, by dropping the first reference in 1735 * blk_freeze_queue_start, and the moment the last request is 1736 * consumed, marked by the instant q_usage_counter reaches 1737 * zero. 1738 */ 1739 if (!percpu_ref_tryget(&q->q_usage_counter)) 1740 return; 1741 1742 /* check if there is any timed-out request */ 1743 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); 1744 if (expired.has_timedout_rq) { 1745 /* 1746 * Before walking tags, we must ensure any submit started 1747 * before the current time has finished. Since the submit 1748 * uses srcu or rcu, wait for a synchronization point to 1749 * ensure all running submits have finished 1750 */ 1751 blk_mq_wait_quiesce_done(q->tag_set); 1752 1753 expired.next = 0; 1754 blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); 1755 } 1756 1757 if (expired.next != 0) { 1758 mod_timer(&q->timeout, expired.next); 1759 } else { 1760 /* 1761 * Request timeouts are handled as a forward rolling timer. If 1762 * we end up here it means that no requests are pending and 1763 * also that no request has been pending for a while. Mark 1764 * each hctx as idle. 1765 */ 1766 queue_for_each_hw_ctx(q, hctx, i) { 1767 /* the hctx may be unmapped, so check it here */ 1768 if (blk_mq_hw_queue_mapped(hctx)) 1769 blk_mq_tag_idle(hctx); 1770 } 1771 } 1772 blk_queue_exit(q); 1773 } 1774 1775 struct flush_busy_ctx_data { 1776 struct blk_mq_hw_ctx *hctx; 1777 struct list_head *list; 1778 }; 1779 1780 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) 1781 { 1782 struct flush_busy_ctx_data *flush_data = data; 1783 struct blk_mq_hw_ctx *hctx = flush_data->hctx; 1784 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 1785 enum hctx_type type = hctx->type; 1786 1787 spin_lock(&ctx->lock); 1788 list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); 1789 sbitmap_clear_bit(sb, bitnr); 1790 spin_unlock(&ctx->lock); 1791 return true; 1792 } 1793 1794 /* 1795 * Process software queues that have been marked busy, splicing them 1796 * to the for-dispatch 1797 */ 1798 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 1799 { 1800 struct flush_busy_ctx_data data = { 1801 .hctx = hctx, 1802 .list = list, 1803 }; 1804 1805 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); 1806 } 1807 1808 struct dispatch_rq_data { 1809 struct blk_mq_hw_ctx *hctx; 1810 struct request *rq; 1811 }; 1812 1813 static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, 1814 void *data) 1815 { 1816 struct dispatch_rq_data *dispatch_data = data; 1817 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; 1818 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 1819 enum hctx_type type = hctx->type; 1820 1821 spin_lock(&ctx->lock); 1822 if (!list_empty(&ctx->rq_lists[type])) { 1823 dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); 1824 list_del_init(&dispatch_data->rq->queuelist); 1825 if (list_empty(&ctx->rq_lists[type])) 1826 sbitmap_clear_bit(sb, bitnr); 1827 } 1828 spin_unlock(&ctx->lock); 1829 1830 return !dispatch_data->rq; 1831 } 1832 1833 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, 1834 struct blk_mq_ctx *start) 1835 { 1836 unsigned off = start ? start->index_hw[hctx->type] : 0; 1837 struct dispatch_rq_data data = { 1838 .hctx = hctx, 1839 .rq = NULL, 1840 }; 1841 1842 __sbitmap_for_each_set(&hctx->ctx_map, off, 1843 dispatch_rq_from_ctx, &data); 1844 1845 return data.rq; 1846 } 1847 1848 bool __blk_mq_alloc_driver_tag(struct request *rq) 1849 { 1850 struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; 1851 unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; 1852 int tag; 1853 1854 blk_mq_tag_busy(rq->mq_hctx); 1855 1856 if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { 1857 bt = &rq->mq_hctx->tags->breserved_tags; 1858 tag_offset = 0; 1859 } else { 1860 if (!hctx_may_queue(rq->mq_hctx, bt)) 1861 return false; 1862 } 1863 1864 tag = __sbitmap_queue_get(bt); 1865 if (tag == BLK_MQ_NO_TAG) 1866 return false; 1867 1868 rq->tag = tag + tag_offset; 1869 blk_mq_inc_active_requests(rq->mq_hctx); 1870 return true; 1871 } 1872 1873 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, 1874 int flags, void *key) 1875 { 1876 struct blk_mq_hw_ctx *hctx; 1877 1878 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); 1879 1880 spin_lock(&hctx->dispatch_wait_lock); 1881 if (!list_empty(&wait->entry)) { 1882 struct sbitmap_queue *sbq; 1883 1884 list_del_init(&wait->entry); 1885 sbq = &hctx->tags->bitmap_tags; 1886 atomic_dec(&sbq->ws_active); 1887 } 1888 spin_unlock(&hctx->dispatch_wait_lock); 1889 1890 blk_mq_run_hw_queue(hctx, true); 1891 return 1; 1892 } 1893 1894 /* 1895 * Mark us waiting for a tag. For shared tags, this involves hooking us into 1896 * the tag wakeups. For non-shared tags, we can simply mark us needing a 1897 * restart. For both cases, take care to check the condition again after 1898 * marking us as waiting. 1899 */ 1900 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, 1901 struct request *rq) 1902 { 1903 struct sbitmap_queue *sbq; 1904 struct wait_queue_head *wq; 1905 wait_queue_entry_t *wait; 1906 bool ret; 1907 1908 if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && 1909 !(blk_mq_is_shared_tags(hctx->flags))) { 1910 blk_mq_sched_mark_restart_hctx(hctx); 1911 1912 /* 1913 * It's possible that a tag was freed in the window between the 1914 * allocation failure and adding the hardware queue to the wait 1915 * queue. 1916 * 1917 * Don't clear RESTART here, someone else could have set it. 1918 * At most this will cost an extra queue run. 1919 */ 1920 return blk_mq_get_driver_tag(rq); 1921 } 1922 1923 wait = &hctx->dispatch_wait; 1924 if (!list_empty_careful(&wait->entry)) 1925 return false; 1926 1927 if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) 1928 sbq = &hctx->tags->breserved_tags; 1929 else 1930 sbq = &hctx->tags->bitmap_tags; 1931 wq = &bt_wait_ptr(sbq, hctx)->wait; 1932 1933 spin_lock_irq(&wq->lock); 1934 spin_lock(&hctx->dispatch_wait_lock); 1935 if (!list_empty(&wait->entry)) { 1936 spin_unlock(&hctx->dispatch_wait_lock); 1937 spin_unlock_irq(&wq->lock); 1938 return false; 1939 } 1940 1941 atomic_inc(&sbq->ws_active); 1942 wait->flags &= ~WQ_FLAG_EXCLUSIVE; 1943 __add_wait_queue(wq, wait); 1944 1945 /* 1946 * Add one explicit barrier since blk_mq_get_driver_tag() may 1947 * not imply barrier in case of failure. 1948 * 1949 * Order adding us to wait queue and allocating driver tag. 1950 * 1951 * The pair is the one implied in sbitmap_queue_wake_up() which 1952 * orders clearing sbitmap tag bits and waitqueue_active() in 1953 * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless 1954 * 1955 * Otherwise, re-order of adding wait queue and getting driver tag 1956 * may cause __sbitmap_queue_wake_up() to wake up nothing because 1957 * the waitqueue_active() may not observe us in wait queue. 1958 */ 1959 smp_mb(); 1960 1961 /* 1962 * It's possible that a tag was freed in the window between the 1963 * allocation failure and adding the hardware queue to the wait 1964 * queue. 1965 */ 1966 ret = blk_mq_get_driver_tag(rq); 1967 if (!ret) { 1968 spin_unlock(&hctx->dispatch_wait_lock); 1969 spin_unlock_irq(&wq->lock); 1970 return false; 1971 } 1972 1973 /* 1974 * We got a tag, remove ourselves from the wait queue to ensure 1975 * someone else gets the wakeup. 1976 */ 1977 list_del_init(&wait->entry); 1978 atomic_dec(&sbq->ws_active); 1979 spin_unlock(&hctx->dispatch_wait_lock); 1980 spin_unlock_irq(&wq->lock); 1981 1982 return true; 1983 } 1984 1985 #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 1986 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 1987 /* 1988 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): 1989 * - EWMA is one simple way to compute running average value 1990 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially 1991 * - take 4 as factor for avoiding to get too small(0) result, and this 1992 * factor doesn't matter because EWMA decreases exponentially 1993 */ 1994 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) 1995 { 1996 unsigned int ewma; 1997 1998 ewma = hctx->dispatch_busy; 1999 2000 if (!ewma && !busy) 2001 return; 2002 2003 ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; 2004 if (busy) 2005 ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; 2006 ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; 2007 2008 hctx->dispatch_busy = ewma; 2009 } 2010 2011 #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ 2012 2013 static void blk_mq_handle_dev_resource(struct request *rq, 2014 struct list_head *list) 2015 { 2016 list_add(&rq->queuelist, list); 2017 __blk_mq_requeue_request(rq); 2018 } 2019 2020 enum prep_dispatch { 2021 PREP_DISPATCH_OK, 2022 PREP_DISPATCH_NO_TAG, 2023 PREP_DISPATCH_NO_BUDGET, 2024 }; 2025 2026 static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, 2027 bool need_budget) 2028 { 2029 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 2030 int budget_token = -1; 2031 2032 if (need_budget) { 2033 budget_token = blk_mq_get_dispatch_budget(rq->q); 2034 if (budget_token < 0) { 2035 blk_mq_put_driver_tag(rq); 2036 return PREP_DISPATCH_NO_BUDGET; 2037 } 2038 blk_mq_set_rq_budget_token(rq, budget_token); 2039 } 2040 2041 if (!blk_mq_get_driver_tag(rq)) { 2042 /* 2043 * The initial allocation attempt failed, so we need to 2044 * rerun the hardware queue when a tag is freed. The 2045 * waitqueue takes care of that. If the queue is run 2046 * before we add this entry back on the dispatch list, 2047 * we'll re-run it below. 2048 */ 2049 if (!blk_mq_mark_tag_wait(hctx, rq)) { 2050 /* 2051 * All budgets not got from this function will be put 2052 * together during handling partial dispatch 2053 */ 2054 if (need_budget) 2055 blk_mq_put_dispatch_budget(rq->q, budget_token); 2056 return PREP_DISPATCH_NO_TAG; 2057 } 2058 } 2059 2060 return PREP_DISPATCH_OK; 2061 } 2062 2063 /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ 2064 static void blk_mq_release_budgets(struct request_queue *q, 2065 struct list_head *list) 2066 { 2067 struct request *rq; 2068 2069 list_for_each_entry(rq, list, queuelist) { 2070 int budget_token = blk_mq_get_rq_budget_token(rq); 2071 2072 if (budget_token >= 0) 2073 blk_mq_put_dispatch_budget(q, budget_token); 2074 } 2075 } 2076 2077 /* 2078 * blk_mq_commit_rqs will notify driver using bd->last that there is no 2079 * more requests. (See comment in struct blk_mq_ops for commit_rqs for 2080 * details) 2081 * Attention, we should explicitly call this in unusual cases: 2082 * 1) did not queue everything initially scheduled to queue 2083 * 2) the last attempt to queue a request failed 2084 */ 2085 static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, 2086 bool from_schedule) 2087 { 2088 if (hctx->queue->mq_ops->commit_rqs && queued) { 2089 trace_block_unplug(hctx->queue, queued, !from_schedule); 2090 hctx->queue->mq_ops->commit_rqs(hctx); 2091 } 2092 } 2093 2094 /* 2095 * Returns true if we did some work AND can potentially do more. 2096 */ 2097 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, 2098 bool get_budget) 2099 { 2100 enum prep_dispatch prep; 2101 struct request_queue *q = hctx->queue; 2102 struct request *rq; 2103 int queued; 2104 blk_status_t ret = BLK_STS_OK; 2105 bool needs_resource = false; 2106 2107 if (list_empty(list)) 2108 return false; 2109 2110 /* 2111 * Now process all the entries, sending them to the driver. 2112 */ 2113 queued = 0; 2114 do { 2115 struct blk_mq_queue_data bd; 2116 2117 rq = list_first_entry(list, struct request, queuelist); 2118 2119 WARN_ON_ONCE(hctx != rq->mq_hctx); 2120 prep = blk_mq_prep_dispatch_rq(rq, get_budget); 2121 if (prep != PREP_DISPATCH_OK) 2122 break; 2123 2124 list_del_init(&rq->queuelist); 2125 2126 bd.rq = rq; 2127 bd.last = list_empty(list); 2128 2129 ret = q->mq_ops->queue_rq(hctx, &bd); 2130 switch (ret) { 2131 case BLK_STS_OK: 2132 queued++; 2133 break; 2134 case BLK_STS_RESOURCE: 2135 needs_resource = true; 2136 fallthrough; 2137 case BLK_STS_DEV_RESOURCE: 2138 blk_mq_handle_dev_resource(rq, list); 2139 goto out; 2140 default: 2141 blk_mq_end_request(rq, ret); 2142 } 2143 } while (!list_empty(list)); 2144 out: 2145 /* If we didn't flush the entire list, we could have told the driver 2146 * there was more coming, but that turned out to be a lie. 2147 */ 2148 if (!list_empty(list) || ret != BLK_STS_OK) 2149 blk_mq_commit_rqs(hctx, queued, false); 2150 2151 /* 2152 * Any items that need requeuing? Stuff them into hctx->dispatch, 2153 * that is where we will continue on next queue run. 2154 */ 2155 if (!list_empty(list)) { 2156 bool needs_restart; 2157 /* For non-shared tags, the RESTART check will suffice */ 2158 bool no_tag = prep == PREP_DISPATCH_NO_TAG && 2159 ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || 2160 blk_mq_is_shared_tags(hctx->flags)); 2161 2162 /* 2163 * If the caller allocated budgets, free the budgets of the 2164 * requests that have not yet been passed to the block driver. 2165 */ 2166 if (!get_budget) 2167 blk_mq_release_budgets(q, list); 2168 2169 spin_lock(&hctx->lock); 2170 list_splice_tail_init(list, &hctx->dispatch); 2171 spin_unlock(&hctx->lock); 2172 2173 /* 2174 * Order adding requests to hctx->dispatch and checking 2175 * SCHED_RESTART flag. The pair of this smp_mb() is the one 2176 * in blk_mq_sched_restart(). Avoid restart code path to 2177 * miss the new added requests to hctx->dispatch, meantime 2178 * SCHED_RESTART is observed here. 2179 */ 2180 smp_mb(); 2181 2182 /* 2183 * If SCHED_RESTART was set by the caller of this function and 2184 * it is no longer set that means that it was cleared by another 2185 * thread and hence that a queue rerun is needed. 2186 * 2187 * If 'no_tag' is set, that means that we failed getting 2188 * a driver tag with an I/O scheduler attached. If our dispatch 2189 * waitqueue is no longer active, ensure that we run the queue 2190 * AFTER adding our entries back to the list. 2191 * 2192 * If no I/O scheduler has been configured it is possible that 2193 * the hardware queue got stopped and restarted before requests 2194 * were pushed back onto the dispatch list. Rerun the queue to 2195 * avoid starvation. Notes: 2196 * - blk_mq_run_hw_queue() checks whether or not a queue has 2197 * been stopped before rerunning a queue. 2198 * - Some but not all block drivers stop a queue before 2199 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq 2200 * and dm-rq. 2201 * 2202 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART 2203 * bit is set, run queue after a delay to avoid IO stalls 2204 * that could otherwise occur if the queue is idle. We'll do 2205 * similar if we couldn't get budget or couldn't lock a zone 2206 * and SCHED_RESTART is set. 2207 */ 2208 needs_restart = blk_mq_sched_needs_restart(hctx); 2209 if (prep == PREP_DISPATCH_NO_BUDGET) 2210 needs_resource = true; 2211 if (!needs_restart || 2212 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) 2213 blk_mq_run_hw_queue(hctx, true); 2214 else if (needs_resource) 2215 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); 2216 2217 blk_mq_update_dispatch_busy(hctx, true); 2218 return false; 2219 } 2220 2221 blk_mq_update_dispatch_busy(hctx, false); 2222 return true; 2223 } 2224 2225 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) 2226 { 2227 int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); 2228 2229 if (cpu >= nr_cpu_ids) 2230 cpu = cpumask_first(hctx->cpumask); 2231 return cpu; 2232 } 2233 2234 /* 2235 * ->next_cpu is always calculated from hctx->cpumask, so simply use 2236 * it for speeding up the check 2237 */ 2238 static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx) 2239 { 2240 return hctx->next_cpu >= nr_cpu_ids; 2241 } 2242 2243 /* 2244 * It'd be great if the workqueue API had a way to pass 2245 * in a mask and had some smarts for more clever placement. 2246 * For now we just round-robin here, switching for every 2247 * BLK_MQ_CPU_WORK_BATCH queued items. 2248 */ 2249 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 2250 { 2251 bool tried = false; 2252 int next_cpu = hctx->next_cpu; 2253 2254 /* Switch to unbound if no allowable CPUs in this hctx */ 2255 if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx)) 2256 return WORK_CPU_UNBOUND; 2257 2258 if (--hctx->next_cpu_batch <= 0) { 2259 select_cpu: 2260 next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, 2261 cpu_online_mask); 2262 if (next_cpu >= nr_cpu_ids) 2263 next_cpu = blk_mq_first_mapped_cpu(hctx); 2264 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 2265 } 2266 2267 /* 2268 * Do unbound schedule if we can't find a online CPU for this hctx, 2269 * and it should only happen in the path of handling CPU DEAD. 2270 */ 2271 if (!cpu_online(next_cpu)) { 2272 if (!tried) { 2273 tried = true; 2274 goto select_cpu; 2275 } 2276 2277 /* 2278 * Make sure to re-select CPU next time once after CPUs 2279 * in hctx->cpumask become online again. 2280 */ 2281 hctx->next_cpu = next_cpu; 2282 hctx->next_cpu_batch = 1; 2283 return WORK_CPU_UNBOUND; 2284 } 2285 2286 hctx->next_cpu = next_cpu; 2287 return next_cpu; 2288 } 2289 2290 /** 2291 * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. 2292 * @hctx: Pointer to the hardware queue to run. 2293 * @msecs: Milliseconds of delay to wait before running the queue. 2294 * 2295 * Run a hardware queue asynchronously with a delay of @msecs. 2296 */ 2297 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 2298 { 2299 if (unlikely(blk_mq_hctx_stopped(hctx))) 2300 return; 2301 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, 2302 msecs_to_jiffies(msecs)); 2303 } 2304 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); 2305 2306 static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx) 2307 { 2308 bool need_run; 2309 2310 /* 2311 * When queue is quiesced, we may be switching io scheduler, or 2312 * updating nr_hw_queues, or other things, and we can't run queue 2313 * any more, even blk_mq_hctx_has_pending() can't be called safely. 2314 * 2315 * And queue will be rerun in blk_mq_unquiesce_queue() if it is 2316 * quiesced. 2317 */ 2318 __blk_mq_run_dispatch_ops(hctx->queue, false, 2319 need_run = !blk_queue_quiesced(hctx->queue) && 2320 blk_mq_hctx_has_pending(hctx)); 2321 return need_run; 2322 } 2323 2324 /** 2325 * blk_mq_run_hw_queue - Start to run a hardware queue. 2326 * @hctx: Pointer to the hardware queue to run. 2327 * @async: If we want to run the queue asynchronously. 2328 * 2329 * Check if the request queue is not in a quiesced state and if there are 2330 * pending requests to be sent. If this is true, run the queue to send requests 2331 * to hardware. 2332 */ 2333 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 2334 { 2335 bool need_run; 2336 2337 /* 2338 * We can't run the queue inline with interrupts disabled. 2339 */ 2340 WARN_ON_ONCE(!async && in_interrupt()); 2341 2342 might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); 2343 2344 need_run = blk_mq_hw_queue_need_run(hctx); 2345 if (!need_run) { 2346 unsigned long flags; 2347 2348 /* 2349 * Synchronize with blk_mq_unquiesce_queue(), because we check 2350 * if hw queue is quiesced locklessly above, we need the use 2351 * ->queue_lock to make sure we see the up-to-date status to 2352 * not miss rerunning the hw queue. 2353 */ 2354 spin_lock_irqsave(&hctx->queue->queue_lock, flags); 2355 need_run = blk_mq_hw_queue_need_run(hctx); 2356 spin_unlock_irqrestore(&hctx->queue->queue_lock, flags); 2357 2358 if (!need_run) 2359 return; 2360 } 2361 2362 if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { 2363 blk_mq_delay_run_hw_queue(hctx, 0); 2364 return; 2365 } 2366 2367 blk_mq_run_dispatch_ops(hctx->queue, 2368 blk_mq_sched_dispatch_requests(hctx)); 2369 } 2370 EXPORT_SYMBOL(blk_mq_run_hw_queue); 2371 2372 /* 2373 * Return prefered queue to dispatch from (if any) for non-mq aware IO 2374 * scheduler. 2375 */ 2376 static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) 2377 { 2378 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 2379 /* 2380 * If the IO scheduler does not respect hardware queues when 2381 * dispatching, we just don't bother with multiple HW queues and 2382 * dispatch from hctx for the current CPU since running multiple queues 2383 * just causes lock contention inside the scheduler and pointless cache 2384 * bouncing. 2385 */ 2386 struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT]; 2387 2388 if (!blk_mq_hctx_stopped(hctx)) 2389 return hctx; 2390 return NULL; 2391 } 2392 2393 /** 2394 * blk_mq_run_hw_queues - Run all hardware queues in a request queue. 2395 * @q: Pointer to the request queue to run. 2396 * @async: If we want to run the queue asynchronously. 2397 */ 2398 void blk_mq_run_hw_queues(struct request_queue *q, bool async) 2399 { 2400 struct blk_mq_hw_ctx *hctx, *sq_hctx; 2401 unsigned long i; 2402 2403 sq_hctx = NULL; 2404 if (blk_queue_sq_sched(q)) 2405 sq_hctx = blk_mq_get_sq_hctx(q); 2406 queue_for_each_hw_ctx(q, hctx, i) { 2407 if (blk_mq_hctx_stopped(hctx)) 2408 continue; 2409 /* 2410 * Dispatch from this hctx either if there's no hctx preferred 2411 * by IO scheduler or if it has requests that bypass the 2412 * scheduler. 2413 */ 2414 if (!sq_hctx || sq_hctx == hctx || 2415 !list_empty_careful(&hctx->dispatch)) 2416 blk_mq_run_hw_queue(hctx, async); 2417 } 2418 } 2419 EXPORT_SYMBOL(blk_mq_run_hw_queues); 2420 2421 /** 2422 * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. 2423 * @q: Pointer to the request queue to run. 2424 * @msecs: Milliseconds of delay to wait before running the queues. 2425 */ 2426 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) 2427 { 2428 struct blk_mq_hw_ctx *hctx, *sq_hctx; 2429 unsigned long i; 2430 2431 sq_hctx = NULL; 2432 if (blk_queue_sq_sched(q)) 2433 sq_hctx = blk_mq_get_sq_hctx(q); 2434 queue_for_each_hw_ctx(q, hctx, i) { 2435 if (blk_mq_hctx_stopped(hctx)) 2436 continue; 2437 /* 2438 * If there is already a run_work pending, leave the 2439 * pending delay untouched. Otherwise, a hctx can stall 2440 * if another hctx is re-delaying the other's work 2441 * before the work executes. 2442 */ 2443 if (delayed_work_pending(&hctx->run_work)) 2444 continue; 2445 /* 2446 * Dispatch from this hctx either if there's no hctx preferred 2447 * by IO scheduler or if it has requests that bypass the 2448 * scheduler. 2449 */ 2450 if (!sq_hctx || sq_hctx == hctx || 2451 !list_empty_careful(&hctx->dispatch)) 2452 blk_mq_delay_run_hw_queue(hctx, msecs); 2453 } 2454 } 2455 EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); 2456 2457 /* 2458 * This function is often used for pausing .queue_rq() by driver when 2459 * there isn't enough resource or some conditions aren't satisfied, and 2460 * BLK_STS_RESOURCE is usually returned. 2461 * 2462 * We do not guarantee that dispatch can be drained or blocked 2463 * after blk_mq_stop_hw_queue() returns. Please use 2464 * blk_mq_quiesce_queue() for that requirement. 2465 */ 2466 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 2467 { 2468 cancel_delayed_work(&hctx->run_work); 2469 2470 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 2471 } 2472 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 2473 2474 /* 2475 * This function is often used for pausing .queue_rq() by driver when 2476 * there isn't enough resource or some conditions aren't satisfied, and 2477 * BLK_STS_RESOURCE is usually returned. 2478 * 2479 * We do not guarantee that dispatch can be drained or blocked 2480 * after blk_mq_stop_hw_queues() returns. Please use 2481 * blk_mq_quiesce_queue() for that requirement. 2482 */ 2483 void blk_mq_stop_hw_queues(struct request_queue *q) 2484 { 2485 struct blk_mq_hw_ctx *hctx; 2486 unsigned long i; 2487 2488 queue_for_each_hw_ctx(q, hctx, i) 2489 blk_mq_stop_hw_queue(hctx); 2490 } 2491 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 2492 2493 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 2494 { 2495 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 2496 2497 blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); 2498 } 2499 EXPORT_SYMBOL(blk_mq_start_hw_queue); 2500 2501 void blk_mq_start_hw_queues(struct request_queue *q) 2502 { 2503 struct blk_mq_hw_ctx *hctx; 2504 unsigned long i; 2505 2506 queue_for_each_hw_ctx(q, hctx, i) 2507 blk_mq_start_hw_queue(hctx); 2508 } 2509 EXPORT_SYMBOL(blk_mq_start_hw_queues); 2510 2511 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 2512 { 2513 if (!blk_mq_hctx_stopped(hctx)) 2514 return; 2515 2516 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 2517 /* 2518 * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the 2519 * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch 2520 * list in the subsequent routine. 2521 */ 2522 smp_mb__after_atomic(); 2523 blk_mq_run_hw_queue(hctx, async); 2524 } 2525 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); 2526 2527 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 2528 { 2529 struct blk_mq_hw_ctx *hctx; 2530 unsigned long i; 2531 2532 queue_for_each_hw_ctx(q, hctx, i) 2533 blk_mq_start_stopped_hw_queue(hctx, async || 2534 (hctx->flags & BLK_MQ_F_BLOCKING)); 2535 } 2536 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 2537 2538 static void blk_mq_run_work_fn(struct work_struct *work) 2539 { 2540 struct blk_mq_hw_ctx *hctx = 2541 container_of(work, struct blk_mq_hw_ctx, run_work.work); 2542 2543 blk_mq_run_dispatch_ops(hctx->queue, 2544 blk_mq_sched_dispatch_requests(hctx)); 2545 } 2546 2547 /** 2548 * blk_mq_request_bypass_insert - Insert a request at dispatch list. 2549 * @rq: Pointer to request to be inserted. 2550 * @flags: BLK_MQ_INSERT_* 2551 * 2552 * Should only be used carefully, when the caller knows we want to 2553 * bypass a potential IO scheduler on the target device. 2554 */ 2555 static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) 2556 { 2557 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 2558 2559 spin_lock(&hctx->lock); 2560 if (flags & BLK_MQ_INSERT_AT_HEAD) 2561 list_add(&rq->queuelist, &hctx->dispatch); 2562 else 2563 list_add_tail(&rq->queuelist, &hctx->dispatch); 2564 spin_unlock(&hctx->lock); 2565 } 2566 2567 static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, 2568 struct blk_mq_ctx *ctx, struct list_head *list, 2569 bool run_queue_async) 2570 { 2571 struct request *rq; 2572 enum hctx_type type = hctx->type; 2573 2574 /* 2575 * Try to issue requests directly if the hw queue isn't busy to save an 2576 * extra enqueue & dequeue to the sw queue. 2577 */ 2578 if (!hctx->dispatch_busy && !run_queue_async) { 2579 blk_mq_run_dispatch_ops(hctx->queue, 2580 blk_mq_try_issue_list_directly(hctx, list)); 2581 if (list_empty(list)) 2582 goto out; 2583 } 2584 2585 /* 2586 * preemption doesn't flush plug list, so it's possible ctx->cpu is 2587 * offline now 2588 */ 2589 list_for_each_entry(rq, list, queuelist) { 2590 BUG_ON(rq->mq_ctx != ctx); 2591 trace_block_rq_insert(rq); 2592 if (rq->cmd_flags & REQ_NOWAIT) 2593 run_queue_async = true; 2594 } 2595 2596 spin_lock(&ctx->lock); 2597 list_splice_tail_init(list, &ctx->rq_lists[type]); 2598 blk_mq_hctx_mark_pending(hctx, ctx); 2599 spin_unlock(&ctx->lock); 2600 out: 2601 blk_mq_run_hw_queue(hctx, run_queue_async); 2602 } 2603 2604 static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) 2605 { 2606 struct request_queue *q = rq->q; 2607 struct blk_mq_ctx *ctx = rq->mq_ctx; 2608 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 2609 2610 if (blk_rq_is_passthrough(rq)) { 2611 /* 2612 * Passthrough request have to be added to hctx->dispatch 2613 * directly. The device may be in a situation where it can't 2614 * handle FS request, and always returns BLK_STS_RESOURCE for 2615 * them, which gets them added to hctx->dispatch. 2616 * 2617 * If a passthrough request is required to unblock the queues, 2618 * and it is added to the scheduler queue, there is no chance to 2619 * dispatch it given we prioritize requests in hctx->dispatch. 2620 */ 2621 blk_mq_request_bypass_insert(rq, flags); 2622 } else if (req_op(rq) == REQ_OP_FLUSH) { 2623 /* 2624 * Firstly normal IO request is inserted to scheduler queue or 2625 * sw queue, meantime we add flush request to dispatch queue( 2626 * hctx->dispatch) directly and there is at most one in-flight 2627 * flush request for each hw queue, so it doesn't matter to add 2628 * flush request to tail or front of the dispatch queue. 2629 * 2630 * Secondly in case of NCQ, flush request belongs to non-NCQ 2631 * command, and queueing it will fail when there is any 2632 * in-flight normal IO request(NCQ command). When adding flush 2633 * rq to the front of hctx->dispatch, it is easier to introduce 2634 * extra time to flush rq's latency because of S_SCHED_RESTART 2635 * compared with adding to the tail of dispatch queue, then 2636 * chance of flush merge is increased, and less flush requests 2637 * will be issued to controller. It is observed that ~10% time 2638 * is saved in blktests block/004 on disk attached to AHCI/NCQ 2639 * drive when adding flush rq to the front of hctx->dispatch. 2640 * 2641 * Simply queue flush rq to the front of hctx->dispatch so that 2642 * intensive flush workloads can benefit in case of NCQ HW. 2643 */ 2644 blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD); 2645 } else if (q->elevator) { 2646 LIST_HEAD(list); 2647 2648 WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG); 2649 2650 list_add(&rq->queuelist, &list); 2651 q->elevator->type->ops.insert_requests(hctx, &list, flags); 2652 } else { 2653 trace_block_rq_insert(rq); 2654 2655 spin_lock(&ctx->lock); 2656 if (flags & BLK_MQ_INSERT_AT_HEAD) 2657 list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); 2658 else 2659 list_add_tail(&rq->queuelist, 2660 &ctx->rq_lists[hctx->type]); 2661 blk_mq_hctx_mark_pending(hctx, ctx); 2662 spin_unlock(&ctx->lock); 2663 } 2664 } 2665 2666 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, 2667 unsigned int nr_segs) 2668 { 2669 int err; 2670 2671 if (bio->bi_opf & REQ_RAHEAD) 2672 rq->cmd_flags |= REQ_FAILFAST_MASK; 2673 2674 rq->bio = rq->biotail = bio; 2675 rq->__sector = bio->bi_iter.bi_sector; 2676 rq->__data_len = bio->bi_iter.bi_size; 2677 rq->nr_phys_segments = nr_segs; 2678 if (bio_integrity(bio)) 2679 rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, 2680 bio); 2681 2682 /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ 2683 err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); 2684 WARN_ON_ONCE(err); 2685 2686 blk_account_io_start(rq); 2687 } 2688 2689 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, 2690 struct request *rq, bool last) 2691 { 2692 struct request_queue *q = rq->q; 2693 struct blk_mq_queue_data bd = { 2694 .rq = rq, 2695 .last = last, 2696 }; 2697 blk_status_t ret; 2698 2699 /* 2700 * For OK queue, we are done. For error, caller may kill it. 2701 * Any other error (busy), just add it to our list as we 2702 * previously would have done. 2703 */ 2704 ret = q->mq_ops->queue_rq(hctx, &bd); 2705 switch (ret) { 2706 case BLK_STS_OK: 2707 blk_mq_update_dispatch_busy(hctx, false); 2708 break; 2709 case BLK_STS_RESOURCE: 2710 case BLK_STS_DEV_RESOURCE: 2711 blk_mq_update_dispatch_busy(hctx, true); 2712 __blk_mq_requeue_request(rq); 2713 break; 2714 default: 2715 blk_mq_update_dispatch_busy(hctx, false); 2716 break; 2717 } 2718 2719 return ret; 2720 } 2721 2722 static bool blk_mq_get_budget_and_tag(struct request *rq) 2723 { 2724 int budget_token; 2725 2726 budget_token = blk_mq_get_dispatch_budget(rq->q); 2727 if (budget_token < 0) 2728 return false; 2729 blk_mq_set_rq_budget_token(rq, budget_token); 2730 if (!blk_mq_get_driver_tag(rq)) { 2731 blk_mq_put_dispatch_budget(rq->q, budget_token); 2732 return false; 2733 } 2734 return true; 2735 } 2736 2737 /** 2738 * blk_mq_try_issue_directly - Try to send a request directly to device driver. 2739 * @hctx: Pointer of the associated hardware queue. 2740 * @rq: Pointer to request to be sent. 2741 * 2742 * If the device has enough resources to accept a new request now, send the 2743 * request directly to device driver. Else, insert at hctx->dispatch queue, so 2744 * we can try send it another time in the future. Requests inserted at this 2745 * queue have higher priority. 2746 */ 2747 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 2748 struct request *rq) 2749 { 2750 blk_status_t ret; 2751 2752 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { 2753 blk_mq_insert_request(rq, 0); 2754 blk_mq_run_hw_queue(hctx, false); 2755 return; 2756 } 2757 2758 if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { 2759 blk_mq_insert_request(rq, 0); 2760 blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); 2761 return; 2762 } 2763 2764 ret = __blk_mq_issue_directly(hctx, rq, true); 2765 switch (ret) { 2766 case BLK_STS_OK: 2767 break; 2768 case BLK_STS_RESOURCE: 2769 case BLK_STS_DEV_RESOURCE: 2770 blk_mq_request_bypass_insert(rq, 0); 2771 blk_mq_run_hw_queue(hctx, false); 2772 break; 2773 default: 2774 blk_mq_end_request(rq, ret); 2775 break; 2776 } 2777 } 2778 2779 static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) 2780 { 2781 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 2782 2783 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { 2784 blk_mq_insert_request(rq, 0); 2785 blk_mq_run_hw_queue(hctx, false); 2786 return BLK_STS_OK; 2787 } 2788 2789 if (!blk_mq_get_budget_and_tag(rq)) 2790 return BLK_STS_RESOURCE; 2791 return __blk_mq_issue_directly(hctx, rq, last); 2792 } 2793 2794 static void blk_mq_issue_direct(struct rq_list *rqs) 2795 { 2796 struct blk_mq_hw_ctx *hctx = NULL; 2797 struct request *rq; 2798 int queued = 0; 2799 blk_status_t ret = BLK_STS_OK; 2800 2801 while ((rq = rq_list_pop(rqs))) { 2802 bool last = rq_list_empty(rqs); 2803 2804 if (hctx != rq->mq_hctx) { 2805 if (hctx) { 2806 blk_mq_commit_rqs(hctx, queued, false); 2807 queued = 0; 2808 } 2809 hctx = rq->mq_hctx; 2810 } 2811 2812 ret = blk_mq_request_issue_directly(rq, last); 2813 switch (ret) { 2814 case BLK_STS_OK: 2815 queued++; 2816 break; 2817 case BLK_STS_RESOURCE: 2818 case BLK_STS_DEV_RESOURCE: 2819 blk_mq_request_bypass_insert(rq, 0); 2820 blk_mq_run_hw_queue(hctx, false); 2821 goto out; 2822 default: 2823 blk_mq_end_request(rq, ret); 2824 break; 2825 } 2826 } 2827 2828 out: 2829 if (ret != BLK_STS_OK) 2830 blk_mq_commit_rqs(hctx, queued, false); 2831 } 2832 2833 static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs) 2834 { 2835 if (blk_queue_quiesced(q)) 2836 return; 2837 q->mq_ops->queue_rqs(rqs); 2838 } 2839 2840 static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs, 2841 struct rq_list *queue_rqs) 2842 { 2843 struct request *rq = rq_list_pop(rqs); 2844 struct request_queue *this_q = rq->q; 2845 struct request **prev = &rqs->head; 2846 struct rq_list matched_rqs = {}; 2847 struct request *last = NULL; 2848 unsigned depth = 1; 2849 2850 rq_list_add_tail(&matched_rqs, rq); 2851 while ((rq = *prev)) { 2852 if (rq->q == this_q) { 2853 /* move rq from rqs to matched_rqs */ 2854 *prev = rq->rq_next; 2855 rq_list_add_tail(&matched_rqs, rq); 2856 depth++; 2857 } else { 2858 /* leave rq in rqs */ 2859 prev = &rq->rq_next; 2860 last = rq; 2861 } 2862 } 2863 2864 rqs->tail = last; 2865 *queue_rqs = matched_rqs; 2866 return depth; 2867 } 2868 2869 static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth) 2870 { 2871 struct request_queue *q = rq_list_peek(rqs)->q; 2872 2873 trace_block_unplug(q, depth, true); 2874 2875 /* 2876 * Peek first request and see if we have a ->queue_rqs() hook. 2877 * If we do, we can dispatch the whole list in one go. 2878 * We already know at this point that all requests belong to the 2879 * same queue, caller must ensure that's the case. 2880 */ 2881 if (q->mq_ops->queue_rqs) { 2882 blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs)); 2883 if (rq_list_empty(rqs)) 2884 return; 2885 } 2886 2887 blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs)); 2888 } 2889 2890 static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched) 2891 { 2892 struct blk_mq_hw_ctx *this_hctx = NULL; 2893 struct blk_mq_ctx *this_ctx = NULL; 2894 struct rq_list requeue_list = {}; 2895 unsigned int depth = 0; 2896 bool is_passthrough = false; 2897 LIST_HEAD(list); 2898 2899 do { 2900 struct request *rq = rq_list_pop(rqs); 2901 2902 if (!this_hctx) { 2903 this_hctx = rq->mq_hctx; 2904 this_ctx = rq->mq_ctx; 2905 is_passthrough = blk_rq_is_passthrough(rq); 2906 } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || 2907 is_passthrough != blk_rq_is_passthrough(rq)) { 2908 rq_list_add_tail(&requeue_list, rq); 2909 continue; 2910 } 2911 list_add_tail(&rq->queuelist, &list); 2912 depth++; 2913 } while (!rq_list_empty(rqs)); 2914 2915 *rqs = requeue_list; 2916 trace_block_unplug(this_hctx->queue, depth, !from_sched); 2917 2918 percpu_ref_get(&this_hctx->queue->q_usage_counter); 2919 /* passthrough requests should never be issued to the I/O scheduler */ 2920 if (is_passthrough) { 2921 spin_lock(&this_hctx->lock); 2922 list_splice_tail_init(&list, &this_hctx->dispatch); 2923 spin_unlock(&this_hctx->lock); 2924 blk_mq_run_hw_queue(this_hctx, from_sched); 2925 } else if (this_hctx->queue->elevator) { 2926 this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, 2927 &list, 0); 2928 blk_mq_run_hw_queue(this_hctx, from_sched); 2929 } else { 2930 blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched); 2931 } 2932 percpu_ref_put(&this_hctx->queue->q_usage_counter); 2933 } 2934 2935 static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs) 2936 { 2937 do { 2938 struct rq_list queue_rqs; 2939 unsigned depth; 2940 2941 depth = blk_mq_extract_queue_requests(rqs, &queue_rqs); 2942 blk_mq_dispatch_queue_requests(&queue_rqs, depth); 2943 while (!rq_list_empty(&queue_rqs)) 2944 blk_mq_dispatch_list(&queue_rqs, false); 2945 } while (!rq_list_empty(rqs)); 2946 } 2947 2948 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 2949 { 2950 unsigned int depth; 2951 2952 /* 2953 * We may have been called recursively midway through handling 2954 * plug->mq_list via a schedule() in the driver's queue_rq() callback. 2955 * To avoid mq_list changing under our feet, clear rq_count early and 2956 * bail out specifically if rq_count is 0 rather than checking 2957 * whether the mq_list is empty. 2958 */ 2959 if (plug->rq_count == 0) 2960 return; 2961 depth = plug->rq_count; 2962 plug->rq_count = 0; 2963 2964 if (!plug->has_elevator && !from_schedule) { 2965 if (plug->multiple_queues) { 2966 blk_mq_dispatch_multiple_queue_requests(&plug->mq_list); 2967 return; 2968 } 2969 2970 blk_mq_dispatch_queue_requests(&plug->mq_list, depth); 2971 if (rq_list_empty(&plug->mq_list)) 2972 return; 2973 } 2974 2975 do { 2976 blk_mq_dispatch_list(&plug->mq_list, from_schedule); 2977 } while (!rq_list_empty(&plug->mq_list)); 2978 } 2979 2980 static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 2981 struct list_head *list) 2982 { 2983 int queued = 0; 2984 blk_status_t ret = BLK_STS_OK; 2985 2986 while (!list_empty(list)) { 2987 struct request *rq = list_first_entry(list, struct request, 2988 queuelist); 2989 2990 list_del_init(&rq->queuelist); 2991 ret = blk_mq_request_issue_directly(rq, list_empty(list)); 2992 switch (ret) { 2993 case BLK_STS_OK: 2994 queued++; 2995 break; 2996 case BLK_STS_RESOURCE: 2997 case BLK_STS_DEV_RESOURCE: 2998 blk_mq_request_bypass_insert(rq, 0); 2999 if (list_empty(list)) 3000 blk_mq_run_hw_queue(hctx, false); 3001 goto out; 3002 default: 3003 blk_mq_end_request(rq, ret); 3004 break; 3005 } 3006 } 3007 3008 out: 3009 if (ret != BLK_STS_OK) 3010 blk_mq_commit_rqs(hctx, queued, false); 3011 } 3012 3013 static bool blk_mq_attempt_bio_merge(struct request_queue *q, 3014 struct bio *bio, unsigned int nr_segs) 3015 { 3016 if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { 3017 if (blk_attempt_plug_merge(q, bio, nr_segs)) 3018 return true; 3019 if (blk_mq_sched_bio_merge(q, bio, nr_segs)) 3020 return true; 3021 } 3022 return false; 3023 } 3024 3025 static struct request *blk_mq_get_new_requests(struct request_queue *q, 3026 struct blk_plug *plug, 3027 struct bio *bio) 3028 { 3029 struct blk_mq_alloc_data data = { 3030 .q = q, 3031 .flags = 0, 3032 .shallow_depth = 0, 3033 .cmd_flags = bio->bi_opf, 3034 .rq_flags = 0, 3035 .nr_tags = 1, 3036 .cached_rqs = NULL, 3037 .ctx = NULL, 3038 .hctx = NULL 3039 }; 3040 struct request *rq; 3041 3042 rq_qos_throttle(q, bio); 3043 3044 if (plug) { 3045 data.nr_tags = plug->nr_ios; 3046 plug->nr_ios = 1; 3047 data.cached_rqs = &plug->cached_rqs; 3048 } 3049 3050 rq = __blk_mq_alloc_requests(&data); 3051 if (unlikely(!rq)) 3052 rq_qos_cleanup(q, bio); 3053 return rq; 3054 } 3055 3056 /* 3057 * Check if there is a suitable cached request and return it. 3058 */ 3059 static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, 3060 struct request_queue *q, blk_opf_t opf) 3061 { 3062 enum hctx_type type = blk_mq_get_hctx_type(opf); 3063 struct request *rq; 3064 3065 if (!plug) 3066 return NULL; 3067 rq = rq_list_peek(&plug->cached_rqs); 3068 if (!rq || rq->q != q) 3069 return NULL; 3070 if (type != rq->mq_hctx->type && 3071 (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT)) 3072 return NULL; 3073 if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) 3074 return NULL; 3075 return rq; 3076 } 3077 3078 static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, 3079 struct bio *bio) 3080 { 3081 if (rq_list_pop(&plug->cached_rqs) != rq) 3082 WARN_ON_ONCE(1); 3083 3084 /* 3085 * If any qos ->throttle() end up blocking, we will have flushed the 3086 * plug and hence killed the cached_rq list as well. Pop this entry 3087 * before we throttle. 3088 */ 3089 rq_qos_throttle(rq->q, bio); 3090 3091 blk_mq_rq_time_init(rq, blk_time_get_ns()); 3092 rq->cmd_flags = bio->bi_opf; 3093 INIT_LIST_HEAD(&rq->queuelist); 3094 } 3095 3096 static bool bio_unaligned(const struct bio *bio, struct request_queue *q) 3097 { 3098 unsigned int bs_mask = queue_logical_block_size(q) - 1; 3099 3100 /* .bi_sector of any zero sized bio need to be initialized */ 3101 if ((bio->bi_iter.bi_size & bs_mask) || 3102 ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask)) 3103 return true; 3104 return false; 3105 } 3106 3107 /** 3108 * blk_mq_submit_bio - Create and send a request to block device. 3109 * @bio: Bio pointer. 3110 * 3111 * Builds up a request structure from @q and @bio and send to the device. The 3112 * request may not be queued directly to hardware if: 3113 * * This request can be merged with another one 3114 * * We want to place request at plug queue for possible future merging 3115 * * There is an IO scheduler active at this queue 3116 * 3117 * It will not queue the request if there is an error with the bio, or at the 3118 * request creation. 3119 */ 3120 void blk_mq_submit_bio(struct bio *bio) 3121 { 3122 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 3123 struct blk_plug *plug = current->plug; 3124 const int is_sync = op_is_sync(bio->bi_opf); 3125 struct blk_mq_hw_ctx *hctx; 3126 unsigned int nr_segs; 3127 struct request *rq; 3128 blk_status_t ret; 3129 3130 /* 3131 * If the plug has a cached request for this queue, try to use it. 3132 */ 3133 rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); 3134 3135 /* 3136 * A BIO that was released from a zone write plug has already been 3137 * through the preparation in this function, already holds a reference 3138 * on the queue usage counter, and is the only write BIO in-flight for 3139 * the target zone. Go straight to preparing a request for it. 3140 */ 3141 if (bio_zone_write_plugging(bio)) { 3142 nr_segs = bio->__bi_nr_segments; 3143 if (rq) 3144 blk_queue_exit(q); 3145 goto new_request; 3146 } 3147 3148 /* 3149 * The cached request already holds a q_usage_counter reference and we 3150 * don't have to acquire a new one if we use it. 3151 */ 3152 if (!rq) { 3153 if (unlikely(bio_queue_enter(bio))) 3154 return; 3155 } 3156 3157 /* 3158 * Device reconfiguration may change logical block size or reduce the 3159 * number of poll queues, so the checks for alignment and poll support 3160 * have to be done with queue usage counter held. 3161 */ 3162 if (unlikely(bio_unaligned(bio, q))) { 3163 bio_io_error(bio); 3164 goto queue_exit; 3165 } 3166 3167 if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) { 3168 bio->bi_status = BLK_STS_NOTSUPP; 3169 bio_endio(bio); 3170 goto queue_exit; 3171 } 3172 3173 bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); 3174 if (!bio) 3175 goto queue_exit; 3176 3177 if (!bio_integrity_prep(bio)) 3178 goto queue_exit; 3179 3180 blk_mq_bio_issue_init(q, bio); 3181 if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) 3182 goto queue_exit; 3183 3184 if (bio_needs_zone_write_plugging(bio)) { 3185 if (blk_zone_plug_bio(bio, nr_segs)) 3186 goto queue_exit; 3187 } 3188 3189 new_request: 3190 if (rq) { 3191 blk_mq_use_cached_rq(rq, plug, bio); 3192 } else { 3193 rq = blk_mq_get_new_requests(q, plug, bio); 3194 if (unlikely(!rq)) { 3195 if (bio->bi_opf & REQ_NOWAIT) 3196 bio_wouldblock_error(bio); 3197 goto queue_exit; 3198 } 3199 } 3200 3201 trace_block_getrq(bio); 3202 3203 rq_qos_track(q, rq, bio); 3204 3205 blk_mq_bio_to_request(rq, bio, nr_segs); 3206 3207 ret = blk_crypto_rq_get_keyslot(rq); 3208 if (ret != BLK_STS_OK) { 3209 bio->bi_status = ret; 3210 bio_endio(bio); 3211 blk_mq_free_request(rq); 3212 return; 3213 } 3214 3215 if (bio_zone_write_plugging(bio)) 3216 blk_zone_write_plug_init_request(rq); 3217 3218 if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) 3219 return; 3220 3221 if (plug) { 3222 blk_add_rq_to_plug(plug, rq); 3223 return; 3224 } 3225 3226 hctx = rq->mq_hctx; 3227 if ((rq->rq_flags & RQF_USE_SCHED) || 3228 (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { 3229 blk_mq_insert_request(rq, 0); 3230 blk_mq_run_hw_queue(hctx, true); 3231 } else { 3232 blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); 3233 } 3234 return; 3235 3236 queue_exit: 3237 /* 3238 * Don't drop the queue reference if we were trying to use a cached 3239 * request and thus didn't acquire one. 3240 */ 3241 if (!rq) 3242 blk_queue_exit(q); 3243 } 3244 3245 #ifdef CONFIG_BLK_MQ_STACKING 3246 /** 3247 * blk_insert_cloned_request - Helper for stacking drivers to submit a request 3248 * @rq: the request being queued 3249 */ 3250 blk_status_t blk_insert_cloned_request(struct request *rq) 3251 { 3252 struct request_queue *q = rq->q; 3253 unsigned int max_sectors = blk_queue_get_max_sectors(rq); 3254 unsigned int max_segments = blk_rq_get_max_segments(rq); 3255 blk_status_t ret; 3256 3257 if (blk_rq_sectors(rq) > max_sectors) { 3258 /* 3259 * SCSI device does not have a good way to return if 3260 * Write Same/Zero is actually supported. If a device rejects 3261 * a non-read/write command (discard, write same,etc.) the 3262 * low-level device driver will set the relevant queue limit to 3263 * 0 to prevent blk-lib from issuing more of the offending 3264 * operations. Commands queued prior to the queue limit being 3265 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O 3266 * errors being propagated to upper layers. 3267 */ 3268 if (max_sectors == 0) 3269 return BLK_STS_NOTSUPP; 3270 3271 printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", 3272 __func__, blk_rq_sectors(rq), max_sectors); 3273 return BLK_STS_IOERR; 3274 } 3275 3276 /* 3277 * The queue settings related to segment counting may differ from the 3278 * original queue. 3279 */ 3280 rq->nr_phys_segments = blk_recalc_rq_segments(rq); 3281 if (rq->nr_phys_segments > max_segments) { 3282 printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n", 3283 __func__, rq->nr_phys_segments, max_segments); 3284 return BLK_STS_IOERR; 3285 } 3286 3287 if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) 3288 return BLK_STS_IOERR; 3289 3290 ret = blk_crypto_rq_get_keyslot(rq); 3291 if (ret != BLK_STS_OK) 3292 return ret; 3293 3294 blk_account_io_start(rq); 3295 3296 /* 3297 * Since we have a scheduler attached on the top device, 3298 * bypass a potential scheduler on the bottom device for 3299 * insert. 3300 */ 3301 blk_mq_run_dispatch_ops(q, 3302 ret = blk_mq_request_issue_directly(rq, true)); 3303 if (ret) 3304 blk_account_io_done(rq, blk_time_get_ns()); 3305 return ret; 3306 } 3307 EXPORT_SYMBOL_GPL(blk_insert_cloned_request); 3308 3309 /** 3310 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request 3311 * @rq: the clone request to be cleaned up 3312 * 3313 * Description: 3314 * Free all bios in @rq for a cloned request. 3315 */ 3316 void blk_rq_unprep_clone(struct request *rq) 3317 { 3318 struct bio *bio; 3319 3320 while ((bio = rq->bio) != NULL) { 3321 rq->bio = bio->bi_next; 3322 3323 bio_put(bio); 3324 } 3325 } 3326 EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); 3327 3328 /** 3329 * blk_rq_prep_clone - Helper function to setup clone request 3330 * @rq: the request to be setup 3331 * @rq_src: original request to be cloned 3332 * @bs: bio_set that bios for clone are allocated from 3333 * @gfp_mask: memory allocation mask for bio 3334 * @bio_ctr: setup function to be called for each clone bio. 3335 * Returns %0 for success, non %0 for failure. 3336 * @data: private data to be passed to @bio_ctr 3337 * 3338 * Description: 3339 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. 3340 * Also, pages which the original bios are pointing to are not copied 3341 * and the cloned bios just point same pages. 3342 * So cloned bios must be completed before original bios, which means 3343 * the caller must complete @rq before @rq_src. 3344 */ 3345 int blk_rq_prep_clone(struct request *rq, struct request *rq_src, 3346 struct bio_set *bs, gfp_t gfp_mask, 3347 int (*bio_ctr)(struct bio *, struct bio *, void *), 3348 void *data) 3349 { 3350 struct bio *bio_src; 3351 3352 if (!bs) 3353 bs = &fs_bio_set; 3354 3355 __rq_for_each_bio(bio_src, rq_src) { 3356 struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src, 3357 gfp_mask, bs); 3358 if (!bio) 3359 goto free_and_out; 3360 3361 if (bio_ctr && bio_ctr(bio, bio_src, data)) { 3362 bio_put(bio); 3363 goto free_and_out; 3364 } 3365 3366 if (rq->bio) { 3367 rq->biotail->bi_next = bio; 3368 rq->biotail = bio; 3369 } else { 3370 rq->bio = rq->biotail = bio; 3371 } 3372 } 3373 3374 /* Copy attributes of the original request to the clone request. */ 3375 rq->__sector = blk_rq_pos(rq_src); 3376 rq->__data_len = blk_rq_bytes(rq_src); 3377 if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { 3378 rq->rq_flags |= RQF_SPECIAL_PAYLOAD; 3379 rq->special_vec = rq_src->special_vec; 3380 } 3381 rq->nr_phys_segments = rq_src->nr_phys_segments; 3382 rq->nr_integrity_segments = rq_src->nr_integrity_segments; 3383 3384 if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) 3385 goto free_and_out; 3386 3387 return 0; 3388 3389 free_and_out: 3390 blk_rq_unprep_clone(rq); 3391 3392 return -ENOMEM; 3393 } 3394 EXPORT_SYMBOL_GPL(blk_rq_prep_clone); 3395 #endif /* CONFIG_BLK_MQ_STACKING */ 3396 3397 /* 3398 * Steal bios from a request and add them to a bio list. 3399 * The request must not have been partially completed before. 3400 */ 3401 void blk_steal_bios(struct bio_list *list, struct request *rq) 3402 { 3403 if (rq->bio) { 3404 if (list->tail) 3405 list->tail->bi_next = rq->bio; 3406 else 3407 list->head = rq->bio; 3408 list->tail = rq->biotail; 3409 3410 rq->bio = NULL; 3411 rq->biotail = NULL; 3412 } 3413 3414 rq->__data_len = 0; 3415 } 3416 EXPORT_SYMBOL_GPL(blk_steal_bios); 3417 3418 static size_t order_to_size(unsigned int order) 3419 { 3420 return (size_t)PAGE_SIZE << order; 3421 } 3422 3423 /* called before freeing request pool in @tags */ 3424 static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, 3425 struct blk_mq_tags *tags) 3426 { 3427 struct page *page; 3428 3429 /* 3430 * There is no need to clear mapping if driver tags is not initialized 3431 * or the mapping belongs to the driver tags. 3432 */ 3433 if (!drv_tags || drv_tags == tags) 3434 return; 3435 3436 list_for_each_entry(page, &tags->page_list, lru) { 3437 unsigned long start = (unsigned long)page_address(page); 3438 unsigned long end = start + order_to_size(page->private); 3439 int i; 3440 3441 for (i = 0; i < drv_tags->nr_tags; i++) { 3442 struct request *rq = drv_tags->rqs[i]; 3443 unsigned long rq_addr = (unsigned long)rq; 3444 3445 if (rq_addr >= start && rq_addr < end) { 3446 WARN_ON_ONCE(req_ref_read(rq) != 0); 3447 cmpxchg(&drv_tags->rqs[i], rq, NULL); 3448 } 3449 } 3450 } 3451 } 3452 3453 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 3454 unsigned int hctx_idx) 3455 { 3456 struct blk_mq_tags *drv_tags; 3457 3458 if (list_empty(&tags->page_list)) 3459 return; 3460 3461 if (blk_mq_is_shared_tags(set->flags)) 3462 drv_tags = set->shared_tags; 3463 else 3464 drv_tags = set->tags[hctx_idx]; 3465 3466 if (tags->static_rqs && set->ops->exit_request) { 3467 int i; 3468 3469 for (i = 0; i < tags->nr_tags; i++) { 3470 struct request *rq = tags->static_rqs[i]; 3471 3472 if (!rq) 3473 continue; 3474 set->ops->exit_request(set, rq, hctx_idx); 3475 tags->static_rqs[i] = NULL; 3476 } 3477 } 3478 3479 blk_mq_clear_rq_mapping(drv_tags, tags); 3480 /* 3481 * Free request pages in SRCU callback, which is called from 3482 * blk_mq_free_tags(). 3483 */ 3484 } 3485 3486 void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) 3487 { 3488 kfree(tags->rqs); 3489 tags->rqs = NULL; 3490 kfree(tags->static_rqs); 3491 tags->static_rqs = NULL; 3492 3493 blk_mq_free_tags(set, tags); 3494 } 3495 3496 static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, 3497 unsigned int hctx_idx) 3498 { 3499 int i; 3500 3501 for (i = 0; i < set->nr_maps; i++) { 3502 unsigned int start = set->map[i].queue_offset; 3503 unsigned int end = start + set->map[i].nr_queues; 3504 3505 if (hctx_idx >= start && hctx_idx < end) 3506 break; 3507 } 3508 3509 if (i >= set->nr_maps) 3510 i = HCTX_TYPE_DEFAULT; 3511 3512 return i; 3513 } 3514 3515 static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set, 3516 unsigned int hctx_idx) 3517 { 3518 enum hctx_type type = hctx_idx_to_type(set, hctx_idx); 3519 3520 return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx); 3521 } 3522 3523 static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, 3524 unsigned int hctx_idx, 3525 unsigned int nr_tags, 3526 unsigned int reserved_tags) 3527 { 3528 int node = blk_mq_get_hctx_node(set, hctx_idx); 3529 struct blk_mq_tags *tags; 3530 3531 if (node == NUMA_NO_NODE) 3532 node = set->numa_node; 3533 3534 tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node); 3535 if (!tags) 3536 return NULL; 3537 3538 tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), 3539 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 3540 node); 3541 if (!tags->rqs) 3542 goto err_free_tags; 3543 3544 tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), 3545 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 3546 node); 3547 if (!tags->static_rqs) 3548 goto err_free_rqs; 3549 3550 return tags; 3551 3552 err_free_rqs: 3553 kfree(tags->rqs); 3554 err_free_tags: 3555 blk_mq_free_tags(set, tags); 3556 return NULL; 3557 } 3558 3559 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, 3560 unsigned int hctx_idx, int node) 3561 { 3562 int ret; 3563 3564 if (set->ops->init_request) { 3565 ret = set->ops->init_request(set, rq, hctx_idx, node); 3566 if (ret) 3567 return ret; 3568 } 3569 3570 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 3571 return 0; 3572 } 3573 3574 static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, 3575 struct blk_mq_tags *tags, 3576 unsigned int hctx_idx, unsigned int depth) 3577 { 3578 unsigned int i, j, entries_per_page, max_order = 4; 3579 int node = blk_mq_get_hctx_node(set, hctx_idx); 3580 size_t rq_size, left; 3581 3582 if (node == NUMA_NO_NODE) 3583 node = set->numa_node; 3584 3585 /* 3586 * rq_size is the size of the request plus driver payload, rounded 3587 * to the cacheline size 3588 */ 3589 rq_size = round_up(sizeof(struct request) + set->cmd_size, 3590 cache_line_size()); 3591 left = rq_size * depth; 3592 3593 for (i = 0; i < depth; ) { 3594 int this_order = max_order; 3595 struct page *page; 3596 int to_do; 3597 void *p; 3598 3599 while (this_order && left < order_to_size(this_order - 1)) 3600 this_order--; 3601 3602 do { 3603 page = alloc_pages_node(node, 3604 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 3605 this_order); 3606 if (page) 3607 break; 3608 if (!this_order--) 3609 break; 3610 if (order_to_size(this_order) < rq_size) 3611 break; 3612 } while (1); 3613 3614 if (!page) 3615 goto fail; 3616 3617 page->private = this_order; 3618 list_add_tail(&page->lru, &tags->page_list); 3619 3620 p = page_address(page); 3621 /* 3622 * Allow kmemleak to scan these pages as they contain pointers 3623 * to additional allocations like via ops->init_request(). 3624 */ 3625 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); 3626 entries_per_page = order_to_size(this_order) / rq_size; 3627 to_do = min(entries_per_page, depth - i); 3628 left -= to_do * rq_size; 3629 for (j = 0; j < to_do; j++) { 3630 struct request *rq = p; 3631 3632 tags->static_rqs[i] = rq; 3633 if (blk_mq_init_request(set, rq, hctx_idx, node)) { 3634 tags->static_rqs[i] = NULL; 3635 goto fail; 3636 } 3637 3638 p += rq_size; 3639 i++; 3640 } 3641 } 3642 return 0; 3643 3644 fail: 3645 blk_mq_free_rqs(set, tags, hctx_idx); 3646 return -ENOMEM; 3647 } 3648 3649 struct rq_iter_data { 3650 struct blk_mq_hw_ctx *hctx; 3651 bool has_rq; 3652 }; 3653 3654 static bool blk_mq_has_request(struct request *rq, void *data) 3655 { 3656 struct rq_iter_data *iter_data = data; 3657 3658 if (rq->mq_hctx != iter_data->hctx) 3659 return true; 3660 iter_data->has_rq = true; 3661 return false; 3662 } 3663 3664 static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) 3665 { 3666 struct blk_mq_tags *tags = hctx->sched_tags ? 3667 hctx->sched_tags : hctx->tags; 3668 struct rq_iter_data data = { 3669 .hctx = hctx, 3670 }; 3671 int srcu_idx; 3672 3673 srcu_idx = srcu_read_lock(&hctx->queue->tag_set->tags_srcu); 3674 blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); 3675 srcu_read_unlock(&hctx->queue->tag_set->tags_srcu, srcu_idx); 3676 3677 return data.has_rq; 3678 } 3679 3680 static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx, 3681 unsigned int this_cpu) 3682 { 3683 enum hctx_type type = hctx->type; 3684 int cpu; 3685 3686 /* 3687 * hctx->cpumask has to rule out isolated CPUs, but userspace still 3688 * might submit IOs on these isolated CPUs, so use the queue map to 3689 * check if all CPUs mapped to this hctx are offline 3690 */ 3691 for_each_online_cpu(cpu) { 3692 struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue, 3693 type, cpu); 3694 3695 if (h != hctx) 3696 continue; 3697 3698 /* this hctx has at least one online CPU */ 3699 if (this_cpu != cpu) 3700 return true; 3701 } 3702 3703 return false; 3704 } 3705 3706 static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) 3707 { 3708 struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, 3709 struct blk_mq_hw_ctx, cpuhp_online); 3710 3711 if (blk_mq_hctx_has_online_cpu(hctx, cpu)) 3712 return 0; 3713 3714 /* 3715 * Prevent new request from being allocated on the current hctx. 3716 * 3717 * The smp_mb__after_atomic() Pairs with the implied barrier in 3718 * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is 3719 * seen once we return from the tag allocator. 3720 */ 3721 set_bit(BLK_MQ_S_INACTIVE, &hctx->state); 3722 smp_mb__after_atomic(); 3723 3724 /* 3725 * Try to grab a reference to the queue and wait for any outstanding 3726 * requests. If we could not grab a reference the queue has been 3727 * frozen and there are no requests. 3728 */ 3729 if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { 3730 while (blk_mq_hctx_has_requests(hctx)) 3731 msleep(5); 3732 percpu_ref_put(&hctx->queue->q_usage_counter); 3733 } 3734 3735 return 0; 3736 } 3737 3738 /* 3739 * Check if one CPU is mapped to the specified hctx 3740 * 3741 * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed 3742 * to be used for scheduling kworker only. For other usage, please call this 3743 * helper for checking if one CPU belongs to the specified hctx 3744 */ 3745 static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu, 3746 const struct blk_mq_hw_ctx *hctx) 3747 { 3748 struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue, 3749 hctx->type, cpu); 3750 3751 return mapped_hctx == hctx; 3752 } 3753 3754 static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) 3755 { 3756 struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, 3757 struct blk_mq_hw_ctx, cpuhp_online); 3758 3759 if (blk_mq_cpu_mapped_to_hctx(cpu, hctx)) 3760 clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); 3761 return 0; 3762 } 3763 3764 /* 3765 * 'cpu' is going away. splice any existing rq_list entries from this 3766 * software queue to the hw queue dispatch list, and ensure that it 3767 * gets run. 3768 */ 3769 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) 3770 { 3771 struct blk_mq_hw_ctx *hctx; 3772 struct blk_mq_ctx *ctx; 3773 LIST_HEAD(tmp); 3774 enum hctx_type type; 3775 3776 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); 3777 if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx)) 3778 return 0; 3779 3780 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 3781 type = hctx->type; 3782 3783 spin_lock(&ctx->lock); 3784 if (!list_empty(&ctx->rq_lists[type])) { 3785 list_splice_init(&ctx->rq_lists[type], &tmp); 3786 blk_mq_hctx_clear_pending(hctx, ctx); 3787 } 3788 spin_unlock(&ctx->lock); 3789 3790 if (list_empty(&tmp)) 3791 return 0; 3792 3793 spin_lock(&hctx->lock); 3794 list_splice_tail_init(&tmp, &hctx->dispatch); 3795 spin_unlock(&hctx->lock); 3796 3797 blk_mq_run_hw_queue(hctx, true); 3798 return 0; 3799 } 3800 3801 static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 3802 { 3803 lockdep_assert_held(&blk_mq_cpuhp_lock); 3804 3805 if (!(hctx->flags & BLK_MQ_F_STACKING) && 3806 !hlist_unhashed(&hctx->cpuhp_online)) { 3807 cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 3808 &hctx->cpuhp_online); 3809 INIT_HLIST_NODE(&hctx->cpuhp_online); 3810 } 3811 3812 if (!hlist_unhashed(&hctx->cpuhp_dead)) { 3813 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, 3814 &hctx->cpuhp_dead); 3815 INIT_HLIST_NODE(&hctx->cpuhp_dead); 3816 } 3817 } 3818 3819 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 3820 { 3821 mutex_lock(&blk_mq_cpuhp_lock); 3822 __blk_mq_remove_cpuhp(hctx); 3823 mutex_unlock(&blk_mq_cpuhp_lock); 3824 } 3825 3826 static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx) 3827 { 3828 lockdep_assert_held(&blk_mq_cpuhp_lock); 3829 3830 if (!(hctx->flags & BLK_MQ_F_STACKING) && 3831 hlist_unhashed(&hctx->cpuhp_online)) 3832 cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 3833 &hctx->cpuhp_online); 3834 3835 if (hlist_unhashed(&hctx->cpuhp_dead)) 3836 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, 3837 &hctx->cpuhp_dead); 3838 } 3839 3840 static void __blk_mq_remove_cpuhp_list(struct list_head *head) 3841 { 3842 struct blk_mq_hw_ctx *hctx; 3843 3844 lockdep_assert_held(&blk_mq_cpuhp_lock); 3845 3846 list_for_each_entry(hctx, head, hctx_list) 3847 __blk_mq_remove_cpuhp(hctx); 3848 } 3849 3850 /* 3851 * Unregister cpuhp callbacks from exited hw queues 3852 * 3853 * Safe to call if this `request_queue` is live 3854 */ 3855 static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q) 3856 { 3857 LIST_HEAD(hctx_list); 3858 3859 spin_lock(&q->unused_hctx_lock); 3860 list_splice_init(&q->unused_hctx_list, &hctx_list); 3861 spin_unlock(&q->unused_hctx_lock); 3862 3863 mutex_lock(&blk_mq_cpuhp_lock); 3864 __blk_mq_remove_cpuhp_list(&hctx_list); 3865 mutex_unlock(&blk_mq_cpuhp_lock); 3866 3867 spin_lock(&q->unused_hctx_lock); 3868 list_splice(&hctx_list, &q->unused_hctx_list); 3869 spin_unlock(&q->unused_hctx_lock); 3870 } 3871 3872 /* 3873 * Register cpuhp callbacks from all hw queues 3874 * 3875 * Safe to call if this `request_queue` is live 3876 */ 3877 static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q) 3878 { 3879 struct blk_mq_hw_ctx *hctx; 3880 unsigned long i; 3881 3882 mutex_lock(&blk_mq_cpuhp_lock); 3883 queue_for_each_hw_ctx(q, hctx, i) 3884 __blk_mq_add_cpuhp(hctx); 3885 mutex_unlock(&blk_mq_cpuhp_lock); 3886 } 3887 3888 /* 3889 * Before freeing hw queue, clearing the flush request reference in 3890 * tags->rqs[] for avoiding potential UAF. 3891 */ 3892 static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, 3893 unsigned int queue_depth, struct request *flush_rq) 3894 { 3895 int i; 3896 3897 /* The hw queue may not be mapped yet */ 3898 if (!tags) 3899 return; 3900 3901 WARN_ON_ONCE(req_ref_read(flush_rq) != 0); 3902 3903 for (i = 0; i < queue_depth; i++) 3904 cmpxchg(&tags->rqs[i], flush_rq, NULL); 3905 } 3906 3907 static void blk_free_flush_queue_callback(struct rcu_head *head) 3908 { 3909 struct blk_flush_queue *fq = 3910 container_of(head, struct blk_flush_queue, rcu_head); 3911 3912 blk_free_flush_queue(fq); 3913 } 3914 3915 /* hctx->ctxs will be freed in queue's release handler */ 3916 static void blk_mq_exit_hctx(struct request_queue *q, 3917 struct blk_mq_tag_set *set, 3918 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 3919 { 3920 struct request *flush_rq = hctx->fq->flush_rq; 3921 3922 if (blk_mq_hw_queue_mapped(hctx)) 3923 blk_mq_tag_idle(hctx); 3924 3925 if (blk_queue_init_done(q)) 3926 blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], 3927 set->queue_depth, flush_rq); 3928 if (set->ops->exit_request) 3929 set->ops->exit_request(set, flush_rq, hctx_idx); 3930 3931 if (set->ops->exit_hctx) 3932 set->ops->exit_hctx(hctx, hctx_idx); 3933 3934 call_srcu(&set->tags_srcu, &hctx->fq->rcu_head, 3935 blk_free_flush_queue_callback); 3936 hctx->fq = NULL; 3937 3938 xa_erase(&q->hctx_table, hctx_idx); 3939 3940 spin_lock(&q->unused_hctx_lock); 3941 list_add(&hctx->hctx_list, &q->unused_hctx_list); 3942 spin_unlock(&q->unused_hctx_lock); 3943 } 3944 3945 static void blk_mq_exit_hw_queues(struct request_queue *q, 3946 struct blk_mq_tag_set *set, int nr_queue) 3947 { 3948 struct blk_mq_hw_ctx *hctx; 3949 unsigned long i; 3950 3951 queue_for_each_hw_ctx(q, hctx, i) { 3952 if (i == nr_queue) 3953 break; 3954 blk_mq_remove_cpuhp(hctx); 3955 blk_mq_exit_hctx(q, set, hctx, i); 3956 } 3957 } 3958 3959 static int blk_mq_init_hctx(struct request_queue *q, 3960 struct blk_mq_tag_set *set, 3961 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 3962 { 3963 gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; 3964 3965 hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); 3966 if (!hctx->fq) 3967 goto fail; 3968 3969 hctx->queue_num = hctx_idx; 3970 3971 hctx->tags = set->tags[hctx_idx]; 3972 3973 if (set->ops->init_hctx && 3974 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 3975 goto fail_free_fq; 3976 3977 if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, 3978 hctx->numa_node)) 3979 goto exit_hctx; 3980 3981 if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) 3982 goto exit_flush_rq; 3983 3984 return 0; 3985 3986 exit_flush_rq: 3987 if (set->ops->exit_request) 3988 set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); 3989 exit_hctx: 3990 if (set->ops->exit_hctx) 3991 set->ops->exit_hctx(hctx, hctx_idx); 3992 fail_free_fq: 3993 blk_free_flush_queue(hctx->fq); 3994 hctx->fq = NULL; 3995 fail: 3996 return -1; 3997 } 3998 3999 static struct blk_mq_hw_ctx * 4000 blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, 4001 int node) 4002 { 4003 struct blk_mq_hw_ctx *hctx; 4004 gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; 4005 4006 hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node); 4007 if (!hctx) 4008 goto fail_alloc_hctx; 4009 4010 if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node)) 4011 goto free_hctx; 4012 4013 atomic_set(&hctx->nr_active, 0); 4014 if (node == NUMA_NO_NODE) 4015 node = set->numa_node; 4016 hctx->numa_node = node; 4017 4018 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 4019 spin_lock_init(&hctx->lock); 4020 INIT_LIST_HEAD(&hctx->dispatch); 4021 INIT_HLIST_NODE(&hctx->cpuhp_dead); 4022 INIT_HLIST_NODE(&hctx->cpuhp_online); 4023 hctx->queue = q; 4024 hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; 4025 4026 INIT_LIST_HEAD(&hctx->hctx_list); 4027 4028 /* 4029 * Allocate space for all possible cpus to avoid allocation at 4030 * runtime 4031 */ 4032 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), 4033 gfp, node); 4034 if (!hctx->ctxs) 4035 goto free_cpumask; 4036 4037 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), 4038 gfp, node, false, false)) 4039 goto free_ctxs; 4040 hctx->nr_ctx = 0; 4041 4042 spin_lock_init(&hctx->dispatch_wait_lock); 4043 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); 4044 INIT_LIST_HEAD(&hctx->dispatch_wait.entry); 4045 4046 blk_mq_hctx_kobj_init(hctx); 4047 4048 return hctx; 4049 4050 free_ctxs: 4051 kfree(hctx->ctxs); 4052 free_cpumask: 4053 free_cpumask_var(hctx->cpumask); 4054 free_hctx: 4055 kfree(hctx); 4056 fail_alloc_hctx: 4057 return NULL; 4058 } 4059 4060 static void blk_mq_init_cpu_queues(struct request_queue *q, 4061 unsigned int nr_hw_queues) 4062 { 4063 struct blk_mq_tag_set *set = q->tag_set; 4064 unsigned int i, j; 4065 4066 for_each_possible_cpu(i) { 4067 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 4068 struct blk_mq_hw_ctx *hctx; 4069 int k; 4070 4071 __ctx->cpu = i; 4072 spin_lock_init(&__ctx->lock); 4073 for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) 4074 INIT_LIST_HEAD(&__ctx->rq_lists[k]); 4075 4076 __ctx->queue = q; 4077 4078 /* 4079 * Set local node, IFF we have more than one hw queue. If 4080 * not, we remain on the home node of the device 4081 */ 4082 for (j = 0; j < set->nr_maps; j++) { 4083 hctx = blk_mq_map_queue_type(q, j, i); 4084 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 4085 hctx->numa_node = cpu_to_node(i); 4086 } 4087 } 4088 } 4089 4090 struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, 4091 unsigned int hctx_idx, 4092 unsigned int depth) 4093 { 4094 struct blk_mq_tags *tags; 4095 int ret; 4096 4097 tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); 4098 if (!tags) 4099 return NULL; 4100 4101 ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); 4102 if (ret) { 4103 blk_mq_free_rq_map(set, tags); 4104 return NULL; 4105 } 4106 4107 return tags; 4108 } 4109 4110 static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, 4111 int hctx_idx) 4112 { 4113 if (blk_mq_is_shared_tags(set->flags)) { 4114 set->tags[hctx_idx] = set->shared_tags; 4115 4116 return true; 4117 } 4118 4119 set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, 4120 set->queue_depth); 4121 4122 return set->tags[hctx_idx]; 4123 } 4124 4125 void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, 4126 struct blk_mq_tags *tags, 4127 unsigned int hctx_idx) 4128 { 4129 if (tags) { 4130 blk_mq_free_rqs(set, tags, hctx_idx); 4131 blk_mq_free_rq_map(set, tags); 4132 } 4133 } 4134 4135 static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, 4136 unsigned int hctx_idx) 4137 { 4138 if (!blk_mq_is_shared_tags(set->flags)) 4139 blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); 4140 4141 set->tags[hctx_idx] = NULL; 4142 } 4143 4144 static void blk_mq_map_swqueue(struct request_queue *q) 4145 { 4146 unsigned int j, hctx_idx; 4147 unsigned long i; 4148 struct blk_mq_hw_ctx *hctx; 4149 struct blk_mq_ctx *ctx; 4150 struct blk_mq_tag_set *set = q->tag_set; 4151 4152 queue_for_each_hw_ctx(q, hctx, i) { 4153 cpumask_clear(hctx->cpumask); 4154 hctx->nr_ctx = 0; 4155 hctx->dispatch_from = NULL; 4156 } 4157 4158 /* 4159 * Map software to hardware queues. 4160 * 4161 * If the cpu isn't present, the cpu is mapped to first hctx. 4162 */ 4163 for_each_possible_cpu(i) { 4164 4165 ctx = per_cpu_ptr(q->queue_ctx, i); 4166 for (j = 0; j < set->nr_maps; j++) { 4167 if (!set->map[j].nr_queues) { 4168 ctx->hctxs[j] = blk_mq_map_queue_type(q, 4169 HCTX_TYPE_DEFAULT, i); 4170 continue; 4171 } 4172 hctx_idx = set->map[j].mq_map[i]; 4173 /* unmapped hw queue can be remapped after CPU topo changed */ 4174 if (!set->tags[hctx_idx] && 4175 !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { 4176 /* 4177 * If tags initialization fail for some hctx, 4178 * that hctx won't be brought online. In this 4179 * case, remap the current ctx to hctx[0] which 4180 * is guaranteed to always have tags allocated 4181 */ 4182 set->map[j].mq_map[i] = 0; 4183 } 4184 4185 hctx = blk_mq_map_queue_type(q, j, i); 4186 ctx->hctxs[j] = hctx; 4187 /* 4188 * If the CPU is already set in the mask, then we've 4189 * mapped this one already. This can happen if 4190 * devices share queues across queue maps. 4191 */ 4192 if (cpumask_test_cpu(i, hctx->cpumask)) 4193 continue; 4194 4195 cpumask_set_cpu(i, hctx->cpumask); 4196 hctx->type = j; 4197 ctx->index_hw[hctx->type] = hctx->nr_ctx; 4198 hctx->ctxs[hctx->nr_ctx++] = ctx; 4199 4200 /* 4201 * If the nr_ctx type overflows, we have exceeded the 4202 * amount of sw queues we can support. 4203 */ 4204 BUG_ON(!hctx->nr_ctx); 4205 } 4206 4207 for (; j < HCTX_MAX_TYPES; j++) 4208 ctx->hctxs[j] = blk_mq_map_queue_type(q, 4209 HCTX_TYPE_DEFAULT, i); 4210 } 4211 4212 queue_for_each_hw_ctx(q, hctx, i) { 4213 int cpu; 4214 4215 /* 4216 * If no software queues are mapped to this hardware queue, 4217 * disable it and free the request entries. 4218 */ 4219 if (!hctx->nr_ctx) { 4220 /* Never unmap queue 0. We need it as a 4221 * fallback in case of a new remap fails 4222 * allocation 4223 */ 4224 if (i) 4225 __blk_mq_free_map_and_rqs(set, i); 4226 4227 hctx->tags = NULL; 4228 continue; 4229 } 4230 4231 hctx->tags = set->tags[i]; 4232 WARN_ON(!hctx->tags); 4233 4234 /* 4235 * Set the map size to the number of mapped software queues. 4236 * This is more accurate and more efficient than looping 4237 * over all possibly mapped software queues. 4238 */ 4239 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); 4240 4241 /* 4242 * Rule out isolated CPUs from hctx->cpumask to avoid 4243 * running block kworker on isolated CPUs 4244 */ 4245 for_each_cpu(cpu, hctx->cpumask) { 4246 if (cpu_is_isolated(cpu)) 4247 cpumask_clear_cpu(cpu, hctx->cpumask); 4248 } 4249 4250 /* 4251 * Initialize batch roundrobin counts 4252 */ 4253 hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); 4254 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 4255 } 4256 } 4257 4258 /* 4259 * Caller needs to ensure that we're either frozen/quiesced, or that 4260 * the queue isn't live yet. 4261 */ 4262 static void queue_set_hctx_shared(struct request_queue *q, bool shared) 4263 { 4264 struct blk_mq_hw_ctx *hctx; 4265 unsigned long i; 4266 4267 queue_for_each_hw_ctx(q, hctx, i) { 4268 if (shared) { 4269 hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; 4270 } else { 4271 blk_mq_tag_idle(hctx); 4272 hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; 4273 } 4274 } 4275 } 4276 4277 static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, 4278 bool shared) 4279 { 4280 struct request_queue *q; 4281 unsigned int memflags; 4282 4283 lockdep_assert_held(&set->tag_list_lock); 4284 4285 list_for_each_entry(q, &set->tag_list, tag_set_list) { 4286 memflags = blk_mq_freeze_queue(q); 4287 queue_set_hctx_shared(q, shared); 4288 blk_mq_unfreeze_queue(q, memflags); 4289 } 4290 } 4291 4292 static void blk_mq_del_queue_tag_set(struct request_queue *q) 4293 { 4294 struct blk_mq_tag_set *set = q->tag_set; 4295 4296 mutex_lock(&set->tag_list_lock); 4297 list_del(&q->tag_set_list); 4298 if (list_is_singular(&set->tag_list)) { 4299 /* just transitioned to unshared */ 4300 set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; 4301 /* update existing queue */ 4302 blk_mq_update_tag_set_shared(set, false); 4303 } 4304 mutex_unlock(&set->tag_list_lock); 4305 INIT_LIST_HEAD(&q->tag_set_list); 4306 } 4307 4308 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 4309 struct request_queue *q) 4310 { 4311 mutex_lock(&set->tag_list_lock); 4312 4313 /* 4314 * Check to see if we're transitioning to shared (from 1 to 2 queues). 4315 */ 4316 if (!list_empty(&set->tag_list) && 4317 !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { 4318 set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; 4319 /* update existing queue */ 4320 blk_mq_update_tag_set_shared(set, true); 4321 } 4322 if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) 4323 queue_set_hctx_shared(q, true); 4324 list_add_tail(&q->tag_set_list, &set->tag_list); 4325 4326 mutex_unlock(&set->tag_list_lock); 4327 } 4328 4329 /* All allocations will be freed in release handler of q->mq_kobj */ 4330 static int blk_mq_alloc_ctxs(struct request_queue *q) 4331 { 4332 struct blk_mq_ctxs *ctxs; 4333 int cpu; 4334 4335 ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); 4336 if (!ctxs) 4337 return -ENOMEM; 4338 4339 ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); 4340 if (!ctxs->queue_ctx) 4341 goto fail; 4342 4343 for_each_possible_cpu(cpu) { 4344 struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); 4345 ctx->ctxs = ctxs; 4346 } 4347 4348 q->mq_kobj = &ctxs->kobj; 4349 q->queue_ctx = ctxs->queue_ctx; 4350 4351 return 0; 4352 fail: 4353 kfree(ctxs); 4354 return -ENOMEM; 4355 } 4356 4357 /* 4358 * It is the actual release handler for mq, but we do it from 4359 * request queue's release handler for avoiding use-after-free 4360 * and headache because q->mq_kobj shouldn't have been introduced, 4361 * but we can't group ctx/kctx kobj without it. 4362 */ 4363 void blk_mq_release(struct request_queue *q) 4364 { 4365 struct blk_mq_hw_ctx *hctx, *next; 4366 unsigned long i; 4367 4368 queue_for_each_hw_ctx(q, hctx, i) 4369 WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); 4370 4371 /* all hctx are in .unused_hctx_list now */ 4372 list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) { 4373 list_del_init(&hctx->hctx_list); 4374 kobject_put(&hctx->kobj); 4375 } 4376 4377 xa_destroy(&q->hctx_table); 4378 4379 /* 4380 * release .mq_kobj and sw queue's kobject now because 4381 * both share lifetime with request queue. 4382 */ 4383 blk_mq_sysfs_deinit(q); 4384 } 4385 4386 struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, 4387 struct queue_limits *lim, void *queuedata) 4388 { 4389 struct queue_limits default_lim = { }; 4390 struct request_queue *q; 4391 int ret; 4392 4393 if (!lim) 4394 lim = &default_lim; 4395 lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; 4396 if (set->nr_maps > HCTX_TYPE_POLL) 4397 lim->features |= BLK_FEAT_POLL; 4398 4399 q = blk_alloc_queue(lim, set->numa_node); 4400 if (IS_ERR(q)) 4401 return q; 4402 q->queuedata = queuedata; 4403 ret = blk_mq_init_allocated_queue(set, q); 4404 if (ret) { 4405 blk_put_queue(q); 4406 return ERR_PTR(ret); 4407 } 4408 return q; 4409 } 4410 EXPORT_SYMBOL(blk_mq_alloc_queue); 4411 4412 /** 4413 * blk_mq_destroy_queue - shutdown a request queue 4414 * @q: request queue to shutdown 4415 * 4416 * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future 4417 * requests will be failed with -ENODEV. The caller is responsible for dropping 4418 * the reference from blk_mq_alloc_queue() by calling blk_put_queue(). 4419 * 4420 * Context: can sleep 4421 */ 4422 void blk_mq_destroy_queue(struct request_queue *q) 4423 { 4424 WARN_ON_ONCE(!queue_is_mq(q)); 4425 WARN_ON_ONCE(blk_queue_registered(q)); 4426 4427 might_sleep(); 4428 4429 blk_queue_flag_set(QUEUE_FLAG_DYING, q); 4430 blk_queue_start_drain(q); 4431 blk_mq_freeze_queue_wait(q); 4432 4433 blk_sync_queue(q); 4434 blk_mq_cancel_work_sync(q); 4435 blk_mq_exit_queue(q); 4436 } 4437 EXPORT_SYMBOL(blk_mq_destroy_queue); 4438 4439 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, 4440 struct queue_limits *lim, void *queuedata, 4441 struct lock_class_key *lkclass) 4442 { 4443 struct request_queue *q; 4444 struct gendisk *disk; 4445 4446 q = blk_mq_alloc_queue(set, lim, queuedata); 4447 if (IS_ERR(q)) 4448 return ERR_CAST(q); 4449 4450 disk = __alloc_disk_node(q, set->numa_node, lkclass); 4451 if (!disk) { 4452 blk_mq_destroy_queue(q); 4453 blk_put_queue(q); 4454 return ERR_PTR(-ENOMEM); 4455 } 4456 set_bit(GD_OWNS_QUEUE, &disk->state); 4457 return disk; 4458 } 4459 EXPORT_SYMBOL(__blk_mq_alloc_disk); 4460 4461 struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, 4462 struct lock_class_key *lkclass) 4463 { 4464 struct gendisk *disk; 4465 4466 if (!blk_get_queue(q)) 4467 return NULL; 4468 disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass); 4469 if (!disk) 4470 blk_put_queue(q); 4471 return disk; 4472 } 4473 EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); 4474 4475 /* 4476 * Only hctx removed from cpuhp list can be reused 4477 */ 4478 static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx) 4479 { 4480 return hlist_unhashed(&hctx->cpuhp_online) && 4481 hlist_unhashed(&hctx->cpuhp_dead); 4482 } 4483 4484 static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( 4485 struct blk_mq_tag_set *set, struct request_queue *q, 4486 int hctx_idx, int node) 4487 { 4488 struct blk_mq_hw_ctx *hctx = NULL, *tmp; 4489 4490 /* reuse dead hctx first */ 4491 spin_lock(&q->unused_hctx_lock); 4492 list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { 4493 if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) { 4494 hctx = tmp; 4495 break; 4496 } 4497 } 4498 if (hctx) 4499 list_del_init(&hctx->hctx_list); 4500 spin_unlock(&q->unused_hctx_lock); 4501 4502 if (!hctx) 4503 hctx = blk_mq_alloc_hctx(q, set, node); 4504 if (!hctx) 4505 goto fail; 4506 4507 if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) 4508 goto free_hctx; 4509 4510 return hctx; 4511 4512 free_hctx: 4513 kobject_put(&hctx->kobj); 4514 fail: 4515 return NULL; 4516 } 4517 4518 static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 4519 struct request_queue *q) 4520 { 4521 struct blk_mq_hw_ctx *hctx; 4522 unsigned long i, j; 4523 4524 for (i = 0; i < set->nr_hw_queues; i++) { 4525 int old_node; 4526 int node = blk_mq_get_hctx_node(set, i); 4527 struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); 4528 4529 if (old_hctx) { 4530 old_node = old_hctx->numa_node; 4531 blk_mq_exit_hctx(q, set, old_hctx, i); 4532 } 4533 4534 if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { 4535 if (!old_hctx) 4536 break; 4537 pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", 4538 node, old_node); 4539 hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); 4540 WARN_ON_ONCE(!hctx); 4541 } 4542 } 4543 /* 4544 * Increasing nr_hw_queues fails. Free the newly allocated 4545 * hctxs and keep the previous q->nr_hw_queues. 4546 */ 4547 if (i != set->nr_hw_queues) { 4548 j = q->nr_hw_queues; 4549 } else { 4550 j = i; 4551 q->nr_hw_queues = set->nr_hw_queues; 4552 } 4553 4554 xa_for_each_start(&q->hctx_table, j, hctx, j) 4555 blk_mq_exit_hctx(q, set, hctx, j); 4556 } 4557 4558 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 4559 struct request_queue *q) 4560 { 4561 __blk_mq_realloc_hw_ctxs(set, q); 4562 4563 /* unregister cpuhp callbacks for exited hctxs */ 4564 blk_mq_remove_hw_queues_cpuhp(q); 4565 4566 /* register cpuhp for new initialized hctxs */ 4567 blk_mq_add_hw_queues_cpuhp(q); 4568 } 4569 4570 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 4571 struct request_queue *q) 4572 { 4573 /* mark the queue as mq asap */ 4574 q->mq_ops = set->ops; 4575 4576 /* 4577 * ->tag_set has to be setup before initialize hctx, which cpuphp 4578 * handler needs it for checking queue mapping 4579 */ 4580 q->tag_set = set; 4581 4582 if (blk_mq_alloc_ctxs(q)) 4583 goto err_exit; 4584 4585 /* init q->mq_kobj and sw queues' kobjects */ 4586 blk_mq_sysfs_init(q); 4587 4588 INIT_LIST_HEAD(&q->unused_hctx_list); 4589 spin_lock_init(&q->unused_hctx_lock); 4590 4591 xa_init(&q->hctx_table); 4592 4593 blk_mq_realloc_hw_ctxs(set, q); 4594 if (!q->nr_hw_queues) 4595 goto err_hctxs; 4596 4597 INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 4598 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 4599 4600 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 4601 4602 INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); 4603 INIT_LIST_HEAD(&q->flush_list); 4604 INIT_LIST_HEAD(&q->requeue_list); 4605 spin_lock_init(&q->requeue_lock); 4606 4607 q->nr_requests = set->queue_depth; 4608 4609 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 4610 blk_mq_map_swqueue(q); 4611 blk_mq_add_queue_tag_set(set, q); 4612 return 0; 4613 4614 err_hctxs: 4615 blk_mq_release(q); 4616 err_exit: 4617 q->mq_ops = NULL; 4618 return -ENOMEM; 4619 } 4620 EXPORT_SYMBOL(blk_mq_init_allocated_queue); 4621 4622 /* tags can _not_ be used after returning from blk_mq_exit_queue */ 4623 void blk_mq_exit_queue(struct request_queue *q) 4624 { 4625 struct blk_mq_tag_set *set = q->tag_set; 4626 4627 /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ 4628 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 4629 /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ 4630 blk_mq_del_queue_tag_set(q); 4631 } 4632 4633 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 4634 { 4635 int i; 4636 4637 if (blk_mq_is_shared_tags(set->flags)) { 4638 set->shared_tags = blk_mq_alloc_map_and_rqs(set, 4639 BLK_MQ_NO_HCTX_IDX, 4640 set->queue_depth); 4641 if (!set->shared_tags) 4642 return -ENOMEM; 4643 } 4644 4645 for (i = 0; i < set->nr_hw_queues; i++) { 4646 if (!__blk_mq_alloc_map_and_rqs(set, i)) 4647 goto out_unwind; 4648 cond_resched(); 4649 } 4650 4651 return 0; 4652 4653 out_unwind: 4654 while (--i >= 0) 4655 __blk_mq_free_map_and_rqs(set, i); 4656 4657 if (blk_mq_is_shared_tags(set->flags)) { 4658 blk_mq_free_map_and_rqs(set, set->shared_tags, 4659 BLK_MQ_NO_HCTX_IDX); 4660 } 4661 4662 return -ENOMEM; 4663 } 4664 4665 /* 4666 * Allocate the request maps associated with this tag_set. Note that this 4667 * may reduce the depth asked for, if memory is tight. set->queue_depth 4668 * will be updated to reflect the allocated depth. 4669 */ 4670 static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) 4671 { 4672 unsigned int depth; 4673 int err; 4674 4675 depth = set->queue_depth; 4676 do { 4677 err = __blk_mq_alloc_rq_maps(set); 4678 if (!err) 4679 break; 4680 4681 set->queue_depth >>= 1; 4682 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 4683 err = -ENOMEM; 4684 break; 4685 } 4686 } while (set->queue_depth); 4687 4688 if (!set->queue_depth || err) { 4689 pr_err("blk-mq: failed to allocate request map\n"); 4690 return -ENOMEM; 4691 } 4692 4693 if (depth != set->queue_depth) 4694 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 4695 depth, set->queue_depth); 4696 4697 return 0; 4698 } 4699 4700 static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) 4701 { 4702 /* 4703 * blk_mq_map_queues() and multiple .map_queues() implementations 4704 * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the 4705 * number of hardware queues. 4706 */ 4707 if (set->nr_maps == 1) 4708 set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; 4709 4710 if (set->ops->map_queues) { 4711 int i; 4712 4713 /* 4714 * transport .map_queues is usually done in the following 4715 * way: 4716 * 4717 * for (queue = 0; queue < set->nr_hw_queues; queue++) { 4718 * mask = get_cpu_mask(queue) 4719 * for_each_cpu(cpu, mask) 4720 * set->map[x].mq_map[cpu] = queue; 4721 * } 4722 * 4723 * When we need to remap, the table has to be cleared for 4724 * killing stale mapping since one CPU may not be mapped 4725 * to any hw queue. 4726 */ 4727 for (i = 0; i < set->nr_maps; i++) 4728 blk_mq_clear_mq_map(&set->map[i]); 4729 4730 set->ops->map_queues(set); 4731 } else { 4732 BUG_ON(set->nr_maps > 1); 4733 blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 4734 } 4735 } 4736 4737 static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, 4738 int new_nr_hw_queues) 4739 { 4740 struct blk_mq_tags **new_tags; 4741 int i; 4742 4743 if (set->nr_hw_queues >= new_nr_hw_queues) 4744 goto done; 4745 4746 new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), 4747 GFP_KERNEL, set->numa_node); 4748 if (!new_tags) 4749 return -ENOMEM; 4750 4751 if (set->tags) 4752 memcpy(new_tags, set->tags, set->nr_hw_queues * 4753 sizeof(*set->tags)); 4754 kfree(set->tags); 4755 set->tags = new_tags; 4756 4757 for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { 4758 if (!__blk_mq_alloc_map_and_rqs(set, i)) { 4759 while (--i >= set->nr_hw_queues) 4760 __blk_mq_free_map_and_rqs(set, i); 4761 return -ENOMEM; 4762 } 4763 cond_resched(); 4764 } 4765 4766 done: 4767 set->nr_hw_queues = new_nr_hw_queues; 4768 return 0; 4769 } 4770 4771 /* 4772 * Alloc a tag set to be associated with one or more request queues. 4773 * May fail with EINVAL for various error conditions. May adjust the 4774 * requested depth down, if it's too large. In that case, the set 4775 * value will be stored in set->queue_depth. 4776 */ 4777 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 4778 { 4779 int i, ret; 4780 4781 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 4782 4783 if (!set->nr_hw_queues) 4784 return -EINVAL; 4785 if (!set->queue_depth) 4786 return -EINVAL; 4787 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 4788 return -EINVAL; 4789 4790 if (!set->ops->queue_rq) 4791 return -EINVAL; 4792 4793 if (!set->ops->get_budget ^ !set->ops->put_budget) 4794 return -EINVAL; 4795 4796 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 4797 pr_info("blk-mq: reduced tag depth to %u\n", 4798 BLK_MQ_MAX_DEPTH); 4799 set->queue_depth = BLK_MQ_MAX_DEPTH; 4800 } 4801 4802 if (!set->nr_maps) 4803 set->nr_maps = 1; 4804 else if (set->nr_maps > HCTX_MAX_TYPES) 4805 return -EINVAL; 4806 4807 /* 4808 * If a crashdump is active, then we are potentially in a very 4809 * memory constrained environment. Limit us to 64 tags to prevent 4810 * using too much memory. 4811 */ 4812 if (is_kdump_kernel()) 4813 set->queue_depth = min(64U, set->queue_depth); 4814 4815 /* 4816 * There is no use for more h/w queues than cpus if we just have 4817 * a single map 4818 */ 4819 if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) 4820 set->nr_hw_queues = nr_cpu_ids; 4821 4822 if (set->flags & BLK_MQ_F_BLOCKING) { 4823 set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); 4824 if (!set->srcu) 4825 return -ENOMEM; 4826 ret = init_srcu_struct(set->srcu); 4827 if (ret) 4828 goto out_free_srcu; 4829 } 4830 ret = init_srcu_struct(&set->tags_srcu); 4831 if (ret) 4832 goto out_cleanup_srcu; 4833 4834 init_rwsem(&set->update_nr_hwq_lock); 4835 4836 ret = -ENOMEM; 4837 set->tags = kcalloc_node(set->nr_hw_queues, 4838 sizeof(struct blk_mq_tags *), GFP_KERNEL, 4839 set->numa_node); 4840 if (!set->tags) 4841 goto out_cleanup_tags_srcu; 4842 4843 for (i = 0; i < set->nr_maps; i++) { 4844 set->map[i].mq_map = kcalloc_node(nr_cpu_ids, 4845 sizeof(set->map[i].mq_map[0]), 4846 GFP_KERNEL, set->numa_node); 4847 if (!set->map[i].mq_map) 4848 goto out_free_mq_map; 4849 set->map[i].nr_queues = set->nr_hw_queues; 4850 } 4851 4852 blk_mq_update_queue_map(set); 4853 4854 ret = blk_mq_alloc_set_map_and_rqs(set); 4855 if (ret) 4856 goto out_free_mq_map; 4857 4858 mutex_init(&set->tag_list_lock); 4859 INIT_LIST_HEAD(&set->tag_list); 4860 4861 return 0; 4862 4863 out_free_mq_map: 4864 for (i = 0; i < set->nr_maps; i++) { 4865 kfree(set->map[i].mq_map); 4866 set->map[i].mq_map = NULL; 4867 } 4868 kfree(set->tags); 4869 set->tags = NULL; 4870 out_cleanup_tags_srcu: 4871 cleanup_srcu_struct(&set->tags_srcu); 4872 out_cleanup_srcu: 4873 if (set->flags & BLK_MQ_F_BLOCKING) 4874 cleanup_srcu_struct(set->srcu); 4875 out_free_srcu: 4876 if (set->flags & BLK_MQ_F_BLOCKING) 4877 kfree(set->srcu); 4878 return ret; 4879 } 4880 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 4881 4882 /* allocate and initialize a tagset for a simple single-queue device */ 4883 int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, 4884 const struct blk_mq_ops *ops, unsigned int queue_depth, 4885 unsigned int set_flags) 4886 { 4887 memset(set, 0, sizeof(*set)); 4888 set->ops = ops; 4889 set->nr_hw_queues = 1; 4890 set->nr_maps = 1; 4891 set->queue_depth = queue_depth; 4892 set->numa_node = NUMA_NO_NODE; 4893 set->flags = set_flags; 4894 return blk_mq_alloc_tag_set(set); 4895 } 4896 EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); 4897 4898 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 4899 { 4900 int i, j; 4901 4902 for (i = 0; i < set->nr_hw_queues; i++) 4903 __blk_mq_free_map_and_rqs(set, i); 4904 4905 if (blk_mq_is_shared_tags(set->flags)) { 4906 blk_mq_free_map_and_rqs(set, set->shared_tags, 4907 BLK_MQ_NO_HCTX_IDX); 4908 } 4909 4910 for (j = 0; j < set->nr_maps; j++) { 4911 kfree(set->map[j].mq_map); 4912 set->map[j].mq_map = NULL; 4913 } 4914 4915 kfree(set->tags); 4916 set->tags = NULL; 4917 4918 srcu_barrier(&set->tags_srcu); 4919 cleanup_srcu_struct(&set->tags_srcu); 4920 if (set->flags & BLK_MQ_F_BLOCKING) { 4921 cleanup_srcu_struct(set->srcu); 4922 kfree(set->srcu); 4923 } 4924 } 4925 EXPORT_SYMBOL(blk_mq_free_tag_set); 4926 4927 struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, 4928 struct elevator_tags *et, 4929 unsigned int nr) 4930 { 4931 struct blk_mq_tag_set *set = q->tag_set; 4932 struct elevator_tags *old_et = NULL; 4933 struct blk_mq_hw_ctx *hctx; 4934 unsigned long i; 4935 4936 blk_mq_quiesce_queue(q); 4937 4938 if (blk_mq_is_shared_tags(set->flags)) { 4939 /* 4940 * Shared tags, for sched tags, we allocate max initially hence 4941 * tags can't grow, see blk_mq_alloc_sched_tags(). 4942 */ 4943 if (q->elevator) 4944 blk_mq_tag_update_sched_shared_tags(q); 4945 else 4946 blk_mq_tag_resize_shared_tags(set, nr); 4947 } else if (!q->elevator) { 4948 /* 4949 * Non-shared hardware tags, nr is already checked from 4950 * queue_requests_store() and tags can't grow. 4951 */ 4952 queue_for_each_hw_ctx(q, hctx, i) { 4953 if (!hctx->tags) 4954 continue; 4955 sbitmap_queue_resize(&hctx->tags->bitmap_tags, 4956 nr - hctx->tags->nr_reserved_tags); 4957 } 4958 } else if (nr <= q->elevator->et->nr_requests) { 4959 /* Non-shared sched tags, and tags don't grow. */ 4960 queue_for_each_hw_ctx(q, hctx, i) { 4961 if (!hctx->sched_tags) 4962 continue; 4963 sbitmap_queue_resize(&hctx->sched_tags->bitmap_tags, 4964 nr - hctx->sched_tags->nr_reserved_tags); 4965 } 4966 } else { 4967 /* Non-shared sched tags, and tags grow */ 4968 queue_for_each_hw_ctx(q, hctx, i) 4969 hctx->sched_tags = et->tags[i]; 4970 old_et = q->elevator->et; 4971 q->elevator->et = et; 4972 } 4973 4974 q->nr_requests = nr; 4975 if (q->elevator && q->elevator->type->ops.depth_updated) 4976 q->elevator->type->ops.depth_updated(q); 4977 4978 blk_mq_unquiesce_queue(q); 4979 return old_et; 4980 } 4981 4982 /* 4983 * Switch back to the elevator type stored in the xarray. 4984 */ 4985 static void blk_mq_elv_switch_back(struct request_queue *q, 4986 struct xarray *elv_tbl, struct xarray *et_tbl) 4987 { 4988 struct elevator_type *e = xa_load(elv_tbl, q->id); 4989 struct elevator_tags *t = xa_load(et_tbl, q->id); 4990 4991 /* The elv_update_nr_hw_queues unfreezes the queue. */ 4992 elv_update_nr_hw_queues(q, e, t); 4993 4994 /* Drop the reference acquired in blk_mq_elv_switch_none. */ 4995 if (e) 4996 elevator_put(e); 4997 } 4998 4999 /* 5000 * Stores elevator type in xarray and set current elevator to none. It uses 5001 * q->id as an index to store the elevator type into the xarray. 5002 */ 5003 static int blk_mq_elv_switch_none(struct request_queue *q, 5004 struct xarray *elv_tbl) 5005 { 5006 int ret = 0; 5007 5008 lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); 5009 5010 /* 5011 * Accessing q->elevator without holding q->elevator_lock is safe here 5012 * because we're called from nr_hw_queue update which is protected by 5013 * set->update_nr_hwq_lock in the writer context. So, scheduler update/ 5014 * switch code (which acquires the same lock in the reader context) 5015 * can't run concurrently. 5016 */ 5017 if (q->elevator) { 5018 5019 ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL); 5020 if (WARN_ON_ONCE(ret)) 5021 return ret; 5022 5023 /* 5024 * Before we switch elevator to 'none', take a reference to 5025 * the elevator module so that while nr_hw_queue update is 5026 * running, no one can remove elevator module. We'd put the 5027 * reference to elevator module later when we switch back 5028 * elevator. 5029 */ 5030 __elevator_get(q->elevator->type); 5031 5032 elevator_set_none(q); 5033 } 5034 return ret; 5035 } 5036 5037 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, 5038 int nr_hw_queues) 5039 { 5040 struct request_queue *q; 5041 int prev_nr_hw_queues = set->nr_hw_queues; 5042 unsigned int memflags; 5043 int i; 5044 struct xarray elv_tbl, et_tbl; 5045 bool queues_frozen = false; 5046 5047 lockdep_assert_held(&set->tag_list_lock); 5048 5049 if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) 5050 nr_hw_queues = nr_cpu_ids; 5051 if (nr_hw_queues < 1) 5052 return; 5053 if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) 5054 return; 5055 5056 memflags = memalloc_noio_save(); 5057 5058 xa_init(&et_tbl); 5059 if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0) 5060 goto out_memalloc_restore; 5061 5062 xa_init(&elv_tbl); 5063 5064 list_for_each_entry(q, &set->tag_list, tag_set_list) { 5065 blk_mq_debugfs_unregister_hctxs(q); 5066 blk_mq_sysfs_unregister_hctxs(q); 5067 } 5068 5069 /* 5070 * Switch IO scheduler to 'none', cleaning up the data associated 5071 * with the previous scheduler. We will switch back once we are done 5072 * updating the new sw to hw queue mappings. 5073 */ 5074 list_for_each_entry(q, &set->tag_list, tag_set_list) 5075 if (blk_mq_elv_switch_none(q, &elv_tbl)) 5076 goto switch_back; 5077 5078 list_for_each_entry(q, &set->tag_list, tag_set_list) 5079 blk_mq_freeze_queue_nomemsave(q); 5080 queues_frozen = true; 5081 if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) 5082 goto switch_back; 5083 5084 fallback: 5085 blk_mq_update_queue_map(set); 5086 list_for_each_entry(q, &set->tag_list, tag_set_list) { 5087 __blk_mq_realloc_hw_ctxs(set, q); 5088 5089 if (q->nr_hw_queues != set->nr_hw_queues) { 5090 int i = prev_nr_hw_queues; 5091 5092 pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", 5093 nr_hw_queues, prev_nr_hw_queues); 5094 for (; i < set->nr_hw_queues; i++) 5095 __blk_mq_free_map_and_rqs(set, i); 5096 5097 set->nr_hw_queues = prev_nr_hw_queues; 5098 goto fallback; 5099 } 5100 blk_mq_map_swqueue(q); 5101 } 5102 switch_back: 5103 /* The blk_mq_elv_switch_back unfreezes queue for us. */ 5104 list_for_each_entry(q, &set->tag_list, tag_set_list) { 5105 /* switch_back expects queue to be frozen */ 5106 if (!queues_frozen) 5107 blk_mq_freeze_queue_nomemsave(q); 5108 blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl); 5109 } 5110 5111 list_for_each_entry(q, &set->tag_list, tag_set_list) { 5112 blk_mq_sysfs_register_hctxs(q); 5113 blk_mq_debugfs_register_hctxs(q); 5114 5115 blk_mq_remove_hw_queues_cpuhp(q); 5116 blk_mq_add_hw_queues_cpuhp(q); 5117 } 5118 5119 xa_destroy(&elv_tbl); 5120 xa_destroy(&et_tbl); 5121 out_memalloc_restore: 5122 memalloc_noio_restore(memflags); 5123 5124 /* Free the excess tags when nr_hw_queues shrink. */ 5125 for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) 5126 __blk_mq_free_map_and_rqs(set, i); 5127 } 5128 5129 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) 5130 { 5131 down_write(&set->update_nr_hwq_lock); 5132 mutex_lock(&set->tag_list_lock); 5133 __blk_mq_update_nr_hw_queues(set, nr_hw_queues); 5134 mutex_unlock(&set->tag_list_lock); 5135 up_write(&set->update_nr_hwq_lock); 5136 } 5137 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); 5138 5139 static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 5140 struct io_comp_batch *iob, unsigned int flags) 5141 { 5142 long state = get_current_state(); 5143 int ret; 5144 5145 do { 5146 ret = q->mq_ops->poll(hctx, iob); 5147 if (ret > 0) { 5148 __set_current_state(TASK_RUNNING); 5149 return ret; 5150 } 5151 5152 if (signal_pending_state(state, current)) 5153 __set_current_state(TASK_RUNNING); 5154 if (task_is_running(current)) 5155 return 1; 5156 5157 if (ret < 0 || (flags & BLK_POLL_ONESHOT)) 5158 break; 5159 cpu_relax(); 5160 } while (!need_resched()); 5161 5162 __set_current_state(TASK_RUNNING); 5163 return 0; 5164 } 5165 5166 int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, 5167 struct io_comp_batch *iob, unsigned int flags) 5168 { 5169 if (!blk_mq_can_poll(q)) 5170 return 0; 5171 return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags); 5172 } 5173 5174 int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, 5175 unsigned int poll_flags) 5176 { 5177 struct request_queue *q = rq->q; 5178 int ret; 5179 5180 if (!blk_rq_is_poll(rq)) 5181 return 0; 5182 if (!percpu_ref_tryget(&q->q_usage_counter)) 5183 return 0; 5184 5185 ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags); 5186 blk_queue_exit(q); 5187 5188 return ret; 5189 } 5190 EXPORT_SYMBOL_GPL(blk_rq_poll); 5191 5192 unsigned int blk_mq_rq_cpu(struct request *rq) 5193 { 5194 return rq->mq_ctx->cpu; 5195 } 5196 EXPORT_SYMBOL(blk_mq_rq_cpu); 5197 5198 void blk_mq_cancel_work_sync(struct request_queue *q) 5199 { 5200 struct blk_mq_hw_ctx *hctx; 5201 unsigned long i; 5202 5203 cancel_delayed_work_sync(&q->requeue_work); 5204 5205 queue_for_each_hw_ctx(q, hctx, i) 5206 cancel_delayed_work_sync(&hctx->run_work); 5207 } 5208 5209 static int __init blk_mq_init(void) 5210 { 5211 int i; 5212 5213 for_each_possible_cpu(i) 5214 init_llist_head(&per_cpu(blk_cpu_done, i)); 5215 for_each_possible_cpu(i) 5216 INIT_CSD(&per_cpu(blk_cpu_csd, i), 5217 __blk_mq_complete_request_remote, NULL); 5218 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); 5219 5220 cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, 5221 "block/softirq:dead", NULL, 5222 blk_softirq_cpu_dead); 5223 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 5224 blk_mq_hctx_notify_dead); 5225 cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", 5226 blk_mq_hctx_notify_online, 5227 blk_mq_hctx_notify_offline); 5228 return 0; 5229 } 5230 subsys_initcall(blk_mq_init); 5231