1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Block device elevator/IO-scheduler. 4 * 5 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 6 * 7 * 30042000 Jens Axboe <axboe@kernel.dk> : 8 * 9 * Split the elevator a bit so that it is possible to choose a different 10 * one or even write a new "plug in". There are three pieces: 11 * - elevator_fn, inserts a new request in the queue list 12 * - elevator_merge_fn, decides whether a new buffer can be merged with 13 * an existing request 14 * - elevator_dequeue_fn, called when a request is taken off the active list 15 * 16 * 20082000 Dave Jones <davej@suse.de> : 17 * Removed tests for max-bomb-segments, which was breaking elvtune 18 * when run without -bN 19 * 20 * Jens: 21 * - Rework again to work with bio instead of buffer_heads 22 * - loose bi_dev comparisons, partition handling is right now 23 * - completely modularize elevator setup and teardown 24 * 25 */ 26 #include <linux/kernel.h> 27 #include <linux/fs.h> 28 #include <linux/blkdev.h> 29 #include <linux/bio.h> 30 #include <linux/module.h> 31 #include <linux/slab.h> 32 #include <linux/init.h> 33 #include <linux/compiler.h> 34 #include <linux/blktrace_api.h> 35 #include <linux/hash.h> 36 #include <linux/uaccess.h> 37 #include <linux/pm_runtime.h> 38 39 #include <trace/events/block.h> 40 41 #include "elevator.h" 42 #include "blk.h" 43 #include "blk-mq-sched.h" 44 #include "blk-pm.h" 45 #include "blk-wbt.h" 46 #include "blk-cgroup.h" 47 48 static DEFINE_SPINLOCK(elv_list_lock); 49 static LIST_HEAD(elv_list); 50 51 /* 52 * Merge hash stuff. 53 */ 54 #define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) 55 56 /* 57 * Query io scheduler to see if the current process issuing bio may be 58 * merged with rq. 59 */ 60 static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) 61 { 62 struct request_queue *q = rq->q; 63 struct elevator_queue *e = q->elevator; 64 65 if (e->type->ops.allow_merge) 66 return e->type->ops.allow_merge(q, rq, bio); 67 68 return true; 69 } 70 71 /* 72 * can we safely merge with this request? 73 */ 74 bool elv_bio_merge_ok(struct request *rq, struct bio *bio) 75 { 76 if (!blk_rq_merge_ok(rq, bio)) 77 return false; 78 79 if (!elv_iosched_allow_bio_merge(rq, bio)) 80 return false; 81 82 return true; 83 } 84 EXPORT_SYMBOL(elv_bio_merge_ok); 85 86 /** 87 * elevator_match - Check whether @e's name or alias matches @name 88 * @e: Scheduler to test 89 * @name: Elevator name to test 90 * 91 * Return true if the elevator @e's name or alias matches @name. 92 */ 93 static bool elevator_match(const struct elevator_type *e, const char *name) 94 { 95 return !strcmp(e->elevator_name, name) || 96 (e->elevator_alias && !strcmp(e->elevator_alias, name)); 97 } 98 99 static struct elevator_type *__elevator_find(const char *name) 100 { 101 struct elevator_type *e; 102 103 list_for_each_entry(e, &elv_list, list) 104 if (elevator_match(e, name)) 105 return e; 106 return NULL; 107 } 108 109 static struct elevator_type *elevator_find_get(struct request_queue *q, 110 const char *name) 111 { 112 struct elevator_type *e; 113 114 spin_lock(&elv_list_lock); 115 e = __elevator_find(name); 116 if (e && (!elevator_tryget(e))) 117 e = NULL; 118 spin_unlock(&elv_list_lock); 119 return e; 120 } 121 122 static const struct kobj_type elv_ktype; 123 124 struct elevator_queue *elevator_alloc(struct request_queue *q, 125 struct elevator_type *e) 126 { 127 struct elevator_queue *eq; 128 129 eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node); 130 if (unlikely(!eq)) 131 return NULL; 132 133 __elevator_get(e); 134 eq->type = e; 135 kobject_init(&eq->kobj, &elv_ktype); 136 mutex_init(&eq->sysfs_lock); 137 hash_init(eq->hash); 138 139 return eq; 140 } 141 EXPORT_SYMBOL(elevator_alloc); 142 143 static void elevator_release(struct kobject *kobj) 144 { 145 struct elevator_queue *e; 146 147 e = container_of(kobj, struct elevator_queue, kobj); 148 elevator_put(e->type); 149 kfree(e); 150 } 151 152 void elevator_exit(struct request_queue *q) 153 { 154 struct elevator_queue *e = q->elevator; 155 156 ioc_clear_queue(q); 157 blk_mq_sched_free_rqs(q); 158 159 mutex_lock(&e->sysfs_lock); 160 blk_mq_exit_sched(q, e); 161 mutex_unlock(&e->sysfs_lock); 162 163 kobject_put(&e->kobj); 164 } 165 166 static inline void __elv_rqhash_del(struct request *rq) 167 { 168 hash_del(&rq->hash); 169 rq->rq_flags &= ~RQF_HASHED; 170 } 171 172 void elv_rqhash_del(struct request_queue *q, struct request *rq) 173 { 174 if (ELV_ON_HASH(rq)) 175 __elv_rqhash_del(rq); 176 } 177 EXPORT_SYMBOL_GPL(elv_rqhash_del); 178 179 void elv_rqhash_add(struct request_queue *q, struct request *rq) 180 { 181 struct elevator_queue *e = q->elevator; 182 183 BUG_ON(ELV_ON_HASH(rq)); 184 hash_add(e->hash, &rq->hash, rq_hash_key(rq)); 185 rq->rq_flags |= RQF_HASHED; 186 } 187 EXPORT_SYMBOL_GPL(elv_rqhash_add); 188 189 void elv_rqhash_reposition(struct request_queue *q, struct request *rq) 190 { 191 __elv_rqhash_del(rq); 192 elv_rqhash_add(q, rq); 193 } 194 195 struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) 196 { 197 struct elevator_queue *e = q->elevator; 198 struct hlist_node *next; 199 struct request *rq; 200 201 hash_for_each_possible_safe(e->hash, rq, next, hash, offset) { 202 BUG_ON(!ELV_ON_HASH(rq)); 203 204 if (unlikely(!rq_mergeable(rq))) { 205 __elv_rqhash_del(rq); 206 continue; 207 } 208 209 if (rq_hash_key(rq) == offset) 210 return rq; 211 } 212 213 return NULL; 214 } 215 216 /* 217 * RB-tree support functions for inserting/lookup/removal of requests 218 * in a sorted RB tree. 219 */ 220 void elv_rb_add(struct rb_root *root, struct request *rq) 221 { 222 struct rb_node **p = &root->rb_node; 223 struct rb_node *parent = NULL; 224 struct request *__rq; 225 226 while (*p) { 227 parent = *p; 228 __rq = rb_entry(parent, struct request, rb_node); 229 230 if (blk_rq_pos(rq) < blk_rq_pos(__rq)) 231 p = &(*p)->rb_left; 232 else if (blk_rq_pos(rq) >= blk_rq_pos(__rq)) 233 p = &(*p)->rb_right; 234 } 235 236 rb_link_node(&rq->rb_node, parent, p); 237 rb_insert_color(&rq->rb_node, root); 238 } 239 EXPORT_SYMBOL(elv_rb_add); 240 241 void elv_rb_del(struct rb_root *root, struct request *rq) 242 { 243 BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); 244 rb_erase(&rq->rb_node, root); 245 RB_CLEAR_NODE(&rq->rb_node); 246 } 247 EXPORT_SYMBOL(elv_rb_del); 248 249 struct request *elv_rb_find(struct rb_root *root, sector_t sector) 250 { 251 struct rb_node *n = root->rb_node; 252 struct request *rq; 253 254 while (n) { 255 rq = rb_entry(n, struct request, rb_node); 256 257 if (sector < blk_rq_pos(rq)) 258 n = n->rb_left; 259 else if (sector > blk_rq_pos(rq)) 260 n = n->rb_right; 261 else 262 return rq; 263 } 264 265 return NULL; 266 } 267 EXPORT_SYMBOL(elv_rb_find); 268 269 enum elv_merge elv_merge(struct request_queue *q, struct request **req, 270 struct bio *bio) 271 { 272 struct elevator_queue *e = q->elevator; 273 struct request *__rq; 274 275 /* 276 * Levels of merges: 277 * nomerges: No merges at all attempted 278 * noxmerges: Only simple one-hit cache try 279 * merges: All merge tries attempted 280 */ 281 if (blk_queue_nomerges(q) || !bio_mergeable(bio)) 282 return ELEVATOR_NO_MERGE; 283 284 /* 285 * First try one-hit cache. 286 */ 287 if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) { 288 enum elv_merge ret = blk_try_merge(q->last_merge, bio); 289 290 if (ret != ELEVATOR_NO_MERGE) { 291 *req = q->last_merge; 292 return ret; 293 } 294 } 295 296 if (blk_queue_noxmerges(q)) 297 return ELEVATOR_NO_MERGE; 298 299 /* 300 * See if our hash lookup can find a potential backmerge. 301 */ 302 __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); 303 if (__rq && elv_bio_merge_ok(__rq, bio)) { 304 *req = __rq; 305 306 if (blk_discard_mergable(__rq)) 307 return ELEVATOR_DISCARD_MERGE; 308 return ELEVATOR_BACK_MERGE; 309 } 310 311 if (e->type->ops.request_merge) 312 return e->type->ops.request_merge(q, req, bio); 313 314 return ELEVATOR_NO_MERGE; 315 } 316 317 /* 318 * Attempt to do an insertion back merge. Only check for the case where 319 * we can append 'rq' to an existing request, so we can throw 'rq' away 320 * afterwards. 321 * 322 * Returns true if we merged, false otherwise. 'free' will contain all 323 * requests that need to be freed. 324 */ 325 bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq, 326 struct list_head *free) 327 { 328 struct request *__rq; 329 bool ret; 330 331 if (blk_queue_nomerges(q)) 332 return false; 333 334 /* 335 * First try one-hit cache. 336 */ 337 if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) { 338 list_add(&rq->queuelist, free); 339 return true; 340 } 341 342 if (blk_queue_noxmerges(q)) 343 return false; 344 345 ret = false; 346 /* 347 * See if our hash lookup can find a potential backmerge. 348 */ 349 while (1) { 350 __rq = elv_rqhash_find(q, blk_rq_pos(rq)); 351 if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) 352 break; 353 354 list_add(&rq->queuelist, free); 355 /* The merged request could be merged with others, try again */ 356 ret = true; 357 rq = __rq; 358 } 359 360 return ret; 361 } 362 363 void elv_merged_request(struct request_queue *q, struct request *rq, 364 enum elv_merge type) 365 { 366 struct elevator_queue *e = q->elevator; 367 368 if (e->type->ops.request_merged) 369 e->type->ops.request_merged(q, rq, type); 370 371 if (type == ELEVATOR_BACK_MERGE) 372 elv_rqhash_reposition(q, rq); 373 374 q->last_merge = rq; 375 } 376 377 void elv_merge_requests(struct request_queue *q, struct request *rq, 378 struct request *next) 379 { 380 struct elevator_queue *e = q->elevator; 381 382 if (e->type->ops.requests_merged) 383 e->type->ops.requests_merged(q, rq, next); 384 385 elv_rqhash_reposition(q, rq); 386 q->last_merge = rq; 387 } 388 389 struct request *elv_latter_request(struct request_queue *q, struct request *rq) 390 { 391 struct elevator_queue *e = q->elevator; 392 393 if (e->type->ops.next_request) 394 return e->type->ops.next_request(q, rq); 395 396 return NULL; 397 } 398 399 struct request *elv_former_request(struct request_queue *q, struct request *rq) 400 { 401 struct elevator_queue *e = q->elevator; 402 403 if (e->type->ops.former_request) 404 return e->type->ops.former_request(q, rq); 405 406 return NULL; 407 } 408 409 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) 410 411 static ssize_t 412 elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 413 { 414 struct elv_fs_entry *entry = to_elv(attr); 415 struct elevator_queue *e; 416 ssize_t error; 417 418 if (!entry->show) 419 return -EIO; 420 421 e = container_of(kobj, struct elevator_queue, kobj); 422 mutex_lock(&e->sysfs_lock); 423 error = e->type ? entry->show(e, page) : -ENOENT; 424 mutex_unlock(&e->sysfs_lock); 425 return error; 426 } 427 428 static ssize_t 429 elv_attr_store(struct kobject *kobj, struct attribute *attr, 430 const char *page, size_t length) 431 { 432 struct elv_fs_entry *entry = to_elv(attr); 433 struct elevator_queue *e; 434 ssize_t error; 435 436 if (!entry->store) 437 return -EIO; 438 439 e = container_of(kobj, struct elevator_queue, kobj); 440 mutex_lock(&e->sysfs_lock); 441 error = e->type ? entry->store(e, page, length) : -ENOENT; 442 mutex_unlock(&e->sysfs_lock); 443 return error; 444 } 445 446 static const struct sysfs_ops elv_sysfs_ops = { 447 .show = elv_attr_show, 448 .store = elv_attr_store, 449 }; 450 451 static const struct kobj_type elv_ktype = { 452 .sysfs_ops = &elv_sysfs_ops, 453 .release = elevator_release, 454 }; 455 456 int elv_register_queue(struct request_queue *q, bool uevent) 457 { 458 struct elevator_queue *e = q->elevator; 459 int error; 460 461 lockdep_assert_held(&q->sysfs_lock); 462 463 error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); 464 if (!error) { 465 struct elv_fs_entry *attr = e->type->elevator_attrs; 466 if (attr) { 467 while (attr->attr.name) { 468 if (sysfs_create_file(&e->kobj, &attr->attr)) 469 break; 470 attr++; 471 } 472 } 473 if (uevent) 474 kobject_uevent(&e->kobj, KOBJ_ADD); 475 476 set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags); 477 } 478 return error; 479 } 480 481 void elv_unregister_queue(struct request_queue *q) 482 { 483 struct elevator_queue *e = q->elevator; 484 485 lockdep_assert_held(&q->sysfs_lock); 486 487 if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { 488 kobject_uevent(&e->kobj, KOBJ_REMOVE); 489 kobject_del(&e->kobj); 490 } 491 } 492 493 int elv_register(struct elevator_type *e) 494 { 495 /* finish request is mandatory */ 496 if (WARN_ON_ONCE(!e->ops.finish_request)) 497 return -EINVAL; 498 /* insert_requests and dispatch_request are mandatory */ 499 if (WARN_ON_ONCE(!e->ops.insert_requests || !e->ops.dispatch_request)) 500 return -EINVAL; 501 502 /* create icq_cache if requested */ 503 if (e->icq_size) { 504 if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || 505 WARN_ON(e->icq_align < __alignof__(struct io_cq))) 506 return -EINVAL; 507 508 snprintf(e->icq_cache_name, sizeof(e->icq_cache_name), 509 "%s_io_cq", e->elevator_name); 510 e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size, 511 e->icq_align, 0, NULL); 512 if (!e->icq_cache) 513 return -ENOMEM; 514 } 515 516 /* register, don't allow duplicate names */ 517 spin_lock(&elv_list_lock); 518 if (__elevator_find(e->elevator_name)) { 519 spin_unlock(&elv_list_lock); 520 kmem_cache_destroy(e->icq_cache); 521 return -EBUSY; 522 } 523 list_add_tail(&e->list, &elv_list); 524 spin_unlock(&elv_list_lock); 525 526 printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name); 527 528 return 0; 529 } 530 EXPORT_SYMBOL_GPL(elv_register); 531 532 void elv_unregister(struct elevator_type *e) 533 { 534 /* unregister */ 535 spin_lock(&elv_list_lock); 536 list_del_init(&e->list); 537 spin_unlock(&elv_list_lock); 538 539 /* 540 * Destroy icq_cache if it exists. icq's are RCU managed. Make 541 * sure all RCU operations are complete before proceeding. 542 */ 543 if (e->icq_cache) { 544 rcu_barrier(); 545 kmem_cache_destroy(e->icq_cache); 546 e->icq_cache = NULL; 547 } 548 } 549 EXPORT_SYMBOL_GPL(elv_unregister); 550 551 static inline bool elv_support_iosched(struct request_queue *q) 552 { 553 if (!queue_is_mq(q) || 554 (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))) 555 return false; 556 return true; 557 } 558 559 /* 560 * For single queue devices, default to using mq-deadline. If we have multiple 561 * queues or mq-deadline is not available, default to "none". 562 */ 563 static struct elevator_type *elevator_get_default(struct request_queue *q) 564 { 565 if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) 566 return NULL; 567 568 if (q->nr_hw_queues != 1 && 569 !blk_mq_is_shared_tags(q->tag_set->flags)) 570 return NULL; 571 572 return elevator_find_get(q, "mq-deadline"); 573 } 574 575 /* 576 * Use the default elevator settings. If the chosen elevator initialization 577 * fails, fall back to the "none" elevator (no elevator). 578 */ 579 void elevator_init_mq(struct request_queue *q) 580 { 581 struct elevator_type *e; 582 int err; 583 584 if (!elv_support_iosched(q)) 585 return; 586 587 WARN_ON_ONCE(blk_queue_registered(q)); 588 589 if (unlikely(q->elevator)) 590 return; 591 592 e = elevator_get_default(q); 593 if (!e) 594 return; 595 596 /* 597 * We are called before adding disk, when there isn't any FS I/O, 598 * so freezing queue plus canceling dispatch work is enough to 599 * drain any dispatch activities originated from passthrough 600 * requests, then no need to quiesce queue which may add long boot 601 * latency, especially when lots of disks are involved. 602 */ 603 blk_mq_freeze_queue(q); 604 blk_mq_cancel_work_sync(q); 605 606 err = blk_mq_init_sched(q, e); 607 608 blk_mq_unfreeze_queue(q); 609 610 if (err) { 611 pr_warn("\"%s\" elevator initialization failed, " 612 "falling back to \"none\"\n", e->elevator_name); 613 } 614 615 elevator_put(e); 616 } 617 618 /* 619 * Switch to new_e io scheduler. 620 * 621 * If switching fails, we are most likely running out of memory and not able 622 * to restore the old io scheduler, so leaving the io scheduler being none. 623 */ 624 int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 625 { 626 int ret; 627 628 lockdep_assert_held(&q->sysfs_lock); 629 630 blk_mq_freeze_queue(q); 631 blk_mq_quiesce_queue(q); 632 633 if (q->elevator) { 634 elv_unregister_queue(q); 635 elevator_exit(q); 636 } 637 638 ret = blk_mq_init_sched(q, new_e); 639 if (ret) 640 goto out_unfreeze; 641 642 ret = elv_register_queue(q, true); 643 if (ret) { 644 elevator_exit(q); 645 goto out_unfreeze; 646 } 647 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); 648 649 out_unfreeze: 650 blk_mq_unquiesce_queue(q); 651 blk_mq_unfreeze_queue(q); 652 653 if (ret) { 654 pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n", 655 new_e->elevator_name); 656 } 657 658 return ret; 659 } 660 661 void elevator_disable(struct request_queue *q) 662 { 663 lockdep_assert_held(&q->sysfs_lock); 664 665 blk_mq_freeze_queue(q); 666 blk_mq_quiesce_queue(q); 667 668 elv_unregister_queue(q); 669 elevator_exit(q); 670 blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); 671 q->elevator = NULL; 672 q->nr_requests = q->tag_set->queue_depth; 673 blk_add_trace_msg(q, "elv switch: none"); 674 675 blk_mq_unquiesce_queue(q); 676 blk_mq_unfreeze_queue(q); 677 } 678 679 /* 680 * Switch this queue to the given IO scheduler. 681 */ 682 static int elevator_change(struct request_queue *q, const char *elevator_name) 683 { 684 struct elevator_type *e; 685 int ret; 686 687 /* Make sure queue is not in the middle of being removed */ 688 if (!blk_queue_registered(q)) 689 return -ENOENT; 690 691 if (!strncmp(elevator_name, "none", 4)) { 692 if (q->elevator) 693 elevator_disable(q); 694 return 0; 695 } 696 697 if (q->elevator && elevator_match(q->elevator->type, elevator_name)) 698 return 0; 699 700 e = elevator_find_get(q, elevator_name); 701 if (!e) { 702 request_module("%s-iosched", elevator_name); 703 e = elevator_find_get(q, elevator_name); 704 if (!e) 705 return -EINVAL; 706 } 707 ret = elevator_switch(q, e); 708 elevator_put(e); 709 return ret; 710 } 711 712 ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, 713 size_t count) 714 { 715 char elevator_name[ELV_NAME_MAX]; 716 int ret; 717 718 if (!elv_support_iosched(disk->queue)) 719 return count; 720 721 strscpy(elevator_name, buf, sizeof(elevator_name)); 722 ret = elevator_change(disk->queue, strstrip(elevator_name)); 723 if (!ret) 724 return count; 725 return ret; 726 } 727 728 ssize_t elv_iosched_show(struct gendisk *disk, char *name) 729 { 730 struct request_queue *q = disk->queue; 731 struct elevator_queue *eq = q->elevator; 732 struct elevator_type *cur = NULL, *e; 733 int len = 0; 734 735 if (!elv_support_iosched(q)) 736 return sprintf(name, "none\n"); 737 738 if (!q->elevator) { 739 len += sprintf(name+len, "[none] "); 740 } else { 741 len += sprintf(name+len, "none "); 742 cur = eq->type; 743 } 744 745 spin_lock(&elv_list_lock); 746 list_for_each_entry(e, &elv_list, list) { 747 if (e == cur) 748 len += sprintf(name+len, "[%s] ", e->elevator_name); 749 else 750 len += sprintf(name+len, "%s ", e->elevator_name); 751 } 752 spin_unlock(&elv_list_lock); 753 754 len += sprintf(name+len, "\n"); 755 return len; 756 } 757 758 struct request *elv_rb_former_request(struct request_queue *q, 759 struct request *rq) 760 { 761 struct rb_node *rbprev = rb_prev(&rq->rb_node); 762 763 if (rbprev) 764 return rb_entry_rq(rbprev); 765 766 return NULL; 767 } 768 EXPORT_SYMBOL(elv_rb_former_request); 769 770 struct request *elv_rb_latter_request(struct request_queue *q, 771 struct request *rq) 772 { 773 struct rb_node *rbnext = rb_next(&rq->rb_node); 774 775 if (rbnext) 776 return rb_entry_rq(rbnext); 777 778 return NULL; 779 } 780 EXPORT_SYMBOL(elv_rb_latter_request); 781 782 static int __init elevator_setup(char *str) 783 { 784 pr_warn("Kernel parameter elevator= does not have any effect anymore.\n" 785 "Please use sysfs to set IO scheduler for individual devices.\n"); 786 return 1; 787 } 788 789 __setup("elevator=", elevator_setup); 790