1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Common Block IO controller cgroup interface 4 * 5 * Based on ideas and code from CFQ, CFS and BFQ: 6 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 7 * 8 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 9 * Paolo Valente <paolo.valente@unimore.it> 10 * 11 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 12 * Nauman Rafique <nauman@google.com> 13 * 14 * For policy-specific per-blkcg data: 15 * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> 16 * Arianna Avanzini <avanzini.arianna@gmail.com> 17 */ 18 #include <linux/ioprio.h> 19 #include <linux/kdev_t.h> 20 #include <linux/module.h> 21 #include <linux/sched/signal.h> 22 #include <linux/err.h> 23 #include <linux/blkdev.h> 24 #include <linux/backing-dev.h> 25 #include <linux/slab.h> 26 #include <linux/delay.h> 27 #include <linux/atomic.h> 28 #include <linux/ctype.h> 29 #include <linux/resume_user_mode.h> 30 #include <linux/psi.h> 31 #include <linux/part_stat.h> 32 #include "blk.h" 33 #include "blk-cgroup.h" 34 #include "blk-ioprio.h" 35 #include "blk-throttle.h" 36 37 /* 38 * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. 39 * blkcg_pol_register_mutex nests outside of it and synchronizes entire 40 * policy [un]register operations including cgroup file additions / 41 * removals. Putting cgroup file registration outside blkcg_pol_mutex 42 * allows grabbing it from cgroup callbacks. 43 */ 44 static DEFINE_MUTEX(blkcg_pol_register_mutex); 45 static DEFINE_MUTEX(blkcg_pol_mutex); 46 47 struct blkcg blkcg_root; 48 EXPORT_SYMBOL_GPL(blkcg_root); 49 50 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; 51 EXPORT_SYMBOL_GPL(blkcg_root_css); 52 53 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 54 55 static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ 56 57 bool blkcg_debug_stats = false; 58 static struct workqueue_struct *blkcg_punt_bio_wq; 59 60 #define BLKG_DESTROY_BATCH_SIZE 64 61 62 static bool blkcg_policy_enabled(struct request_queue *q, 63 const struct blkcg_policy *pol) 64 { 65 return pol && test_bit(pol->plid, q->blkcg_pols); 66 } 67 68 static void blkg_free_workfn(struct work_struct *work) 69 { 70 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, 71 free_work); 72 int i; 73 74 for (i = 0; i < BLKCG_MAX_POLS; i++) 75 if (blkg->pd[i]) 76 blkcg_policy[i]->pd_free_fn(blkg->pd[i]); 77 78 if (blkg->q) 79 blk_put_queue(blkg->q); 80 free_percpu(blkg->iostat_cpu); 81 percpu_ref_exit(&blkg->refcnt); 82 kfree(blkg); 83 } 84 85 /** 86 * blkg_free - free a blkg 87 * @blkg: blkg to free 88 * 89 * Free @blkg which may be partially allocated. 90 */ 91 static void blkg_free(struct blkcg_gq *blkg) 92 { 93 if (!blkg) 94 return; 95 96 /* 97 * Both ->pd_free_fn() and request queue's release handler may 98 * sleep, so free us by scheduling one work func 99 */ 100 INIT_WORK(&blkg->free_work, blkg_free_workfn); 101 schedule_work(&blkg->free_work); 102 } 103 104 static void __blkg_release(struct rcu_head *rcu) 105 { 106 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); 107 108 WARN_ON(!bio_list_empty(&blkg->async_bios)); 109 110 /* release the blkcg and parent blkg refs this blkg has been holding */ 111 css_put(&blkg->blkcg->css); 112 if (blkg->parent) 113 blkg_put(blkg->parent); 114 blkg_free(blkg); 115 } 116 117 /* 118 * A group is RCU protected, but having an rcu lock does not mean that one 119 * can access all the fields of blkg and assume these are valid. For 120 * example, don't try to follow throtl_data and request queue links. 121 * 122 * Having a reference to blkg under an rcu allows accesses to only values 123 * local to groups like group stats and group rate limits. 124 */ 125 static void blkg_release(struct percpu_ref *ref) 126 { 127 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); 128 129 call_rcu(&blkg->rcu_head, __blkg_release); 130 } 131 132 static void blkg_async_bio_workfn(struct work_struct *work) 133 { 134 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, 135 async_bio_work); 136 struct bio_list bios = BIO_EMPTY_LIST; 137 struct bio *bio; 138 struct blk_plug plug; 139 bool need_plug = false; 140 141 /* as long as there are pending bios, @blkg can't go away */ 142 spin_lock_bh(&blkg->async_bio_lock); 143 bio_list_merge(&bios, &blkg->async_bios); 144 bio_list_init(&blkg->async_bios); 145 spin_unlock_bh(&blkg->async_bio_lock); 146 147 /* start plug only when bio_list contains at least 2 bios */ 148 if (bios.head && bios.head->bi_next) { 149 need_plug = true; 150 blk_start_plug(&plug); 151 } 152 while ((bio = bio_list_pop(&bios))) 153 submit_bio(bio); 154 if (need_plug) 155 blk_finish_plug(&plug); 156 } 157 158 /** 159 * blkg_alloc - allocate a blkg 160 * @blkcg: block cgroup the new blkg is associated with 161 * @q: request_queue the new blkg is associated with 162 * @gfp_mask: allocation mask to use 163 * 164 * Allocate a new blkg assocating @blkcg and @q. 165 */ 166 static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, 167 gfp_t gfp_mask) 168 { 169 struct blkcg_gq *blkg; 170 int i, cpu; 171 172 /* alloc and init base part */ 173 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); 174 if (!blkg) 175 return NULL; 176 177 if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) 178 goto err_free; 179 180 blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); 181 if (!blkg->iostat_cpu) 182 goto err_free; 183 184 if (!blk_get_queue(q)) 185 goto err_free; 186 187 blkg->q = q; 188 INIT_LIST_HEAD(&blkg->q_node); 189 spin_lock_init(&blkg->async_bio_lock); 190 bio_list_init(&blkg->async_bios); 191 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); 192 blkg->blkcg = blkcg; 193 194 u64_stats_init(&blkg->iostat.sync); 195 for_each_possible_cpu(cpu) 196 u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); 197 198 for (i = 0; i < BLKCG_MAX_POLS; i++) { 199 struct blkcg_policy *pol = blkcg_policy[i]; 200 struct blkg_policy_data *pd; 201 202 if (!blkcg_policy_enabled(q, pol)) 203 continue; 204 205 /* alloc per-policy data and attach it to blkg */ 206 pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); 207 if (!pd) 208 goto err_free; 209 210 blkg->pd[i] = pd; 211 pd->blkg = blkg; 212 pd->plid = i; 213 } 214 215 return blkg; 216 217 err_free: 218 blkg_free(blkg); 219 return NULL; 220 } 221 222 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, 223 struct request_queue *q, bool update_hint) 224 { 225 struct blkcg_gq *blkg; 226 227 /* 228 * Hint didn't match. Look up from the radix tree. Note that the 229 * hint can only be updated under queue_lock as otherwise @blkg 230 * could have already been removed from blkg_tree. The caller is 231 * responsible for grabbing queue_lock if @update_hint. 232 */ 233 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); 234 if (blkg && blkg->q == q) { 235 if (update_hint) { 236 lockdep_assert_held(&q->queue_lock); 237 rcu_assign_pointer(blkcg->blkg_hint, blkg); 238 } 239 return blkg; 240 } 241 242 return NULL; 243 } 244 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); 245 246 /* 247 * If @new_blkg is %NULL, this function tries to allocate a new one as 248 * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. 249 */ 250 static struct blkcg_gq *blkg_create(struct blkcg *blkcg, 251 struct request_queue *q, 252 struct blkcg_gq *new_blkg) 253 { 254 struct blkcg_gq *blkg; 255 int i, ret; 256 257 WARN_ON_ONCE(!rcu_read_lock_held()); 258 lockdep_assert_held(&q->queue_lock); 259 260 /* request_queue is dying, do not create/recreate a blkg */ 261 if (blk_queue_dying(q)) { 262 ret = -ENODEV; 263 goto err_free_blkg; 264 } 265 266 /* blkg holds a reference to blkcg */ 267 if (!css_tryget_online(&blkcg->css)) { 268 ret = -ENODEV; 269 goto err_free_blkg; 270 } 271 272 /* allocate */ 273 if (!new_blkg) { 274 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); 275 if (unlikely(!new_blkg)) { 276 ret = -ENOMEM; 277 goto err_put_css; 278 } 279 } 280 blkg = new_blkg; 281 282 /* link parent */ 283 if (blkcg_parent(blkcg)) { 284 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); 285 if (WARN_ON_ONCE(!blkg->parent)) { 286 ret = -ENODEV; 287 goto err_put_css; 288 } 289 blkg_get(blkg->parent); 290 } 291 292 /* invoke per-policy init */ 293 for (i = 0; i < BLKCG_MAX_POLS; i++) { 294 struct blkcg_policy *pol = blkcg_policy[i]; 295 296 if (blkg->pd[i] && pol->pd_init_fn) 297 pol->pd_init_fn(blkg->pd[i]); 298 } 299 300 /* insert */ 301 spin_lock(&blkcg->lock); 302 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 303 if (likely(!ret)) { 304 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 305 list_add(&blkg->q_node, &q->blkg_list); 306 307 for (i = 0; i < BLKCG_MAX_POLS; i++) { 308 struct blkcg_policy *pol = blkcg_policy[i]; 309 310 if (blkg->pd[i] && pol->pd_online_fn) 311 pol->pd_online_fn(blkg->pd[i]); 312 } 313 } 314 blkg->online = true; 315 spin_unlock(&blkcg->lock); 316 317 if (!ret) 318 return blkg; 319 320 /* @blkg failed fully initialized, use the usual release path */ 321 blkg_put(blkg); 322 return ERR_PTR(ret); 323 324 err_put_css: 325 css_put(&blkcg->css); 326 err_free_blkg: 327 blkg_free(new_blkg); 328 return ERR_PTR(ret); 329 } 330 331 /** 332 * blkg_lookup_create - lookup blkg, try to create one if not there 333 * @blkcg: blkcg of interest 334 * @q: request_queue of interest 335 * 336 * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to 337 * create one. blkg creation is performed recursively from blkcg_root such 338 * that all non-root blkg's have access to the parent blkg. This function 339 * should be called under RCU read lock and takes @q->queue_lock. 340 * 341 * Returns the blkg or the closest blkg if blkg_create() fails as it walks 342 * down from root. 343 */ 344 static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 345 struct request_queue *q) 346 { 347 struct blkcg_gq *blkg; 348 unsigned long flags; 349 350 WARN_ON_ONCE(!rcu_read_lock_held()); 351 352 blkg = blkg_lookup(blkcg, q); 353 if (blkg) 354 return blkg; 355 356 spin_lock_irqsave(&q->queue_lock, flags); 357 blkg = __blkg_lookup(blkcg, q, true); 358 if (blkg) 359 goto found; 360 361 /* 362 * Create blkgs walking down from blkcg_root to @blkcg, so that all 363 * non-root blkgs have access to their parents. Returns the closest 364 * blkg to the intended blkg should blkg_create() fail. 365 */ 366 while (true) { 367 struct blkcg *pos = blkcg; 368 struct blkcg *parent = blkcg_parent(blkcg); 369 struct blkcg_gq *ret_blkg = q->root_blkg; 370 371 while (parent) { 372 blkg = __blkg_lookup(parent, q, false); 373 if (blkg) { 374 /* remember closest blkg */ 375 ret_blkg = blkg; 376 break; 377 } 378 pos = parent; 379 parent = blkcg_parent(parent); 380 } 381 382 blkg = blkg_create(pos, q, NULL); 383 if (IS_ERR(blkg)) { 384 blkg = ret_blkg; 385 break; 386 } 387 if (pos == blkcg) 388 break; 389 } 390 391 found: 392 spin_unlock_irqrestore(&q->queue_lock, flags); 393 return blkg; 394 } 395 396 static void blkg_destroy(struct blkcg_gq *blkg) 397 { 398 struct blkcg *blkcg = blkg->blkcg; 399 int i; 400 401 lockdep_assert_held(&blkg->q->queue_lock); 402 lockdep_assert_held(&blkcg->lock); 403 404 /* Something wrong if we are trying to remove same group twice */ 405 WARN_ON_ONCE(list_empty(&blkg->q_node)); 406 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); 407 408 for (i = 0; i < BLKCG_MAX_POLS; i++) { 409 struct blkcg_policy *pol = blkcg_policy[i]; 410 411 if (blkg->pd[i] && pol->pd_offline_fn) 412 pol->pd_offline_fn(blkg->pd[i]); 413 } 414 415 blkg->online = false; 416 417 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); 418 list_del_init(&blkg->q_node); 419 hlist_del_init_rcu(&blkg->blkcg_node); 420 421 /* 422 * Both setting lookup hint to and clearing it from @blkg are done 423 * under queue_lock. If it's not pointing to @blkg now, it never 424 * will. Hint assignment itself can race safely. 425 */ 426 if (rcu_access_pointer(blkcg->blkg_hint) == blkg) 427 rcu_assign_pointer(blkcg->blkg_hint, NULL); 428 429 /* 430 * Put the reference taken at the time of creation so that when all 431 * queues are gone, group can be destroyed. 432 */ 433 percpu_ref_kill(&blkg->refcnt); 434 } 435 436 /** 437 * blkg_destroy_all - destroy all blkgs associated with a request_queue 438 * @q: request_queue of interest 439 * 440 * Destroy all blkgs associated with @q. 441 */ 442 static void blkg_destroy_all(struct request_queue *q) 443 { 444 struct blkcg_gq *blkg, *n; 445 int count = BLKG_DESTROY_BATCH_SIZE; 446 447 restart: 448 spin_lock_irq(&q->queue_lock); 449 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 450 struct blkcg *blkcg = blkg->blkcg; 451 452 spin_lock(&blkcg->lock); 453 blkg_destroy(blkg); 454 spin_unlock(&blkcg->lock); 455 456 /* 457 * in order to avoid holding the spin lock for too long, release 458 * it when a batch of blkgs are destroyed. 459 */ 460 if (!(--count)) { 461 count = BLKG_DESTROY_BATCH_SIZE; 462 spin_unlock_irq(&q->queue_lock); 463 cond_resched(); 464 goto restart; 465 } 466 } 467 468 q->root_blkg = NULL; 469 spin_unlock_irq(&q->queue_lock); 470 } 471 472 static int blkcg_reset_stats(struct cgroup_subsys_state *css, 473 struct cftype *cftype, u64 val) 474 { 475 struct blkcg *blkcg = css_to_blkcg(css); 476 struct blkcg_gq *blkg; 477 int i, cpu; 478 479 mutex_lock(&blkcg_pol_mutex); 480 spin_lock_irq(&blkcg->lock); 481 482 /* 483 * Note that stat reset is racy - it doesn't synchronize against 484 * stat updates. This is a debug feature which shouldn't exist 485 * anyway. If you get hit by a race, retry. 486 */ 487 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { 488 for_each_possible_cpu(cpu) { 489 struct blkg_iostat_set *bis = 490 per_cpu_ptr(blkg->iostat_cpu, cpu); 491 memset(bis, 0, sizeof(*bis)); 492 } 493 memset(&blkg->iostat, 0, sizeof(blkg->iostat)); 494 495 for (i = 0; i < BLKCG_MAX_POLS; i++) { 496 struct blkcg_policy *pol = blkcg_policy[i]; 497 498 if (blkg->pd[i] && pol->pd_reset_stats_fn) 499 pol->pd_reset_stats_fn(blkg->pd[i]); 500 } 501 } 502 503 spin_unlock_irq(&blkcg->lock); 504 mutex_unlock(&blkcg_pol_mutex); 505 return 0; 506 } 507 508 const char *blkg_dev_name(struct blkcg_gq *blkg) 509 { 510 if (!blkg->q->disk || !blkg->q->disk->bdi->dev) 511 return NULL; 512 return bdi_dev_name(blkg->q->disk->bdi); 513 } 514 515 /** 516 * blkcg_print_blkgs - helper for printing per-blkg data 517 * @sf: seq_file to print to 518 * @blkcg: blkcg of interest 519 * @prfill: fill function to print out a blkg 520 * @pol: policy in question 521 * @data: data to be passed to @prfill 522 * @show_total: to print out sum of prfill return values or not 523 * 524 * This function invokes @prfill on each blkg of @blkcg if pd for the 525 * policy specified by @pol exists. @prfill is invoked with @sf, the 526 * policy data and @data and the matching queue lock held. If @show_total 527 * is %true, the sum of the return values from @prfill is printed with 528 * "Total" label at the end. 529 * 530 * This is to be used to construct print functions for 531 * cftype->read_seq_string method. 532 */ 533 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, 534 u64 (*prfill)(struct seq_file *, 535 struct blkg_policy_data *, int), 536 const struct blkcg_policy *pol, int data, 537 bool show_total) 538 { 539 struct blkcg_gq *blkg; 540 u64 total = 0; 541 542 rcu_read_lock(); 543 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 544 spin_lock_irq(&blkg->q->queue_lock); 545 if (blkcg_policy_enabled(blkg->q, pol)) 546 total += prfill(sf, blkg->pd[pol->plid], data); 547 spin_unlock_irq(&blkg->q->queue_lock); 548 } 549 rcu_read_unlock(); 550 551 if (show_total) 552 seq_printf(sf, "Total %llu\n", (unsigned long long)total); 553 } 554 EXPORT_SYMBOL_GPL(blkcg_print_blkgs); 555 556 /** 557 * __blkg_prfill_u64 - prfill helper for a single u64 value 558 * @sf: seq_file to print to 559 * @pd: policy private data of interest 560 * @v: value to print 561 * 562 * Print @v to @sf for the device assocaited with @pd. 563 */ 564 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) 565 { 566 const char *dname = blkg_dev_name(pd->blkg); 567 568 if (!dname) 569 return 0; 570 571 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); 572 return v; 573 } 574 EXPORT_SYMBOL_GPL(__blkg_prfill_u64); 575 576 /* Performs queue bypass and policy enabled checks then looks up blkg. */ 577 static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, 578 const struct blkcg_policy *pol, 579 struct request_queue *q) 580 { 581 WARN_ON_ONCE(!rcu_read_lock_held()); 582 lockdep_assert_held(&q->queue_lock); 583 584 if (!blkcg_policy_enabled(q, pol)) 585 return ERR_PTR(-EOPNOTSUPP); 586 return __blkg_lookup(blkcg, q, true /* update_hint */); 587 } 588 589 /** 590 * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update 591 * @inputp: input string pointer 592 * 593 * Parse the device node prefix part, MAJ:MIN, of per-blkg config update 594 * from @input and get and return the matching bdev. *@inputp is 595 * updated to point past the device node prefix. Returns an ERR_PTR() 596 * value on error. 597 * 598 * Use this function iff blkg_conf_prep() can't be used for some reason. 599 */ 600 struct block_device *blkcg_conf_open_bdev(char **inputp) 601 { 602 char *input = *inputp; 603 unsigned int major, minor; 604 struct block_device *bdev; 605 int key_len; 606 607 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) 608 return ERR_PTR(-EINVAL); 609 610 input += key_len; 611 if (!isspace(*input)) 612 return ERR_PTR(-EINVAL); 613 input = skip_spaces(input); 614 615 bdev = blkdev_get_no_open(MKDEV(major, minor)); 616 if (!bdev) 617 return ERR_PTR(-ENODEV); 618 if (bdev_is_partition(bdev)) { 619 blkdev_put_no_open(bdev); 620 return ERR_PTR(-ENODEV); 621 } 622 623 *inputp = input; 624 return bdev; 625 } 626 627 /** 628 * blkg_conf_prep - parse and prepare for per-blkg config update 629 * @blkcg: target block cgroup 630 * @pol: target policy 631 * @input: input string 632 * @ctx: blkg_conf_ctx to be filled 633 * 634 * Parse per-blkg config update from @input and initialize @ctx with the 635 * result. @ctx->blkg points to the blkg to be updated and @ctx->body the 636 * part of @input following MAJ:MIN. This function returns with RCU read 637 * lock and queue lock held and must be paired with blkg_conf_finish(). 638 */ 639 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 640 char *input, struct blkg_conf_ctx *ctx) 641 __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) 642 { 643 struct block_device *bdev; 644 struct request_queue *q; 645 struct blkcg_gq *blkg; 646 int ret; 647 648 bdev = blkcg_conf_open_bdev(&input); 649 if (IS_ERR(bdev)) 650 return PTR_ERR(bdev); 651 652 q = bdev_get_queue(bdev); 653 654 /* 655 * blkcg_deactivate_policy() requires queue to be frozen, we can grab 656 * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). 657 */ 658 ret = blk_queue_enter(q, 0); 659 if (ret) 660 goto fail; 661 662 rcu_read_lock(); 663 spin_lock_irq(&q->queue_lock); 664 665 blkg = blkg_lookup_check(blkcg, pol, q); 666 if (IS_ERR(blkg)) { 667 ret = PTR_ERR(blkg); 668 goto fail_unlock; 669 } 670 671 if (blkg) 672 goto success; 673 674 /* 675 * Create blkgs walking down from blkcg_root to @blkcg, so that all 676 * non-root blkgs have access to their parents. 677 */ 678 while (true) { 679 struct blkcg *pos = blkcg; 680 struct blkcg *parent; 681 struct blkcg_gq *new_blkg; 682 683 parent = blkcg_parent(blkcg); 684 while (parent && !__blkg_lookup(parent, q, false)) { 685 pos = parent; 686 parent = blkcg_parent(parent); 687 } 688 689 /* Drop locks to do new blkg allocation with GFP_KERNEL. */ 690 spin_unlock_irq(&q->queue_lock); 691 rcu_read_unlock(); 692 693 new_blkg = blkg_alloc(pos, q, GFP_KERNEL); 694 if (unlikely(!new_blkg)) { 695 ret = -ENOMEM; 696 goto fail_exit_queue; 697 } 698 699 if (radix_tree_preload(GFP_KERNEL)) { 700 blkg_free(new_blkg); 701 ret = -ENOMEM; 702 goto fail_exit_queue; 703 } 704 705 rcu_read_lock(); 706 spin_lock_irq(&q->queue_lock); 707 708 blkg = blkg_lookup_check(pos, pol, q); 709 if (IS_ERR(blkg)) { 710 ret = PTR_ERR(blkg); 711 blkg_free(new_blkg); 712 goto fail_preloaded; 713 } 714 715 if (blkg) { 716 blkg_free(new_blkg); 717 } else { 718 blkg = blkg_create(pos, q, new_blkg); 719 if (IS_ERR(blkg)) { 720 ret = PTR_ERR(blkg); 721 goto fail_preloaded; 722 } 723 } 724 725 radix_tree_preload_end(); 726 727 if (pos == blkcg) 728 goto success; 729 } 730 success: 731 blk_queue_exit(q); 732 ctx->bdev = bdev; 733 ctx->blkg = blkg; 734 ctx->body = input; 735 return 0; 736 737 fail_preloaded: 738 radix_tree_preload_end(); 739 fail_unlock: 740 spin_unlock_irq(&q->queue_lock); 741 rcu_read_unlock(); 742 fail_exit_queue: 743 blk_queue_exit(q); 744 fail: 745 blkdev_put_no_open(bdev); 746 /* 747 * If queue was bypassing, we should retry. Do so after a 748 * short msleep(). It isn't strictly necessary but queue 749 * can be bypassing for some time and it's always nice to 750 * avoid busy looping. 751 */ 752 if (ret == -EBUSY) { 753 msleep(10); 754 ret = restart_syscall(); 755 } 756 return ret; 757 } 758 EXPORT_SYMBOL_GPL(blkg_conf_prep); 759 760 /** 761 * blkg_conf_finish - finish up per-blkg config update 762 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() 763 * 764 * Finish up after per-blkg config update. This function must be paired 765 * with blkg_conf_prep(). 766 */ 767 void blkg_conf_finish(struct blkg_conf_ctx *ctx) 768 __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu) 769 { 770 spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); 771 rcu_read_unlock(); 772 blkdev_put_no_open(ctx->bdev); 773 } 774 EXPORT_SYMBOL_GPL(blkg_conf_finish); 775 776 static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) 777 { 778 int i; 779 780 for (i = 0; i < BLKG_IOSTAT_NR; i++) { 781 dst->bytes[i] = src->bytes[i]; 782 dst->ios[i] = src->ios[i]; 783 } 784 } 785 786 static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) 787 { 788 int i; 789 790 for (i = 0; i < BLKG_IOSTAT_NR; i++) { 791 dst->bytes[i] += src->bytes[i]; 792 dst->ios[i] += src->ios[i]; 793 } 794 } 795 796 static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) 797 { 798 int i; 799 800 for (i = 0; i < BLKG_IOSTAT_NR; i++) { 801 dst->bytes[i] -= src->bytes[i]; 802 dst->ios[i] -= src->ios[i]; 803 } 804 } 805 806 static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) 807 { 808 struct blkcg *blkcg = css_to_blkcg(css); 809 struct blkcg_gq *blkg; 810 811 /* Root-level stats are sourced from system-wide IO stats */ 812 if (!cgroup_parent(css->cgroup)) 813 return; 814 815 rcu_read_lock(); 816 817 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 818 struct blkcg_gq *parent = blkg->parent; 819 struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); 820 struct blkg_iostat cur, delta; 821 unsigned long flags; 822 unsigned int seq; 823 824 /* fetch the current per-cpu values */ 825 do { 826 seq = u64_stats_fetch_begin(&bisc->sync); 827 blkg_iostat_set(&cur, &bisc->cur); 828 } while (u64_stats_fetch_retry(&bisc->sync, seq)); 829 830 /* propagate percpu delta to global */ 831 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); 832 blkg_iostat_set(&delta, &cur); 833 blkg_iostat_sub(&delta, &bisc->last); 834 blkg_iostat_add(&blkg->iostat.cur, &delta); 835 blkg_iostat_add(&bisc->last, &delta); 836 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); 837 838 /* propagate global delta to parent (unless that's root) */ 839 if (parent && parent->parent) { 840 flags = u64_stats_update_begin_irqsave(&parent->iostat.sync); 841 blkg_iostat_set(&delta, &blkg->iostat.cur); 842 blkg_iostat_sub(&delta, &blkg->iostat.last); 843 blkg_iostat_add(&parent->iostat.cur, &delta); 844 blkg_iostat_add(&blkg->iostat.last, &delta); 845 u64_stats_update_end_irqrestore(&parent->iostat.sync, flags); 846 } 847 } 848 849 rcu_read_unlock(); 850 } 851 852 /* 853 * We source root cgroup stats from the system-wide stats to avoid 854 * tracking the same information twice and incurring overhead when no 855 * cgroups are defined. For that reason, cgroup_rstat_flush in 856 * blkcg_print_stat does not actually fill out the iostat in the root 857 * cgroup's blkcg_gq. 858 * 859 * However, we would like to re-use the printing code between the root and 860 * non-root cgroups to the extent possible. For that reason, we simulate 861 * flushing the root cgroup's stats by explicitly filling in the iostat 862 * with disk level statistics. 863 */ 864 static void blkcg_fill_root_iostats(void) 865 { 866 struct class_dev_iter iter; 867 struct device *dev; 868 869 class_dev_iter_init(&iter, &block_class, NULL, &disk_type); 870 while ((dev = class_dev_iter_next(&iter))) { 871 struct block_device *bdev = dev_to_bdev(dev); 872 struct blkcg_gq *blkg = 873 blk_queue_root_blkg(bdev_get_queue(bdev)); 874 struct blkg_iostat tmp; 875 int cpu; 876 unsigned long flags; 877 878 memset(&tmp, 0, sizeof(tmp)); 879 for_each_possible_cpu(cpu) { 880 struct disk_stats *cpu_dkstats; 881 882 cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); 883 tmp.ios[BLKG_IOSTAT_READ] += 884 cpu_dkstats->ios[STAT_READ]; 885 tmp.ios[BLKG_IOSTAT_WRITE] += 886 cpu_dkstats->ios[STAT_WRITE]; 887 tmp.ios[BLKG_IOSTAT_DISCARD] += 888 cpu_dkstats->ios[STAT_DISCARD]; 889 // convert sectors to bytes 890 tmp.bytes[BLKG_IOSTAT_READ] += 891 cpu_dkstats->sectors[STAT_READ] << 9; 892 tmp.bytes[BLKG_IOSTAT_WRITE] += 893 cpu_dkstats->sectors[STAT_WRITE] << 9; 894 tmp.bytes[BLKG_IOSTAT_DISCARD] += 895 cpu_dkstats->sectors[STAT_DISCARD] << 9; 896 } 897 898 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); 899 blkg_iostat_set(&blkg->iostat.cur, &tmp); 900 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); 901 } 902 } 903 904 static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) 905 { 906 struct blkg_iostat_set *bis = &blkg->iostat; 907 u64 rbytes, wbytes, rios, wios, dbytes, dios; 908 bool has_stats = false; 909 const char *dname; 910 unsigned seq; 911 int i; 912 913 if (!blkg->online) 914 return; 915 916 dname = blkg_dev_name(blkg); 917 if (!dname) 918 return; 919 920 seq_printf(s, "%s ", dname); 921 922 do { 923 seq = u64_stats_fetch_begin(&bis->sync); 924 925 rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; 926 wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; 927 dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; 928 rios = bis->cur.ios[BLKG_IOSTAT_READ]; 929 wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; 930 dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; 931 } while (u64_stats_fetch_retry(&bis->sync, seq)); 932 933 if (rbytes || wbytes || rios || wios) { 934 has_stats = true; 935 seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", 936 rbytes, wbytes, rios, wios, 937 dbytes, dios); 938 } 939 940 if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { 941 has_stats = true; 942 seq_printf(s, " use_delay=%d delay_nsec=%llu", 943 atomic_read(&blkg->use_delay), 944 atomic64_read(&blkg->delay_nsec)); 945 } 946 947 for (i = 0; i < BLKCG_MAX_POLS; i++) { 948 struct blkcg_policy *pol = blkcg_policy[i]; 949 950 if (!blkg->pd[i] || !pol->pd_stat_fn) 951 continue; 952 953 if (pol->pd_stat_fn(blkg->pd[i], s)) 954 has_stats = true; 955 } 956 957 if (has_stats) 958 seq_printf(s, "\n"); 959 } 960 961 static int blkcg_print_stat(struct seq_file *sf, void *v) 962 { 963 struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); 964 struct blkcg_gq *blkg; 965 966 if (!seq_css(sf)->parent) 967 blkcg_fill_root_iostats(); 968 else 969 cgroup_rstat_flush(blkcg->css.cgroup); 970 971 rcu_read_lock(); 972 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 973 spin_lock_irq(&blkg->q->queue_lock); 974 blkcg_print_one_stat(blkg, sf); 975 spin_unlock_irq(&blkg->q->queue_lock); 976 } 977 rcu_read_unlock(); 978 return 0; 979 } 980 981 static struct cftype blkcg_files[] = { 982 { 983 .name = "stat", 984 .seq_show = blkcg_print_stat, 985 }, 986 { } /* terminate */ 987 }; 988 989 static struct cftype blkcg_legacy_files[] = { 990 { 991 .name = "reset_stats", 992 .write_u64 = blkcg_reset_stats, 993 }, 994 { } /* terminate */ 995 }; 996 997 /* 998 * blkcg destruction is a three-stage process. 999 * 1000 * 1. Destruction starts. The blkcg_css_offline() callback is invoked 1001 * which offlines writeback. Here we tie the next stage of blkg destruction 1002 * to the completion of writeback associated with the blkcg. This lets us 1003 * avoid punting potentially large amounts of outstanding writeback to root 1004 * while maintaining any ongoing policies. The next stage is triggered when 1005 * the nr_cgwbs count goes to zero. 1006 * 1007 * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called 1008 * and handles the destruction of blkgs. Here the css reference held by 1009 * the blkg is put back eventually allowing blkcg_css_free() to be called. 1010 * This work may occur in cgwb_release_workfn() on the cgwb_release 1011 * workqueue. Any submitted ios that fail to get the blkg ref will be 1012 * punted to the root_blkg. 1013 * 1014 * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. 1015 * This finally frees the blkcg. 1016 */ 1017 1018 /** 1019 * blkcg_css_offline - cgroup css_offline callback 1020 * @css: css of interest 1021 * 1022 * This function is called when @css is about to go away. Here the cgwbs are 1023 * offlined first and only once writeback associated with the blkcg has 1024 * finished do we start step 2 (see above). 1025 */ 1026 static void blkcg_css_offline(struct cgroup_subsys_state *css) 1027 { 1028 struct blkcg *blkcg = css_to_blkcg(css); 1029 1030 /* this prevents anyone from attaching or migrating to this blkcg */ 1031 wb_blkcg_offline(blkcg); 1032 1033 /* put the base online pin allowing step 2 to be triggered */ 1034 blkcg_unpin_online(blkcg); 1035 } 1036 1037 /** 1038 * blkcg_destroy_blkgs - responsible for shooting down blkgs 1039 * @blkcg: blkcg of interest 1040 * 1041 * blkgs should be removed while holding both q and blkcg locks. As blkcg lock 1042 * is nested inside q lock, this function performs reverse double lock dancing. 1043 * Destroying the blkgs releases the reference held on the blkcg's css allowing 1044 * blkcg_css_free to eventually be called. 1045 * 1046 * This is the blkcg counterpart of ioc_release_fn(). 1047 */ 1048 void blkcg_destroy_blkgs(struct blkcg *blkcg) 1049 { 1050 might_sleep(); 1051 1052 spin_lock_irq(&blkcg->lock); 1053 1054 while (!hlist_empty(&blkcg->blkg_list)) { 1055 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, 1056 struct blkcg_gq, blkcg_node); 1057 struct request_queue *q = blkg->q; 1058 1059 if (need_resched() || !spin_trylock(&q->queue_lock)) { 1060 /* 1061 * Given that the system can accumulate a huge number 1062 * of blkgs in pathological cases, check to see if we 1063 * need to rescheduling to avoid softlockup. 1064 */ 1065 spin_unlock_irq(&blkcg->lock); 1066 cond_resched(); 1067 spin_lock_irq(&blkcg->lock); 1068 continue; 1069 } 1070 1071 blkg_destroy(blkg); 1072 spin_unlock(&q->queue_lock); 1073 } 1074 1075 spin_unlock_irq(&blkcg->lock); 1076 } 1077 1078 static void blkcg_css_free(struct cgroup_subsys_state *css) 1079 { 1080 struct blkcg *blkcg = css_to_blkcg(css); 1081 int i; 1082 1083 mutex_lock(&blkcg_pol_mutex); 1084 1085 list_del(&blkcg->all_blkcgs_node); 1086 1087 for (i = 0; i < BLKCG_MAX_POLS; i++) 1088 if (blkcg->cpd[i]) 1089 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); 1090 1091 mutex_unlock(&blkcg_pol_mutex); 1092 1093 kfree(blkcg); 1094 } 1095 1096 static struct cgroup_subsys_state * 1097 blkcg_css_alloc(struct cgroup_subsys_state *parent_css) 1098 { 1099 struct blkcg *blkcg; 1100 struct cgroup_subsys_state *ret; 1101 int i; 1102 1103 mutex_lock(&blkcg_pol_mutex); 1104 1105 if (!parent_css) { 1106 blkcg = &blkcg_root; 1107 } else { 1108 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 1109 if (!blkcg) { 1110 ret = ERR_PTR(-ENOMEM); 1111 goto unlock; 1112 } 1113 } 1114 1115 for (i = 0; i < BLKCG_MAX_POLS ; i++) { 1116 struct blkcg_policy *pol = blkcg_policy[i]; 1117 struct blkcg_policy_data *cpd; 1118 1119 /* 1120 * If the policy hasn't been attached yet, wait for it 1121 * to be attached before doing anything else. Otherwise, 1122 * check if the policy requires any specific per-cgroup 1123 * data: if it does, allocate and initialize it. 1124 */ 1125 if (!pol || !pol->cpd_alloc_fn) 1126 continue; 1127 1128 cpd = pol->cpd_alloc_fn(GFP_KERNEL); 1129 if (!cpd) { 1130 ret = ERR_PTR(-ENOMEM); 1131 goto free_pd_blkcg; 1132 } 1133 blkcg->cpd[i] = cpd; 1134 cpd->blkcg = blkcg; 1135 cpd->plid = i; 1136 if (pol->cpd_init_fn) 1137 pol->cpd_init_fn(cpd); 1138 } 1139 1140 spin_lock_init(&blkcg->lock); 1141 refcount_set(&blkcg->online_pin, 1); 1142 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); 1143 INIT_HLIST_HEAD(&blkcg->blkg_list); 1144 #ifdef CONFIG_CGROUP_WRITEBACK 1145 INIT_LIST_HEAD(&blkcg->cgwb_list); 1146 #endif 1147 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); 1148 1149 mutex_unlock(&blkcg_pol_mutex); 1150 return &blkcg->css; 1151 1152 free_pd_blkcg: 1153 for (i--; i >= 0; i--) 1154 if (blkcg->cpd[i]) 1155 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); 1156 1157 if (blkcg != &blkcg_root) 1158 kfree(blkcg); 1159 unlock: 1160 mutex_unlock(&blkcg_pol_mutex); 1161 return ret; 1162 } 1163 1164 static int blkcg_css_online(struct cgroup_subsys_state *css) 1165 { 1166 struct blkcg *blkcg = css_to_blkcg(css); 1167 struct blkcg *parent = blkcg_parent(blkcg); 1168 1169 /* 1170 * blkcg_pin_online() is used to delay blkcg offline so that blkgs 1171 * don't go offline while cgwbs are still active on them. Pin the 1172 * parent so that offline always happens towards the root. 1173 */ 1174 if (parent) 1175 blkcg_pin_online(parent); 1176 return 0; 1177 } 1178 1179 /** 1180 * blkcg_init_queue - initialize blkcg part of request queue 1181 * @q: request_queue to initialize 1182 * 1183 * Called from blk_alloc_queue(). Responsible for initializing blkcg 1184 * part of new request_queue @q. 1185 * 1186 * RETURNS: 1187 * 0 on success, -errno on failure. 1188 */ 1189 int blkcg_init_queue(struct request_queue *q) 1190 { 1191 struct blkcg_gq *new_blkg, *blkg; 1192 bool preloaded; 1193 int ret; 1194 1195 INIT_LIST_HEAD(&q->blkg_list); 1196 1197 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); 1198 if (!new_blkg) 1199 return -ENOMEM; 1200 1201 preloaded = !radix_tree_preload(GFP_KERNEL); 1202 1203 /* Make sure the root blkg exists. */ 1204 rcu_read_lock(); 1205 spin_lock_irq(&q->queue_lock); 1206 blkg = blkg_create(&blkcg_root, q, new_blkg); 1207 if (IS_ERR(blkg)) 1208 goto err_unlock; 1209 q->root_blkg = blkg; 1210 spin_unlock_irq(&q->queue_lock); 1211 rcu_read_unlock(); 1212 1213 if (preloaded) 1214 radix_tree_preload_end(); 1215 1216 ret = blk_ioprio_init(q); 1217 if (ret) 1218 goto err_destroy_all; 1219 1220 ret = blk_throtl_init(q); 1221 if (ret) 1222 goto err_destroy_all; 1223 1224 ret = blk_iolatency_init(q); 1225 if (ret) { 1226 blk_throtl_exit(q); 1227 goto err_destroy_all; 1228 } 1229 1230 return 0; 1231 1232 err_destroy_all: 1233 blkg_destroy_all(q); 1234 return ret; 1235 err_unlock: 1236 spin_unlock_irq(&q->queue_lock); 1237 rcu_read_unlock(); 1238 if (preloaded) 1239 radix_tree_preload_end(); 1240 return PTR_ERR(blkg); 1241 } 1242 1243 /** 1244 * blkcg_exit_queue - exit and release blkcg part of request_queue 1245 * @q: request_queue being released 1246 * 1247 * Called from blk_exit_queue(). Responsible for exiting blkcg part. 1248 */ 1249 void blkcg_exit_queue(struct request_queue *q) 1250 { 1251 blkg_destroy_all(q); 1252 blk_throtl_exit(q); 1253 } 1254 1255 static void blkcg_bind(struct cgroup_subsys_state *root_css) 1256 { 1257 int i; 1258 1259 mutex_lock(&blkcg_pol_mutex); 1260 1261 for (i = 0; i < BLKCG_MAX_POLS; i++) { 1262 struct blkcg_policy *pol = blkcg_policy[i]; 1263 struct blkcg *blkcg; 1264 1265 if (!pol || !pol->cpd_bind_fn) 1266 continue; 1267 1268 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) 1269 if (blkcg->cpd[pol->plid]) 1270 pol->cpd_bind_fn(blkcg->cpd[pol->plid]); 1271 } 1272 mutex_unlock(&blkcg_pol_mutex); 1273 } 1274 1275 static void blkcg_exit(struct task_struct *tsk) 1276 { 1277 if (tsk->throttle_queue) 1278 blk_put_queue(tsk->throttle_queue); 1279 tsk->throttle_queue = NULL; 1280 } 1281 1282 struct cgroup_subsys io_cgrp_subsys = { 1283 .css_alloc = blkcg_css_alloc, 1284 .css_online = blkcg_css_online, 1285 .css_offline = blkcg_css_offline, 1286 .css_free = blkcg_css_free, 1287 .css_rstat_flush = blkcg_rstat_flush, 1288 .bind = blkcg_bind, 1289 .dfl_cftypes = blkcg_files, 1290 .legacy_cftypes = blkcg_legacy_files, 1291 .legacy_name = "blkio", 1292 .exit = blkcg_exit, 1293 #ifdef CONFIG_MEMCG 1294 /* 1295 * This ensures that, if available, memcg is automatically enabled 1296 * together on the default hierarchy so that the owner cgroup can 1297 * be retrieved from writeback pages. 1298 */ 1299 .depends_on = 1 << memory_cgrp_id, 1300 #endif 1301 }; 1302 EXPORT_SYMBOL_GPL(io_cgrp_subsys); 1303 1304 /** 1305 * blkcg_activate_policy - activate a blkcg policy on a request_queue 1306 * @q: request_queue of interest 1307 * @pol: blkcg policy to activate 1308 * 1309 * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through 1310 * bypass mode to populate its blkgs with policy_data for @pol. 1311 * 1312 * Activation happens with @q bypassed, so nobody would be accessing blkgs 1313 * from IO path. Update of each blkg is protected by both queue and blkcg 1314 * locks so that holding either lock and testing blkcg_policy_enabled() is 1315 * always enough for dereferencing policy data. 1316 * 1317 * The caller is responsible for synchronizing [de]activations and policy 1318 * [un]registerations. Returns 0 on success, -errno on failure. 1319 */ 1320 int blkcg_activate_policy(struct request_queue *q, 1321 const struct blkcg_policy *pol) 1322 { 1323 struct blkg_policy_data *pd_prealloc = NULL; 1324 struct blkcg_gq *blkg, *pinned_blkg = NULL; 1325 int ret; 1326 1327 if (blkcg_policy_enabled(q, pol)) 1328 return 0; 1329 1330 if (queue_is_mq(q)) 1331 blk_mq_freeze_queue(q); 1332 retry: 1333 spin_lock_irq(&q->queue_lock); 1334 1335 /* blkg_list is pushed at the head, reverse walk to allocate parents first */ 1336 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { 1337 struct blkg_policy_data *pd; 1338 1339 if (blkg->pd[pol->plid]) 1340 continue; 1341 1342 /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ 1343 if (blkg == pinned_blkg) { 1344 pd = pd_prealloc; 1345 pd_prealloc = NULL; 1346 } else { 1347 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, 1348 blkg->blkcg); 1349 } 1350 1351 if (!pd) { 1352 /* 1353 * GFP_NOWAIT failed. Free the existing one and 1354 * prealloc for @blkg w/ GFP_KERNEL. 1355 */ 1356 if (pinned_blkg) 1357 blkg_put(pinned_blkg); 1358 blkg_get(blkg); 1359 pinned_blkg = blkg; 1360 1361 spin_unlock_irq(&q->queue_lock); 1362 1363 if (pd_prealloc) 1364 pol->pd_free_fn(pd_prealloc); 1365 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, 1366 blkg->blkcg); 1367 if (pd_prealloc) 1368 goto retry; 1369 else 1370 goto enomem; 1371 } 1372 1373 blkg->pd[pol->plid] = pd; 1374 pd->blkg = blkg; 1375 pd->plid = pol->plid; 1376 } 1377 1378 /* all allocated, init in the same order */ 1379 if (pol->pd_init_fn) 1380 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) 1381 pol->pd_init_fn(blkg->pd[pol->plid]); 1382 1383 __set_bit(pol->plid, q->blkcg_pols); 1384 ret = 0; 1385 1386 spin_unlock_irq(&q->queue_lock); 1387 out: 1388 if (queue_is_mq(q)) 1389 blk_mq_unfreeze_queue(q); 1390 if (pinned_blkg) 1391 blkg_put(pinned_blkg); 1392 if (pd_prealloc) 1393 pol->pd_free_fn(pd_prealloc); 1394 return ret; 1395 1396 enomem: 1397 /* alloc failed, nothing's initialized yet, free everything */ 1398 spin_lock_irq(&q->queue_lock); 1399 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1400 struct blkcg *blkcg = blkg->blkcg; 1401 1402 spin_lock(&blkcg->lock); 1403 if (blkg->pd[pol->plid]) { 1404 pol->pd_free_fn(blkg->pd[pol->plid]); 1405 blkg->pd[pol->plid] = NULL; 1406 } 1407 spin_unlock(&blkcg->lock); 1408 } 1409 spin_unlock_irq(&q->queue_lock); 1410 ret = -ENOMEM; 1411 goto out; 1412 } 1413 EXPORT_SYMBOL_GPL(blkcg_activate_policy); 1414 1415 /** 1416 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue 1417 * @q: request_queue of interest 1418 * @pol: blkcg policy to deactivate 1419 * 1420 * Deactivate @pol on @q. Follows the same synchronization rules as 1421 * blkcg_activate_policy(). 1422 */ 1423 void blkcg_deactivate_policy(struct request_queue *q, 1424 const struct blkcg_policy *pol) 1425 { 1426 struct blkcg_gq *blkg; 1427 1428 if (!blkcg_policy_enabled(q, pol)) 1429 return; 1430 1431 if (queue_is_mq(q)) 1432 blk_mq_freeze_queue(q); 1433 1434 spin_lock_irq(&q->queue_lock); 1435 1436 __clear_bit(pol->plid, q->blkcg_pols); 1437 1438 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1439 struct blkcg *blkcg = blkg->blkcg; 1440 1441 spin_lock(&blkcg->lock); 1442 if (blkg->pd[pol->plid]) { 1443 if (pol->pd_offline_fn) 1444 pol->pd_offline_fn(blkg->pd[pol->plid]); 1445 pol->pd_free_fn(blkg->pd[pol->plid]); 1446 blkg->pd[pol->plid] = NULL; 1447 } 1448 spin_unlock(&blkcg->lock); 1449 } 1450 1451 spin_unlock_irq(&q->queue_lock); 1452 1453 if (queue_is_mq(q)) 1454 blk_mq_unfreeze_queue(q); 1455 } 1456 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); 1457 1458 /** 1459 * blkcg_policy_register - register a blkcg policy 1460 * @pol: blkcg policy to register 1461 * 1462 * Register @pol with blkcg core. Might sleep and @pol may be modified on 1463 * successful registration. Returns 0 on success and -errno on failure. 1464 */ 1465 int blkcg_policy_register(struct blkcg_policy *pol) 1466 { 1467 struct blkcg *blkcg; 1468 int i, ret; 1469 1470 mutex_lock(&blkcg_pol_register_mutex); 1471 mutex_lock(&blkcg_pol_mutex); 1472 1473 /* find an empty slot */ 1474 ret = -ENOSPC; 1475 for (i = 0; i < BLKCG_MAX_POLS; i++) 1476 if (!blkcg_policy[i]) 1477 break; 1478 if (i >= BLKCG_MAX_POLS) { 1479 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); 1480 goto err_unlock; 1481 } 1482 1483 /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ 1484 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || 1485 (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) 1486 goto err_unlock; 1487 1488 /* register @pol */ 1489 pol->plid = i; 1490 blkcg_policy[pol->plid] = pol; 1491 1492 /* allocate and install cpd's */ 1493 if (pol->cpd_alloc_fn) { 1494 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1495 struct blkcg_policy_data *cpd; 1496 1497 cpd = pol->cpd_alloc_fn(GFP_KERNEL); 1498 if (!cpd) 1499 goto err_free_cpds; 1500 1501 blkcg->cpd[pol->plid] = cpd; 1502 cpd->blkcg = blkcg; 1503 cpd->plid = pol->plid; 1504 if (pol->cpd_init_fn) 1505 pol->cpd_init_fn(cpd); 1506 } 1507 } 1508 1509 mutex_unlock(&blkcg_pol_mutex); 1510 1511 /* everything is in place, add intf files for the new policy */ 1512 if (pol->dfl_cftypes) 1513 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, 1514 pol->dfl_cftypes)); 1515 if (pol->legacy_cftypes) 1516 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, 1517 pol->legacy_cftypes)); 1518 mutex_unlock(&blkcg_pol_register_mutex); 1519 return 0; 1520 1521 err_free_cpds: 1522 if (pol->cpd_free_fn) { 1523 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1524 if (blkcg->cpd[pol->plid]) { 1525 pol->cpd_free_fn(blkcg->cpd[pol->plid]); 1526 blkcg->cpd[pol->plid] = NULL; 1527 } 1528 } 1529 } 1530 blkcg_policy[pol->plid] = NULL; 1531 err_unlock: 1532 mutex_unlock(&blkcg_pol_mutex); 1533 mutex_unlock(&blkcg_pol_register_mutex); 1534 return ret; 1535 } 1536 EXPORT_SYMBOL_GPL(blkcg_policy_register); 1537 1538 /** 1539 * blkcg_policy_unregister - unregister a blkcg policy 1540 * @pol: blkcg policy to unregister 1541 * 1542 * Undo blkcg_policy_register(@pol). Might sleep. 1543 */ 1544 void blkcg_policy_unregister(struct blkcg_policy *pol) 1545 { 1546 struct blkcg *blkcg; 1547 1548 mutex_lock(&blkcg_pol_register_mutex); 1549 1550 if (WARN_ON(blkcg_policy[pol->plid] != pol)) 1551 goto out_unlock; 1552 1553 /* kill the intf files first */ 1554 if (pol->dfl_cftypes) 1555 cgroup_rm_cftypes(pol->dfl_cftypes); 1556 if (pol->legacy_cftypes) 1557 cgroup_rm_cftypes(pol->legacy_cftypes); 1558 1559 /* remove cpds and unregister */ 1560 mutex_lock(&blkcg_pol_mutex); 1561 1562 if (pol->cpd_free_fn) { 1563 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1564 if (blkcg->cpd[pol->plid]) { 1565 pol->cpd_free_fn(blkcg->cpd[pol->plid]); 1566 blkcg->cpd[pol->plid] = NULL; 1567 } 1568 } 1569 } 1570 blkcg_policy[pol->plid] = NULL; 1571 1572 mutex_unlock(&blkcg_pol_mutex); 1573 out_unlock: 1574 mutex_unlock(&blkcg_pol_register_mutex); 1575 } 1576 EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 1577 1578 bool __blkcg_punt_bio_submit(struct bio *bio) 1579 { 1580 struct blkcg_gq *blkg = bio->bi_blkg; 1581 1582 /* consume the flag first */ 1583 bio->bi_opf &= ~REQ_CGROUP_PUNT; 1584 1585 /* never bounce for the root cgroup */ 1586 if (!blkg->parent) 1587 return false; 1588 1589 spin_lock_bh(&blkg->async_bio_lock); 1590 bio_list_add(&blkg->async_bios, bio); 1591 spin_unlock_bh(&blkg->async_bio_lock); 1592 1593 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); 1594 return true; 1595 } 1596 1597 /* 1598 * Scale the accumulated delay based on how long it has been since we updated 1599 * the delay. We only call this when we are adding delay, in case it's been a 1600 * while since we added delay, and when we are checking to see if we need to 1601 * delay a task, to account for any delays that may have occurred. 1602 */ 1603 static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) 1604 { 1605 u64 old = atomic64_read(&blkg->delay_start); 1606 1607 /* negative use_delay means no scaling, see blkcg_set_delay() */ 1608 if (atomic_read(&blkg->use_delay) < 0) 1609 return; 1610 1611 /* 1612 * We only want to scale down every second. The idea here is that we 1613 * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain 1614 * time window. We only want to throttle tasks for recent delay that 1615 * has occurred, in 1 second time windows since that's the maximum 1616 * things can be throttled. We save the current delay window in 1617 * blkg->last_delay so we know what amount is still left to be charged 1618 * to the blkg from this point onward. blkg->last_use keeps track of 1619 * the use_delay counter. The idea is if we're unthrottling the blkg we 1620 * are ok with whatever is happening now, and we can take away more of 1621 * the accumulated delay as we've already throttled enough that 1622 * everybody is happy with their IO latencies. 1623 */ 1624 if (time_before64(old + NSEC_PER_SEC, now) && 1625 atomic64_cmpxchg(&blkg->delay_start, old, now) == old) { 1626 u64 cur = atomic64_read(&blkg->delay_nsec); 1627 u64 sub = min_t(u64, blkg->last_delay, now - old); 1628 int cur_use = atomic_read(&blkg->use_delay); 1629 1630 /* 1631 * We've been unthrottled, subtract a larger chunk of our 1632 * accumulated delay. 1633 */ 1634 if (cur_use < blkg->last_use) 1635 sub = max_t(u64, sub, blkg->last_delay >> 1); 1636 1637 /* 1638 * This shouldn't happen, but handle it anyway. Our delay_nsec 1639 * should only ever be growing except here where we subtract out 1640 * min(last_delay, 1 second), but lord knows bugs happen and I'd 1641 * rather not end up with negative numbers. 1642 */ 1643 if (unlikely(cur < sub)) { 1644 atomic64_set(&blkg->delay_nsec, 0); 1645 blkg->last_delay = 0; 1646 } else { 1647 atomic64_sub(sub, &blkg->delay_nsec); 1648 blkg->last_delay = cur - sub; 1649 } 1650 blkg->last_use = cur_use; 1651 } 1652 } 1653 1654 /* 1655 * This is called when we want to actually walk up the hierarchy and check to 1656 * see if we need to throttle, and then actually throttle if there is some 1657 * accumulated delay. This should only be called upon return to user space so 1658 * we're not holding some lock that would induce a priority inversion. 1659 */ 1660 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) 1661 { 1662 unsigned long pflags; 1663 bool clamp; 1664 u64 now = ktime_to_ns(ktime_get()); 1665 u64 exp; 1666 u64 delay_nsec = 0; 1667 int tok; 1668 1669 while (blkg->parent) { 1670 int use_delay = atomic_read(&blkg->use_delay); 1671 1672 if (use_delay) { 1673 u64 this_delay; 1674 1675 blkcg_scale_delay(blkg, now); 1676 this_delay = atomic64_read(&blkg->delay_nsec); 1677 if (this_delay > delay_nsec) { 1678 delay_nsec = this_delay; 1679 clamp = use_delay > 0; 1680 } 1681 } 1682 blkg = blkg->parent; 1683 } 1684 1685 if (!delay_nsec) 1686 return; 1687 1688 /* 1689 * Let's not sleep for all eternity if we've amassed a huge delay. 1690 * Swapping or metadata IO can accumulate 10's of seconds worth of 1691 * delay, and we want userspace to be able to do _something_ so cap the 1692 * delays at 0.25s. If there's 10's of seconds worth of delay then the 1693 * tasks will be delayed for 0.25 second for every syscall. If 1694 * blkcg_set_delay() was used as indicated by negative use_delay, the 1695 * caller is responsible for regulating the range. 1696 */ 1697 if (clamp) 1698 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); 1699 1700 if (use_memdelay) 1701 psi_memstall_enter(&pflags); 1702 1703 exp = ktime_add_ns(now, delay_nsec); 1704 tok = io_schedule_prepare(); 1705 do { 1706 __set_current_state(TASK_KILLABLE); 1707 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) 1708 break; 1709 } while (!fatal_signal_pending(current)); 1710 io_schedule_finish(tok); 1711 1712 if (use_memdelay) 1713 psi_memstall_leave(&pflags); 1714 } 1715 1716 /** 1717 * blkcg_maybe_throttle_current - throttle the current task if it has been marked 1718 * 1719 * This is only called if we've been marked with set_notify_resume(). Obviously 1720 * we can be set_notify_resume() for reasons other than blkcg throttling, so we 1721 * check to see if current->throttle_queue is set and if not this doesn't do 1722 * anything. This should only ever be called by the resume code, it's not meant 1723 * to be called by people willy-nilly as it will actually do the work to 1724 * throttle the task if it is setup for throttling. 1725 */ 1726 void blkcg_maybe_throttle_current(void) 1727 { 1728 struct request_queue *q = current->throttle_queue; 1729 struct cgroup_subsys_state *css; 1730 struct blkcg *blkcg; 1731 struct blkcg_gq *blkg; 1732 bool use_memdelay = current->use_memdelay; 1733 1734 if (!q) 1735 return; 1736 1737 current->throttle_queue = NULL; 1738 current->use_memdelay = false; 1739 1740 rcu_read_lock(); 1741 css = kthread_blkcg(); 1742 if (css) 1743 blkcg = css_to_blkcg(css); 1744 else 1745 blkcg = css_to_blkcg(task_css(current, io_cgrp_id)); 1746 1747 if (!blkcg) 1748 goto out; 1749 blkg = blkg_lookup(blkcg, q); 1750 if (!blkg) 1751 goto out; 1752 if (!blkg_tryget(blkg)) 1753 goto out; 1754 rcu_read_unlock(); 1755 1756 blkcg_maybe_throttle_blkg(blkg, use_memdelay); 1757 blkg_put(blkg); 1758 blk_put_queue(q); 1759 return; 1760 out: 1761 rcu_read_unlock(); 1762 blk_put_queue(q); 1763 } 1764 1765 /** 1766 * blkcg_schedule_throttle - this task needs to check for throttling 1767 * @q: the request queue IO was submitted on 1768 * @use_memdelay: do we charge this to memory delay for PSI 1769 * 1770 * This is called by the IO controller when we know there's delay accumulated 1771 * for the blkg for this task. We do not pass the blkg because there are places 1772 * we call this that may not have that information, the swapping code for 1773 * instance will only have a request_queue at that point. This set's the 1774 * notify_resume for the task to check and see if it requires throttling before 1775 * returning to user space. 1776 * 1777 * We will only schedule once per syscall. You can call this over and over 1778 * again and it will only do the check once upon return to user space, and only 1779 * throttle once. If the task needs to be throttled again it'll need to be 1780 * re-set at the next time we see the task. 1781 */ 1782 void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) 1783 { 1784 if (unlikely(current->flags & PF_KTHREAD)) 1785 return; 1786 1787 if (current->throttle_queue != q) { 1788 if (!blk_get_queue(q)) 1789 return; 1790 1791 if (current->throttle_queue) 1792 blk_put_queue(current->throttle_queue); 1793 current->throttle_queue = q; 1794 } 1795 1796 if (use_memdelay) 1797 current->use_memdelay = use_memdelay; 1798 set_notify_resume(current); 1799 } 1800 1801 /** 1802 * blkcg_add_delay - add delay to this blkg 1803 * @blkg: blkg of interest 1804 * @now: the current time in nanoseconds 1805 * @delta: how many nanoseconds of delay to add 1806 * 1807 * Charge @delta to the blkg's current delay accumulation. This is used to 1808 * throttle tasks if an IO controller thinks we need more throttling. 1809 */ 1810 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) 1811 { 1812 if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) 1813 return; 1814 blkcg_scale_delay(blkg, now); 1815 atomic64_add(delta, &blkg->delay_nsec); 1816 } 1817 1818 /** 1819 * blkg_tryget_closest - try and get a blkg ref on the closet blkg 1820 * @bio: target bio 1821 * @css: target css 1822 * 1823 * As the failure mode here is to walk up the blkg tree, this ensure that the 1824 * blkg->parent pointers are always valid. This returns the blkg that it ended 1825 * up taking a reference on or %NULL if no reference was taken. 1826 */ 1827 static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, 1828 struct cgroup_subsys_state *css) 1829 { 1830 struct blkcg_gq *blkg, *ret_blkg = NULL; 1831 1832 rcu_read_lock(); 1833 blkg = blkg_lookup_create(css_to_blkcg(css), 1834 bdev_get_queue(bio->bi_bdev)); 1835 while (blkg) { 1836 if (blkg_tryget(blkg)) { 1837 ret_blkg = blkg; 1838 break; 1839 } 1840 blkg = blkg->parent; 1841 } 1842 rcu_read_unlock(); 1843 1844 return ret_blkg; 1845 } 1846 1847 /** 1848 * bio_associate_blkg_from_css - associate a bio with a specified css 1849 * @bio: target bio 1850 * @css: target css 1851 * 1852 * Associate @bio with the blkg found by combining the css's blkg and the 1853 * request_queue of the @bio. An association failure is handled by walking up 1854 * the blkg tree. Therefore, the blkg associated can be anything between @blkg 1855 * and q->root_blkg. This situation only happens when a cgroup is dying and 1856 * then the remaining bios will spill to the closest alive blkg. 1857 * 1858 * A reference will be taken on the blkg and will be released when @bio is 1859 * freed. 1860 */ 1861 void bio_associate_blkg_from_css(struct bio *bio, 1862 struct cgroup_subsys_state *css) 1863 { 1864 if (bio->bi_blkg) 1865 blkg_put(bio->bi_blkg); 1866 1867 if (css && css->parent) { 1868 bio->bi_blkg = blkg_tryget_closest(bio, css); 1869 } else { 1870 blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg); 1871 bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg; 1872 } 1873 } 1874 EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); 1875 1876 /** 1877 * bio_associate_blkg - associate a bio with a blkg 1878 * @bio: target bio 1879 * 1880 * Associate @bio with the blkg found from the bio's css and request_queue. 1881 * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is 1882 * already associated, the css is reused and association redone as the 1883 * request_queue may have changed. 1884 */ 1885 void bio_associate_blkg(struct bio *bio) 1886 { 1887 struct cgroup_subsys_state *css; 1888 1889 rcu_read_lock(); 1890 1891 if (bio->bi_blkg) 1892 css = &bio_blkcg(bio)->css; 1893 else 1894 css = blkcg_css(); 1895 1896 bio_associate_blkg_from_css(bio, css); 1897 1898 rcu_read_unlock(); 1899 } 1900 EXPORT_SYMBOL_GPL(bio_associate_blkg); 1901 1902 /** 1903 * bio_clone_blkg_association - clone blkg association from src to dst bio 1904 * @dst: destination bio 1905 * @src: source bio 1906 */ 1907 void bio_clone_blkg_association(struct bio *dst, struct bio *src) 1908 { 1909 if (src->bi_blkg) { 1910 if (dst->bi_blkg) 1911 blkg_put(dst->bi_blkg); 1912 blkg_get(src->bi_blkg); 1913 dst->bi_blkg = src->bi_blkg; 1914 } 1915 } 1916 EXPORT_SYMBOL_GPL(bio_clone_blkg_association); 1917 1918 static int blk_cgroup_io_type(struct bio *bio) 1919 { 1920 if (op_is_discard(bio->bi_opf)) 1921 return BLKG_IOSTAT_DISCARD; 1922 if (op_is_write(bio->bi_opf)) 1923 return BLKG_IOSTAT_WRITE; 1924 return BLKG_IOSTAT_READ; 1925 } 1926 1927 void blk_cgroup_bio_start(struct bio *bio) 1928 { 1929 int rwd = blk_cgroup_io_type(bio), cpu; 1930 struct blkg_iostat_set *bis; 1931 unsigned long flags; 1932 1933 cpu = get_cpu(); 1934 bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); 1935 flags = u64_stats_update_begin_irqsave(&bis->sync); 1936 1937 /* 1938 * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split 1939 * bio and we would have already accounted for the size of the bio. 1940 */ 1941 if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { 1942 bio_set_flag(bio, BIO_CGROUP_ACCT); 1943 bis->cur.bytes[rwd] += bio->bi_iter.bi_size; 1944 } 1945 bis->cur.ios[rwd]++; 1946 1947 u64_stats_update_end_irqrestore(&bis->sync, flags); 1948 if (cgroup_subsys_on_dfl(io_cgrp_subsys)) 1949 cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); 1950 put_cpu(); 1951 } 1952 1953 static int __init blkcg_init(void) 1954 { 1955 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", 1956 WQ_MEM_RECLAIM | WQ_FREEZABLE | 1957 WQ_UNBOUND | WQ_SYSFS, 0); 1958 if (!blkcg_punt_bio_wq) 1959 return -ENOMEM; 1960 return 0; 1961 } 1962 subsys_initcall(blkcg_init); 1963 1964 module_param(blkcg_debug_stats, bool, 0644); 1965 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); 1966