1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Common Block IO controller cgroup interface 4 * 5 * Based on ideas and code from CFQ, CFS and BFQ: 6 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 7 * 8 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 9 * Paolo Valente <paolo.valente@unimore.it> 10 * 11 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 12 * Nauman Rafique <nauman@google.com> 13 * 14 * For policy-specific per-blkcg data: 15 * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> 16 * Arianna Avanzini <avanzini.arianna@gmail.com> 17 */ 18 #include <linux/ioprio.h> 19 #include <linux/kdev_t.h> 20 #include <linux/module.h> 21 #include <linux/sched/signal.h> 22 #include <linux/err.h> 23 #include <linux/blkdev.h> 24 #include <linux/backing-dev.h> 25 #include <linux/slab.h> 26 #include <linux/delay.h> 27 #include <linux/atomic.h> 28 #include <linux/ctype.h> 29 #include <linux/resume_user_mode.h> 30 #include <linux/psi.h> 31 #include <linux/part_stat.h> 32 #include "blk.h" 33 #include "blk-cgroup.h" 34 #include "blk-ioprio.h" 35 #include "blk-throttle.h" 36 37 /* 38 * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. 39 * blkcg_pol_register_mutex nests outside of it and synchronizes entire 40 * policy [un]register operations including cgroup file additions / 41 * removals. Putting cgroup file registration outside blkcg_pol_mutex 42 * allows grabbing it from cgroup callbacks. 43 */ 44 static DEFINE_MUTEX(blkcg_pol_register_mutex); 45 static DEFINE_MUTEX(blkcg_pol_mutex); 46 47 struct blkcg blkcg_root; 48 EXPORT_SYMBOL_GPL(blkcg_root); 49 50 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; 51 EXPORT_SYMBOL_GPL(blkcg_root_css); 52 53 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 54 55 static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ 56 57 bool blkcg_debug_stats = false; 58 static struct workqueue_struct *blkcg_punt_bio_wq; 59 60 #define BLKG_DESTROY_BATCH_SIZE 64 61 62 /** 63 * blkcg_css - find the current css 64 * 65 * Find the css associated with either the kthread or the current task. 66 * This may return a dying css, so it is up to the caller to use tryget logic 67 * to confirm it is alive and well. 68 */ 69 static struct cgroup_subsys_state *blkcg_css(void) 70 { 71 struct cgroup_subsys_state *css; 72 73 css = kthread_blkcg(); 74 if (css) 75 return css; 76 return task_css(current, io_cgrp_id); 77 } 78 79 static bool blkcg_policy_enabled(struct request_queue *q, 80 const struct blkcg_policy *pol) 81 { 82 return pol && test_bit(pol->plid, q->blkcg_pols); 83 } 84 85 static void blkg_free_workfn(struct work_struct *work) 86 { 87 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, 88 free_work); 89 int i; 90 91 for (i = 0; i < BLKCG_MAX_POLS; i++) 92 if (blkg->pd[i]) 93 blkcg_policy[i]->pd_free_fn(blkg->pd[i]); 94 95 if (blkg->q) 96 blk_put_queue(blkg->q); 97 free_percpu(blkg->iostat_cpu); 98 percpu_ref_exit(&blkg->refcnt); 99 kfree(blkg); 100 } 101 102 /** 103 * blkg_free - free a blkg 104 * @blkg: blkg to free 105 * 106 * Free @blkg which may be partially allocated. 107 */ 108 static void blkg_free(struct blkcg_gq *blkg) 109 { 110 if (!blkg) 111 return; 112 113 /* 114 * Both ->pd_free_fn() and request queue's release handler may 115 * sleep, so free us by scheduling one work func 116 */ 117 INIT_WORK(&blkg->free_work, blkg_free_workfn); 118 schedule_work(&blkg->free_work); 119 } 120 121 static void __blkg_release(struct rcu_head *rcu) 122 { 123 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); 124 125 WARN_ON(!bio_list_empty(&blkg->async_bios)); 126 127 /* release the blkcg and parent blkg refs this blkg has been holding */ 128 css_put(&blkg->blkcg->css); 129 if (blkg->parent) 130 blkg_put(blkg->parent); 131 blkg_free(blkg); 132 } 133 134 /* 135 * A group is RCU protected, but having an rcu lock does not mean that one 136 * can access all the fields of blkg and assume these are valid. For 137 * example, don't try to follow throtl_data and request queue links. 138 * 139 * Having a reference to blkg under an rcu allows accesses to only values 140 * local to groups like group stats and group rate limits. 141 */ 142 static void blkg_release(struct percpu_ref *ref) 143 { 144 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); 145 146 call_rcu(&blkg->rcu_head, __blkg_release); 147 } 148 149 static void blkg_async_bio_workfn(struct work_struct *work) 150 { 151 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, 152 async_bio_work); 153 struct bio_list bios = BIO_EMPTY_LIST; 154 struct bio *bio; 155 struct blk_plug plug; 156 bool need_plug = false; 157 158 /* as long as there are pending bios, @blkg can't go away */ 159 spin_lock_bh(&blkg->async_bio_lock); 160 bio_list_merge(&bios, &blkg->async_bios); 161 bio_list_init(&blkg->async_bios); 162 spin_unlock_bh(&blkg->async_bio_lock); 163 164 /* start plug only when bio_list contains at least 2 bios */ 165 if (bios.head && bios.head->bi_next) { 166 need_plug = true; 167 blk_start_plug(&plug); 168 } 169 while ((bio = bio_list_pop(&bios))) 170 submit_bio(bio); 171 if (need_plug) 172 blk_finish_plug(&plug); 173 } 174 175 /** 176 * bio_blkcg_css - return the blkcg CSS associated with a bio 177 * @bio: target bio 178 * 179 * This returns the CSS for the blkcg associated with a bio, or %NULL if not 180 * associated. Callers are expected to either handle %NULL or know association 181 * has been done prior to calling this. 182 */ 183 struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) 184 { 185 if (!bio || !bio->bi_blkg) 186 return NULL; 187 return &bio->bi_blkg->blkcg->css; 188 } 189 EXPORT_SYMBOL_GPL(bio_blkcg_css); 190 191 /** 192 * blkcg_parent - get the parent of a blkcg 193 * @blkcg: blkcg of interest 194 * 195 * Return the parent blkcg of @blkcg. Can be called anytime. 196 */ 197 static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) 198 { 199 return css_to_blkcg(blkcg->css.parent); 200 } 201 202 /** 203 * blkg_alloc - allocate a blkg 204 * @blkcg: block cgroup the new blkg is associated with 205 * @q: request_queue the new blkg is associated with 206 * @gfp_mask: allocation mask to use 207 * 208 * Allocate a new blkg assocating @blkcg and @q. 209 */ 210 static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, 211 gfp_t gfp_mask) 212 { 213 struct blkcg_gq *blkg; 214 int i, cpu; 215 216 /* alloc and init base part */ 217 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); 218 if (!blkg) 219 return NULL; 220 221 if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) 222 goto err_free; 223 224 blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); 225 if (!blkg->iostat_cpu) 226 goto err_free; 227 228 if (!blk_get_queue(q)) 229 goto err_free; 230 231 blkg->q = q; 232 INIT_LIST_HEAD(&blkg->q_node); 233 spin_lock_init(&blkg->async_bio_lock); 234 bio_list_init(&blkg->async_bios); 235 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); 236 blkg->blkcg = blkcg; 237 238 u64_stats_init(&blkg->iostat.sync); 239 for_each_possible_cpu(cpu) 240 u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); 241 242 for (i = 0; i < BLKCG_MAX_POLS; i++) { 243 struct blkcg_policy *pol = blkcg_policy[i]; 244 struct blkg_policy_data *pd; 245 246 if (!blkcg_policy_enabled(q, pol)) 247 continue; 248 249 /* alloc per-policy data and attach it to blkg */ 250 pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); 251 if (!pd) 252 goto err_free; 253 254 blkg->pd[i] = pd; 255 pd->blkg = blkg; 256 pd->plid = i; 257 } 258 259 return blkg; 260 261 err_free: 262 blkg_free(blkg); 263 return NULL; 264 } 265 266 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, 267 struct request_queue *q, bool update_hint) 268 { 269 struct blkcg_gq *blkg; 270 271 /* 272 * Hint didn't match. Look up from the radix tree. Note that the 273 * hint can only be updated under queue_lock as otherwise @blkg 274 * could have already been removed from blkg_tree. The caller is 275 * responsible for grabbing queue_lock if @update_hint. 276 */ 277 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); 278 if (blkg && blkg->q == q) { 279 if (update_hint) { 280 lockdep_assert_held(&q->queue_lock); 281 rcu_assign_pointer(blkcg->blkg_hint, blkg); 282 } 283 return blkg; 284 } 285 286 return NULL; 287 } 288 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); 289 290 /* 291 * If @new_blkg is %NULL, this function tries to allocate a new one as 292 * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. 293 */ 294 static struct blkcg_gq *blkg_create(struct blkcg *blkcg, 295 struct request_queue *q, 296 struct blkcg_gq *new_blkg) 297 { 298 struct blkcg_gq *blkg; 299 int i, ret; 300 301 lockdep_assert_held(&q->queue_lock); 302 303 /* request_queue is dying, do not create/recreate a blkg */ 304 if (blk_queue_dying(q)) { 305 ret = -ENODEV; 306 goto err_free_blkg; 307 } 308 309 /* blkg holds a reference to blkcg */ 310 if (!css_tryget_online(&blkcg->css)) { 311 ret = -ENODEV; 312 goto err_free_blkg; 313 } 314 315 /* allocate */ 316 if (!new_blkg) { 317 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); 318 if (unlikely(!new_blkg)) { 319 ret = -ENOMEM; 320 goto err_put_css; 321 } 322 } 323 blkg = new_blkg; 324 325 /* link parent */ 326 if (blkcg_parent(blkcg)) { 327 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); 328 if (WARN_ON_ONCE(!blkg->parent)) { 329 ret = -ENODEV; 330 goto err_put_css; 331 } 332 blkg_get(blkg->parent); 333 } 334 335 /* invoke per-policy init */ 336 for (i = 0; i < BLKCG_MAX_POLS; i++) { 337 struct blkcg_policy *pol = blkcg_policy[i]; 338 339 if (blkg->pd[i] && pol->pd_init_fn) 340 pol->pd_init_fn(blkg->pd[i]); 341 } 342 343 /* insert */ 344 spin_lock(&blkcg->lock); 345 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 346 if (likely(!ret)) { 347 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 348 list_add(&blkg->q_node, &q->blkg_list); 349 350 for (i = 0; i < BLKCG_MAX_POLS; i++) { 351 struct blkcg_policy *pol = blkcg_policy[i]; 352 353 if (blkg->pd[i] && pol->pd_online_fn) 354 pol->pd_online_fn(blkg->pd[i]); 355 } 356 } 357 blkg->online = true; 358 spin_unlock(&blkcg->lock); 359 360 if (!ret) 361 return blkg; 362 363 /* @blkg failed fully initialized, use the usual release path */ 364 blkg_put(blkg); 365 return ERR_PTR(ret); 366 367 err_put_css: 368 css_put(&blkcg->css); 369 err_free_blkg: 370 blkg_free(new_blkg); 371 return ERR_PTR(ret); 372 } 373 374 /** 375 * blkg_lookup_create - lookup blkg, try to create one if not there 376 * @blkcg: blkcg of interest 377 * @q: request_queue of interest 378 * 379 * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to 380 * create one. blkg creation is performed recursively from blkcg_root such 381 * that all non-root blkg's have access to the parent blkg. This function 382 * should be called under RCU read lock and takes @q->queue_lock. 383 * 384 * Returns the blkg or the closest blkg if blkg_create() fails as it walks 385 * down from root. 386 */ 387 static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 388 struct request_queue *q) 389 { 390 struct blkcg_gq *blkg; 391 unsigned long flags; 392 393 WARN_ON_ONCE(!rcu_read_lock_held()); 394 395 blkg = blkg_lookup(blkcg, q); 396 if (blkg) 397 return blkg; 398 399 spin_lock_irqsave(&q->queue_lock, flags); 400 blkg = __blkg_lookup(blkcg, q, true); 401 if (blkg) 402 goto found; 403 404 /* 405 * Create blkgs walking down from blkcg_root to @blkcg, so that all 406 * non-root blkgs have access to their parents. Returns the closest 407 * blkg to the intended blkg should blkg_create() fail. 408 */ 409 while (true) { 410 struct blkcg *pos = blkcg; 411 struct blkcg *parent = blkcg_parent(blkcg); 412 struct blkcg_gq *ret_blkg = q->root_blkg; 413 414 while (parent) { 415 blkg = __blkg_lookup(parent, q, false); 416 if (blkg) { 417 /* remember closest blkg */ 418 ret_blkg = blkg; 419 break; 420 } 421 pos = parent; 422 parent = blkcg_parent(parent); 423 } 424 425 blkg = blkg_create(pos, q, NULL); 426 if (IS_ERR(blkg)) { 427 blkg = ret_blkg; 428 break; 429 } 430 if (pos == blkcg) 431 break; 432 } 433 434 found: 435 spin_unlock_irqrestore(&q->queue_lock, flags); 436 return blkg; 437 } 438 439 static void blkg_destroy(struct blkcg_gq *blkg) 440 { 441 struct blkcg *blkcg = blkg->blkcg; 442 int i; 443 444 lockdep_assert_held(&blkg->q->queue_lock); 445 lockdep_assert_held(&blkcg->lock); 446 447 /* Something wrong if we are trying to remove same group twice */ 448 WARN_ON_ONCE(list_empty(&blkg->q_node)); 449 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); 450 451 for (i = 0; i < BLKCG_MAX_POLS; i++) { 452 struct blkcg_policy *pol = blkcg_policy[i]; 453 454 if (blkg->pd[i] && pol->pd_offline_fn) 455 pol->pd_offline_fn(blkg->pd[i]); 456 } 457 458 blkg->online = false; 459 460 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); 461 list_del_init(&blkg->q_node); 462 hlist_del_init_rcu(&blkg->blkcg_node); 463 464 /* 465 * Both setting lookup hint to and clearing it from @blkg are done 466 * under queue_lock. If it's not pointing to @blkg now, it never 467 * will. Hint assignment itself can race safely. 468 */ 469 if (rcu_access_pointer(blkcg->blkg_hint) == blkg) 470 rcu_assign_pointer(blkcg->blkg_hint, NULL); 471 472 /* 473 * Put the reference taken at the time of creation so that when all 474 * queues are gone, group can be destroyed. 475 */ 476 percpu_ref_kill(&blkg->refcnt); 477 } 478 479 /** 480 * blkg_destroy_all - destroy all blkgs associated with a request_queue 481 * @q: request_queue of interest 482 * 483 * Destroy all blkgs associated with @q. 484 */ 485 static void blkg_destroy_all(struct request_queue *q) 486 { 487 struct blkcg_gq *blkg, *n; 488 int count = BLKG_DESTROY_BATCH_SIZE; 489 490 restart: 491 spin_lock_irq(&q->queue_lock); 492 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 493 struct blkcg *blkcg = blkg->blkcg; 494 495 spin_lock(&blkcg->lock); 496 blkg_destroy(blkg); 497 spin_unlock(&blkcg->lock); 498 499 /* 500 * in order to avoid holding the spin lock for too long, release 501 * it when a batch of blkgs are destroyed. 502 */ 503 if (!(--count)) { 504 count = BLKG_DESTROY_BATCH_SIZE; 505 spin_unlock_irq(&q->queue_lock); 506 cond_resched(); 507 goto restart; 508 } 509 } 510 511 q->root_blkg = NULL; 512 spin_unlock_irq(&q->queue_lock); 513 } 514 515 static int blkcg_reset_stats(struct cgroup_subsys_state *css, 516 struct cftype *cftype, u64 val) 517 { 518 struct blkcg *blkcg = css_to_blkcg(css); 519 struct blkcg_gq *blkg; 520 int i, cpu; 521 522 mutex_lock(&blkcg_pol_mutex); 523 spin_lock_irq(&blkcg->lock); 524 525 /* 526 * Note that stat reset is racy - it doesn't synchronize against 527 * stat updates. This is a debug feature which shouldn't exist 528 * anyway. If you get hit by a race, retry. 529 */ 530 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { 531 for_each_possible_cpu(cpu) { 532 struct blkg_iostat_set *bis = 533 per_cpu_ptr(blkg->iostat_cpu, cpu); 534 memset(bis, 0, sizeof(*bis)); 535 } 536 memset(&blkg->iostat, 0, sizeof(blkg->iostat)); 537 538 for (i = 0; i < BLKCG_MAX_POLS; i++) { 539 struct blkcg_policy *pol = blkcg_policy[i]; 540 541 if (blkg->pd[i] && pol->pd_reset_stats_fn) 542 pol->pd_reset_stats_fn(blkg->pd[i]); 543 } 544 } 545 546 spin_unlock_irq(&blkcg->lock); 547 mutex_unlock(&blkcg_pol_mutex); 548 return 0; 549 } 550 551 const char *blkg_dev_name(struct blkcg_gq *blkg) 552 { 553 if (!blkg->q->disk || !blkg->q->disk->bdi->dev) 554 return NULL; 555 return bdi_dev_name(blkg->q->disk->bdi); 556 } 557 558 /** 559 * blkcg_print_blkgs - helper for printing per-blkg data 560 * @sf: seq_file to print to 561 * @blkcg: blkcg of interest 562 * @prfill: fill function to print out a blkg 563 * @pol: policy in question 564 * @data: data to be passed to @prfill 565 * @show_total: to print out sum of prfill return values or not 566 * 567 * This function invokes @prfill on each blkg of @blkcg if pd for the 568 * policy specified by @pol exists. @prfill is invoked with @sf, the 569 * policy data and @data and the matching queue lock held. If @show_total 570 * is %true, the sum of the return values from @prfill is printed with 571 * "Total" label at the end. 572 * 573 * This is to be used to construct print functions for 574 * cftype->read_seq_string method. 575 */ 576 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, 577 u64 (*prfill)(struct seq_file *, 578 struct blkg_policy_data *, int), 579 const struct blkcg_policy *pol, int data, 580 bool show_total) 581 { 582 struct blkcg_gq *blkg; 583 u64 total = 0; 584 585 rcu_read_lock(); 586 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 587 spin_lock_irq(&blkg->q->queue_lock); 588 if (blkcg_policy_enabled(blkg->q, pol)) 589 total += prfill(sf, blkg->pd[pol->plid], data); 590 spin_unlock_irq(&blkg->q->queue_lock); 591 } 592 rcu_read_unlock(); 593 594 if (show_total) 595 seq_printf(sf, "Total %llu\n", (unsigned long long)total); 596 } 597 EXPORT_SYMBOL_GPL(blkcg_print_blkgs); 598 599 /** 600 * __blkg_prfill_u64 - prfill helper for a single u64 value 601 * @sf: seq_file to print to 602 * @pd: policy private data of interest 603 * @v: value to print 604 * 605 * Print @v to @sf for the device assocaited with @pd. 606 */ 607 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) 608 { 609 const char *dname = blkg_dev_name(pd->blkg); 610 611 if (!dname) 612 return 0; 613 614 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); 615 return v; 616 } 617 EXPORT_SYMBOL_GPL(__blkg_prfill_u64); 618 619 /* Performs queue bypass and policy enabled checks then looks up blkg. */ 620 static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, 621 const struct blkcg_policy *pol, 622 struct request_queue *q) 623 { 624 WARN_ON_ONCE(!rcu_read_lock_held()); 625 lockdep_assert_held(&q->queue_lock); 626 627 if (!blkcg_policy_enabled(q, pol)) 628 return ERR_PTR(-EOPNOTSUPP); 629 return __blkg_lookup(blkcg, q, true /* update_hint */); 630 } 631 632 /** 633 * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update 634 * @inputp: input string pointer 635 * 636 * Parse the device node prefix part, MAJ:MIN, of per-blkg config update 637 * from @input and get and return the matching bdev. *@inputp is 638 * updated to point past the device node prefix. Returns an ERR_PTR() 639 * value on error. 640 * 641 * Use this function iff blkg_conf_prep() can't be used for some reason. 642 */ 643 struct block_device *blkcg_conf_open_bdev(char **inputp) 644 { 645 char *input = *inputp; 646 unsigned int major, minor; 647 struct block_device *bdev; 648 int key_len; 649 650 if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) 651 return ERR_PTR(-EINVAL); 652 653 input += key_len; 654 if (!isspace(*input)) 655 return ERR_PTR(-EINVAL); 656 input = skip_spaces(input); 657 658 bdev = blkdev_get_no_open(MKDEV(major, minor)); 659 if (!bdev) 660 return ERR_PTR(-ENODEV); 661 if (bdev_is_partition(bdev)) { 662 blkdev_put_no_open(bdev); 663 return ERR_PTR(-ENODEV); 664 } 665 666 *inputp = input; 667 return bdev; 668 } 669 670 /** 671 * blkg_conf_prep - parse and prepare for per-blkg config update 672 * @blkcg: target block cgroup 673 * @pol: target policy 674 * @input: input string 675 * @ctx: blkg_conf_ctx to be filled 676 * 677 * Parse per-blkg config update from @input and initialize @ctx with the 678 * result. @ctx->blkg points to the blkg to be updated and @ctx->body the 679 * part of @input following MAJ:MIN. This function returns with RCU read 680 * lock and queue lock held and must be paired with blkg_conf_finish(). 681 */ 682 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 683 char *input, struct blkg_conf_ctx *ctx) 684 __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) 685 { 686 struct block_device *bdev; 687 struct request_queue *q; 688 struct blkcg_gq *blkg; 689 int ret; 690 691 bdev = blkcg_conf_open_bdev(&input); 692 if (IS_ERR(bdev)) 693 return PTR_ERR(bdev); 694 695 q = bdev_get_queue(bdev); 696 697 /* 698 * blkcg_deactivate_policy() requires queue to be frozen, we can grab 699 * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). 700 */ 701 ret = blk_queue_enter(q, 0); 702 if (ret) 703 goto fail; 704 705 rcu_read_lock(); 706 spin_lock_irq(&q->queue_lock); 707 708 blkg = blkg_lookup_check(blkcg, pol, q); 709 if (IS_ERR(blkg)) { 710 ret = PTR_ERR(blkg); 711 goto fail_unlock; 712 } 713 714 if (blkg) 715 goto success; 716 717 /* 718 * Create blkgs walking down from blkcg_root to @blkcg, so that all 719 * non-root blkgs have access to their parents. 720 */ 721 while (true) { 722 struct blkcg *pos = blkcg; 723 struct blkcg *parent; 724 struct blkcg_gq *new_blkg; 725 726 parent = blkcg_parent(blkcg); 727 while (parent && !__blkg_lookup(parent, q, false)) { 728 pos = parent; 729 parent = blkcg_parent(parent); 730 } 731 732 /* Drop locks to do new blkg allocation with GFP_KERNEL. */ 733 spin_unlock_irq(&q->queue_lock); 734 rcu_read_unlock(); 735 736 new_blkg = blkg_alloc(pos, q, GFP_KERNEL); 737 if (unlikely(!new_blkg)) { 738 ret = -ENOMEM; 739 goto fail_exit_queue; 740 } 741 742 if (radix_tree_preload(GFP_KERNEL)) { 743 blkg_free(new_blkg); 744 ret = -ENOMEM; 745 goto fail_exit_queue; 746 } 747 748 rcu_read_lock(); 749 spin_lock_irq(&q->queue_lock); 750 751 blkg = blkg_lookup_check(pos, pol, q); 752 if (IS_ERR(blkg)) { 753 ret = PTR_ERR(blkg); 754 blkg_free(new_blkg); 755 goto fail_preloaded; 756 } 757 758 if (blkg) { 759 blkg_free(new_blkg); 760 } else { 761 blkg = blkg_create(pos, q, new_blkg); 762 if (IS_ERR(blkg)) { 763 ret = PTR_ERR(blkg); 764 goto fail_preloaded; 765 } 766 } 767 768 radix_tree_preload_end(); 769 770 if (pos == blkcg) 771 goto success; 772 } 773 success: 774 blk_queue_exit(q); 775 ctx->bdev = bdev; 776 ctx->blkg = blkg; 777 ctx->body = input; 778 return 0; 779 780 fail_preloaded: 781 radix_tree_preload_end(); 782 fail_unlock: 783 spin_unlock_irq(&q->queue_lock); 784 rcu_read_unlock(); 785 fail_exit_queue: 786 blk_queue_exit(q); 787 fail: 788 blkdev_put_no_open(bdev); 789 /* 790 * If queue was bypassing, we should retry. Do so after a 791 * short msleep(). It isn't strictly necessary but queue 792 * can be bypassing for some time and it's always nice to 793 * avoid busy looping. 794 */ 795 if (ret == -EBUSY) { 796 msleep(10); 797 ret = restart_syscall(); 798 } 799 return ret; 800 } 801 EXPORT_SYMBOL_GPL(blkg_conf_prep); 802 803 /** 804 * blkg_conf_finish - finish up per-blkg config update 805 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() 806 * 807 * Finish up after per-blkg config update. This function must be paired 808 * with blkg_conf_prep(). 809 */ 810 void blkg_conf_finish(struct blkg_conf_ctx *ctx) 811 __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu) 812 { 813 spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); 814 rcu_read_unlock(); 815 blkdev_put_no_open(ctx->bdev); 816 } 817 EXPORT_SYMBOL_GPL(blkg_conf_finish); 818 819 static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) 820 { 821 int i; 822 823 for (i = 0; i < BLKG_IOSTAT_NR; i++) { 824 dst->bytes[i] = src->bytes[i]; 825 dst->ios[i] = src->ios[i]; 826 } 827 } 828 829 static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) 830 { 831 int i; 832 833 for (i = 0; i < BLKG_IOSTAT_NR; i++) { 834 dst->bytes[i] += src->bytes[i]; 835 dst->ios[i] += src->ios[i]; 836 } 837 } 838 839 static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) 840 { 841 int i; 842 843 for (i = 0; i < BLKG_IOSTAT_NR; i++) { 844 dst->bytes[i] -= src->bytes[i]; 845 dst->ios[i] -= src->ios[i]; 846 } 847 } 848 849 static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) 850 { 851 struct blkcg *blkcg = css_to_blkcg(css); 852 struct blkcg_gq *blkg; 853 854 /* Root-level stats are sourced from system-wide IO stats */ 855 if (!cgroup_parent(css->cgroup)) 856 return; 857 858 rcu_read_lock(); 859 860 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 861 struct blkcg_gq *parent = blkg->parent; 862 struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); 863 struct blkg_iostat cur, delta; 864 unsigned long flags; 865 unsigned int seq; 866 867 /* fetch the current per-cpu values */ 868 do { 869 seq = u64_stats_fetch_begin(&bisc->sync); 870 blkg_iostat_set(&cur, &bisc->cur); 871 } while (u64_stats_fetch_retry(&bisc->sync, seq)); 872 873 /* propagate percpu delta to global */ 874 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); 875 blkg_iostat_set(&delta, &cur); 876 blkg_iostat_sub(&delta, &bisc->last); 877 blkg_iostat_add(&blkg->iostat.cur, &delta); 878 blkg_iostat_add(&bisc->last, &delta); 879 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); 880 881 /* propagate global delta to parent (unless that's root) */ 882 if (parent && parent->parent) { 883 flags = u64_stats_update_begin_irqsave(&parent->iostat.sync); 884 blkg_iostat_set(&delta, &blkg->iostat.cur); 885 blkg_iostat_sub(&delta, &blkg->iostat.last); 886 blkg_iostat_add(&parent->iostat.cur, &delta); 887 blkg_iostat_add(&blkg->iostat.last, &delta); 888 u64_stats_update_end_irqrestore(&parent->iostat.sync, flags); 889 } 890 } 891 892 rcu_read_unlock(); 893 } 894 895 /* 896 * We source root cgroup stats from the system-wide stats to avoid 897 * tracking the same information twice and incurring overhead when no 898 * cgroups are defined. For that reason, cgroup_rstat_flush in 899 * blkcg_print_stat does not actually fill out the iostat in the root 900 * cgroup's blkcg_gq. 901 * 902 * However, we would like to re-use the printing code between the root and 903 * non-root cgroups to the extent possible. For that reason, we simulate 904 * flushing the root cgroup's stats by explicitly filling in the iostat 905 * with disk level statistics. 906 */ 907 static void blkcg_fill_root_iostats(void) 908 { 909 struct class_dev_iter iter; 910 struct device *dev; 911 912 class_dev_iter_init(&iter, &block_class, NULL, &disk_type); 913 while ((dev = class_dev_iter_next(&iter))) { 914 struct block_device *bdev = dev_to_bdev(dev); 915 struct blkcg_gq *blkg = 916 blk_queue_root_blkg(bdev_get_queue(bdev)); 917 struct blkg_iostat tmp; 918 int cpu; 919 unsigned long flags; 920 921 memset(&tmp, 0, sizeof(tmp)); 922 for_each_possible_cpu(cpu) { 923 struct disk_stats *cpu_dkstats; 924 925 cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); 926 tmp.ios[BLKG_IOSTAT_READ] += 927 cpu_dkstats->ios[STAT_READ]; 928 tmp.ios[BLKG_IOSTAT_WRITE] += 929 cpu_dkstats->ios[STAT_WRITE]; 930 tmp.ios[BLKG_IOSTAT_DISCARD] += 931 cpu_dkstats->ios[STAT_DISCARD]; 932 // convert sectors to bytes 933 tmp.bytes[BLKG_IOSTAT_READ] += 934 cpu_dkstats->sectors[STAT_READ] << 9; 935 tmp.bytes[BLKG_IOSTAT_WRITE] += 936 cpu_dkstats->sectors[STAT_WRITE] << 9; 937 tmp.bytes[BLKG_IOSTAT_DISCARD] += 938 cpu_dkstats->sectors[STAT_DISCARD] << 9; 939 } 940 941 flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); 942 blkg_iostat_set(&blkg->iostat.cur, &tmp); 943 u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); 944 } 945 } 946 947 static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) 948 { 949 struct blkg_iostat_set *bis = &blkg->iostat; 950 u64 rbytes, wbytes, rios, wios, dbytes, dios; 951 const char *dname; 952 unsigned seq; 953 int i; 954 955 if (!blkg->online) 956 return; 957 958 dname = blkg_dev_name(blkg); 959 if (!dname) 960 return; 961 962 seq_printf(s, "%s ", dname); 963 964 do { 965 seq = u64_stats_fetch_begin(&bis->sync); 966 967 rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; 968 wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; 969 dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; 970 rios = bis->cur.ios[BLKG_IOSTAT_READ]; 971 wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; 972 dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; 973 } while (u64_stats_fetch_retry(&bis->sync, seq)); 974 975 if (rbytes || wbytes || rios || wios) { 976 seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", 977 rbytes, wbytes, rios, wios, 978 dbytes, dios); 979 } 980 981 if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { 982 seq_printf(s, " use_delay=%d delay_nsec=%llu", 983 atomic_read(&blkg->use_delay), 984 atomic64_read(&blkg->delay_nsec)); 985 } 986 987 for (i = 0; i < BLKCG_MAX_POLS; i++) { 988 struct blkcg_policy *pol = blkcg_policy[i]; 989 990 if (!blkg->pd[i] || !pol->pd_stat_fn) 991 continue; 992 993 pol->pd_stat_fn(blkg->pd[i], s); 994 } 995 996 seq_puts(s, "\n"); 997 } 998 999 static int blkcg_print_stat(struct seq_file *sf, void *v) 1000 { 1001 struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); 1002 struct blkcg_gq *blkg; 1003 1004 if (!seq_css(sf)->parent) 1005 blkcg_fill_root_iostats(); 1006 else 1007 cgroup_rstat_flush(blkcg->css.cgroup); 1008 1009 rcu_read_lock(); 1010 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 1011 spin_lock_irq(&blkg->q->queue_lock); 1012 blkcg_print_one_stat(blkg, sf); 1013 spin_unlock_irq(&blkg->q->queue_lock); 1014 } 1015 rcu_read_unlock(); 1016 return 0; 1017 } 1018 1019 static struct cftype blkcg_files[] = { 1020 { 1021 .name = "stat", 1022 .seq_show = blkcg_print_stat, 1023 }, 1024 { } /* terminate */ 1025 }; 1026 1027 static struct cftype blkcg_legacy_files[] = { 1028 { 1029 .name = "reset_stats", 1030 .write_u64 = blkcg_reset_stats, 1031 }, 1032 { } /* terminate */ 1033 }; 1034 1035 #ifdef CONFIG_CGROUP_WRITEBACK 1036 struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) 1037 { 1038 return &css_to_blkcg(css)->cgwb_list; 1039 } 1040 #endif 1041 1042 /* 1043 * blkcg destruction is a three-stage process. 1044 * 1045 * 1. Destruction starts. The blkcg_css_offline() callback is invoked 1046 * which offlines writeback. Here we tie the next stage of blkg destruction 1047 * to the completion of writeback associated with the blkcg. This lets us 1048 * avoid punting potentially large amounts of outstanding writeback to root 1049 * while maintaining any ongoing policies. The next stage is triggered when 1050 * the nr_cgwbs count goes to zero. 1051 * 1052 * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called 1053 * and handles the destruction of blkgs. Here the css reference held by 1054 * the blkg is put back eventually allowing blkcg_css_free() to be called. 1055 * This work may occur in cgwb_release_workfn() on the cgwb_release 1056 * workqueue. Any submitted ios that fail to get the blkg ref will be 1057 * punted to the root_blkg. 1058 * 1059 * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. 1060 * This finally frees the blkcg. 1061 */ 1062 1063 /** 1064 * blkcg_destroy_blkgs - responsible for shooting down blkgs 1065 * @blkcg: blkcg of interest 1066 * 1067 * blkgs should be removed while holding both q and blkcg locks. As blkcg lock 1068 * is nested inside q lock, this function performs reverse double lock dancing. 1069 * Destroying the blkgs releases the reference held on the blkcg's css allowing 1070 * blkcg_css_free to eventually be called. 1071 * 1072 * This is the blkcg counterpart of ioc_release_fn(). 1073 */ 1074 static void blkcg_destroy_blkgs(struct blkcg *blkcg) 1075 { 1076 might_sleep(); 1077 1078 spin_lock_irq(&blkcg->lock); 1079 1080 while (!hlist_empty(&blkcg->blkg_list)) { 1081 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, 1082 struct blkcg_gq, blkcg_node); 1083 struct request_queue *q = blkg->q; 1084 1085 if (need_resched() || !spin_trylock(&q->queue_lock)) { 1086 /* 1087 * Given that the system can accumulate a huge number 1088 * of blkgs in pathological cases, check to see if we 1089 * need to rescheduling to avoid softlockup. 1090 */ 1091 spin_unlock_irq(&blkcg->lock); 1092 cond_resched(); 1093 spin_lock_irq(&blkcg->lock); 1094 continue; 1095 } 1096 1097 blkg_destroy(blkg); 1098 spin_unlock(&q->queue_lock); 1099 } 1100 1101 spin_unlock_irq(&blkcg->lock); 1102 } 1103 1104 /** 1105 * blkcg_pin_online - pin online state 1106 * @blkcg_css: blkcg of interest 1107 * 1108 * While pinned, a blkcg is kept online. This is primarily used to 1109 * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline 1110 * while an associated cgwb is still active. 1111 */ 1112 void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css) 1113 { 1114 refcount_inc(&css_to_blkcg(blkcg_css)->online_pin); 1115 } 1116 1117 /** 1118 * blkcg_unpin_online - unpin online state 1119 * @blkcg_css: blkcg of interest 1120 * 1121 * This is primarily used to impedance-match blkg and cgwb lifetimes so 1122 * that blkg doesn't go offline while an associated cgwb is still active. 1123 * When this count goes to zero, all active cgwbs have finished so the 1124 * blkcg can continue destruction by calling blkcg_destroy_blkgs(). 1125 */ 1126 void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) 1127 { 1128 struct blkcg *blkcg = css_to_blkcg(blkcg_css); 1129 1130 do { 1131 if (!refcount_dec_and_test(&blkcg->online_pin)) 1132 break; 1133 blkcg_destroy_blkgs(blkcg); 1134 blkcg = blkcg_parent(blkcg); 1135 } while (blkcg); 1136 } 1137 1138 /** 1139 * blkcg_css_offline - cgroup css_offline callback 1140 * @css: css of interest 1141 * 1142 * This function is called when @css is about to go away. Here the cgwbs are 1143 * offlined first and only once writeback associated with the blkcg has 1144 * finished do we start step 2 (see above). 1145 */ 1146 static void blkcg_css_offline(struct cgroup_subsys_state *css) 1147 { 1148 /* this prevents anyone from attaching or migrating to this blkcg */ 1149 wb_blkcg_offline(css); 1150 1151 /* put the base online pin allowing step 2 to be triggered */ 1152 blkcg_unpin_online(css); 1153 } 1154 1155 static void blkcg_css_free(struct cgroup_subsys_state *css) 1156 { 1157 struct blkcg *blkcg = css_to_blkcg(css); 1158 int i; 1159 1160 mutex_lock(&blkcg_pol_mutex); 1161 1162 list_del(&blkcg->all_blkcgs_node); 1163 1164 for (i = 0; i < BLKCG_MAX_POLS; i++) 1165 if (blkcg->cpd[i]) 1166 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); 1167 1168 mutex_unlock(&blkcg_pol_mutex); 1169 1170 kfree(blkcg); 1171 } 1172 1173 static struct cgroup_subsys_state * 1174 blkcg_css_alloc(struct cgroup_subsys_state *parent_css) 1175 { 1176 struct blkcg *blkcg; 1177 struct cgroup_subsys_state *ret; 1178 int i; 1179 1180 mutex_lock(&blkcg_pol_mutex); 1181 1182 if (!parent_css) { 1183 blkcg = &blkcg_root; 1184 } else { 1185 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 1186 if (!blkcg) { 1187 ret = ERR_PTR(-ENOMEM); 1188 goto unlock; 1189 } 1190 } 1191 1192 for (i = 0; i < BLKCG_MAX_POLS ; i++) { 1193 struct blkcg_policy *pol = blkcg_policy[i]; 1194 struct blkcg_policy_data *cpd; 1195 1196 /* 1197 * If the policy hasn't been attached yet, wait for it 1198 * to be attached before doing anything else. Otherwise, 1199 * check if the policy requires any specific per-cgroup 1200 * data: if it does, allocate and initialize it. 1201 */ 1202 if (!pol || !pol->cpd_alloc_fn) 1203 continue; 1204 1205 cpd = pol->cpd_alloc_fn(GFP_KERNEL); 1206 if (!cpd) { 1207 ret = ERR_PTR(-ENOMEM); 1208 goto free_pd_blkcg; 1209 } 1210 blkcg->cpd[i] = cpd; 1211 cpd->blkcg = blkcg; 1212 cpd->plid = i; 1213 if (pol->cpd_init_fn) 1214 pol->cpd_init_fn(cpd); 1215 } 1216 1217 spin_lock_init(&blkcg->lock); 1218 refcount_set(&blkcg->online_pin, 1); 1219 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); 1220 INIT_HLIST_HEAD(&blkcg->blkg_list); 1221 #ifdef CONFIG_CGROUP_WRITEBACK 1222 INIT_LIST_HEAD(&blkcg->cgwb_list); 1223 #endif 1224 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); 1225 1226 mutex_unlock(&blkcg_pol_mutex); 1227 return &blkcg->css; 1228 1229 free_pd_blkcg: 1230 for (i--; i >= 0; i--) 1231 if (blkcg->cpd[i]) 1232 blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); 1233 1234 if (blkcg != &blkcg_root) 1235 kfree(blkcg); 1236 unlock: 1237 mutex_unlock(&blkcg_pol_mutex); 1238 return ret; 1239 } 1240 1241 static int blkcg_css_online(struct cgroup_subsys_state *css) 1242 { 1243 struct blkcg *parent = blkcg_parent(css_to_blkcg(css)); 1244 1245 /* 1246 * blkcg_pin_online() is used to delay blkcg offline so that blkgs 1247 * don't go offline while cgwbs are still active on them. Pin the 1248 * parent so that offline always happens towards the root. 1249 */ 1250 if (parent) 1251 blkcg_pin_online(css); 1252 return 0; 1253 } 1254 1255 /** 1256 * blkcg_init_queue - initialize blkcg part of request queue 1257 * @q: request_queue to initialize 1258 * 1259 * Called from blk_alloc_queue(). Responsible for initializing blkcg 1260 * part of new request_queue @q. 1261 * 1262 * RETURNS: 1263 * 0 on success, -errno on failure. 1264 */ 1265 int blkcg_init_queue(struct request_queue *q) 1266 { 1267 struct blkcg_gq *new_blkg, *blkg; 1268 bool preloaded; 1269 int ret; 1270 1271 INIT_LIST_HEAD(&q->blkg_list); 1272 1273 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); 1274 if (!new_blkg) 1275 return -ENOMEM; 1276 1277 preloaded = !radix_tree_preload(GFP_KERNEL); 1278 1279 /* Make sure the root blkg exists. */ 1280 /* spin_lock_irq can serve as RCU read-side critical section. */ 1281 spin_lock_irq(&q->queue_lock); 1282 blkg = blkg_create(&blkcg_root, q, new_blkg); 1283 if (IS_ERR(blkg)) 1284 goto err_unlock; 1285 q->root_blkg = blkg; 1286 spin_unlock_irq(&q->queue_lock); 1287 1288 if (preloaded) 1289 radix_tree_preload_end(); 1290 1291 ret = blk_ioprio_init(q); 1292 if (ret) 1293 goto err_destroy_all; 1294 1295 ret = blk_throtl_init(q); 1296 if (ret) 1297 goto err_destroy_all; 1298 1299 ret = blk_iolatency_init(q); 1300 if (ret) { 1301 blk_throtl_exit(q); 1302 goto err_destroy_all; 1303 } 1304 1305 return 0; 1306 1307 err_destroy_all: 1308 blkg_destroy_all(q); 1309 return ret; 1310 err_unlock: 1311 spin_unlock_irq(&q->queue_lock); 1312 if (preloaded) 1313 radix_tree_preload_end(); 1314 return PTR_ERR(blkg); 1315 } 1316 1317 /** 1318 * blkcg_exit_queue - exit and release blkcg part of request_queue 1319 * @q: request_queue being released 1320 * 1321 * Called from blk_exit_queue(). Responsible for exiting blkcg part. 1322 */ 1323 void blkcg_exit_queue(struct request_queue *q) 1324 { 1325 blkg_destroy_all(q); 1326 blk_throtl_exit(q); 1327 } 1328 1329 static void blkcg_bind(struct cgroup_subsys_state *root_css) 1330 { 1331 int i; 1332 1333 mutex_lock(&blkcg_pol_mutex); 1334 1335 for (i = 0; i < BLKCG_MAX_POLS; i++) { 1336 struct blkcg_policy *pol = blkcg_policy[i]; 1337 struct blkcg *blkcg; 1338 1339 if (!pol || !pol->cpd_bind_fn) 1340 continue; 1341 1342 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) 1343 if (blkcg->cpd[pol->plid]) 1344 pol->cpd_bind_fn(blkcg->cpd[pol->plid]); 1345 } 1346 mutex_unlock(&blkcg_pol_mutex); 1347 } 1348 1349 static void blkcg_exit(struct task_struct *tsk) 1350 { 1351 if (tsk->throttle_queue) 1352 blk_put_queue(tsk->throttle_queue); 1353 tsk->throttle_queue = NULL; 1354 } 1355 1356 struct cgroup_subsys io_cgrp_subsys = { 1357 .css_alloc = blkcg_css_alloc, 1358 .css_online = blkcg_css_online, 1359 .css_offline = blkcg_css_offline, 1360 .css_free = blkcg_css_free, 1361 .css_rstat_flush = blkcg_rstat_flush, 1362 .bind = blkcg_bind, 1363 .dfl_cftypes = blkcg_files, 1364 .legacy_cftypes = blkcg_legacy_files, 1365 .legacy_name = "blkio", 1366 .exit = blkcg_exit, 1367 #ifdef CONFIG_MEMCG 1368 /* 1369 * This ensures that, if available, memcg is automatically enabled 1370 * together on the default hierarchy so that the owner cgroup can 1371 * be retrieved from writeback pages. 1372 */ 1373 .depends_on = 1 << memory_cgrp_id, 1374 #endif 1375 }; 1376 EXPORT_SYMBOL_GPL(io_cgrp_subsys); 1377 1378 /** 1379 * blkcg_activate_policy - activate a blkcg policy on a request_queue 1380 * @q: request_queue of interest 1381 * @pol: blkcg policy to activate 1382 * 1383 * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through 1384 * bypass mode to populate its blkgs with policy_data for @pol. 1385 * 1386 * Activation happens with @q bypassed, so nobody would be accessing blkgs 1387 * from IO path. Update of each blkg is protected by both queue and blkcg 1388 * locks so that holding either lock and testing blkcg_policy_enabled() is 1389 * always enough for dereferencing policy data. 1390 * 1391 * The caller is responsible for synchronizing [de]activations and policy 1392 * [un]registerations. Returns 0 on success, -errno on failure. 1393 */ 1394 int blkcg_activate_policy(struct request_queue *q, 1395 const struct blkcg_policy *pol) 1396 { 1397 struct blkg_policy_data *pd_prealloc = NULL; 1398 struct blkcg_gq *blkg, *pinned_blkg = NULL; 1399 int ret; 1400 1401 if (blkcg_policy_enabled(q, pol)) 1402 return 0; 1403 1404 if (queue_is_mq(q)) 1405 blk_mq_freeze_queue(q); 1406 retry: 1407 spin_lock_irq(&q->queue_lock); 1408 1409 /* blkg_list is pushed at the head, reverse walk to allocate parents first */ 1410 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { 1411 struct blkg_policy_data *pd; 1412 1413 if (blkg->pd[pol->plid]) 1414 continue; 1415 1416 /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ 1417 if (blkg == pinned_blkg) { 1418 pd = pd_prealloc; 1419 pd_prealloc = NULL; 1420 } else { 1421 pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, 1422 blkg->blkcg); 1423 } 1424 1425 if (!pd) { 1426 /* 1427 * GFP_NOWAIT failed. Free the existing one and 1428 * prealloc for @blkg w/ GFP_KERNEL. 1429 */ 1430 if (pinned_blkg) 1431 blkg_put(pinned_blkg); 1432 blkg_get(blkg); 1433 pinned_blkg = blkg; 1434 1435 spin_unlock_irq(&q->queue_lock); 1436 1437 if (pd_prealloc) 1438 pol->pd_free_fn(pd_prealloc); 1439 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, 1440 blkg->blkcg); 1441 if (pd_prealloc) 1442 goto retry; 1443 else 1444 goto enomem; 1445 } 1446 1447 blkg->pd[pol->plid] = pd; 1448 pd->blkg = blkg; 1449 pd->plid = pol->plid; 1450 } 1451 1452 /* all allocated, init in the same order */ 1453 if (pol->pd_init_fn) 1454 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) 1455 pol->pd_init_fn(blkg->pd[pol->plid]); 1456 1457 __set_bit(pol->plid, q->blkcg_pols); 1458 ret = 0; 1459 1460 spin_unlock_irq(&q->queue_lock); 1461 out: 1462 if (queue_is_mq(q)) 1463 blk_mq_unfreeze_queue(q); 1464 if (pinned_blkg) 1465 blkg_put(pinned_blkg); 1466 if (pd_prealloc) 1467 pol->pd_free_fn(pd_prealloc); 1468 return ret; 1469 1470 enomem: 1471 /* alloc failed, nothing's initialized yet, free everything */ 1472 spin_lock_irq(&q->queue_lock); 1473 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1474 struct blkcg *blkcg = blkg->blkcg; 1475 1476 spin_lock(&blkcg->lock); 1477 if (blkg->pd[pol->plid]) { 1478 pol->pd_free_fn(blkg->pd[pol->plid]); 1479 blkg->pd[pol->plid] = NULL; 1480 } 1481 spin_unlock(&blkcg->lock); 1482 } 1483 spin_unlock_irq(&q->queue_lock); 1484 ret = -ENOMEM; 1485 goto out; 1486 } 1487 EXPORT_SYMBOL_GPL(blkcg_activate_policy); 1488 1489 /** 1490 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue 1491 * @q: request_queue of interest 1492 * @pol: blkcg policy to deactivate 1493 * 1494 * Deactivate @pol on @q. Follows the same synchronization rules as 1495 * blkcg_activate_policy(). 1496 */ 1497 void blkcg_deactivate_policy(struct request_queue *q, 1498 const struct blkcg_policy *pol) 1499 { 1500 struct blkcg_gq *blkg; 1501 1502 if (!blkcg_policy_enabled(q, pol)) 1503 return; 1504 1505 if (queue_is_mq(q)) 1506 blk_mq_freeze_queue(q); 1507 1508 spin_lock_irq(&q->queue_lock); 1509 1510 __clear_bit(pol->plid, q->blkcg_pols); 1511 1512 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1513 struct blkcg *blkcg = blkg->blkcg; 1514 1515 spin_lock(&blkcg->lock); 1516 if (blkg->pd[pol->plid]) { 1517 if (pol->pd_offline_fn) 1518 pol->pd_offline_fn(blkg->pd[pol->plid]); 1519 pol->pd_free_fn(blkg->pd[pol->plid]); 1520 blkg->pd[pol->plid] = NULL; 1521 } 1522 spin_unlock(&blkcg->lock); 1523 } 1524 1525 spin_unlock_irq(&q->queue_lock); 1526 1527 if (queue_is_mq(q)) 1528 blk_mq_unfreeze_queue(q); 1529 } 1530 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); 1531 1532 /** 1533 * blkcg_policy_register - register a blkcg policy 1534 * @pol: blkcg policy to register 1535 * 1536 * Register @pol with blkcg core. Might sleep and @pol may be modified on 1537 * successful registration. Returns 0 on success and -errno on failure. 1538 */ 1539 int blkcg_policy_register(struct blkcg_policy *pol) 1540 { 1541 struct blkcg *blkcg; 1542 int i, ret; 1543 1544 mutex_lock(&blkcg_pol_register_mutex); 1545 mutex_lock(&blkcg_pol_mutex); 1546 1547 /* find an empty slot */ 1548 ret = -ENOSPC; 1549 for (i = 0; i < BLKCG_MAX_POLS; i++) 1550 if (!blkcg_policy[i]) 1551 break; 1552 if (i >= BLKCG_MAX_POLS) { 1553 pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); 1554 goto err_unlock; 1555 } 1556 1557 /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ 1558 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || 1559 (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) 1560 goto err_unlock; 1561 1562 /* register @pol */ 1563 pol->plid = i; 1564 blkcg_policy[pol->plid] = pol; 1565 1566 /* allocate and install cpd's */ 1567 if (pol->cpd_alloc_fn) { 1568 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1569 struct blkcg_policy_data *cpd; 1570 1571 cpd = pol->cpd_alloc_fn(GFP_KERNEL); 1572 if (!cpd) 1573 goto err_free_cpds; 1574 1575 blkcg->cpd[pol->plid] = cpd; 1576 cpd->blkcg = blkcg; 1577 cpd->plid = pol->plid; 1578 if (pol->cpd_init_fn) 1579 pol->cpd_init_fn(cpd); 1580 } 1581 } 1582 1583 mutex_unlock(&blkcg_pol_mutex); 1584 1585 /* everything is in place, add intf files for the new policy */ 1586 if (pol->dfl_cftypes) 1587 WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, 1588 pol->dfl_cftypes)); 1589 if (pol->legacy_cftypes) 1590 WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, 1591 pol->legacy_cftypes)); 1592 mutex_unlock(&blkcg_pol_register_mutex); 1593 return 0; 1594 1595 err_free_cpds: 1596 if (pol->cpd_free_fn) { 1597 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1598 if (blkcg->cpd[pol->plid]) { 1599 pol->cpd_free_fn(blkcg->cpd[pol->plid]); 1600 blkcg->cpd[pol->plid] = NULL; 1601 } 1602 } 1603 } 1604 blkcg_policy[pol->plid] = NULL; 1605 err_unlock: 1606 mutex_unlock(&blkcg_pol_mutex); 1607 mutex_unlock(&blkcg_pol_register_mutex); 1608 return ret; 1609 } 1610 EXPORT_SYMBOL_GPL(blkcg_policy_register); 1611 1612 /** 1613 * blkcg_policy_unregister - unregister a blkcg policy 1614 * @pol: blkcg policy to unregister 1615 * 1616 * Undo blkcg_policy_register(@pol). Might sleep. 1617 */ 1618 void blkcg_policy_unregister(struct blkcg_policy *pol) 1619 { 1620 struct blkcg *blkcg; 1621 1622 mutex_lock(&blkcg_pol_register_mutex); 1623 1624 if (WARN_ON(blkcg_policy[pol->plid] != pol)) 1625 goto out_unlock; 1626 1627 /* kill the intf files first */ 1628 if (pol->dfl_cftypes) 1629 cgroup_rm_cftypes(pol->dfl_cftypes); 1630 if (pol->legacy_cftypes) 1631 cgroup_rm_cftypes(pol->legacy_cftypes); 1632 1633 /* remove cpds and unregister */ 1634 mutex_lock(&blkcg_pol_mutex); 1635 1636 if (pol->cpd_free_fn) { 1637 list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { 1638 if (blkcg->cpd[pol->plid]) { 1639 pol->cpd_free_fn(blkcg->cpd[pol->plid]); 1640 blkcg->cpd[pol->plid] = NULL; 1641 } 1642 } 1643 } 1644 blkcg_policy[pol->plid] = NULL; 1645 1646 mutex_unlock(&blkcg_pol_mutex); 1647 out_unlock: 1648 mutex_unlock(&blkcg_pol_register_mutex); 1649 } 1650 EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 1651 1652 bool __blkcg_punt_bio_submit(struct bio *bio) 1653 { 1654 struct blkcg_gq *blkg = bio->bi_blkg; 1655 1656 /* consume the flag first */ 1657 bio->bi_opf &= ~REQ_CGROUP_PUNT; 1658 1659 /* never bounce for the root cgroup */ 1660 if (!blkg->parent) 1661 return false; 1662 1663 spin_lock_bh(&blkg->async_bio_lock); 1664 bio_list_add(&blkg->async_bios, bio); 1665 spin_unlock_bh(&blkg->async_bio_lock); 1666 1667 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); 1668 return true; 1669 } 1670 1671 /* 1672 * Scale the accumulated delay based on how long it has been since we updated 1673 * the delay. We only call this when we are adding delay, in case it's been a 1674 * while since we added delay, and when we are checking to see if we need to 1675 * delay a task, to account for any delays that may have occurred. 1676 */ 1677 static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) 1678 { 1679 u64 old = atomic64_read(&blkg->delay_start); 1680 1681 /* negative use_delay means no scaling, see blkcg_set_delay() */ 1682 if (atomic_read(&blkg->use_delay) < 0) 1683 return; 1684 1685 /* 1686 * We only want to scale down every second. The idea here is that we 1687 * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain 1688 * time window. We only want to throttle tasks for recent delay that 1689 * has occurred, in 1 second time windows since that's the maximum 1690 * things can be throttled. We save the current delay window in 1691 * blkg->last_delay so we know what amount is still left to be charged 1692 * to the blkg from this point onward. blkg->last_use keeps track of 1693 * the use_delay counter. The idea is if we're unthrottling the blkg we 1694 * are ok with whatever is happening now, and we can take away more of 1695 * the accumulated delay as we've already throttled enough that 1696 * everybody is happy with their IO latencies. 1697 */ 1698 if (time_before64(old + NSEC_PER_SEC, now) && 1699 atomic64_cmpxchg(&blkg->delay_start, old, now) == old) { 1700 u64 cur = atomic64_read(&blkg->delay_nsec); 1701 u64 sub = min_t(u64, blkg->last_delay, now - old); 1702 int cur_use = atomic_read(&blkg->use_delay); 1703 1704 /* 1705 * We've been unthrottled, subtract a larger chunk of our 1706 * accumulated delay. 1707 */ 1708 if (cur_use < blkg->last_use) 1709 sub = max_t(u64, sub, blkg->last_delay >> 1); 1710 1711 /* 1712 * This shouldn't happen, but handle it anyway. Our delay_nsec 1713 * should only ever be growing except here where we subtract out 1714 * min(last_delay, 1 second), but lord knows bugs happen and I'd 1715 * rather not end up with negative numbers. 1716 */ 1717 if (unlikely(cur < sub)) { 1718 atomic64_set(&blkg->delay_nsec, 0); 1719 blkg->last_delay = 0; 1720 } else { 1721 atomic64_sub(sub, &blkg->delay_nsec); 1722 blkg->last_delay = cur - sub; 1723 } 1724 blkg->last_use = cur_use; 1725 } 1726 } 1727 1728 /* 1729 * This is called when we want to actually walk up the hierarchy and check to 1730 * see if we need to throttle, and then actually throttle if there is some 1731 * accumulated delay. This should only be called upon return to user space so 1732 * we're not holding some lock that would induce a priority inversion. 1733 */ 1734 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) 1735 { 1736 unsigned long pflags; 1737 bool clamp; 1738 u64 now = ktime_to_ns(ktime_get()); 1739 u64 exp; 1740 u64 delay_nsec = 0; 1741 int tok; 1742 1743 while (blkg->parent) { 1744 int use_delay = atomic_read(&blkg->use_delay); 1745 1746 if (use_delay) { 1747 u64 this_delay; 1748 1749 blkcg_scale_delay(blkg, now); 1750 this_delay = atomic64_read(&blkg->delay_nsec); 1751 if (this_delay > delay_nsec) { 1752 delay_nsec = this_delay; 1753 clamp = use_delay > 0; 1754 } 1755 } 1756 blkg = blkg->parent; 1757 } 1758 1759 if (!delay_nsec) 1760 return; 1761 1762 /* 1763 * Let's not sleep for all eternity if we've amassed a huge delay. 1764 * Swapping or metadata IO can accumulate 10's of seconds worth of 1765 * delay, and we want userspace to be able to do _something_ so cap the 1766 * delays at 0.25s. If there's 10's of seconds worth of delay then the 1767 * tasks will be delayed for 0.25 second for every syscall. If 1768 * blkcg_set_delay() was used as indicated by negative use_delay, the 1769 * caller is responsible for regulating the range. 1770 */ 1771 if (clamp) 1772 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); 1773 1774 if (use_memdelay) 1775 psi_memstall_enter(&pflags); 1776 1777 exp = ktime_add_ns(now, delay_nsec); 1778 tok = io_schedule_prepare(); 1779 do { 1780 __set_current_state(TASK_KILLABLE); 1781 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) 1782 break; 1783 } while (!fatal_signal_pending(current)); 1784 io_schedule_finish(tok); 1785 1786 if (use_memdelay) 1787 psi_memstall_leave(&pflags); 1788 } 1789 1790 /** 1791 * blkcg_maybe_throttle_current - throttle the current task if it has been marked 1792 * 1793 * This is only called if we've been marked with set_notify_resume(). Obviously 1794 * we can be set_notify_resume() for reasons other than blkcg throttling, so we 1795 * check to see if current->throttle_queue is set and if not this doesn't do 1796 * anything. This should only ever be called by the resume code, it's not meant 1797 * to be called by people willy-nilly as it will actually do the work to 1798 * throttle the task if it is setup for throttling. 1799 */ 1800 void blkcg_maybe_throttle_current(void) 1801 { 1802 struct request_queue *q = current->throttle_queue; 1803 struct blkcg *blkcg; 1804 struct blkcg_gq *blkg; 1805 bool use_memdelay = current->use_memdelay; 1806 1807 if (!q) 1808 return; 1809 1810 current->throttle_queue = NULL; 1811 current->use_memdelay = false; 1812 1813 rcu_read_lock(); 1814 blkcg = css_to_blkcg(blkcg_css()); 1815 if (!blkcg) 1816 goto out; 1817 blkg = blkg_lookup(blkcg, q); 1818 if (!blkg) 1819 goto out; 1820 if (!blkg_tryget(blkg)) 1821 goto out; 1822 rcu_read_unlock(); 1823 1824 blkcg_maybe_throttle_blkg(blkg, use_memdelay); 1825 blkg_put(blkg); 1826 blk_put_queue(q); 1827 return; 1828 out: 1829 rcu_read_unlock(); 1830 blk_put_queue(q); 1831 } 1832 1833 /** 1834 * blkcg_schedule_throttle - this task needs to check for throttling 1835 * @q: the request queue IO was submitted on 1836 * @use_memdelay: do we charge this to memory delay for PSI 1837 * 1838 * This is called by the IO controller when we know there's delay accumulated 1839 * for the blkg for this task. We do not pass the blkg because there are places 1840 * we call this that may not have that information, the swapping code for 1841 * instance will only have a request_queue at that point. This set's the 1842 * notify_resume for the task to check and see if it requires throttling before 1843 * returning to user space. 1844 * 1845 * We will only schedule once per syscall. You can call this over and over 1846 * again and it will only do the check once upon return to user space, and only 1847 * throttle once. If the task needs to be throttled again it'll need to be 1848 * re-set at the next time we see the task. 1849 */ 1850 void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) 1851 { 1852 if (unlikely(current->flags & PF_KTHREAD)) 1853 return; 1854 1855 if (current->throttle_queue != q) { 1856 if (!blk_get_queue(q)) 1857 return; 1858 1859 if (current->throttle_queue) 1860 blk_put_queue(current->throttle_queue); 1861 current->throttle_queue = q; 1862 } 1863 1864 if (use_memdelay) 1865 current->use_memdelay = use_memdelay; 1866 set_notify_resume(current); 1867 } 1868 1869 /** 1870 * blkcg_add_delay - add delay to this blkg 1871 * @blkg: blkg of interest 1872 * @now: the current time in nanoseconds 1873 * @delta: how many nanoseconds of delay to add 1874 * 1875 * Charge @delta to the blkg's current delay accumulation. This is used to 1876 * throttle tasks if an IO controller thinks we need more throttling. 1877 */ 1878 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) 1879 { 1880 if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) 1881 return; 1882 blkcg_scale_delay(blkg, now); 1883 atomic64_add(delta, &blkg->delay_nsec); 1884 } 1885 1886 /** 1887 * blkg_tryget_closest - try and get a blkg ref on the closet blkg 1888 * @bio: target bio 1889 * @css: target css 1890 * 1891 * As the failure mode here is to walk up the blkg tree, this ensure that the 1892 * blkg->parent pointers are always valid. This returns the blkg that it ended 1893 * up taking a reference on or %NULL if no reference was taken. 1894 */ 1895 static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, 1896 struct cgroup_subsys_state *css) 1897 { 1898 struct blkcg_gq *blkg, *ret_blkg = NULL; 1899 1900 rcu_read_lock(); 1901 blkg = blkg_lookup_create(css_to_blkcg(css), 1902 bdev_get_queue(bio->bi_bdev)); 1903 while (blkg) { 1904 if (blkg_tryget(blkg)) { 1905 ret_blkg = blkg; 1906 break; 1907 } 1908 blkg = blkg->parent; 1909 } 1910 rcu_read_unlock(); 1911 1912 return ret_blkg; 1913 } 1914 1915 /** 1916 * bio_associate_blkg_from_css - associate a bio with a specified css 1917 * @bio: target bio 1918 * @css: target css 1919 * 1920 * Associate @bio with the blkg found by combining the css's blkg and the 1921 * request_queue of the @bio. An association failure is handled by walking up 1922 * the blkg tree. Therefore, the blkg associated can be anything between @blkg 1923 * and q->root_blkg. This situation only happens when a cgroup is dying and 1924 * then the remaining bios will spill to the closest alive blkg. 1925 * 1926 * A reference will be taken on the blkg and will be released when @bio is 1927 * freed. 1928 */ 1929 void bio_associate_blkg_from_css(struct bio *bio, 1930 struct cgroup_subsys_state *css) 1931 { 1932 if (bio->bi_blkg) 1933 blkg_put(bio->bi_blkg); 1934 1935 if (css && css->parent) { 1936 bio->bi_blkg = blkg_tryget_closest(bio, css); 1937 } else { 1938 blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg); 1939 bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg; 1940 } 1941 } 1942 EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); 1943 1944 /** 1945 * bio_associate_blkg - associate a bio with a blkg 1946 * @bio: target bio 1947 * 1948 * Associate @bio with the blkg found from the bio's css and request_queue. 1949 * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is 1950 * already associated, the css is reused and association redone as the 1951 * request_queue may have changed. 1952 */ 1953 void bio_associate_blkg(struct bio *bio) 1954 { 1955 struct cgroup_subsys_state *css; 1956 1957 rcu_read_lock(); 1958 1959 if (bio->bi_blkg) 1960 css = bio_blkcg_css(bio); 1961 else 1962 css = blkcg_css(); 1963 1964 bio_associate_blkg_from_css(bio, css); 1965 1966 rcu_read_unlock(); 1967 } 1968 EXPORT_SYMBOL_GPL(bio_associate_blkg); 1969 1970 /** 1971 * bio_clone_blkg_association - clone blkg association from src to dst bio 1972 * @dst: destination bio 1973 * @src: source bio 1974 */ 1975 void bio_clone_blkg_association(struct bio *dst, struct bio *src) 1976 { 1977 if (src->bi_blkg) { 1978 if (dst->bi_blkg) 1979 blkg_put(dst->bi_blkg); 1980 blkg_get(src->bi_blkg); 1981 dst->bi_blkg = src->bi_blkg; 1982 } 1983 } 1984 EXPORT_SYMBOL_GPL(bio_clone_blkg_association); 1985 1986 static int blk_cgroup_io_type(struct bio *bio) 1987 { 1988 if (op_is_discard(bio->bi_opf)) 1989 return BLKG_IOSTAT_DISCARD; 1990 if (op_is_write(bio->bi_opf)) 1991 return BLKG_IOSTAT_WRITE; 1992 return BLKG_IOSTAT_READ; 1993 } 1994 1995 void blk_cgroup_bio_start(struct bio *bio) 1996 { 1997 int rwd = blk_cgroup_io_type(bio), cpu; 1998 struct blkg_iostat_set *bis; 1999 unsigned long flags; 2000 2001 cpu = get_cpu(); 2002 bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); 2003 flags = u64_stats_update_begin_irqsave(&bis->sync); 2004 2005 /* 2006 * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split 2007 * bio and we would have already accounted for the size of the bio. 2008 */ 2009 if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { 2010 bio_set_flag(bio, BIO_CGROUP_ACCT); 2011 bis->cur.bytes[rwd] += bio->bi_iter.bi_size; 2012 } 2013 bis->cur.ios[rwd]++; 2014 2015 u64_stats_update_end_irqrestore(&bis->sync, flags); 2016 if (cgroup_subsys_on_dfl(io_cgrp_subsys)) 2017 cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); 2018 put_cpu(); 2019 } 2020 2021 bool blk_cgroup_congested(void) 2022 { 2023 struct cgroup_subsys_state *css; 2024 bool ret = false; 2025 2026 rcu_read_lock(); 2027 for (css = blkcg_css(); css; css = css->parent) { 2028 if (atomic_read(&css->cgroup->congestion_count)) { 2029 ret = true; 2030 break; 2031 } 2032 } 2033 rcu_read_unlock(); 2034 return ret; 2035 } 2036 2037 static int __init blkcg_init(void) 2038 { 2039 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", 2040 WQ_MEM_RECLAIM | WQ_FREEZABLE | 2041 WQ_UNBOUND | WQ_SYSFS, 0); 2042 if (!blkcg_punt_bio_wq) 2043 return -ENOMEM; 2044 return 0; 2045 } 2046 subsys_initcall(blkcg_init); 2047 2048 module_param(blkcg_debug_stats, bool, 0644); 2049 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); 2050