1 /* 2 * Common Block IO controller cgroup interface 3 * 4 * Based on ideas and code from CFQ, CFS and BFQ: 5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 6 * 7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> 8 * Paolo Valente <paolo.valente@unimore.it> 9 * 10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 11 * Nauman Rafique <nauman@google.com> 12 */ 13 #include <linux/ioprio.h> 14 #include <linux/kdev_t.h> 15 #include <linux/module.h> 16 #include <linux/err.h> 17 #include <linux/blkdev.h> 18 #include <linux/slab.h> 19 #include <linux/genhd.h> 20 #include <linux/delay.h> 21 #include <linux/atomic.h> 22 #include "blk-cgroup.h" 23 #include "blk.h" 24 25 #define MAX_KEY_LEN 100 26 27 static DEFINE_MUTEX(blkcg_pol_mutex); 28 29 struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, 30 .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; 31 EXPORT_SYMBOL_GPL(blkcg_root); 32 33 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 34 35 static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 36 struct request_queue *q, bool update_hint); 37 38 /** 39 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants 40 * @d_blkg: loop cursor pointing to the current descendant 41 * @pos_cgrp: used for iteration 42 * @p_blkg: target blkg to walk descendants of 43 * 44 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU 45 * read locked. If called under either blkcg or queue lock, the iteration 46 * is guaranteed to include all and only online blkgs. The caller may 47 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip 48 * subtree. 49 */ 50 #define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ 51 cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ 52 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ 53 (p_blkg)->q, false))) 54 55 static bool blkcg_policy_enabled(struct request_queue *q, 56 const struct blkcg_policy *pol) 57 { 58 return pol && test_bit(pol->plid, q->blkcg_pols); 59 } 60 61 /** 62 * blkg_free - free a blkg 63 * @blkg: blkg to free 64 * 65 * Free @blkg which may be partially allocated. 66 */ 67 static void blkg_free(struct blkcg_gq *blkg) 68 { 69 int i; 70 71 if (!blkg) 72 return; 73 74 for (i = 0; i < BLKCG_MAX_POLS; i++) { 75 struct blkcg_policy *pol = blkcg_policy[i]; 76 struct blkg_policy_data *pd = blkg->pd[i]; 77 78 if (!pd) 79 continue; 80 81 if (pol && pol->pd_exit_fn) 82 pol->pd_exit_fn(blkg); 83 84 kfree(pd); 85 } 86 87 blk_exit_rl(&blkg->rl); 88 kfree(blkg); 89 } 90 91 /** 92 * blkg_alloc - allocate a blkg 93 * @blkcg: block cgroup the new blkg is associated with 94 * @q: request_queue the new blkg is associated with 95 * @gfp_mask: allocation mask to use 96 * 97 * Allocate a new blkg assocating @blkcg and @q. 98 */ 99 static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, 100 gfp_t gfp_mask) 101 { 102 struct blkcg_gq *blkg; 103 int i; 104 105 /* alloc and init base part */ 106 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); 107 if (!blkg) 108 return NULL; 109 110 blkg->q = q; 111 INIT_LIST_HEAD(&blkg->q_node); 112 blkg->blkcg = blkcg; 113 blkg->refcnt = 1; 114 115 /* root blkg uses @q->root_rl, init rl only for !root blkgs */ 116 if (blkcg != &blkcg_root) { 117 if (blk_init_rl(&blkg->rl, q, gfp_mask)) 118 goto err_free; 119 blkg->rl.blkg = blkg; 120 } 121 122 for (i = 0; i < BLKCG_MAX_POLS; i++) { 123 struct blkcg_policy *pol = blkcg_policy[i]; 124 struct blkg_policy_data *pd; 125 126 if (!blkcg_policy_enabled(q, pol)) 127 continue; 128 129 /* alloc per-policy data and attach it to blkg */ 130 pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); 131 if (!pd) 132 goto err_free; 133 134 blkg->pd[i] = pd; 135 pd->blkg = blkg; 136 pd->plid = i; 137 138 /* invoke per-policy init */ 139 if (pol->pd_init_fn) 140 pol->pd_init_fn(blkg); 141 } 142 143 return blkg; 144 145 err_free: 146 blkg_free(blkg); 147 return NULL; 148 } 149 150 /** 151 * __blkg_lookup - internal version of blkg_lookup() 152 * @blkcg: blkcg of interest 153 * @q: request_queue of interest 154 * @update_hint: whether to update lookup hint with the result or not 155 * 156 * This is internal version and shouldn't be used by policy 157 * implementations. Looks up blkgs for the @blkcg - @q pair regardless of 158 * @q's bypass state. If @update_hint is %true, the caller should be 159 * holding @q->queue_lock and lookup hint is updated on success. 160 */ 161 static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 162 struct request_queue *q, bool update_hint) 163 { 164 struct blkcg_gq *blkg; 165 166 blkg = rcu_dereference(blkcg->blkg_hint); 167 if (blkg && blkg->q == q) 168 return blkg; 169 170 /* 171 * Hint didn't match. Look up from the radix tree. Note that the 172 * hint can only be updated under queue_lock as otherwise @blkg 173 * could have already been removed from blkg_tree. The caller is 174 * responsible for grabbing queue_lock if @update_hint. 175 */ 176 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); 177 if (blkg && blkg->q == q) { 178 if (update_hint) { 179 lockdep_assert_held(q->queue_lock); 180 rcu_assign_pointer(blkcg->blkg_hint, blkg); 181 } 182 return blkg; 183 } 184 185 return NULL; 186 } 187 188 /** 189 * blkg_lookup - lookup blkg for the specified blkcg - q pair 190 * @blkcg: blkcg of interest 191 * @q: request_queue of interest 192 * 193 * Lookup blkg for the @blkcg - @q pair. This function should be called 194 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing 195 * - see blk_queue_bypass_start() for details. 196 */ 197 struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) 198 { 199 WARN_ON_ONCE(!rcu_read_lock_held()); 200 201 if (unlikely(blk_queue_bypass(q))) 202 return NULL; 203 return __blkg_lookup(blkcg, q, false); 204 } 205 EXPORT_SYMBOL_GPL(blkg_lookup); 206 207 /* 208 * If @new_blkg is %NULL, this function tries to allocate a new one as 209 * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. 210 */ 211 static struct blkcg_gq *blkg_create(struct blkcg *blkcg, 212 struct request_queue *q, 213 struct blkcg_gq *new_blkg) 214 { 215 struct blkcg_gq *blkg; 216 int i, ret; 217 218 WARN_ON_ONCE(!rcu_read_lock_held()); 219 lockdep_assert_held(q->queue_lock); 220 221 /* blkg holds a reference to blkcg */ 222 if (!css_tryget(&blkcg->css)) { 223 ret = -EINVAL; 224 goto err_free_blkg; 225 } 226 227 /* allocate */ 228 if (!new_blkg) { 229 new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); 230 if (unlikely(!new_blkg)) { 231 ret = -ENOMEM; 232 goto err_put_css; 233 } 234 } 235 blkg = new_blkg; 236 237 /* link parent and insert */ 238 if (blkcg_parent(blkcg)) { 239 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); 240 if (WARN_ON_ONCE(!blkg->parent)) { 241 blkg = ERR_PTR(-EINVAL); 242 goto err_put_css; 243 } 244 blkg_get(blkg->parent); 245 } 246 247 spin_lock(&blkcg->lock); 248 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 249 if (likely(!ret)) { 250 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 251 list_add(&blkg->q_node, &q->blkg_list); 252 253 for (i = 0; i < BLKCG_MAX_POLS; i++) { 254 struct blkcg_policy *pol = blkcg_policy[i]; 255 256 if (blkg->pd[i] && pol->pd_online_fn) 257 pol->pd_online_fn(blkg); 258 } 259 } 260 blkg->online = true; 261 spin_unlock(&blkcg->lock); 262 263 if (!ret) 264 return blkg; 265 266 /* @blkg failed fully initialized, use the usual release path */ 267 blkg_put(blkg); 268 return ERR_PTR(ret); 269 270 err_put_css: 271 css_put(&blkcg->css); 272 err_free_blkg: 273 blkg_free(new_blkg); 274 return ERR_PTR(ret); 275 } 276 277 /** 278 * blkg_lookup_create - lookup blkg, try to create one if not there 279 * @blkcg: blkcg of interest 280 * @q: request_queue of interest 281 * 282 * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to 283 * create one. blkg creation is performed recursively from blkcg_root such 284 * that all non-root blkg's have access to the parent blkg. This function 285 * should be called under RCU read lock and @q->queue_lock. 286 * 287 * Returns pointer to the looked up or created blkg on success, ERR_PTR() 288 * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not 289 * dead and bypassing, returns ERR_PTR(-EBUSY). 290 */ 291 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 292 struct request_queue *q) 293 { 294 struct blkcg_gq *blkg; 295 296 WARN_ON_ONCE(!rcu_read_lock_held()); 297 lockdep_assert_held(q->queue_lock); 298 299 /* 300 * This could be the first entry point of blkcg implementation and 301 * we shouldn't allow anything to go through for a bypassing queue. 302 */ 303 if (unlikely(blk_queue_bypass(q))) 304 return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); 305 306 blkg = __blkg_lookup(blkcg, q, true); 307 if (blkg) 308 return blkg; 309 310 /* 311 * Create blkgs walking down from blkcg_root to @blkcg, so that all 312 * non-root blkgs have access to their parents. 313 */ 314 while (true) { 315 struct blkcg *pos = blkcg; 316 struct blkcg *parent = blkcg_parent(blkcg); 317 318 while (parent && !__blkg_lookup(parent, q, false)) { 319 pos = parent; 320 parent = blkcg_parent(parent); 321 } 322 323 blkg = blkg_create(pos, q, NULL); 324 if (pos == blkcg || IS_ERR(blkg)) 325 return blkg; 326 } 327 } 328 EXPORT_SYMBOL_GPL(blkg_lookup_create); 329 330 static void blkg_destroy(struct blkcg_gq *blkg) 331 { 332 struct blkcg *blkcg = blkg->blkcg; 333 int i; 334 335 lockdep_assert_held(blkg->q->queue_lock); 336 lockdep_assert_held(&blkcg->lock); 337 338 /* Something wrong if we are trying to remove same group twice */ 339 WARN_ON_ONCE(list_empty(&blkg->q_node)); 340 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); 341 342 for (i = 0; i < BLKCG_MAX_POLS; i++) { 343 struct blkcg_policy *pol = blkcg_policy[i]; 344 345 if (blkg->pd[i] && pol->pd_offline_fn) 346 pol->pd_offline_fn(blkg); 347 } 348 blkg->online = false; 349 350 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); 351 list_del_init(&blkg->q_node); 352 hlist_del_init_rcu(&blkg->blkcg_node); 353 354 /* 355 * Both setting lookup hint to and clearing it from @blkg are done 356 * under queue_lock. If it's not pointing to @blkg now, it never 357 * will. Hint assignment itself can race safely. 358 */ 359 if (rcu_dereference_raw(blkcg->blkg_hint) == blkg) 360 rcu_assign_pointer(blkcg->blkg_hint, NULL); 361 362 /* 363 * Put the reference taken at the time of creation so that when all 364 * queues are gone, group can be destroyed. 365 */ 366 blkg_put(blkg); 367 } 368 369 /** 370 * blkg_destroy_all - destroy all blkgs associated with a request_queue 371 * @q: request_queue of interest 372 * 373 * Destroy all blkgs associated with @q. 374 */ 375 static void blkg_destroy_all(struct request_queue *q) 376 { 377 struct blkcg_gq *blkg, *n; 378 379 lockdep_assert_held(q->queue_lock); 380 381 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 382 struct blkcg *blkcg = blkg->blkcg; 383 384 spin_lock(&blkcg->lock); 385 blkg_destroy(blkg); 386 spin_unlock(&blkcg->lock); 387 } 388 389 /* 390 * root blkg is destroyed. Just clear the pointer since 391 * root_rl does not take reference on root blkg. 392 */ 393 q->root_blkg = NULL; 394 q->root_rl.blkg = NULL; 395 } 396 397 static void blkg_rcu_free(struct rcu_head *rcu_head) 398 { 399 blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); 400 } 401 402 void __blkg_release(struct blkcg_gq *blkg) 403 { 404 /* release the blkcg and parent blkg refs this blkg has been holding */ 405 css_put(&blkg->blkcg->css); 406 if (blkg->parent) 407 blkg_put(blkg->parent); 408 409 /* 410 * A group is freed in rcu manner. But having an rcu lock does not 411 * mean that one can access all the fields of blkg and assume these 412 * are valid. For example, don't try to follow throtl_data and 413 * request queue links. 414 * 415 * Having a reference to blkg under an rcu allows acess to only 416 * values local to groups like group stats and group rate limits 417 */ 418 call_rcu(&blkg->rcu_head, blkg_rcu_free); 419 } 420 EXPORT_SYMBOL_GPL(__blkg_release); 421 422 /* 423 * The next function used by blk_queue_for_each_rl(). It's a bit tricky 424 * because the root blkg uses @q->root_rl instead of its own rl. 425 */ 426 struct request_list *__blk_queue_next_rl(struct request_list *rl, 427 struct request_queue *q) 428 { 429 struct list_head *ent; 430 struct blkcg_gq *blkg; 431 432 /* 433 * Determine the current blkg list_head. The first entry is 434 * root_rl which is off @q->blkg_list and mapped to the head. 435 */ 436 if (rl == &q->root_rl) { 437 ent = &q->blkg_list; 438 /* There are no more block groups, hence no request lists */ 439 if (list_empty(ent)) 440 return NULL; 441 } else { 442 blkg = container_of(rl, struct blkcg_gq, rl); 443 ent = &blkg->q_node; 444 } 445 446 /* walk to the next list_head, skip root blkcg */ 447 ent = ent->next; 448 if (ent == &q->root_blkg->q_node) 449 ent = ent->next; 450 if (ent == &q->blkg_list) 451 return NULL; 452 453 blkg = container_of(ent, struct blkcg_gq, q_node); 454 return &blkg->rl; 455 } 456 457 static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, 458 u64 val) 459 { 460 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 461 struct blkcg_gq *blkg; 462 int i; 463 464 mutex_lock(&blkcg_pol_mutex); 465 spin_lock_irq(&blkcg->lock); 466 467 /* 468 * Note that stat reset is racy - it doesn't synchronize against 469 * stat updates. This is a debug feature which shouldn't exist 470 * anyway. If you get hit by a race, retry. 471 */ 472 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { 473 for (i = 0; i < BLKCG_MAX_POLS; i++) { 474 struct blkcg_policy *pol = blkcg_policy[i]; 475 476 if (blkcg_policy_enabled(blkg->q, pol) && 477 pol->pd_reset_stats_fn) 478 pol->pd_reset_stats_fn(blkg); 479 } 480 } 481 482 spin_unlock_irq(&blkcg->lock); 483 mutex_unlock(&blkcg_pol_mutex); 484 return 0; 485 } 486 487 static const char *blkg_dev_name(struct blkcg_gq *blkg) 488 { 489 /* some drivers (floppy) instantiate a queue w/o disk registered */ 490 if (blkg->q->backing_dev_info.dev) 491 return dev_name(blkg->q->backing_dev_info.dev); 492 return NULL; 493 } 494 495 /** 496 * blkcg_print_blkgs - helper for printing per-blkg data 497 * @sf: seq_file to print to 498 * @blkcg: blkcg of interest 499 * @prfill: fill function to print out a blkg 500 * @pol: policy in question 501 * @data: data to be passed to @prfill 502 * @show_total: to print out sum of prfill return values or not 503 * 504 * This function invokes @prfill on each blkg of @blkcg if pd for the 505 * policy specified by @pol exists. @prfill is invoked with @sf, the 506 * policy data and @data and the matching queue lock held. If @show_total 507 * is %true, the sum of the return values from @prfill is printed with 508 * "Total" label at the end. 509 * 510 * This is to be used to construct print functions for 511 * cftype->read_seq_string method. 512 */ 513 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, 514 u64 (*prfill)(struct seq_file *, 515 struct blkg_policy_data *, int), 516 const struct blkcg_policy *pol, int data, 517 bool show_total) 518 { 519 struct blkcg_gq *blkg; 520 u64 total = 0; 521 522 rcu_read_lock(); 523 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 524 spin_lock_irq(blkg->q->queue_lock); 525 if (blkcg_policy_enabled(blkg->q, pol)) 526 total += prfill(sf, blkg->pd[pol->plid], data); 527 spin_unlock_irq(blkg->q->queue_lock); 528 } 529 rcu_read_unlock(); 530 531 if (show_total) 532 seq_printf(sf, "Total %llu\n", (unsigned long long)total); 533 } 534 EXPORT_SYMBOL_GPL(blkcg_print_blkgs); 535 536 /** 537 * __blkg_prfill_u64 - prfill helper for a single u64 value 538 * @sf: seq_file to print to 539 * @pd: policy private data of interest 540 * @v: value to print 541 * 542 * Print @v to @sf for the device assocaited with @pd. 543 */ 544 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) 545 { 546 const char *dname = blkg_dev_name(pd->blkg); 547 548 if (!dname) 549 return 0; 550 551 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); 552 return v; 553 } 554 EXPORT_SYMBOL_GPL(__blkg_prfill_u64); 555 556 /** 557 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat 558 * @sf: seq_file to print to 559 * @pd: policy private data of interest 560 * @rwstat: rwstat to print 561 * 562 * Print @rwstat to @sf for the device assocaited with @pd. 563 */ 564 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 565 const struct blkg_rwstat *rwstat) 566 { 567 static const char *rwstr[] = { 568 [BLKG_RWSTAT_READ] = "Read", 569 [BLKG_RWSTAT_WRITE] = "Write", 570 [BLKG_RWSTAT_SYNC] = "Sync", 571 [BLKG_RWSTAT_ASYNC] = "Async", 572 }; 573 const char *dname = blkg_dev_name(pd->blkg); 574 u64 v; 575 int i; 576 577 if (!dname) 578 return 0; 579 580 for (i = 0; i < BLKG_RWSTAT_NR; i++) 581 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], 582 (unsigned long long)rwstat->cnt[i]); 583 584 v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; 585 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); 586 return v; 587 } 588 EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); 589 590 /** 591 * blkg_prfill_stat - prfill callback for blkg_stat 592 * @sf: seq_file to print to 593 * @pd: policy private data of interest 594 * @off: offset to the blkg_stat in @pd 595 * 596 * prfill callback for printing a blkg_stat. 597 */ 598 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) 599 { 600 return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); 601 } 602 EXPORT_SYMBOL_GPL(blkg_prfill_stat); 603 604 /** 605 * blkg_prfill_rwstat - prfill callback for blkg_rwstat 606 * @sf: seq_file to print to 607 * @pd: policy private data of interest 608 * @off: offset to the blkg_rwstat in @pd 609 * 610 * prfill callback for printing a blkg_rwstat. 611 */ 612 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 613 int off) 614 { 615 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); 616 617 return __blkg_prfill_rwstat(sf, pd, &rwstat); 618 } 619 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); 620 621 /** 622 * blkg_stat_recursive_sum - collect hierarchical blkg_stat 623 * @pd: policy private data of interest 624 * @off: offset to the blkg_stat in @pd 625 * 626 * Collect the blkg_stat specified by @off from @pd and all its online 627 * descendants and return the sum. The caller must be holding the queue 628 * lock for online tests. 629 */ 630 u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) 631 { 632 struct blkcg_policy *pol = blkcg_policy[pd->plid]; 633 struct blkcg_gq *pos_blkg; 634 struct cgroup *pos_cgrp; 635 u64 sum; 636 637 lockdep_assert_held(pd->blkg->q->queue_lock); 638 639 sum = blkg_stat_read((void *)pd + off); 640 641 rcu_read_lock(); 642 blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { 643 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); 644 struct blkg_stat *stat = (void *)pos_pd + off; 645 646 if (pos_blkg->online) 647 sum += blkg_stat_read(stat); 648 } 649 rcu_read_unlock(); 650 651 return sum; 652 } 653 EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); 654 655 /** 656 * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat 657 * @pd: policy private data of interest 658 * @off: offset to the blkg_stat in @pd 659 * 660 * Collect the blkg_rwstat specified by @off from @pd and all its online 661 * descendants and return the sum. The caller must be holding the queue 662 * lock for online tests. 663 */ 664 struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, 665 int off) 666 { 667 struct blkcg_policy *pol = blkcg_policy[pd->plid]; 668 struct blkcg_gq *pos_blkg; 669 struct cgroup *pos_cgrp; 670 struct blkg_rwstat sum; 671 int i; 672 673 lockdep_assert_held(pd->blkg->q->queue_lock); 674 675 sum = blkg_rwstat_read((void *)pd + off); 676 677 rcu_read_lock(); 678 blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { 679 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); 680 struct blkg_rwstat *rwstat = (void *)pos_pd + off; 681 struct blkg_rwstat tmp; 682 683 if (!pos_blkg->online) 684 continue; 685 686 tmp = blkg_rwstat_read(rwstat); 687 688 for (i = 0; i < BLKG_RWSTAT_NR; i++) 689 sum.cnt[i] += tmp.cnt[i]; 690 } 691 rcu_read_unlock(); 692 693 return sum; 694 } 695 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); 696 697 /** 698 * blkg_conf_prep - parse and prepare for per-blkg config update 699 * @blkcg: target block cgroup 700 * @pol: target policy 701 * @input: input string 702 * @ctx: blkg_conf_ctx to be filled 703 * 704 * Parse per-blkg config update from @input and initialize @ctx with the 705 * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new 706 * value. This function returns with RCU read lock and queue lock held and 707 * must be paired with blkg_conf_finish(). 708 */ 709 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 710 const char *input, struct blkg_conf_ctx *ctx) 711 __acquires(rcu) __acquires(disk->queue->queue_lock) 712 { 713 struct gendisk *disk; 714 struct blkcg_gq *blkg; 715 unsigned int major, minor; 716 unsigned long long v; 717 int part, ret; 718 719 if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) 720 return -EINVAL; 721 722 disk = get_gendisk(MKDEV(major, minor), &part); 723 if (!disk || part) 724 return -EINVAL; 725 726 rcu_read_lock(); 727 spin_lock_irq(disk->queue->queue_lock); 728 729 if (blkcg_policy_enabled(disk->queue, pol)) 730 blkg = blkg_lookup_create(blkcg, disk->queue); 731 else 732 blkg = ERR_PTR(-EINVAL); 733 734 if (IS_ERR(blkg)) { 735 ret = PTR_ERR(blkg); 736 rcu_read_unlock(); 737 spin_unlock_irq(disk->queue->queue_lock); 738 put_disk(disk); 739 /* 740 * If queue was bypassing, we should retry. Do so after a 741 * short msleep(). It isn't strictly necessary but queue 742 * can be bypassing for some time and it's always nice to 743 * avoid busy looping. 744 */ 745 if (ret == -EBUSY) { 746 msleep(10); 747 ret = restart_syscall(); 748 } 749 return ret; 750 } 751 752 ctx->disk = disk; 753 ctx->blkg = blkg; 754 ctx->v = v; 755 return 0; 756 } 757 EXPORT_SYMBOL_GPL(blkg_conf_prep); 758 759 /** 760 * blkg_conf_finish - finish up per-blkg config update 761 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() 762 * 763 * Finish up after per-blkg config update. This function must be paired 764 * with blkg_conf_prep(). 765 */ 766 void blkg_conf_finish(struct blkg_conf_ctx *ctx) 767 __releases(ctx->disk->queue->queue_lock) __releases(rcu) 768 { 769 spin_unlock_irq(ctx->disk->queue->queue_lock); 770 rcu_read_unlock(); 771 put_disk(ctx->disk); 772 } 773 EXPORT_SYMBOL_GPL(blkg_conf_finish); 774 775 struct cftype blkcg_files[] = { 776 { 777 .name = "reset_stats", 778 .write_u64 = blkcg_reset_stats, 779 }, 780 { } /* terminate */ 781 }; 782 783 /** 784 * blkcg_css_offline - cgroup css_offline callback 785 * @cgroup: cgroup of interest 786 * 787 * This function is called when @cgroup is about to go away and responsible 788 * for shooting down all blkgs associated with @cgroup. blkgs should be 789 * removed while holding both q and blkcg locks. As blkcg lock is nested 790 * inside q lock, this function performs reverse double lock dancing. 791 * 792 * This is the blkcg counterpart of ioc_release_fn(). 793 */ 794 static void blkcg_css_offline(struct cgroup *cgroup) 795 { 796 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 797 798 spin_lock_irq(&blkcg->lock); 799 800 while (!hlist_empty(&blkcg->blkg_list)) { 801 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, 802 struct blkcg_gq, blkcg_node); 803 struct request_queue *q = blkg->q; 804 805 if (spin_trylock(q->queue_lock)) { 806 blkg_destroy(blkg); 807 spin_unlock(q->queue_lock); 808 } else { 809 spin_unlock_irq(&blkcg->lock); 810 cpu_relax(); 811 spin_lock_irq(&blkcg->lock); 812 } 813 } 814 815 spin_unlock_irq(&blkcg->lock); 816 } 817 818 static void blkcg_css_free(struct cgroup *cgroup) 819 { 820 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 821 822 if (blkcg != &blkcg_root) 823 kfree(blkcg); 824 } 825 826 static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) 827 { 828 static atomic64_t id_seq = ATOMIC64_INIT(0); 829 struct blkcg *blkcg; 830 struct cgroup *parent = cgroup->parent; 831 832 if (!parent) { 833 blkcg = &blkcg_root; 834 goto done; 835 } 836 837 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 838 if (!blkcg) 839 return ERR_PTR(-ENOMEM); 840 841 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; 842 blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; 843 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ 844 done: 845 spin_lock_init(&blkcg->lock); 846 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); 847 INIT_HLIST_HEAD(&blkcg->blkg_list); 848 849 return &blkcg->css; 850 } 851 852 /** 853 * blkcg_init_queue - initialize blkcg part of request queue 854 * @q: request_queue to initialize 855 * 856 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg 857 * part of new request_queue @q. 858 * 859 * RETURNS: 860 * 0 on success, -errno on failure. 861 */ 862 int blkcg_init_queue(struct request_queue *q) 863 { 864 might_sleep(); 865 866 return blk_throtl_init(q); 867 } 868 869 /** 870 * blkcg_drain_queue - drain blkcg part of request_queue 871 * @q: request_queue to drain 872 * 873 * Called from blk_drain_queue(). Responsible for draining blkcg part. 874 */ 875 void blkcg_drain_queue(struct request_queue *q) 876 { 877 lockdep_assert_held(q->queue_lock); 878 879 blk_throtl_drain(q); 880 } 881 882 /** 883 * blkcg_exit_queue - exit and release blkcg part of request_queue 884 * @q: request_queue being released 885 * 886 * Called from blk_release_queue(). Responsible for exiting blkcg part. 887 */ 888 void blkcg_exit_queue(struct request_queue *q) 889 { 890 spin_lock_irq(q->queue_lock); 891 blkg_destroy_all(q); 892 spin_unlock_irq(q->queue_lock); 893 894 blk_throtl_exit(q); 895 } 896 897 /* 898 * We cannot support shared io contexts, as we have no mean to support 899 * two tasks with the same ioc in two different groups without major rework 900 * of the main cic data structures. For now we allow a task to change 901 * its cgroup only if it's the only owner of its ioc. 902 */ 903 static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 904 { 905 struct task_struct *task; 906 struct io_context *ioc; 907 int ret = 0; 908 909 /* task_lock() is needed to avoid races with exit_io_context() */ 910 cgroup_taskset_for_each(task, cgrp, tset) { 911 task_lock(task); 912 ioc = task->io_context; 913 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 914 ret = -EINVAL; 915 task_unlock(task); 916 if (ret) 917 break; 918 } 919 return ret; 920 } 921 922 struct cgroup_subsys blkio_subsys = { 923 .name = "blkio", 924 .css_alloc = blkcg_css_alloc, 925 .css_offline = blkcg_css_offline, 926 .css_free = blkcg_css_free, 927 .can_attach = blkcg_can_attach, 928 .subsys_id = blkio_subsys_id, 929 .base_cftypes = blkcg_files, 930 .module = THIS_MODULE, 931 932 /* 933 * blkio subsystem is utterly broken in terms of hierarchy support. 934 * It treats all cgroups equally regardless of where they're 935 * located in the hierarchy - all cgroups are treated as if they're 936 * right below the root. Fix it and remove the following. 937 */ 938 .broken_hierarchy = true, 939 }; 940 EXPORT_SYMBOL_GPL(blkio_subsys); 941 942 /** 943 * blkcg_activate_policy - activate a blkcg policy on a request_queue 944 * @q: request_queue of interest 945 * @pol: blkcg policy to activate 946 * 947 * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through 948 * bypass mode to populate its blkgs with policy_data for @pol. 949 * 950 * Activation happens with @q bypassed, so nobody would be accessing blkgs 951 * from IO path. Update of each blkg is protected by both queue and blkcg 952 * locks so that holding either lock and testing blkcg_policy_enabled() is 953 * always enough for dereferencing policy data. 954 * 955 * The caller is responsible for synchronizing [de]activations and policy 956 * [un]registerations. Returns 0 on success, -errno on failure. 957 */ 958 int blkcg_activate_policy(struct request_queue *q, 959 const struct blkcg_policy *pol) 960 { 961 LIST_HEAD(pds); 962 struct blkcg_gq *blkg, *new_blkg; 963 struct blkg_policy_data *pd, *n; 964 int cnt = 0, ret; 965 bool preloaded; 966 967 if (blkcg_policy_enabled(q, pol)) 968 return 0; 969 970 /* preallocations for root blkg */ 971 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); 972 if (!new_blkg) 973 return -ENOMEM; 974 975 blk_queue_bypass_start(q); 976 977 preloaded = !radix_tree_preload(GFP_KERNEL); 978 979 /* 980 * Make sure the root blkg exists and count the existing blkgs. As 981 * @q is bypassing at this point, blkg_lookup_create() can't be 982 * used. Open code it. 983 */ 984 spin_lock_irq(q->queue_lock); 985 986 rcu_read_lock(); 987 blkg = __blkg_lookup(&blkcg_root, q, false); 988 if (blkg) 989 blkg_free(new_blkg); 990 else 991 blkg = blkg_create(&blkcg_root, q, new_blkg); 992 rcu_read_unlock(); 993 994 if (preloaded) 995 radix_tree_preload_end(); 996 997 if (IS_ERR(blkg)) { 998 ret = PTR_ERR(blkg); 999 goto out_unlock; 1000 } 1001 q->root_blkg = blkg; 1002 q->root_rl.blkg = blkg; 1003 1004 list_for_each_entry(blkg, &q->blkg_list, q_node) 1005 cnt++; 1006 1007 spin_unlock_irq(q->queue_lock); 1008 1009 /* allocate policy_data for all existing blkgs */ 1010 while (cnt--) { 1011 pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); 1012 if (!pd) { 1013 ret = -ENOMEM; 1014 goto out_free; 1015 } 1016 list_add_tail(&pd->alloc_node, &pds); 1017 } 1018 1019 /* 1020 * Install the allocated pds. With @q bypassing, no new blkg 1021 * should have been created while the queue lock was dropped. 1022 */ 1023 spin_lock_irq(q->queue_lock); 1024 1025 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1026 if (WARN_ON(list_empty(&pds))) { 1027 /* umm... this shouldn't happen, just abort */ 1028 ret = -ENOMEM; 1029 goto out_unlock; 1030 } 1031 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); 1032 list_del_init(&pd->alloc_node); 1033 1034 /* grab blkcg lock too while installing @pd on @blkg */ 1035 spin_lock(&blkg->blkcg->lock); 1036 1037 blkg->pd[pol->plid] = pd; 1038 pd->blkg = blkg; 1039 pd->plid = pol->plid; 1040 pol->pd_init_fn(blkg); 1041 1042 spin_unlock(&blkg->blkcg->lock); 1043 } 1044 1045 __set_bit(pol->plid, q->blkcg_pols); 1046 ret = 0; 1047 out_unlock: 1048 spin_unlock_irq(q->queue_lock); 1049 out_free: 1050 blk_queue_bypass_end(q); 1051 list_for_each_entry_safe(pd, n, &pds, alloc_node) 1052 kfree(pd); 1053 return ret; 1054 } 1055 EXPORT_SYMBOL_GPL(blkcg_activate_policy); 1056 1057 /** 1058 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue 1059 * @q: request_queue of interest 1060 * @pol: blkcg policy to deactivate 1061 * 1062 * Deactivate @pol on @q. Follows the same synchronization rules as 1063 * blkcg_activate_policy(). 1064 */ 1065 void blkcg_deactivate_policy(struct request_queue *q, 1066 const struct blkcg_policy *pol) 1067 { 1068 struct blkcg_gq *blkg; 1069 1070 if (!blkcg_policy_enabled(q, pol)) 1071 return; 1072 1073 blk_queue_bypass_start(q); 1074 spin_lock_irq(q->queue_lock); 1075 1076 __clear_bit(pol->plid, q->blkcg_pols); 1077 1078 /* if no policy is left, no need for blkgs - shoot them down */ 1079 if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) 1080 blkg_destroy_all(q); 1081 1082 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1083 /* grab blkcg lock too while removing @pd from @blkg */ 1084 spin_lock(&blkg->blkcg->lock); 1085 1086 if (pol->pd_offline_fn) 1087 pol->pd_offline_fn(blkg); 1088 if (pol->pd_exit_fn) 1089 pol->pd_exit_fn(blkg); 1090 1091 kfree(blkg->pd[pol->plid]); 1092 blkg->pd[pol->plid] = NULL; 1093 1094 spin_unlock(&blkg->blkcg->lock); 1095 } 1096 1097 spin_unlock_irq(q->queue_lock); 1098 blk_queue_bypass_end(q); 1099 } 1100 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); 1101 1102 /** 1103 * blkcg_policy_register - register a blkcg policy 1104 * @pol: blkcg policy to register 1105 * 1106 * Register @pol with blkcg core. Might sleep and @pol may be modified on 1107 * successful registration. Returns 0 on success and -errno on failure. 1108 */ 1109 int blkcg_policy_register(struct blkcg_policy *pol) 1110 { 1111 int i, ret; 1112 1113 if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) 1114 return -EINVAL; 1115 1116 mutex_lock(&blkcg_pol_mutex); 1117 1118 /* find an empty slot */ 1119 ret = -ENOSPC; 1120 for (i = 0; i < BLKCG_MAX_POLS; i++) 1121 if (!blkcg_policy[i]) 1122 break; 1123 if (i >= BLKCG_MAX_POLS) 1124 goto out_unlock; 1125 1126 /* register and update blkgs */ 1127 pol->plid = i; 1128 blkcg_policy[i] = pol; 1129 1130 /* everything is in place, add intf files for the new policy */ 1131 if (pol->cftypes) 1132 WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); 1133 ret = 0; 1134 out_unlock: 1135 mutex_unlock(&blkcg_pol_mutex); 1136 return ret; 1137 } 1138 EXPORT_SYMBOL_GPL(blkcg_policy_register); 1139 1140 /** 1141 * blkcg_policy_unregister - unregister a blkcg policy 1142 * @pol: blkcg policy to unregister 1143 * 1144 * Undo blkcg_policy_register(@pol). Might sleep. 1145 */ 1146 void blkcg_policy_unregister(struct blkcg_policy *pol) 1147 { 1148 mutex_lock(&blkcg_pol_mutex); 1149 1150 if (WARN_ON(blkcg_policy[pol->plid] != pol)) 1151 goto out_unlock; 1152 1153 /* kill the intf files first */ 1154 if (pol->cftypes) 1155 cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); 1156 1157 /* unregister and update blkgs */ 1158 blkcg_policy[pol->plid] = NULL; 1159 out_unlock: 1160 mutex_unlock(&blkcg_pol_mutex); 1161 } 1162 EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 1163