1 // SPDX-License-Identifier: GPL-2.0-only 2 3 #include <linux/blkdev.h> 4 #include <linux/wait.h> 5 #include <linux/rbtree.h> 6 #include <linux/kthread.h> 7 #include <linux/backing-dev.h> 8 #include <linux/blk-cgroup.h> 9 #include <linux/freezer.h> 10 #include <linux/fs.h> 11 #include <linux/pagemap.h> 12 #include <linux/mm.h> 13 #include <linux/sched/mm.h> 14 #include <linux/sched.h> 15 #include <linux/module.h> 16 #include <linux/writeback.h> 17 #include <linux/device.h> 18 #include <trace/events/writeback.h> 19 #include "internal.h" 20 21 struct backing_dev_info noop_backing_dev_info; 22 EXPORT_SYMBOL_GPL(noop_backing_dev_info); 23 24 static const char *bdi_unknown_name = "(unknown)"; 25 26 /* 27 * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU 28 * reader side locking. 29 */ 30 DEFINE_SPINLOCK(bdi_lock); 31 static u64 bdi_id_cursor; 32 static struct rb_root bdi_tree = RB_ROOT; 33 LIST_HEAD(bdi_list); 34 35 /* bdi_wq serves all asynchronous writeback tasks */ 36 struct workqueue_struct *bdi_wq; 37 38 #ifdef CONFIG_DEBUG_FS 39 #include <linux/debugfs.h> 40 #include <linux/seq_file.h> 41 42 struct wb_stats { 43 unsigned long nr_dirty; 44 unsigned long nr_io; 45 unsigned long nr_more_io; 46 unsigned long nr_dirty_time; 47 unsigned long nr_writeback; 48 unsigned long nr_reclaimable; 49 unsigned long nr_dirtied; 50 unsigned long nr_written; 51 unsigned long dirty_thresh; 52 unsigned long wb_thresh; 53 }; 54 55 static struct dentry *bdi_debug_root; 56 57 static void bdi_debug_init(void) 58 { 59 bdi_debug_root = debugfs_create_dir("bdi", NULL); 60 } 61 62 static void collect_wb_stats(struct wb_stats *stats, 63 struct bdi_writeback *wb) 64 { 65 struct inode *inode; 66 67 spin_lock(&wb->list_lock); 68 list_for_each_entry(inode, &wb->b_dirty, i_io_list) 69 stats->nr_dirty++; 70 list_for_each_entry(inode, &wb->b_io, i_io_list) 71 stats->nr_io++; 72 list_for_each_entry(inode, &wb->b_more_io, i_io_list) 73 stats->nr_more_io++; 74 list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) 75 if (inode->i_state & I_DIRTY_TIME) 76 stats->nr_dirty_time++; 77 spin_unlock(&wb->list_lock); 78 79 stats->nr_writeback += wb_stat(wb, WB_WRITEBACK); 80 stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE); 81 stats->nr_dirtied += wb_stat(wb, WB_DIRTIED); 82 stats->nr_written += wb_stat(wb, WB_WRITTEN); 83 stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh); 84 } 85 86 #ifdef CONFIG_CGROUP_WRITEBACK 87 static void bdi_collect_stats(struct backing_dev_info *bdi, 88 struct wb_stats *stats) 89 { 90 struct bdi_writeback *wb; 91 92 rcu_read_lock(); 93 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { 94 if (!wb_tryget(wb)) 95 continue; 96 97 collect_wb_stats(stats, wb); 98 wb_put(wb); 99 } 100 rcu_read_unlock(); 101 } 102 #else 103 static void bdi_collect_stats(struct backing_dev_info *bdi, 104 struct wb_stats *stats) 105 { 106 collect_wb_stats(stats, &bdi->wb); 107 } 108 #endif 109 110 static int bdi_debug_stats_show(struct seq_file *m, void *v) 111 { 112 struct backing_dev_info *bdi = m->private; 113 unsigned long background_thresh; 114 unsigned long dirty_thresh; 115 struct wb_stats stats; 116 unsigned long tot_bw; 117 118 global_dirty_limits(&background_thresh, &dirty_thresh); 119 120 memset(&stats, 0, sizeof(stats)); 121 stats.dirty_thresh = dirty_thresh; 122 bdi_collect_stats(bdi, &stats); 123 tot_bw = atomic_long_read(&bdi->tot_write_bandwidth); 124 125 seq_printf(m, 126 "BdiWriteback: %10lu kB\n" 127 "BdiReclaimable: %10lu kB\n" 128 "BdiDirtyThresh: %10lu kB\n" 129 "DirtyThresh: %10lu kB\n" 130 "BackgroundThresh: %10lu kB\n" 131 "BdiDirtied: %10lu kB\n" 132 "BdiWritten: %10lu kB\n" 133 "BdiWriteBandwidth: %10lu kBps\n" 134 "b_dirty: %10lu\n" 135 "b_io: %10lu\n" 136 "b_more_io: %10lu\n" 137 "b_dirty_time: %10lu\n" 138 "bdi_list: %10u\n" 139 "state: %10lx\n", 140 K(stats.nr_writeback), 141 K(stats.nr_reclaimable), 142 K(stats.wb_thresh), 143 K(dirty_thresh), 144 K(background_thresh), 145 K(stats.nr_dirtied), 146 K(stats.nr_written), 147 K(tot_bw), 148 stats.nr_dirty, 149 stats.nr_io, 150 stats.nr_more_io, 151 stats.nr_dirty_time, 152 !list_empty(&bdi->bdi_list), bdi->wb.state); 153 154 return 0; 155 } 156 DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); 157 158 static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb, 159 struct wb_stats *stats) 160 { 161 162 seq_printf(m, 163 "WbCgIno: %10lu\n" 164 "WbWriteback: %10lu kB\n" 165 "WbReclaimable: %10lu kB\n" 166 "WbDirtyThresh: %10lu kB\n" 167 "WbDirtied: %10lu kB\n" 168 "WbWritten: %10lu kB\n" 169 "WbWriteBandwidth: %10lu kBps\n" 170 "b_dirty: %10lu\n" 171 "b_io: %10lu\n" 172 "b_more_io: %10lu\n" 173 "b_dirty_time: %10lu\n" 174 "state: %10lx\n\n", 175 #ifdef CONFIG_CGROUP_WRITEBACK 176 cgroup_ino(wb->memcg_css->cgroup), 177 #else 178 1ul, 179 #endif 180 K(stats->nr_writeback), 181 K(stats->nr_reclaimable), 182 K(stats->wb_thresh), 183 K(stats->nr_dirtied), 184 K(stats->nr_written), 185 K(wb->avg_write_bandwidth), 186 stats->nr_dirty, 187 stats->nr_io, 188 stats->nr_more_io, 189 stats->nr_dirty_time, 190 wb->state); 191 } 192 193 static int cgwb_debug_stats_show(struct seq_file *m, void *v) 194 { 195 struct backing_dev_info *bdi = m->private; 196 unsigned long background_thresh; 197 unsigned long dirty_thresh; 198 struct bdi_writeback *wb; 199 200 global_dirty_limits(&background_thresh, &dirty_thresh); 201 202 rcu_read_lock(); 203 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { 204 struct wb_stats stats = { .dirty_thresh = dirty_thresh }; 205 206 if (!wb_tryget(wb)) 207 continue; 208 209 collect_wb_stats(&stats, wb); 210 211 /* 212 * Calculate thresh of wb in writeback cgroup which is min of 213 * thresh in global domain and thresh in cgroup domain. Drop 214 * rcu lock because cgwb_calc_thresh may sleep in 215 * cgroup_rstat_flush. We can do so here because we have a ref. 216 */ 217 if (mem_cgroup_wb_domain(wb)) { 218 rcu_read_unlock(); 219 stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb)); 220 rcu_read_lock(); 221 } 222 223 wb_stats_show(m, wb, &stats); 224 225 wb_put(wb); 226 } 227 rcu_read_unlock(); 228 229 return 0; 230 } 231 DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats); 232 233 static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) 234 { 235 bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); 236 237 debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, 238 &bdi_debug_stats_fops); 239 debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi, 240 &cgwb_debug_stats_fops); 241 } 242 243 static void bdi_debug_unregister(struct backing_dev_info *bdi) 244 { 245 debugfs_remove_recursive(bdi->debug_dir); 246 } 247 #else /* CONFIG_DEBUG_FS */ 248 static inline void bdi_debug_init(void) 249 { 250 } 251 static inline void bdi_debug_register(struct backing_dev_info *bdi, 252 const char *name) 253 { 254 } 255 static inline void bdi_debug_unregister(struct backing_dev_info *bdi) 256 { 257 } 258 #endif /* CONFIG_DEBUG_FS */ 259 260 static ssize_t read_ahead_kb_store(struct device *dev, 261 struct device_attribute *attr, 262 const char *buf, size_t count) 263 { 264 struct backing_dev_info *bdi = dev_get_drvdata(dev); 265 unsigned long read_ahead_kb; 266 ssize_t ret; 267 268 ret = kstrtoul(buf, 10, &read_ahead_kb); 269 if (ret < 0) 270 return ret; 271 272 bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); 273 274 return count; 275 } 276 277 #define BDI_SHOW(name, expr) \ 278 static ssize_t name##_show(struct device *dev, \ 279 struct device_attribute *attr, char *buf) \ 280 { \ 281 struct backing_dev_info *bdi = dev_get_drvdata(dev); \ 282 \ 283 return sysfs_emit(buf, "%lld\n", (long long)expr); \ 284 } \ 285 static DEVICE_ATTR_RW(name); 286 287 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) 288 289 static ssize_t min_ratio_store(struct device *dev, 290 struct device_attribute *attr, const char *buf, size_t count) 291 { 292 struct backing_dev_info *bdi = dev_get_drvdata(dev); 293 unsigned int ratio; 294 ssize_t ret; 295 296 ret = kstrtouint(buf, 10, &ratio); 297 if (ret < 0) 298 return ret; 299 300 ret = bdi_set_min_ratio(bdi, ratio); 301 if (!ret) 302 ret = count; 303 304 return ret; 305 } 306 BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) 307 308 static ssize_t min_ratio_fine_store(struct device *dev, 309 struct device_attribute *attr, const char *buf, size_t count) 310 { 311 struct backing_dev_info *bdi = dev_get_drvdata(dev); 312 unsigned int ratio; 313 ssize_t ret; 314 315 ret = kstrtouint(buf, 10, &ratio); 316 if (ret < 0) 317 return ret; 318 319 ret = bdi_set_min_ratio_no_scale(bdi, ratio); 320 if (!ret) 321 ret = count; 322 323 return ret; 324 } 325 BDI_SHOW(min_ratio_fine, bdi->min_ratio) 326 327 static ssize_t max_ratio_store(struct device *dev, 328 struct device_attribute *attr, const char *buf, size_t count) 329 { 330 struct backing_dev_info *bdi = dev_get_drvdata(dev); 331 unsigned int ratio; 332 ssize_t ret; 333 334 ret = kstrtouint(buf, 10, &ratio); 335 if (ret < 0) 336 return ret; 337 338 ret = bdi_set_max_ratio(bdi, ratio); 339 if (!ret) 340 ret = count; 341 342 return ret; 343 } 344 BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) 345 346 static ssize_t max_ratio_fine_store(struct device *dev, 347 struct device_attribute *attr, const char *buf, size_t count) 348 { 349 struct backing_dev_info *bdi = dev_get_drvdata(dev); 350 unsigned int ratio; 351 ssize_t ret; 352 353 ret = kstrtouint(buf, 10, &ratio); 354 if (ret < 0) 355 return ret; 356 357 ret = bdi_set_max_ratio_no_scale(bdi, ratio); 358 if (!ret) 359 ret = count; 360 361 return ret; 362 } 363 BDI_SHOW(max_ratio_fine, bdi->max_ratio) 364 365 static ssize_t min_bytes_show(struct device *dev, 366 struct device_attribute *attr, 367 char *buf) 368 { 369 struct backing_dev_info *bdi = dev_get_drvdata(dev); 370 371 return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi)); 372 } 373 374 static ssize_t min_bytes_store(struct device *dev, 375 struct device_attribute *attr, const char *buf, size_t count) 376 { 377 struct backing_dev_info *bdi = dev_get_drvdata(dev); 378 u64 bytes; 379 ssize_t ret; 380 381 ret = kstrtoull(buf, 10, &bytes); 382 if (ret < 0) 383 return ret; 384 385 ret = bdi_set_min_bytes(bdi, bytes); 386 if (!ret) 387 ret = count; 388 389 return ret; 390 } 391 static DEVICE_ATTR_RW(min_bytes); 392 393 static ssize_t max_bytes_show(struct device *dev, 394 struct device_attribute *attr, 395 char *buf) 396 { 397 struct backing_dev_info *bdi = dev_get_drvdata(dev); 398 399 return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi)); 400 } 401 402 static ssize_t max_bytes_store(struct device *dev, 403 struct device_attribute *attr, const char *buf, size_t count) 404 { 405 struct backing_dev_info *bdi = dev_get_drvdata(dev); 406 u64 bytes; 407 ssize_t ret; 408 409 ret = kstrtoull(buf, 10, &bytes); 410 if (ret < 0) 411 return ret; 412 413 ret = bdi_set_max_bytes(bdi, bytes); 414 if (!ret) 415 ret = count; 416 417 return ret; 418 } 419 static DEVICE_ATTR_RW(max_bytes); 420 421 static ssize_t stable_pages_required_show(struct device *dev, 422 struct device_attribute *attr, 423 char *buf) 424 { 425 dev_warn_once(dev, 426 "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); 427 return sysfs_emit(buf, "%d\n", 0); 428 } 429 static DEVICE_ATTR_RO(stable_pages_required); 430 431 static ssize_t strict_limit_store(struct device *dev, 432 struct device_attribute *attr, const char *buf, size_t count) 433 { 434 struct backing_dev_info *bdi = dev_get_drvdata(dev); 435 unsigned int strict_limit; 436 ssize_t ret; 437 438 ret = kstrtouint(buf, 10, &strict_limit); 439 if (ret < 0) 440 return ret; 441 442 ret = bdi_set_strict_limit(bdi, strict_limit); 443 if (!ret) 444 ret = count; 445 446 return ret; 447 } 448 449 static ssize_t strict_limit_show(struct device *dev, 450 struct device_attribute *attr, char *buf) 451 { 452 struct backing_dev_info *bdi = dev_get_drvdata(dev); 453 454 return sysfs_emit(buf, "%d\n", 455 !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); 456 } 457 static DEVICE_ATTR_RW(strict_limit); 458 459 static struct attribute *bdi_dev_attrs[] = { 460 &dev_attr_read_ahead_kb.attr, 461 &dev_attr_min_ratio.attr, 462 &dev_attr_min_ratio_fine.attr, 463 &dev_attr_max_ratio.attr, 464 &dev_attr_max_ratio_fine.attr, 465 &dev_attr_min_bytes.attr, 466 &dev_attr_max_bytes.attr, 467 &dev_attr_stable_pages_required.attr, 468 &dev_attr_strict_limit.attr, 469 NULL, 470 }; 471 ATTRIBUTE_GROUPS(bdi_dev); 472 473 static const struct class bdi_class = { 474 .name = "bdi", 475 .dev_groups = bdi_dev_groups, 476 }; 477 478 static __init int bdi_class_init(void) 479 { 480 int ret; 481 482 ret = class_register(&bdi_class); 483 if (ret) 484 return ret; 485 486 bdi_debug_init(); 487 488 return 0; 489 } 490 postcore_initcall(bdi_class_init); 491 492 static int __init default_bdi_init(void) 493 { 494 bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | 495 WQ_SYSFS, 0); 496 if (!bdi_wq) 497 return -ENOMEM; 498 return 0; 499 } 500 subsys_initcall(default_bdi_init); 501 502 static void wb_update_bandwidth_workfn(struct work_struct *work) 503 { 504 struct bdi_writeback *wb = container_of(to_delayed_work(work), 505 struct bdi_writeback, bw_dwork); 506 507 wb_update_bandwidth(wb); 508 } 509 510 /* 511 * Initial write bandwidth: 100 MB/s 512 */ 513 #define INIT_BW (100 << (20 - PAGE_SHIFT)) 514 515 static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, 516 gfp_t gfp) 517 { 518 int err; 519 520 memset(wb, 0, sizeof(*wb)); 521 522 wb->bdi = bdi; 523 wb->last_old_flush = jiffies; 524 INIT_LIST_HEAD(&wb->b_dirty); 525 INIT_LIST_HEAD(&wb->b_io); 526 INIT_LIST_HEAD(&wb->b_more_io); 527 INIT_LIST_HEAD(&wb->b_dirty_time); 528 spin_lock_init(&wb->list_lock); 529 530 atomic_set(&wb->writeback_inodes, 0); 531 wb->bw_time_stamp = jiffies; 532 wb->balanced_dirty_ratelimit = INIT_BW; 533 wb->dirty_ratelimit = INIT_BW; 534 wb->write_bandwidth = INIT_BW; 535 wb->avg_write_bandwidth = INIT_BW; 536 537 spin_lock_init(&wb->work_lock); 538 INIT_LIST_HEAD(&wb->work_list); 539 INIT_DELAYED_WORK(&wb->dwork, wb_workfn); 540 INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); 541 542 err = fprop_local_init_percpu(&wb->completions, gfp); 543 if (err) 544 return err; 545 546 err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS); 547 if (err) 548 fprop_local_destroy_percpu(&wb->completions); 549 550 return err; 551 } 552 553 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); 554 555 /* 556 * Remove bdi from the global list and shutdown any threads we have running 557 */ 558 static void wb_shutdown(struct bdi_writeback *wb) 559 { 560 /* Make sure nobody queues further work */ 561 spin_lock_irq(&wb->work_lock); 562 if (!test_and_clear_bit(WB_registered, &wb->state)) { 563 spin_unlock_irq(&wb->work_lock); 564 return; 565 } 566 spin_unlock_irq(&wb->work_lock); 567 568 cgwb_remove_from_bdi_list(wb); 569 /* 570 * Drain work list and shutdown the delayed_work. !WB_registered 571 * tells wb_workfn() that @wb is dying and its work_list needs to 572 * be drained no matter what. 573 */ 574 mod_delayed_work(bdi_wq, &wb->dwork, 0); 575 flush_delayed_work(&wb->dwork); 576 WARN_ON(!list_empty(&wb->work_list)); 577 flush_delayed_work(&wb->bw_dwork); 578 } 579 580 static void wb_exit(struct bdi_writeback *wb) 581 { 582 WARN_ON(delayed_work_pending(&wb->dwork)); 583 percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS); 584 fprop_local_destroy_percpu(&wb->completions); 585 } 586 587 #ifdef CONFIG_CGROUP_WRITEBACK 588 589 #include <linux/memcontrol.h> 590 591 /* 592 * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and 593 * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. 594 */ 595 static DEFINE_SPINLOCK(cgwb_lock); 596 static struct workqueue_struct *cgwb_release_wq; 597 598 static LIST_HEAD(offline_cgwbs); 599 static void cleanup_offline_cgwbs_workfn(struct work_struct *work); 600 static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); 601 602 static void cgwb_free_rcu(struct rcu_head *rcu_head) 603 { 604 struct bdi_writeback *wb = container_of(rcu_head, 605 struct bdi_writeback, rcu); 606 607 percpu_ref_exit(&wb->refcnt); 608 kfree(wb); 609 } 610 611 static void cgwb_release_workfn(struct work_struct *work) 612 { 613 struct bdi_writeback *wb = container_of(work, struct bdi_writeback, 614 release_work); 615 struct backing_dev_info *bdi = wb->bdi; 616 617 mutex_lock(&wb->bdi->cgwb_release_mutex); 618 wb_shutdown(wb); 619 620 css_put(wb->memcg_css); 621 css_put(wb->blkcg_css); 622 mutex_unlock(&wb->bdi->cgwb_release_mutex); 623 624 /* triggers blkg destruction if no online users left */ 625 blkcg_unpin_online(wb->blkcg_css); 626 627 fprop_local_destroy_percpu(&wb->memcg_completions); 628 629 spin_lock_irq(&cgwb_lock); 630 list_del(&wb->offline_node); 631 spin_unlock_irq(&cgwb_lock); 632 633 wb_exit(wb); 634 bdi_put(bdi); 635 WARN_ON_ONCE(!list_empty(&wb->b_attached)); 636 call_rcu(&wb->rcu, cgwb_free_rcu); 637 } 638 639 static void cgwb_release(struct percpu_ref *refcnt) 640 { 641 struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, 642 refcnt); 643 queue_work(cgwb_release_wq, &wb->release_work); 644 } 645 646 static void cgwb_kill(struct bdi_writeback *wb) 647 { 648 lockdep_assert_held(&cgwb_lock); 649 650 WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); 651 list_del(&wb->memcg_node); 652 list_del(&wb->blkcg_node); 653 list_add(&wb->offline_node, &offline_cgwbs); 654 percpu_ref_kill(&wb->refcnt); 655 } 656 657 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) 658 { 659 spin_lock_irq(&cgwb_lock); 660 list_del_rcu(&wb->bdi_node); 661 spin_unlock_irq(&cgwb_lock); 662 } 663 664 static int cgwb_create(struct backing_dev_info *bdi, 665 struct cgroup_subsys_state *memcg_css, gfp_t gfp) 666 { 667 struct mem_cgroup *memcg; 668 struct cgroup_subsys_state *blkcg_css; 669 struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; 670 struct bdi_writeback *wb; 671 unsigned long flags; 672 int ret = 0; 673 674 memcg = mem_cgroup_from_css(memcg_css); 675 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); 676 memcg_cgwb_list = &memcg->cgwb_list; 677 blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css); 678 679 /* look up again under lock and discard on blkcg mismatch */ 680 spin_lock_irqsave(&cgwb_lock, flags); 681 wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); 682 if (wb && wb->blkcg_css != blkcg_css) { 683 cgwb_kill(wb); 684 wb = NULL; 685 } 686 spin_unlock_irqrestore(&cgwb_lock, flags); 687 if (wb) 688 goto out_put; 689 690 /* need to create a new one */ 691 wb = kmalloc(sizeof(*wb), gfp); 692 if (!wb) { 693 ret = -ENOMEM; 694 goto out_put; 695 } 696 697 ret = wb_init(wb, bdi, gfp); 698 if (ret) 699 goto err_free; 700 701 ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); 702 if (ret) 703 goto err_wb_exit; 704 705 ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); 706 if (ret) 707 goto err_ref_exit; 708 709 wb->memcg_css = memcg_css; 710 wb->blkcg_css = blkcg_css; 711 INIT_LIST_HEAD(&wb->b_attached); 712 INIT_WORK(&wb->release_work, cgwb_release_workfn); 713 set_bit(WB_registered, &wb->state); 714 bdi_get(bdi); 715 716 /* 717 * The root wb determines the registered state of the whole bdi and 718 * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate 719 * whether they're still online. Don't link @wb if any is dead. 720 * See wb_memcg_offline() and wb_blkcg_offline(). 721 */ 722 ret = -ENODEV; 723 spin_lock_irqsave(&cgwb_lock, flags); 724 if (test_bit(WB_registered, &bdi->wb.state) && 725 blkcg_cgwb_list->next && memcg_cgwb_list->next) { 726 /* we might have raced another instance of this function */ 727 ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); 728 if (!ret) { 729 list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); 730 list_add(&wb->memcg_node, memcg_cgwb_list); 731 list_add(&wb->blkcg_node, blkcg_cgwb_list); 732 blkcg_pin_online(blkcg_css); 733 css_get(memcg_css); 734 css_get(blkcg_css); 735 } 736 } 737 spin_unlock_irqrestore(&cgwb_lock, flags); 738 if (ret) { 739 if (ret == -EEXIST) 740 ret = 0; 741 goto err_fprop_exit; 742 } 743 goto out_put; 744 745 err_fprop_exit: 746 bdi_put(bdi); 747 fprop_local_destroy_percpu(&wb->memcg_completions); 748 err_ref_exit: 749 percpu_ref_exit(&wb->refcnt); 750 err_wb_exit: 751 wb_exit(wb); 752 err_free: 753 kfree(wb); 754 out_put: 755 css_put(blkcg_css); 756 return ret; 757 } 758 759 /** 760 * wb_get_lookup - get wb for a given memcg 761 * @bdi: target bdi 762 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) 763 * 764 * Try to get the wb for @memcg_css on @bdi. The returned wb has its 765 * refcount incremented. 766 * 767 * This function uses css_get() on @memcg_css and thus expects its refcnt 768 * to be positive on invocation. IOW, rcu_read_lock() protection on 769 * @memcg_css isn't enough. try_get it before calling this function. 770 * 771 * A wb is keyed by its associated memcg. As blkcg implicitly enables 772 * memcg on the default hierarchy, memcg association is guaranteed to be 773 * more specific (equal or descendant to the associated blkcg) and thus can 774 * identify both the memcg and blkcg associations. 775 * 776 * Because the blkcg associated with a memcg may change as blkcg is enabled 777 * and disabled closer to root in the hierarchy, each wb keeps track of 778 * both the memcg and blkcg associated with it and verifies the blkcg on 779 * each lookup. On mismatch, the existing wb is discarded and a new one is 780 * created. 781 */ 782 struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, 783 struct cgroup_subsys_state *memcg_css) 784 { 785 struct bdi_writeback *wb; 786 787 if (!memcg_css->parent) 788 return &bdi->wb; 789 790 rcu_read_lock(); 791 wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); 792 if (wb) { 793 struct cgroup_subsys_state *blkcg_css; 794 795 /* see whether the blkcg association has changed */ 796 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); 797 if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) 798 wb = NULL; 799 css_put(blkcg_css); 800 } 801 rcu_read_unlock(); 802 803 return wb; 804 } 805 806 /** 807 * wb_get_create - get wb for a given memcg, create if necessary 808 * @bdi: target bdi 809 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) 810 * @gfp: allocation mask to use 811 * 812 * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to 813 * create one. See wb_get_lookup() for more details. 814 */ 815 struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, 816 struct cgroup_subsys_state *memcg_css, 817 gfp_t gfp) 818 { 819 struct bdi_writeback *wb; 820 821 might_alloc(gfp); 822 823 do { 824 wb = wb_get_lookup(bdi, memcg_css); 825 } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); 826 827 return wb; 828 } 829 830 static int cgwb_bdi_init(struct backing_dev_info *bdi) 831 { 832 int ret; 833 834 INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); 835 mutex_init(&bdi->cgwb_release_mutex); 836 init_rwsem(&bdi->wb_switch_rwsem); 837 838 ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); 839 if (!ret) { 840 bdi->wb.memcg_css = &root_mem_cgroup->css; 841 bdi->wb.blkcg_css = blkcg_root_css; 842 } 843 return ret; 844 } 845 846 static void cgwb_bdi_unregister(struct backing_dev_info *bdi) 847 { 848 struct radix_tree_iter iter; 849 void **slot; 850 struct bdi_writeback *wb; 851 852 WARN_ON(test_bit(WB_registered, &bdi->wb.state)); 853 854 spin_lock_irq(&cgwb_lock); 855 radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) 856 cgwb_kill(*slot); 857 spin_unlock_irq(&cgwb_lock); 858 859 mutex_lock(&bdi->cgwb_release_mutex); 860 spin_lock_irq(&cgwb_lock); 861 while (!list_empty(&bdi->wb_list)) { 862 wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, 863 bdi_node); 864 spin_unlock_irq(&cgwb_lock); 865 wb_shutdown(wb); 866 spin_lock_irq(&cgwb_lock); 867 } 868 spin_unlock_irq(&cgwb_lock); 869 mutex_unlock(&bdi->cgwb_release_mutex); 870 } 871 872 /* 873 * cleanup_offline_cgwbs_workfn - try to release dying cgwbs 874 * 875 * Try to release dying cgwbs by switching attached inodes to the nearest 876 * living ancestor's writeback. Processed wbs are placed at the end 877 * of the list to guarantee the forward progress. 878 */ 879 static void cleanup_offline_cgwbs_workfn(struct work_struct *work) 880 { 881 struct bdi_writeback *wb; 882 LIST_HEAD(processed); 883 884 spin_lock_irq(&cgwb_lock); 885 886 while (!list_empty(&offline_cgwbs)) { 887 wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, 888 offline_node); 889 list_move(&wb->offline_node, &processed); 890 891 /* 892 * If wb is dirty, cleaning up the writeback by switching 893 * attached inodes will result in an effective removal of any 894 * bandwidth restrictions, which isn't the goal. Instead, 895 * it can be postponed until the next time, when all io 896 * will be likely completed. If in the meantime some inodes 897 * will get re-dirtied, they should be eventually switched to 898 * a new cgwb. 899 */ 900 if (wb_has_dirty_io(wb)) 901 continue; 902 903 if (!wb_tryget(wb)) 904 continue; 905 906 spin_unlock_irq(&cgwb_lock); 907 while (cleanup_offline_cgwb(wb)) 908 cond_resched(); 909 spin_lock_irq(&cgwb_lock); 910 911 wb_put(wb); 912 } 913 914 if (!list_empty(&processed)) 915 list_splice_tail(&processed, &offline_cgwbs); 916 917 spin_unlock_irq(&cgwb_lock); 918 } 919 920 /** 921 * wb_memcg_offline - kill all wb's associated with a memcg being offlined 922 * @memcg: memcg being offlined 923 * 924 * Also prevents creation of any new wb's associated with @memcg. 925 */ 926 void wb_memcg_offline(struct mem_cgroup *memcg) 927 { 928 struct list_head *memcg_cgwb_list = &memcg->cgwb_list; 929 struct bdi_writeback *wb, *next; 930 931 spin_lock_irq(&cgwb_lock); 932 list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) 933 cgwb_kill(wb); 934 memcg_cgwb_list->next = NULL; /* prevent new wb's */ 935 spin_unlock_irq(&cgwb_lock); 936 937 queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); 938 } 939 940 /** 941 * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined 942 * @css: blkcg being offlined 943 * 944 * Also prevents creation of any new wb's associated with @blkcg. 945 */ 946 void wb_blkcg_offline(struct cgroup_subsys_state *css) 947 { 948 struct bdi_writeback *wb, *next; 949 struct list_head *list = blkcg_get_cgwb_list(css); 950 951 spin_lock_irq(&cgwb_lock); 952 list_for_each_entry_safe(wb, next, list, blkcg_node) 953 cgwb_kill(wb); 954 list->next = NULL; /* prevent new wb's */ 955 spin_unlock_irq(&cgwb_lock); 956 } 957 958 static void cgwb_bdi_register(struct backing_dev_info *bdi) 959 { 960 spin_lock_irq(&cgwb_lock); 961 list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); 962 spin_unlock_irq(&cgwb_lock); 963 } 964 965 static int __init cgwb_init(void) 966 { 967 /* 968 * There can be many concurrent release work items overwhelming 969 * system_wq. Put them in a separate wq and limit concurrency. 970 * There's no point in executing many of these in parallel. 971 */ 972 cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); 973 if (!cgwb_release_wq) 974 return -ENOMEM; 975 976 return 0; 977 } 978 subsys_initcall(cgwb_init); 979 980 #else /* CONFIG_CGROUP_WRITEBACK */ 981 982 static int cgwb_bdi_init(struct backing_dev_info *bdi) 983 { 984 return wb_init(&bdi->wb, bdi, GFP_KERNEL); 985 } 986 987 static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } 988 989 static void cgwb_bdi_register(struct backing_dev_info *bdi) 990 { 991 list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); 992 } 993 994 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) 995 { 996 list_del_rcu(&wb->bdi_node); 997 } 998 999 #endif /* CONFIG_CGROUP_WRITEBACK */ 1000 1001 int bdi_init(struct backing_dev_info *bdi) 1002 { 1003 bdi->dev = NULL; 1004 1005 kref_init(&bdi->refcnt); 1006 bdi->min_ratio = 0; 1007 bdi->max_ratio = 100 * BDI_RATIO_SCALE; 1008 bdi->max_prop_frac = FPROP_FRAC_BASE; 1009 INIT_LIST_HEAD(&bdi->bdi_list); 1010 INIT_LIST_HEAD(&bdi->wb_list); 1011 init_waitqueue_head(&bdi->wb_waitq); 1012 bdi->last_bdp_sleep = jiffies; 1013 1014 return cgwb_bdi_init(bdi); 1015 } 1016 1017 struct backing_dev_info *bdi_alloc(int node_id) 1018 { 1019 struct backing_dev_info *bdi; 1020 1021 bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); 1022 if (!bdi) 1023 return NULL; 1024 1025 if (bdi_init(bdi)) { 1026 kfree(bdi); 1027 return NULL; 1028 } 1029 bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; 1030 bdi->ra_pages = VM_READAHEAD_PAGES; 1031 bdi->io_pages = VM_READAHEAD_PAGES; 1032 timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); 1033 return bdi; 1034 } 1035 EXPORT_SYMBOL(bdi_alloc); 1036 1037 static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) 1038 { 1039 struct rb_node **p = &bdi_tree.rb_node; 1040 struct rb_node *parent = NULL; 1041 struct backing_dev_info *bdi; 1042 1043 lockdep_assert_held(&bdi_lock); 1044 1045 while (*p) { 1046 parent = *p; 1047 bdi = rb_entry(parent, struct backing_dev_info, rb_node); 1048 1049 if (bdi->id > id) 1050 p = &(*p)->rb_left; 1051 else if (bdi->id < id) 1052 p = &(*p)->rb_right; 1053 else 1054 break; 1055 } 1056 1057 if (parentp) 1058 *parentp = parent; 1059 return p; 1060 } 1061 1062 /** 1063 * bdi_get_by_id - lookup and get bdi from its id 1064 * @id: bdi id to lookup 1065 * 1066 * Find bdi matching @id and get it. Returns NULL if the matching bdi 1067 * doesn't exist or is already unregistered. 1068 */ 1069 struct backing_dev_info *bdi_get_by_id(u64 id) 1070 { 1071 struct backing_dev_info *bdi = NULL; 1072 struct rb_node **p; 1073 1074 spin_lock_bh(&bdi_lock); 1075 p = bdi_lookup_rb_node(id, NULL); 1076 if (*p) { 1077 bdi = rb_entry(*p, struct backing_dev_info, rb_node); 1078 bdi_get(bdi); 1079 } 1080 spin_unlock_bh(&bdi_lock); 1081 1082 return bdi; 1083 } 1084 1085 int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) 1086 { 1087 struct device *dev; 1088 struct rb_node *parent, **p; 1089 1090 if (bdi->dev) /* The driver needs to use separate queues per device */ 1091 return 0; 1092 1093 vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); 1094 dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); 1095 if (IS_ERR(dev)) 1096 return PTR_ERR(dev); 1097 1098 cgwb_bdi_register(bdi); 1099 bdi->dev = dev; 1100 1101 bdi_debug_register(bdi, dev_name(dev)); 1102 set_bit(WB_registered, &bdi->wb.state); 1103 1104 spin_lock_bh(&bdi_lock); 1105 1106 bdi->id = ++bdi_id_cursor; 1107 1108 p = bdi_lookup_rb_node(bdi->id, &parent); 1109 rb_link_node(&bdi->rb_node, parent, p); 1110 rb_insert_color(&bdi->rb_node, &bdi_tree); 1111 1112 list_add_tail_rcu(&bdi->bdi_list, &bdi_list); 1113 1114 spin_unlock_bh(&bdi_lock); 1115 1116 trace_writeback_bdi_register(bdi); 1117 return 0; 1118 } 1119 1120 int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) 1121 { 1122 va_list args; 1123 int ret; 1124 1125 va_start(args, fmt); 1126 ret = bdi_register_va(bdi, fmt, args); 1127 va_end(args); 1128 return ret; 1129 } 1130 EXPORT_SYMBOL(bdi_register); 1131 1132 void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) 1133 { 1134 WARN_ON_ONCE(bdi->owner); 1135 bdi->owner = owner; 1136 get_device(owner); 1137 } 1138 1139 /* 1140 * Remove bdi from bdi_list, and ensure that it is no longer visible 1141 */ 1142 static void bdi_remove_from_list(struct backing_dev_info *bdi) 1143 { 1144 spin_lock_bh(&bdi_lock); 1145 rb_erase(&bdi->rb_node, &bdi_tree); 1146 list_del_rcu(&bdi->bdi_list); 1147 spin_unlock_bh(&bdi_lock); 1148 1149 synchronize_rcu_expedited(); 1150 } 1151 1152 void bdi_unregister(struct backing_dev_info *bdi) 1153 { 1154 del_timer_sync(&bdi->laptop_mode_wb_timer); 1155 1156 /* make sure nobody finds us on the bdi_list anymore */ 1157 bdi_remove_from_list(bdi); 1158 wb_shutdown(&bdi->wb); 1159 cgwb_bdi_unregister(bdi); 1160 1161 /* 1162 * If this BDI's min ratio has been set, use bdi_set_min_ratio() to 1163 * update the global bdi_min_ratio. 1164 */ 1165 if (bdi->min_ratio) 1166 bdi_set_min_ratio(bdi, 0); 1167 1168 if (bdi->dev) { 1169 bdi_debug_unregister(bdi); 1170 device_unregister(bdi->dev); 1171 bdi->dev = NULL; 1172 } 1173 1174 if (bdi->owner) { 1175 put_device(bdi->owner); 1176 bdi->owner = NULL; 1177 } 1178 } 1179 EXPORT_SYMBOL(bdi_unregister); 1180 1181 static void release_bdi(struct kref *ref) 1182 { 1183 struct backing_dev_info *bdi = 1184 container_of(ref, struct backing_dev_info, refcnt); 1185 1186 WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); 1187 WARN_ON_ONCE(bdi->dev); 1188 wb_exit(&bdi->wb); 1189 kfree(bdi); 1190 } 1191 1192 void bdi_put(struct backing_dev_info *bdi) 1193 { 1194 kref_put(&bdi->refcnt, release_bdi); 1195 } 1196 EXPORT_SYMBOL(bdi_put); 1197 1198 struct backing_dev_info *inode_to_bdi(struct inode *inode) 1199 { 1200 struct super_block *sb; 1201 1202 if (!inode) 1203 return &noop_backing_dev_info; 1204 1205 sb = inode->i_sb; 1206 #ifdef CONFIG_BLOCK 1207 if (sb_is_blkdev_sb(sb)) 1208 return I_BDEV(inode)->bd_disk->bdi; 1209 #endif 1210 return sb->s_bdi; 1211 } 1212 EXPORT_SYMBOL(inode_to_bdi); 1213 1214 const char *bdi_dev_name(struct backing_dev_info *bdi) 1215 { 1216 if (!bdi || !bdi->dev) 1217 return bdi_unknown_name; 1218 return bdi->dev_name; 1219 } 1220 EXPORT_SYMBOL_GPL(bdi_dev_name); 1221