1 // SPDX-License-Identifier: GPL-2.0-only 2 3 #include <linux/blkdev.h> 4 #include <linux/wait.h> 5 #include <linux/rbtree.h> 6 #include <linux/kthread.h> 7 #include <linux/backing-dev.h> 8 #include <linux/blk-cgroup.h> 9 #include <linux/freezer.h> 10 #include <linux/fs.h> 11 #include <linux/pagemap.h> 12 #include <linux/mm.h> 13 #include <linux/sched/mm.h> 14 #include <linux/sched.h> 15 #include <linux/module.h> 16 #include <linux/writeback.h> 17 #include <linux/device.h> 18 #include <trace/events/writeback.h> 19 #include "internal.h" 20 21 struct backing_dev_info noop_backing_dev_info; 22 EXPORT_SYMBOL_GPL(noop_backing_dev_info); 23 24 static const char *bdi_unknown_name = "(unknown)"; 25 26 /* 27 * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU 28 * reader side locking. 29 */ 30 DEFINE_SPINLOCK(bdi_lock); 31 static u64 bdi_id_cursor; 32 static struct rb_root bdi_tree = RB_ROOT; 33 LIST_HEAD(bdi_list); 34 35 /* bdi_wq serves all asynchronous writeback tasks */ 36 struct workqueue_struct *bdi_wq; 37 38 #ifdef CONFIG_DEBUG_FS 39 #include <linux/debugfs.h> 40 #include <linux/seq_file.h> 41 42 struct wb_stats { 43 unsigned long nr_dirty; 44 unsigned long nr_io; 45 unsigned long nr_more_io; 46 unsigned long nr_dirty_time; 47 unsigned long nr_writeback; 48 unsigned long nr_reclaimable; 49 unsigned long nr_dirtied; 50 unsigned long nr_written; 51 unsigned long dirty_thresh; 52 unsigned long wb_thresh; 53 }; 54 55 static struct dentry *bdi_debug_root; 56 57 static void bdi_debug_init(void) 58 { 59 bdi_debug_root = debugfs_create_dir("bdi", NULL); 60 } 61 62 static void collect_wb_stats(struct wb_stats *stats, 63 struct bdi_writeback *wb) 64 { 65 struct inode *inode; 66 67 spin_lock(&wb->list_lock); 68 list_for_each_entry(inode, &wb->b_dirty, i_io_list) 69 stats->nr_dirty++; 70 list_for_each_entry(inode, &wb->b_io, i_io_list) 71 stats->nr_io++; 72 list_for_each_entry(inode, &wb->b_more_io, i_io_list) 73 stats->nr_more_io++; 74 list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) 75 if (inode->i_state & I_DIRTY_TIME) 76 stats->nr_dirty_time++; 77 spin_unlock(&wb->list_lock); 78 79 stats->nr_writeback += wb_stat(wb, WB_WRITEBACK); 80 stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE); 81 stats->nr_dirtied += wb_stat(wb, WB_DIRTIED); 82 stats->nr_written += wb_stat(wb, WB_WRITTEN); 83 stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh); 84 } 85 86 #ifdef CONFIG_CGROUP_WRITEBACK 87 static void bdi_collect_stats(struct backing_dev_info *bdi, 88 struct wb_stats *stats) 89 { 90 struct bdi_writeback *wb; 91 92 rcu_read_lock(); 93 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { 94 if (!wb_tryget(wb)) 95 continue; 96 97 collect_wb_stats(stats, wb); 98 wb_put(wb); 99 } 100 rcu_read_unlock(); 101 } 102 #else 103 static void bdi_collect_stats(struct backing_dev_info *bdi, 104 struct wb_stats *stats) 105 { 106 collect_wb_stats(stats, &bdi->wb); 107 } 108 #endif 109 110 static int bdi_debug_stats_show(struct seq_file *m, void *v) 111 { 112 struct backing_dev_info *bdi = m->private; 113 unsigned long background_thresh; 114 unsigned long dirty_thresh; 115 struct wb_stats stats; 116 unsigned long tot_bw; 117 118 global_dirty_limits(&background_thresh, &dirty_thresh); 119 120 memset(&stats, 0, sizeof(stats)); 121 stats.dirty_thresh = dirty_thresh; 122 bdi_collect_stats(bdi, &stats); 123 tot_bw = atomic_long_read(&bdi->tot_write_bandwidth); 124 125 seq_printf(m, 126 "BdiWriteback: %10lu kB\n" 127 "BdiReclaimable: %10lu kB\n" 128 "BdiDirtyThresh: %10lu kB\n" 129 "DirtyThresh: %10lu kB\n" 130 "BackgroundThresh: %10lu kB\n" 131 "BdiDirtied: %10lu kB\n" 132 "BdiWritten: %10lu kB\n" 133 "BdiWriteBandwidth: %10lu kBps\n" 134 "b_dirty: %10lu\n" 135 "b_io: %10lu\n" 136 "b_more_io: %10lu\n" 137 "b_dirty_time: %10lu\n" 138 "bdi_list: %10u\n" 139 "state: %10lx\n", 140 K(stats.nr_writeback), 141 K(stats.nr_reclaimable), 142 K(stats.wb_thresh), 143 K(dirty_thresh), 144 K(background_thresh), 145 K(stats.nr_dirtied), 146 K(stats.nr_written), 147 K(tot_bw), 148 stats.nr_dirty, 149 stats.nr_io, 150 stats.nr_more_io, 151 stats.nr_dirty_time, 152 !list_empty(&bdi->bdi_list), bdi->wb.state); 153 154 return 0; 155 } 156 DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); 157 158 static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb, 159 struct wb_stats *stats) 160 { 161 162 seq_printf(m, 163 "WbCgIno: %10lu\n" 164 "WbWriteback: %10lu kB\n" 165 "WbReclaimable: %10lu kB\n" 166 "WbDirtyThresh: %10lu kB\n" 167 "WbDirtied: %10lu kB\n" 168 "WbWritten: %10lu kB\n" 169 "WbWriteBandwidth: %10lu kBps\n" 170 "b_dirty: %10lu\n" 171 "b_io: %10lu\n" 172 "b_more_io: %10lu\n" 173 "b_dirty_time: %10lu\n" 174 "state: %10lx\n\n", 175 #ifdef CONFIG_CGROUP_WRITEBACK 176 cgroup_ino(wb->memcg_css->cgroup), 177 #else 178 1ul, 179 #endif 180 K(stats->nr_writeback), 181 K(stats->nr_reclaimable), 182 K(stats->wb_thresh), 183 K(stats->nr_dirtied), 184 K(stats->nr_written), 185 K(wb->avg_write_bandwidth), 186 stats->nr_dirty, 187 stats->nr_io, 188 stats->nr_more_io, 189 stats->nr_dirty_time, 190 wb->state); 191 } 192 193 static int cgwb_debug_stats_show(struct seq_file *m, void *v) 194 { 195 struct backing_dev_info *bdi = m->private; 196 unsigned long background_thresh; 197 unsigned long dirty_thresh; 198 struct bdi_writeback *wb; 199 200 global_dirty_limits(&background_thresh, &dirty_thresh); 201 202 rcu_read_lock(); 203 list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { 204 struct wb_stats stats = { .dirty_thresh = dirty_thresh }; 205 206 if (!wb_tryget(wb)) 207 continue; 208 209 collect_wb_stats(&stats, wb); 210 211 /* 212 * Calculate thresh of wb in writeback cgroup which is min of 213 * thresh in global domain and thresh in cgroup domain. Drop 214 * rcu lock because cgwb_calc_thresh may sleep in 215 * cgroup_rstat_flush. We can do so here because we have a ref. 216 */ 217 if (mem_cgroup_wb_domain(wb)) { 218 rcu_read_unlock(); 219 stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb)); 220 rcu_read_lock(); 221 } 222 223 wb_stats_show(m, wb, &stats); 224 225 wb_put(wb); 226 } 227 rcu_read_unlock(); 228 229 return 0; 230 } 231 DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats); 232 233 static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) 234 { 235 bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); 236 237 debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, 238 &bdi_debug_stats_fops); 239 debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi, 240 &cgwb_debug_stats_fops); 241 } 242 243 static void bdi_debug_unregister(struct backing_dev_info *bdi) 244 { 245 debugfs_remove_recursive(bdi->debug_dir); 246 } 247 #else /* CONFIG_DEBUG_FS */ 248 static inline void bdi_debug_init(void) 249 { 250 } 251 static inline void bdi_debug_register(struct backing_dev_info *bdi, 252 const char *name) 253 { 254 } 255 static inline void bdi_debug_unregister(struct backing_dev_info *bdi) 256 { 257 } 258 #endif /* CONFIG_DEBUG_FS */ 259 260 static ssize_t read_ahead_kb_store(struct device *dev, 261 struct device_attribute *attr, 262 const char *buf, size_t count) 263 { 264 struct backing_dev_info *bdi = dev_get_drvdata(dev); 265 unsigned long read_ahead_kb; 266 ssize_t ret; 267 268 ret = kstrtoul(buf, 10, &read_ahead_kb); 269 if (ret < 0) 270 return ret; 271 272 bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); 273 274 return count; 275 } 276 277 #define BDI_SHOW(name, expr) \ 278 static ssize_t name##_show(struct device *dev, \ 279 struct device_attribute *attr, char *buf) \ 280 { \ 281 struct backing_dev_info *bdi = dev_get_drvdata(dev); \ 282 \ 283 return sysfs_emit(buf, "%lld\n", (long long)expr); \ 284 } \ 285 static DEVICE_ATTR_RW(name); 286 287 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) 288 289 static ssize_t min_ratio_store(struct device *dev, 290 struct device_attribute *attr, const char *buf, size_t count) 291 { 292 struct backing_dev_info *bdi = dev_get_drvdata(dev); 293 unsigned int ratio; 294 ssize_t ret; 295 296 ret = kstrtouint(buf, 10, &ratio); 297 if (ret < 0) 298 return ret; 299 300 ret = bdi_set_min_ratio(bdi, ratio); 301 if (!ret) 302 ret = count; 303 304 return ret; 305 } 306 BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) 307 308 static ssize_t min_ratio_fine_store(struct device *dev, 309 struct device_attribute *attr, const char *buf, size_t count) 310 { 311 struct backing_dev_info *bdi = dev_get_drvdata(dev); 312 unsigned int ratio; 313 ssize_t ret; 314 315 ret = kstrtouint(buf, 10, &ratio); 316 if (ret < 0) 317 return ret; 318 319 ret = bdi_set_min_ratio_no_scale(bdi, ratio); 320 if (!ret) 321 ret = count; 322 323 return ret; 324 } 325 BDI_SHOW(min_ratio_fine, bdi->min_ratio) 326 327 static ssize_t max_ratio_store(struct device *dev, 328 struct device_attribute *attr, const char *buf, size_t count) 329 { 330 struct backing_dev_info *bdi = dev_get_drvdata(dev); 331 unsigned int ratio; 332 ssize_t ret; 333 334 ret = kstrtouint(buf, 10, &ratio); 335 if (ret < 0) 336 return ret; 337 338 ret = bdi_set_max_ratio(bdi, ratio); 339 if (!ret) 340 ret = count; 341 342 return ret; 343 } 344 BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) 345 346 static ssize_t max_ratio_fine_store(struct device *dev, 347 struct device_attribute *attr, const char *buf, size_t count) 348 { 349 struct backing_dev_info *bdi = dev_get_drvdata(dev); 350 unsigned int ratio; 351 ssize_t ret; 352 353 ret = kstrtouint(buf, 10, &ratio); 354 if (ret < 0) 355 return ret; 356 357 ret = bdi_set_max_ratio_no_scale(bdi, ratio); 358 if (!ret) 359 ret = count; 360 361 return ret; 362 } 363 BDI_SHOW(max_ratio_fine, bdi->max_ratio) 364 365 static ssize_t min_bytes_show(struct device *dev, 366 struct device_attribute *attr, 367 char *buf) 368 { 369 struct backing_dev_info *bdi = dev_get_drvdata(dev); 370 371 return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi)); 372 } 373 374 static ssize_t min_bytes_store(struct device *dev, 375 struct device_attribute *attr, const char *buf, size_t count) 376 { 377 struct backing_dev_info *bdi = dev_get_drvdata(dev); 378 u64 bytes; 379 ssize_t ret; 380 381 ret = kstrtoull(buf, 10, &bytes); 382 if (ret < 0) 383 return ret; 384 385 ret = bdi_set_min_bytes(bdi, bytes); 386 if (!ret) 387 ret = count; 388 389 return ret; 390 } 391 static DEVICE_ATTR_RW(min_bytes); 392 393 static ssize_t max_bytes_show(struct device *dev, 394 struct device_attribute *attr, 395 char *buf) 396 { 397 struct backing_dev_info *bdi = dev_get_drvdata(dev); 398 399 return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi)); 400 } 401 402 static ssize_t max_bytes_store(struct device *dev, 403 struct device_attribute *attr, const char *buf, size_t count) 404 { 405 struct backing_dev_info *bdi = dev_get_drvdata(dev); 406 u64 bytes; 407 ssize_t ret; 408 409 ret = kstrtoull(buf, 10, &bytes); 410 if (ret < 0) 411 return ret; 412 413 ret = bdi_set_max_bytes(bdi, bytes); 414 if (!ret) 415 ret = count; 416 417 return ret; 418 } 419 static DEVICE_ATTR_RW(max_bytes); 420 421 static ssize_t stable_pages_required_show(struct device *dev, 422 struct device_attribute *attr, 423 char *buf) 424 { 425 dev_warn_once(dev, 426 "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); 427 return sysfs_emit(buf, "%d\n", 0); 428 } 429 static DEVICE_ATTR_RO(stable_pages_required); 430 431 static ssize_t strict_limit_store(struct device *dev, 432 struct device_attribute *attr, const char *buf, size_t count) 433 { 434 struct backing_dev_info *bdi = dev_get_drvdata(dev); 435 unsigned int strict_limit; 436 ssize_t ret; 437 438 ret = kstrtouint(buf, 10, &strict_limit); 439 if (ret < 0) 440 return ret; 441 442 ret = bdi_set_strict_limit(bdi, strict_limit); 443 if (!ret) 444 ret = count; 445 446 return ret; 447 } 448 449 static ssize_t strict_limit_show(struct device *dev, 450 struct device_attribute *attr, char *buf) 451 { 452 struct backing_dev_info *bdi = dev_get_drvdata(dev); 453 454 return sysfs_emit(buf, "%d\n", 455 !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); 456 } 457 static DEVICE_ATTR_RW(strict_limit); 458 459 static struct attribute *bdi_dev_attrs[] = { 460 &dev_attr_read_ahead_kb.attr, 461 &dev_attr_min_ratio.attr, 462 &dev_attr_min_ratio_fine.attr, 463 &dev_attr_max_ratio.attr, 464 &dev_attr_max_ratio_fine.attr, 465 &dev_attr_min_bytes.attr, 466 &dev_attr_max_bytes.attr, 467 &dev_attr_stable_pages_required.attr, 468 &dev_attr_strict_limit.attr, 469 NULL, 470 }; 471 ATTRIBUTE_GROUPS(bdi_dev); 472 473 static const struct class bdi_class = { 474 .name = "bdi", 475 .dev_groups = bdi_dev_groups, 476 }; 477 478 static __init int bdi_class_init(void) 479 { 480 int ret; 481 482 ret = class_register(&bdi_class); 483 if (ret) 484 return ret; 485 486 bdi_debug_init(); 487 488 return 0; 489 } 490 postcore_initcall(bdi_class_init); 491 492 static int __init default_bdi_init(void) 493 { 494 bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | 495 WQ_SYSFS, 0); 496 if (!bdi_wq) 497 return -ENOMEM; 498 return 0; 499 } 500 subsys_initcall(default_bdi_init); 501 502 static void wb_update_bandwidth_workfn(struct work_struct *work) 503 { 504 struct bdi_writeback *wb = container_of(to_delayed_work(work), 505 struct bdi_writeback, bw_dwork); 506 507 wb_update_bandwidth(wb); 508 } 509 510 /* 511 * Initial write bandwidth: 100 MB/s 512 */ 513 #define INIT_BW (100 << (20 - PAGE_SHIFT)) 514 515 static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, 516 gfp_t gfp) 517 { 518 int err; 519 520 memset(wb, 0, sizeof(*wb)); 521 522 wb->bdi = bdi; 523 wb->last_old_flush = jiffies; 524 INIT_LIST_HEAD(&wb->b_dirty); 525 INIT_LIST_HEAD(&wb->b_io); 526 INIT_LIST_HEAD(&wb->b_more_io); 527 INIT_LIST_HEAD(&wb->b_dirty_time); 528 spin_lock_init(&wb->list_lock); 529 530 atomic_set(&wb->writeback_inodes, 0); 531 wb->bw_time_stamp = jiffies; 532 wb->balanced_dirty_ratelimit = INIT_BW; 533 wb->dirty_ratelimit = INIT_BW; 534 wb->write_bandwidth = INIT_BW; 535 wb->avg_write_bandwidth = INIT_BW; 536 537 spin_lock_init(&wb->work_lock); 538 INIT_LIST_HEAD(&wb->work_list); 539 INIT_DELAYED_WORK(&wb->dwork, wb_workfn); 540 INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); 541 542 err = fprop_local_init_percpu(&wb->completions, gfp); 543 if (err) 544 return err; 545 546 err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS); 547 if (err) 548 fprop_local_destroy_percpu(&wb->completions); 549 550 return err; 551 } 552 553 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); 554 555 /* 556 * Remove bdi from the global list and shutdown any threads we have running 557 */ 558 static void wb_shutdown(struct bdi_writeback *wb) 559 { 560 /* Make sure nobody queues further work */ 561 spin_lock_irq(&wb->work_lock); 562 if (!test_and_clear_bit(WB_registered, &wb->state)) { 563 spin_unlock_irq(&wb->work_lock); 564 return; 565 } 566 spin_unlock_irq(&wb->work_lock); 567 568 cgwb_remove_from_bdi_list(wb); 569 /* 570 * Drain work list and shutdown the delayed_work. !WB_registered 571 * tells wb_workfn() that @wb is dying and its work_list needs to 572 * be drained no matter what. 573 */ 574 mod_delayed_work(bdi_wq, &wb->dwork, 0); 575 flush_delayed_work(&wb->dwork); 576 WARN_ON(!list_empty(&wb->work_list)); 577 flush_delayed_work(&wb->bw_dwork); 578 } 579 580 static void wb_exit(struct bdi_writeback *wb) 581 { 582 WARN_ON(delayed_work_pending(&wb->dwork)); 583 percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS); 584 fprop_local_destroy_percpu(&wb->completions); 585 } 586 587 #ifdef CONFIG_CGROUP_WRITEBACK 588 589 #include <linux/memcontrol.h> 590 591 /* 592 * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and 593 * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. 594 */ 595 static DEFINE_SPINLOCK(cgwb_lock); 596 static struct workqueue_struct *cgwb_release_wq; 597 598 static LIST_HEAD(offline_cgwbs); 599 static void cleanup_offline_cgwbs_workfn(struct work_struct *work); 600 static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); 601 602 static void cgwb_free_rcu(struct rcu_head *rcu_head) 603 { 604 struct bdi_writeback *wb = container_of(rcu_head, 605 struct bdi_writeback, rcu); 606 607 percpu_ref_exit(&wb->refcnt); 608 kfree(wb); 609 } 610 611 static void cgwb_release_workfn(struct work_struct *work) 612 { 613 struct bdi_writeback *wb = container_of(work, struct bdi_writeback, 614 release_work); 615 struct backing_dev_info *bdi = wb->bdi; 616 617 mutex_lock(&wb->bdi->cgwb_release_mutex); 618 wb_shutdown(wb); 619 620 css_put(wb->memcg_css); 621 css_put(wb->blkcg_css); 622 mutex_unlock(&wb->bdi->cgwb_release_mutex); 623 624 /* triggers blkg destruction if no online users left */ 625 blkcg_unpin_online(wb->blkcg_css); 626 627 fprop_local_destroy_percpu(&wb->memcg_completions); 628 629 spin_lock_irq(&cgwb_lock); 630 list_del(&wb->offline_node); 631 spin_unlock_irq(&cgwb_lock); 632 633 wb_exit(wb); 634 bdi_put(bdi); 635 WARN_ON_ONCE(!list_empty(&wb->b_attached)); 636 WARN_ON_ONCE(work_pending(&wb->switch_work)); 637 call_rcu(&wb->rcu, cgwb_free_rcu); 638 } 639 640 static void cgwb_release(struct percpu_ref *refcnt) 641 { 642 struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, 643 refcnt); 644 queue_work(cgwb_release_wq, &wb->release_work); 645 } 646 647 static void cgwb_kill(struct bdi_writeback *wb) 648 { 649 lockdep_assert_held(&cgwb_lock); 650 651 WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); 652 list_del(&wb->memcg_node); 653 list_del(&wb->blkcg_node); 654 list_add(&wb->offline_node, &offline_cgwbs); 655 percpu_ref_kill(&wb->refcnt); 656 } 657 658 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) 659 { 660 spin_lock_irq(&cgwb_lock); 661 list_del_rcu(&wb->bdi_node); 662 spin_unlock_irq(&cgwb_lock); 663 } 664 665 static int cgwb_create(struct backing_dev_info *bdi, 666 struct cgroup_subsys_state *memcg_css, gfp_t gfp) 667 { 668 struct mem_cgroup *memcg; 669 struct cgroup_subsys_state *blkcg_css; 670 struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; 671 struct bdi_writeback *wb; 672 unsigned long flags; 673 int ret = 0; 674 675 memcg = mem_cgroup_from_css(memcg_css); 676 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); 677 memcg_cgwb_list = &memcg->cgwb_list; 678 blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css); 679 680 /* look up again under lock and discard on blkcg mismatch */ 681 spin_lock_irqsave(&cgwb_lock, flags); 682 wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); 683 if (wb && wb->blkcg_css != blkcg_css) { 684 cgwb_kill(wb); 685 wb = NULL; 686 } 687 spin_unlock_irqrestore(&cgwb_lock, flags); 688 if (wb) 689 goto out_put; 690 691 /* need to create a new one */ 692 wb = kmalloc(sizeof(*wb), gfp); 693 if (!wb) { 694 ret = -ENOMEM; 695 goto out_put; 696 } 697 698 ret = wb_init(wb, bdi, gfp); 699 if (ret) 700 goto err_free; 701 702 ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); 703 if (ret) 704 goto err_wb_exit; 705 706 ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); 707 if (ret) 708 goto err_ref_exit; 709 710 wb->memcg_css = memcg_css; 711 wb->blkcg_css = blkcg_css; 712 INIT_LIST_HEAD(&wb->b_attached); 713 INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn); 714 init_llist_head(&wb->switch_wbs_ctxs); 715 INIT_WORK(&wb->release_work, cgwb_release_workfn); 716 set_bit(WB_registered, &wb->state); 717 bdi_get(bdi); 718 719 /* 720 * The root wb determines the registered state of the whole bdi and 721 * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate 722 * whether they're still online. Don't link @wb if any is dead. 723 * See wb_memcg_offline() and wb_blkcg_offline(). 724 */ 725 ret = -ENODEV; 726 spin_lock_irqsave(&cgwb_lock, flags); 727 if (test_bit(WB_registered, &bdi->wb.state) && 728 blkcg_cgwb_list->next && memcg_cgwb_list->next) { 729 /* we might have raced another instance of this function */ 730 ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); 731 if (!ret) { 732 list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); 733 list_add(&wb->memcg_node, memcg_cgwb_list); 734 list_add(&wb->blkcg_node, blkcg_cgwb_list); 735 blkcg_pin_online(blkcg_css); 736 css_get(memcg_css); 737 css_get(blkcg_css); 738 } 739 } 740 spin_unlock_irqrestore(&cgwb_lock, flags); 741 if (ret) { 742 if (ret == -EEXIST) 743 ret = 0; 744 goto err_fprop_exit; 745 } 746 goto out_put; 747 748 err_fprop_exit: 749 bdi_put(bdi); 750 fprop_local_destroy_percpu(&wb->memcg_completions); 751 err_ref_exit: 752 percpu_ref_exit(&wb->refcnt); 753 err_wb_exit: 754 wb_exit(wb); 755 err_free: 756 kfree(wb); 757 out_put: 758 css_put(blkcg_css); 759 return ret; 760 } 761 762 /** 763 * wb_get_lookup - get wb for a given memcg 764 * @bdi: target bdi 765 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) 766 * 767 * Try to get the wb for @memcg_css on @bdi. The returned wb has its 768 * refcount incremented. 769 * 770 * This function uses css_get() on @memcg_css and thus expects its refcnt 771 * to be positive on invocation. IOW, rcu_read_lock() protection on 772 * @memcg_css isn't enough. try_get it before calling this function. 773 * 774 * A wb is keyed by its associated memcg. As blkcg implicitly enables 775 * memcg on the default hierarchy, memcg association is guaranteed to be 776 * more specific (equal or descendant to the associated blkcg) and thus can 777 * identify both the memcg and blkcg associations. 778 * 779 * Because the blkcg associated with a memcg may change as blkcg is enabled 780 * and disabled closer to root in the hierarchy, each wb keeps track of 781 * both the memcg and blkcg associated with it and verifies the blkcg on 782 * each lookup. On mismatch, the existing wb is discarded and a new one is 783 * created. 784 */ 785 struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, 786 struct cgroup_subsys_state *memcg_css) 787 { 788 struct bdi_writeback *wb; 789 790 if (!memcg_css->parent) 791 return &bdi->wb; 792 793 rcu_read_lock(); 794 wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); 795 if (wb) { 796 struct cgroup_subsys_state *blkcg_css; 797 798 /* see whether the blkcg association has changed */ 799 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); 800 if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) 801 wb = NULL; 802 css_put(blkcg_css); 803 } 804 rcu_read_unlock(); 805 806 return wb; 807 } 808 809 /** 810 * wb_get_create - get wb for a given memcg, create if necessary 811 * @bdi: target bdi 812 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) 813 * @gfp: allocation mask to use 814 * 815 * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to 816 * create one. See wb_get_lookup() for more details. 817 */ 818 struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, 819 struct cgroup_subsys_state *memcg_css, 820 gfp_t gfp) 821 { 822 struct bdi_writeback *wb; 823 824 might_alloc(gfp); 825 826 do { 827 wb = wb_get_lookup(bdi, memcg_css); 828 } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); 829 830 return wb; 831 } 832 833 static int cgwb_bdi_init(struct backing_dev_info *bdi) 834 { 835 int ret; 836 837 INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); 838 mutex_init(&bdi->cgwb_release_mutex); 839 init_rwsem(&bdi->wb_switch_rwsem); 840 841 ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); 842 if (!ret) { 843 bdi->wb.memcg_css = &root_mem_cgroup->css; 844 bdi->wb.blkcg_css = blkcg_root_css; 845 INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn); 846 init_llist_head(&bdi->wb.switch_wbs_ctxs); 847 } 848 return ret; 849 } 850 851 static void cgwb_bdi_unregister(struct backing_dev_info *bdi) 852 { 853 struct radix_tree_iter iter; 854 void **slot; 855 struct bdi_writeback *wb; 856 857 WARN_ON(test_bit(WB_registered, &bdi->wb.state)); 858 859 spin_lock_irq(&cgwb_lock); 860 radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) 861 cgwb_kill(*slot); 862 spin_unlock_irq(&cgwb_lock); 863 864 mutex_lock(&bdi->cgwb_release_mutex); 865 spin_lock_irq(&cgwb_lock); 866 while (!list_empty(&bdi->wb_list)) { 867 wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, 868 bdi_node); 869 spin_unlock_irq(&cgwb_lock); 870 wb_shutdown(wb); 871 spin_lock_irq(&cgwb_lock); 872 } 873 spin_unlock_irq(&cgwb_lock); 874 mutex_unlock(&bdi->cgwb_release_mutex); 875 } 876 877 /* 878 * cleanup_offline_cgwbs_workfn - try to release dying cgwbs 879 * 880 * Try to release dying cgwbs by switching attached inodes to the nearest 881 * living ancestor's writeback. Processed wbs are placed at the end 882 * of the list to guarantee the forward progress. 883 */ 884 static void cleanup_offline_cgwbs_workfn(struct work_struct *work) 885 { 886 struct bdi_writeback *wb; 887 LIST_HEAD(processed); 888 889 spin_lock_irq(&cgwb_lock); 890 891 while (!list_empty(&offline_cgwbs)) { 892 wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, 893 offline_node); 894 list_move(&wb->offline_node, &processed); 895 896 /* 897 * If wb is dirty, cleaning up the writeback by switching 898 * attached inodes will result in an effective removal of any 899 * bandwidth restrictions, which isn't the goal. Instead, 900 * it can be postponed until the next time, when all io 901 * will be likely completed. If in the meantime some inodes 902 * will get re-dirtied, they should be eventually switched to 903 * a new cgwb. 904 */ 905 if (wb_has_dirty_io(wb)) 906 continue; 907 908 if (!wb_tryget(wb)) 909 continue; 910 911 spin_unlock_irq(&cgwb_lock); 912 while (cleanup_offline_cgwb(wb)) 913 cond_resched(); 914 spin_lock_irq(&cgwb_lock); 915 916 wb_put(wb); 917 } 918 919 if (!list_empty(&processed)) 920 list_splice_tail(&processed, &offline_cgwbs); 921 922 spin_unlock_irq(&cgwb_lock); 923 } 924 925 /** 926 * wb_memcg_offline - kill all wb's associated with a memcg being offlined 927 * @memcg: memcg being offlined 928 * 929 * Also prevents creation of any new wb's associated with @memcg. 930 */ 931 void wb_memcg_offline(struct mem_cgroup *memcg) 932 { 933 struct list_head *memcg_cgwb_list = &memcg->cgwb_list; 934 struct bdi_writeback *wb, *next; 935 936 spin_lock_irq(&cgwb_lock); 937 list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) 938 cgwb_kill(wb); 939 memcg_cgwb_list->next = NULL; /* prevent new wb's */ 940 spin_unlock_irq(&cgwb_lock); 941 942 queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); 943 } 944 945 /** 946 * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined 947 * @css: blkcg being offlined 948 * 949 * Also prevents creation of any new wb's associated with @blkcg. 950 */ 951 void wb_blkcg_offline(struct cgroup_subsys_state *css) 952 { 953 struct bdi_writeback *wb, *next; 954 struct list_head *list = blkcg_get_cgwb_list(css); 955 956 spin_lock_irq(&cgwb_lock); 957 list_for_each_entry_safe(wb, next, list, blkcg_node) 958 cgwb_kill(wb); 959 list->next = NULL; /* prevent new wb's */ 960 spin_unlock_irq(&cgwb_lock); 961 } 962 963 static void cgwb_bdi_register(struct backing_dev_info *bdi) 964 { 965 spin_lock_irq(&cgwb_lock); 966 list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); 967 spin_unlock_irq(&cgwb_lock); 968 } 969 970 static int __init cgwb_init(void) 971 { 972 /* 973 * There can be many concurrent release work items overwhelming 974 * system_wq. Put them in a separate wq and limit concurrency. 975 * There's no point in executing many of these in parallel. 976 */ 977 cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); 978 if (!cgwb_release_wq) 979 return -ENOMEM; 980 981 return 0; 982 } 983 subsys_initcall(cgwb_init); 984 985 #else /* CONFIG_CGROUP_WRITEBACK */ 986 987 static int cgwb_bdi_init(struct backing_dev_info *bdi) 988 { 989 return wb_init(&bdi->wb, bdi, GFP_KERNEL); 990 } 991 992 static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } 993 994 static void cgwb_bdi_register(struct backing_dev_info *bdi) 995 { 996 list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); 997 } 998 999 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) 1000 { 1001 list_del_rcu(&wb->bdi_node); 1002 } 1003 1004 #endif /* CONFIG_CGROUP_WRITEBACK */ 1005 1006 int bdi_init(struct backing_dev_info *bdi) 1007 { 1008 bdi->dev = NULL; 1009 1010 kref_init(&bdi->refcnt); 1011 bdi->min_ratio = 0; 1012 bdi->max_ratio = 100 * BDI_RATIO_SCALE; 1013 bdi->max_prop_frac = FPROP_FRAC_BASE; 1014 INIT_LIST_HEAD(&bdi->bdi_list); 1015 INIT_LIST_HEAD(&bdi->wb_list); 1016 init_waitqueue_head(&bdi->wb_waitq); 1017 bdi->last_bdp_sleep = jiffies; 1018 1019 return cgwb_bdi_init(bdi); 1020 } 1021 1022 struct backing_dev_info *bdi_alloc(int node_id) 1023 { 1024 struct backing_dev_info *bdi; 1025 1026 bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); 1027 if (!bdi) 1028 return NULL; 1029 1030 if (bdi_init(bdi)) { 1031 kfree(bdi); 1032 return NULL; 1033 } 1034 bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; 1035 bdi->ra_pages = VM_READAHEAD_PAGES; 1036 bdi->io_pages = VM_READAHEAD_PAGES; 1037 timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); 1038 return bdi; 1039 } 1040 EXPORT_SYMBOL(bdi_alloc); 1041 1042 static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) 1043 { 1044 struct rb_node **p = &bdi_tree.rb_node; 1045 struct rb_node *parent = NULL; 1046 struct backing_dev_info *bdi; 1047 1048 lockdep_assert_held(&bdi_lock); 1049 1050 while (*p) { 1051 parent = *p; 1052 bdi = rb_entry(parent, struct backing_dev_info, rb_node); 1053 1054 if (bdi->id > id) 1055 p = &(*p)->rb_left; 1056 else if (bdi->id < id) 1057 p = &(*p)->rb_right; 1058 else 1059 break; 1060 } 1061 1062 if (parentp) 1063 *parentp = parent; 1064 return p; 1065 } 1066 1067 /** 1068 * bdi_get_by_id - lookup and get bdi from its id 1069 * @id: bdi id to lookup 1070 * 1071 * Find bdi matching @id and get it. Returns NULL if the matching bdi 1072 * doesn't exist or is already unregistered. 1073 */ 1074 struct backing_dev_info *bdi_get_by_id(u64 id) 1075 { 1076 struct backing_dev_info *bdi = NULL; 1077 struct rb_node **p; 1078 1079 spin_lock_bh(&bdi_lock); 1080 p = bdi_lookup_rb_node(id, NULL); 1081 if (*p) { 1082 bdi = rb_entry(*p, struct backing_dev_info, rb_node); 1083 bdi_get(bdi); 1084 } 1085 spin_unlock_bh(&bdi_lock); 1086 1087 return bdi; 1088 } 1089 1090 int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) 1091 { 1092 struct device *dev; 1093 struct rb_node *parent, **p; 1094 1095 if (bdi->dev) /* The driver needs to use separate queues per device */ 1096 return 0; 1097 1098 vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); 1099 dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); 1100 if (IS_ERR(dev)) 1101 return PTR_ERR(dev); 1102 1103 cgwb_bdi_register(bdi); 1104 bdi->dev = dev; 1105 1106 bdi_debug_register(bdi, dev_name(dev)); 1107 set_bit(WB_registered, &bdi->wb.state); 1108 1109 spin_lock_bh(&bdi_lock); 1110 1111 bdi->id = ++bdi_id_cursor; 1112 1113 p = bdi_lookup_rb_node(bdi->id, &parent); 1114 rb_link_node(&bdi->rb_node, parent, p); 1115 rb_insert_color(&bdi->rb_node, &bdi_tree); 1116 1117 list_add_tail_rcu(&bdi->bdi_list, &bdi_list); 1118 1119 spin_unlock_bh(&bdi_lock); 1120 1121 trace_writeback_bdi_register(bdi); 1122 return 0; 1123 } 1124 1125 int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) 1126 { 1127 va_list args; 1128 int ret; 1129 1130 va_start(args, fmt); 1131 ret = bdi_register_va(bdi, fmt, args); 1132 va_end(args); 1133 return ret; 1134 } 1135 EXPORT_SYMBOL(bdi_register); 1136 1137 void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) 1138 { 1139 WARN_ON_ONCE(bdi->owner); 1140 bdi->owner = owner; 1141 get_device(owner); 1142 } 1143 1144 /* 1145 * Remove bdi from bdi_list, and ensure that it is no longer visible 1146 */ 1147 static void bdi_remove_from_list(struct backing_dev_info *bdi) 1148 { 1149 spin_lock_bh(&bdi_lock); 1150 rb_erase(&bdi->rb_node, &bdi_tree); 1151 list_del_rcu(&bdi->bdi_list); 1152 spin_unlock_bh(&bdi_lock); 1153 1154 synchronize_rcu_expedited(); 1155 } 1156 1157 void bdi_unregister(struct backing_dev_info *bdi) 1158 { 1159 timer_delete_sync(&bdi->laptop_mode_wb_timer); 1160 1161 /* make sure nobody finds us on the bdi_list anymore */ 1162 bdi_remove_from_list(bdi); 1163 wb_shutdown(&bdi->wb); 1164 cgwb_bdi_unregister(bdi); 1165 1166 /* 1167 * If this BDI's min ratio has been set, use bdi_set_min_ratio() to 1168 * update the global bdi_min_ratio. 1169 */ 1170 if (bdi->min_ratio) 1171 bdi_set_min_ratio(bdi, 0); 1172 1173 if (bdi->dev) { 1174 bdi_debug_unregister(bdi); 1175 device_unregister(bdi->dev); 1176 bdi->dev = NULL; 1177 } 1178 1179 if (bdi->owner) { 1180 put_device(bdi->owner); 1181 bdi->owner = NULL; 1182 } 1183 } 1184 EXPORT_SYMBOL(bdi_unregister); 1185 1186 static void release_bdi(struct kref *ref) 1187 { 1188 struct backing_dev_info *bdi = 1189 container_of(ref, struct backing_dev_info, refcnt); 1190 1191 WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); 1192 WARN_ON_ONCE(bdi->dev); 1193 wb_exit(&bdi->wb); 1194 kfree(bdi); 1195 } 1196 1197 void bdi_put(struct backing_dev_info *bdi) 1198 { 1199 kref_put(&bdi->refcnt, release_bdi); 1200 } 1201 EXPORT_SYMBOL(bdi_put); 1202 1203 struct backing_dev_info *inode_to_bdi(struct inode *inode) 1204 { 1205 struct super_block *sb; 1206 1207 if (!inode) 1208 return &noop_backing_dev_info; 1209 1210 sb = inode->i_sb; 1211 #ifdef CONFIG_BLOCK 1212 if (sb_is_blkdev_sb(sb)) 1213 return I_BDEV(inode)->bd_disk->bdi; 1214 #endif 1215 return sb->s_bdi; 1216 } 1217 EXPORT_SYMBOL(inode_to_bdi); 1218 1219 const char *bdi_dev_name(struct backing_dev_info *bdi) 1220 { 1221 if (!bdi || !bdi->dev) 1222 return bdi_unknown_name; 1223 return bdi->dev_name; 1224 } 1225 EXPORT_SYMBOL_GPL(bdi_dev_name); 1226