1 // SPDX-License-Identifier: GPL-2.0-only 2 3 #include <linux/wait.h> 4 #include <linux/rbtree.h> 5 #include <linux/backing-dev.h> 6 #include <linux/kthread.h> 7 #include <linux/freezer.h> 8 #include <linux/fs.h> 9 #include <linux/pagemap.h> 10 #include <linux/mm.h> 11 #include <linux/sched/mm.h> 12 #include <linux/sched.h> 13 #include <linux/module.h> 14 #include <linux/writeback.h> 15 #include <linux/device.h> 16 #include <trace/events/writeback.h> 17 18 struct backing_dev_info noop_backing_dev_info; 19 EXPORT_SYMBOL_GPL(noop_backing_dev_info); 20 21 static struct class *bdi_class; 22 static const char *bdi_unknown_name = "(unknown)"; 23 24 /* 25 * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU 26 * reader side locking. 27 */ 28 DEFINE_SPINLOCK(bdi_lock); 29 static u64 bdi_id_cursor; 30 static struct rb_root bdi_tree = RB_ROOT; 31 LIST_HEAD(bdi_list); 32 33 /* bdi_wq serves all asynchronous writeback tasks */ 34 struct workqueue_struct *bdi_wq; 35 36 #define K(x) ((x) << (PAGE_SHIFT - 10)) 37 38 #ifdef CONFIG_DEBUG_FS 39 #include <linux/debugfs.h> 40 #include <linux/seq_file.h> 41 42 static struct dentry *bdi_debug_root; 43 44 static void bdi_debug_init(void) 45 { 46 bdi_debug_root = debugfs_create_dir("bdi", NULL); 47 } 48 49 static int bdi_debug_stats_show(struct seq_file *m, void *v) 50 { 51 struct backing_dev_info *bdi = m->private; 52 struct bdi_writeback *wb = &bdi->wb; 53 unsigned long background_thresh; 54 unsigned long dirty_thresh; 55 unsigned long wb_thresh; 56 unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; 57 struct inode *inode; 58 59 nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; 60 spin_lock(&wb->list_lock); 61 list_for_each_entry(inode, &wb->b_dirty, i_io_list) 62 nr_dirty++; 63 list_for_each_entry(inode, &wb->b_io, i_io_list) 64 nr_io++; 65 list_for_each_entry(inode, &wb->b_more_io, i_io_list) 66 nr_more_io++; 67 list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) 68 if (inode->i_state & I_DIRTY_TIME) 69 nr_dirty_time++; 70 spin_unlock(&wb->list_lock); 71 72 global_dirty_limits(&background_thresh, &dirty_thresh); 73 wb_thresh = wb_calc_thresh(wb, dirty_thresh); 74 75 seq_printf(m, 76 "BdiWriteback: %10lu kB\n" 77 "BdiReclaimable: %10lu kB\n" 78 "BdiDirtyThresh: %10lu kB\n" 79 "DirtyThresh: %10lu kB\n" 80 "BackgroundThresh: %10lu kB\n" 81 "BdiDirtied: %10lu kB\n" 82 "BdiWritten: %10lu kB\n" 83 "BdiWriteBandwidth: %10lu kBps\n" 84 "b_dirty: %10lu\n" 85 "b_io: %10lu\n" 86 "b_more_io: %10lu\n" 87 "b_dirty_time: %10lu\n" 88 "bdi_list: %10u\n" 89 "state: %10lx\n", 90 (unsigned long) K(wb_stat(wb, WB_WRITEBACK)), 91 (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)), 92 K(wb_thresh), 93 K(dirty_thresh), 94 K(background_thresh), 95 (unsigned long) K(wb_stat(wb, WB_DIRTIED)), 96 (unsigned long) K(wb_stat(wb, WB_WRITTEN)), 97 (unsigned long) K(wb->write_bandwidth), 98 nr_dirty, 99 nr_io, 100 nr_more_io, 101 nr_dirty_time, 102 !list_empty(&bdi->bdi_list), bdi->wb.state); 103 104 return 0; 105 } 106 DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); 107 108 static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) 109 { 110 bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); 111 112 debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, 113 &bdi_debug_stats_fops); 114 } 115 116 static void bdi_debug_unregister(struct backing_dev_info *bdi) 117 { 118 debugfs_remove_recursive(bdi->debug_dir); 119 } 120 #else 121 static inline void bdi_debug_init(void) 122 { 123 } 124 static inline void bdi_debug_register(struct backing_dev_info *bdi, 125 const char *name) 126 { 127 } 128 static inline void bdi_debug_unregister(struct backing_dev_info *bdi) 129 { 130 } 131 #endif 132 133 static ssize_t read_ahead_kb_store(struct device *dev, 134 struct device_attribute *attr, 135 const char *buf, size_t count) 136 { 137 struct backing_dev_info *bdi = dev_get_drvdata(dev); 138 unsigned long read_ahead_kb; 139 ssize_t ret; 140 141 ret = kstrtoul(buf, 10, &read_ahead_kb); 142 if (ret < 0) 143 return ret; 144 145 bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); 146 147 return count; 148 } 149 150 #define BDI_SHOW(name, expr) \ 151 static ssize_t name##_show(struct device *dev, \ 152 struct device_attribute *attr, char *buf) \ 153 { \ 154 struct backing_dev_info *bdi = dev_get_drvdata(dev); \ 155 \ 156 return sysfs_emit(buf, "%lld\n", (long long)expr); \ 157 } \ 158 static DEVICE_ATTR_RW(name); 159 160 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) 161 162 static ssize_t min_ratio_store(struct device *dev, 163 struct device_attribute *attr, const char *buf, size_t count) 164 { 165 struct backing_dev_info *bdi = dev_get_drvdata(dev); 166 unsigned int ratio; 167 ssize_t ret; 168 169 ret = kstrtouint(buf, 10, &ratio); 170 if (ret < 0) 171 return ret; 172 173 ret = bdi_set_min_ratio(bdi, ratio); 174 if (!ret) 175 ret = count; 176 177 return ret; 178 } 179 BDI_SHOW(min_ratio, bdi->min_ratio) 180 181 static ssize_t max_ratio_store(struct device *dev, 182 struct device_attribute *attr, const char *buf, size_t count) 183 { 184 struct backing_dev_info *bdi = dev_get_drvdata(dev); 185 unsigned int ratio; 186 ssize_t ret; 187 188 ret = kstrtouint(buf, 10, &ratio); 189 if (ret < 0) 190 return ret; 191 192 ret = bdi_set_max_ratio(bdi, ratio); 193 if (!ret) 194 ret = count; 195 196 return ret; 197 } 198 BDI_SHOW(max_ratio, bdi->max_ratio) 199 200 static ssize_t stable_pages_required_show(struct device *dev, 201 struct device_attribute *attr, 202 char *buf) 203 { 204 dev_warn_once(dev, 205 "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); 206 return sysfs_emit(buf, "%d\n", 0); 207 } 208 static DEVICE_ATTR_RO(stable_pages_required); 209 210 static struct attribute *bdi_dev_attrs[] = { 211 &dev_attr_read_ahead_kb.attr, 212 &dev_attr_min_ratio.attr, 213 &dev_attr_max_ratio.attr, 214 &dev_attr_stable_pages_required.attr, 215 NULL, 216 }; 217 ATTRIBUTE_GROUPS(bdi_dev); 218 219 static __init int bdi_class_init(void) 220 { 221 bdi_class = class_create(THIS_MODULE, "bdi"); 222 if (IS_ERR(bdi_class)) 223 return PTR_ERR(bdi_class); 224 225 bdi_class->dev_groups = bdi_dev_groups; 226 bdi_debug_init(); 227 228 return 0; 229 } 230 postcore_initcall(bdi_class_init); 231 232 static int bdi_init(struct backing_dev_info *bdi); 233 234 static int __init default_bdi_init(void) 235 { 236 int err; 237 238 bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | 239 WQ_SYSFS, 0); 240 if (!bdi_wq) 241 return -ENOMEM; 242 243 err = bdi_init(&noop_backing_dev_info); 244 245 return err; 246 } 247 subsys_initcall(default_bdi_init); 248 249 /* 250 * This function is used when the first inode for this wb is marked dirty. It 251 * wakes-up the corresponding bdi thread which should then take care of the 252 * periodic background write-out of dirty inodes. Since the write-out would 253 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just 254 * set up a timer which wakes the bdi thread up later. 255 * 256 * Note, we wouldn't bother setting up the timer, but this function is on the 257 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches 258 * by delaying the wake-up. 259 * 260 * We have to be careful not to postpone flush work if it is scheduled for 261 * earlier. Thus we use queue_delayed_work(). 262 */ 263 void wb_wakeup_delayed(struct bdi_writeback *wb) 264 { 265 unsigned long timeout; 266 267 timeout = msecs_to_jiffies(dirty_writeback_interval * 10); 268 spin_lock_bh(&wb->work_lock); 269 if (test_bit(WB_registered, &wb->state)) 270 queue_delayed_work(bdi_wq, &wb->dwork, timeout); 271 spin_unlock_bh(&wb->work_lock); 272 } 273 274 static void wb_update_bandwidth_workfn(struct work_struct *work) 275 { 276 struct bdi_writeback *wb = container_of(to_delayed_work(work), 277 struct bdi_writeback, bw_dwork); 278 279 wb_update_bandwidth(wb); 280 } 281 282 /* 283 * Initial write bandwidth: 100 MB/s 284 */ 285 #define INIT_BW (100 << (20 - PAGE_SHIFT)) 286 287 static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, 288 gfp_t gfp) 289 { 290 int i, err; 291 292 memset(wb, 0, sizeof(*wb)); 293 294 wb->bdi = bdi; 295 wb->last_old_flush = jiffies; 296 INIT_LIST_HEAD(&wb->b_dirty); 297 INIT_LIST_HEAD(&wb->b_io); 298 INIT_LIST_HEAD(&wb->b_more_io); 299 INIT_LIST_HEAD(&wb->b_dirty_time); 300 spin_lock_init(&wb->list_lock); 301 302 atomic_set(&wb->writeback_inodes, 0); 303 wb->bw_time_stamp = jiffies; 304 wb->balanced_dirty_ratelimit = INIT_BW; 305 wb->dirty_ratelimit = INIT_BW; 306 wb->write_bandwidth = INIT_BW; 307 wb->avg_write_bandwidth = INIT_BW; 308 309 spin_lock_init(&wb->work_lock); 310 INIT_LIST_HEAD(&wb->work_list); 311 INIT_DELAYED_WORK(&wb->dwork, wb_workfn); 312 INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); 313 wb->dirty_sleep = jiffies; 314 315 err = fprop_local_init_percpu(&wb->completions, gfp); 316 if (err) 317 return err; 318 319 for (i = 0; i < NR_WB_STAT_ITEMS; i++) { 320 err = percpu_counter_init(&wb->stat[i], 0, gfp); 321 if (err) 322 goto out_destroy_stat; 323 } 324 325 return 0; 326 327 out_destroy_stat: 328 while (i--) 329 percpu_counter_destroy(&wb->stat[i]); 330 fprop_local_destroy_percpu(&wb->completions); 331 return err; 332 } 333 334 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); 335 336 /* 337 * Remove bdi from the global list and shutdown any threads we have running 338 */ 339 static void wb_shutdown(struct bdi_writeback *wb) 340 { 341 /* Make sure nobody queues further work */ 342 spin_lock_bh(&wb->work_lock); 343 if (!test_and_clear_bit(WB_registered, &wb->state)) { 344 spin_unlock_bh(&wb->work_lock); 345 return; 346 } 347 spin_unlock_bh(&wb->work_lock); 348 349 cgwb_remove_from_bdi_list(wb); 350 /* 351 * Drain work list and shutdown the delayed_work. !WB_registered 352 * tells wb_workfn() that @wb is dying and its work_list needs to 353 * be drained no matter what. 354 */ 355 mod_delayed_work(bdi_wq, &wb->dwork, 0); 356 flush_delayed_work(&wb->dwork); 357 WARN_ON(!list_empty(&wb->work_list)); 358 flush_delayed_work(&wb->bw_dwork); 359 } 360 361 static void wb_exit(struct bdi_writeback *wb) 362 { 363 int i; 364 365 WARN_ON(delayed_work_pending(&wb->dwork)); 366 367 for (i = 0; i < NR_WB_STAT_ITEMS; i++) 368 percpu_counter_destroy(&wb->stat[i]); 369 370 fprop_local_destroy_percpu(&wb->completions); 371 } 372 373 #ifdef CONFIG_CGROUP_WRITEBACK 374 375 #include <linux/memcontrol.h> 376 377 /* 378 * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and 379 * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. 380 */ 381 static DEFINE_SPINLOCK(cgwb_lock); 382 static struct workqueue_struct *cgwb_release_wq; 383 384 static LIST_HEAD(offline_cgwbs); 385 static void cleanup_offline_cgwbs_workfn(struct work_struct *work); 386 static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); 387 388 static void cgwb_release_workfn(struct work_struct *work) 389 { 390 struct bdi_writeback *wb = container_of(work, struct bdi_writeback, 391 release_work); 392 struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css); 393 struct backing_dev_info *bdi = wb->bdi; 394 395 mutex_lock(&wb->bdi->cgwb_release_mutex); 396 wb_shutdown(wb); 397 398 css_put(wb->memcg_css); 399 css_put(wb->blkcg_css); 400 mutex_unlock(&wb->bdi->cgwb_release_mutex); 401 402 /* triggers blkg destruction if no online users left */ 403 blkcg_unpin_online(blkcg); 404 405 fprop_local_destroy_percpu(&wb->memcg_completions); 406 407 spin_lock_irq(&cgwb_lock); 408 list_del(&wb->offline_node); 409 spin_unlock_irq(&cgwb_lock); 410 411 percpu_ref_exit(&wb->refcnt); 412 wb_exit(wb); 413 bdi_put(bdi); 414 WARN_ON_ONCE(!list_empty(&wb->b_attached)); 415 kfree_rcu(wb, rcu); 416 } 417 418 static void cgwb_release(struct percpu_ref *refcnt) 419 { 420 struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, 421 refcnt); 422 queue_work(cgwb_release_wq, &wb->release_work); 423 } 424 425 static void cgwb_kill(struct bdi_writeback *wb) 426 { 427 lockdep_assert_held(&cgwb_lock); 428 429 WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); 430 list_del(&wb->memcg_node); 431 list_del(&wb->blkcg_node); 432 list_add(&wb->offline_node, &offline_cgwbs); 433 percpu_ref_kill(&wb->refcnt); 434 } 435 436 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) 437 { 438 spin_lock_irq(&cgwb_lock); 439 list_del_rcu(&wb->bdi_node); 440 spin_unlock_irq(&cgwb_lock); 441 } 442 443 static int cgwb_create(struct backing_dev_info *bdi, 444 struct cgroup_subsys_state *memcg_css, gfp_t gfp) 445 { 446 struct mem_cgroup *memcg; 447 struct cgroup_subsys_state *blkcg_css; 448 struct blkcg *blkcg; 449 struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; 450 struct bdi_writeback *wb; 451 unsigned long flags; 452 int ret = 0; 453 454 memcg = mem_cgroup_from_css(memcg_css); 455 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); 456 blkcg = css_to_blkcg(blkcg_css); 457 memcg_cgwb_list = &memcg->cgwb_list; 458 blkcg_cgwb_list = &blkcg->cgwb_list; 459 460 /* look up again under lock and discard on blkcg mismatch */ 461 spin_lock_irqsave(&cgwb_lock, flags); 462 wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); 463 if (wb && wb->blkcg_css != blkcg_css) { 464 cgwb_kill(wb); 465 wb = NULL; 466 } 467 spin_unlock_irqrestore(&cgwb_lock, flags); 468 if (wb) 469 goto out_put; 470 471 /* need to create a new one */ 472 wb = kmalloc(sizeof(*wb), gfp); 473 if (!wb) { 474 ret = -ENOMEM; 475 goto out_put; 476 } 477 478 ret = wb_init(wb, bdi, gfp); 479 if (ret) 480 goto err_free; 481 482 ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); 483 if (ret) 484 goto err_wb_exit; 485 486 ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); 487 if (ret) 488 goto err_ref_exit; 489 490 wb->memcg_css = memcg_css; 491 wb->blkcg_css = blkcg_css; 492 INIT_LIST_HEAD(&wb->b_attached); 493 INIT_WORK(&wb->release_work, cgwb_release_workfn); 494 set_bit(WB_registered, &wb->state); 495 bdi_get(bdi); 496 497 /* 498 * The root wb determines the registered state of the whole bdi and 499 * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate 500 * whether they're still online. Don't link @wb if any is dead. 501 * See wb_memcg_offline() and wb_blkcg_offline(). 502 */ 503 ret = -ENODEV; 504 spin_lock_irqsave(&cgwb_lock, flags); 505 if (test_bit(WB_registered, &bdi->wb.state) && 506 blkcg_cgwb_list->next && memcg_cgwb_list->next) { 507 /* we might have raced another instance of this function */ 508 ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); 509 if (!ret) { 510 list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); 511 list_add(&wb->memcg_node, memcg_cgwb_list); 512 list_add(&wb->blkcg_node, blkcg_cgwb_list); 513 blkcg_pin_online(blkcg); 514 css_get(memcg_css); 515 css_get(blkcg_css); 516 } 517 } 518 spin_unlock_irqrestore(&cgwb_lock, flags); 519 if (ret) { 520 if (ret == -EEXIST) 521 ret = 0; 522 goto err_fprop_exit; 523 } 524 goto out_put; 525 526 err_fprop_exit: 527 bdi_put(bdi); 528 fprop_local_destroy_percpu(&wb->memcg_completions); 529 err_ref_exit: 530 percpu_ref_exit(&wb->refcnt); 531 err_wb_exit: 532 wb_exit(wb); 533 err_free: 534 kfree(wb); 535 out_put: 536 css_put(blkcg_css); 537 return ret; 538 } 539 540 /** 541 * wb_get_lookup - get wb for a given memcg 542 * @bdi: target bdi 543 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) 544 * 545 * Try to get the wb for @memcg_css on @bdi. The returned wb has its 546 * refcount incremented. 547 * 548 * This function uses css_get() on @memcg_css and thus expects its refcnt 549 * to be positive on invocation. IOW, rcu_read_lock() protection on 550 * @memcg_css isn't enough. try_get it before calling this function. 551 * 552 * A wb is keyed by its associated memcg. As blkcg implicitly enables 553 * memcg on the default hierarchy, memcg association is guaranteed to be 554 * more specific (equal or descendant to the associated blkcg) and thus can 555 * identify both the memcg and blkcg associations. 556 * 557 * Because the blkcg associated with a memcg may change as blkcg is enabled 558 * and disabled closer to root in the hierarchy, each wb keeps track of 559 * both the memcg and blkcg associated with it and verifies the blkcg on 560 * each lookup. On mismatch, the existing wb is discarded and a new one is 561 * created. 562 */ 563 struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, 564 struct cgroup_subsys_state *memcg_css) 565 { 566 struct bdi_writeback *wb; 567 568 if (!memcg_css->parent) 569 return &bdi->wb; 570 571 rcu_read_lock(); 572 wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); 573 if (wb) { 574 struct cgroup_subsys_state *blkcg_css; 575 576 /* see whether the blkcg association has changed */ 577 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); 578 if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) 579 wb = NULL; 580 css_put(blkcg_css); 581 } 582 rcu_read_unlock(); 583 584 return wb; 585 } 586 587 /** 588 * wb_get_create - get wb for a given memcg, create if necessary 589 * @bdi: target bdi 590 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) 591 * @gfp: allocation mask to use 592 * 593 * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to 594 * create one. See wb_get_lookup() for more details. 595 */ 596 struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, 597 struct cgroup_subsys_state *memcg_css, 598 gfp_t gfp) 599 { 600 struct bdi_writeback *wb; 601 602 might_alloc(gfp); 603 604 if (!memcg_css->parent) 605 return &bdi->wb; 606 607 do { 608 wb = wb_get_lookup(bdi, memcg_css); 609 } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); 610 611 return wb; 612 } 613 614 static int cgwb_bdi_init(struct backing_dev_info *bdi) 615 { 616 int ret; 617 618 INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); 619 mutex_init(&bdi->cgwb_release_mutex); 620 init_rwsem(&bdi->wb_switch_rwsem); 621 622 ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); 623 if (!ret) { 624 bdi->wb.memcg_css = &root_mem_cgroup->css; 625 bdi->wb.blkcg_css = blkcg_root_css; 626 } 627 return ret; 628 } 629 630 static void cgwb_bdi_unregister(struct backing_dev_info *bdi) 631 { 632 struct radix_tree_iter iter; 633 void **slot; 634 struct bdi_writeback *wb; 635 636 WARN_ON(test_bit(WB_registered, &bdi->wb.state)); 637 638 spin_lock_irq(&cgwb_lock); 639 radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) 640 cgwb_kill(*slot); 641 spin_unlock_irq(&cgwb_lock); 642 643 mutex_lock(&bdi->cgwb_release_mutex); 644 spin_lock_irq(&cgwb_lock); 645 while (!list_empty(&bdi->wb_list)) { 646 wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, 647 bdi_node); 648 spin_unlock_irq(&cgwb_lock); 649 wb_shutdown(wb); 650 spin_lock_irq(&cgwb_lock); 651 } 652 spin_unlock_irq(&cgwb_lock); 653 mutex_unlock(&bdi->cgwb_release_mutex); 654 } 655 656 /* 657 * cleanup_offline_cgwbs_workfn - try to release dying cgwbs 658 * 659 * Try to release dying cgwbs by switching attached inodes to the nearest 660 * living ancestor's writeback. Processed wbs are placed at the end 661 * of the list to guarantee the forward progress. 662 */ 663 static void cleanup_offline_cgwbs_workfn(struct work_struct *work) 664 { 665 struct bdi_writeback *wb; 666 LIST_HEAD(processed); 667 668 spin_lock_irq(&cgwb_lock); 669 670 while (!list_empty(&offline_cgwbs)) { 671 wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, 672 offline_node); 673 list_move(&wb->offline_node, &processed); 674 675 /* 676 * If wb is dirty, cleaning up the writeback by switching 677 * attached inodes will result in an effective removal of any 678 * bandwidth restrictions, which isn't the goal. Instead, 679 * it can be postponed until the next time, when all io 680 * will be likely completed. If in the meantime some inodes 681 * will get re-dirtied, they should be eventually switched to 682 * a new cgwb. 683 */ 684 if (wb_has_dirty_io(wb)) 685 continue; 686 687 if (!wb_tryget(wb)) 688 continue; 689 690 spin_unlock_irq(&cgwb_lock); 691 while (cleanup_offline_cgwb(wb)) 692 cond_resched(); 693 spin_lock_irq(&cgwb_lock); 694 695 wb_put(wb); 696 } 697 698 if (!list_empty(&processed)) 699 list_splice_tail(&processed, &offline_cgwbs); 700 701 spin_unlock_irq(&cgwb_lock); 702 } 703 704 /** 705 * wb_memcg_offline - kill all wb's associated with a memcg being offlined 706 * @memcg: memcg being offlined 707 * 708 * Also prevents creation of any new wb's associated with @memcg. 709 */ 710 void wb_memcg_offline(struct mem_cgroup *memcg) 711 { 712 struct list_head *memcg_cgwb_list = &memcg->cgwb_list; 713 struct bdi_writeback *wb, *next; 714 715 spin_lock_irq(&cgwb_lock); 716 list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) 717 cgwb_kill(wb); 718 memcg_cgwb_list->next = NULL; /* prevent new wb's */ 719 spin_unlock_irq(&cgwb_lock); 720 721 queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); 722 } 723 724 /** 725 * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined 726 * @blkcg: blkcg being offlined 727 * 728 * Also prevents creation of any new wb's associated with @blkcg. 729 */ 730 void wb_blkcg_offline(struct blkcg *blkcg) 731 { 732 struct bdi_writeback *wb, *next; 733 734 spin_lock_irq(&cgwb_lock); 735 list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node) 736 cgwb_kill(wb); 737 blkcg->cgwb_list.next = NULL; /* prevent new wb's */ 738 spin_unlock_irq(&cgwb_lock); 739 } 740 741 static void cgwb_bdi_register(struct backing_dev_info *bdi) 742 { 743 spin_lock_irq(&cgwb_lock); 744 list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); 745 spin_unlock_irq(&cgwb_lock); 746 } 747 748 static int __init cgwb_init(void) 749 { 750 /* 751 * There can be many concurrent release work items overwhelming 752 * system_wq. Put them in a separate wq and limit concurrency. 753 * There's no point in executing many of these in parallel. 754 */ 755 cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); 756 if (!cgwb_release_wq) 757 return -ENOMEM; 758 759 return 0; 760 } 761 subsys_initcall(cgwb_init); 762 763 #else /* CONFIG_CGROUP_WRITEBACK */ 764 765 static int cgwb_bdi_init(struct backing_dev_info *bdi) 766 { 767 return wb_init(&bdi->wb, bdi, GFP_KERNEL); 768 } 769 770 static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } 771 772 static void cgwb_bdi_register(struct backing_dev_info *bdi) 773 { 774 list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); 775 } 776 777 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) 778 { 779 list_del_rcu(&wb->bdi_node); 780 } 781 782 #endif /* CONFIG_CGROUP_WRITEBACK */ 783 784 static int bdi_init(struct backing_dev_info *bdi) 785 { 786 int ret; 787 788 bdi->dev = NULL; 789 790 kref_init(&bdi->refcnt); 791 bdi->min_ratio = 0; 792 bdi->max_ratio = 100; 793 bdi->max_prop_frac = FPROP_FRAC_BASE; 794 INIT_LIST_HEAD(&bdi->bdi_list); 795 INIT_LIST_HEAD(&bdi->wb_list); 796 init_waitqueue_head(&bdi->wb_waitq); 797 798 ret = cgwb_bdi_init(bdi); 799 800 return ret; 801 } 802 803 struct backing_dev_info *bdi_alloc(int node_id) 804 { 805 struct backing_dev_info *bdi; 806 807 bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); 808 if (!bdi) 809 return NULL; 810 811 if (bdi_init(bdi)) { 812 kfree(bdi); 813 return NULL; 814 } 815 bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; 816 bdi->ra_pages = VM_READAHEAD_PAGES; 817 bdi->io_pages = VM_READAHEAD_PAGES; 818 timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); 819 return bdi; 820 } 821 EXPORT_SYMBOL(bdi_alloc); 822 823 static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) 824 { 825 struct rb_node **p = &bdi_tree.rb_node; 826 struct rb_node *parent = NULL; 827 struct backing_dev_info *bdi; 828 829 lockdep_assert_held(&bdi_lock); 830 831 while (*p) { 832 parent = *p; 833 bdi = rb_entry(parent, struct backing_dev_info, rb_node); 834 835 if (bdi->id > id) 836 p = &(*p)->rb_left; 837 else if (bdi->id < id) 838 p = &(*p)->rb_right; 839 else 840 break; 841 } 842 843 if (parentp) 844 *parentp = parent; 845 return p; 846 } 847 848 /** 849 * bdi_get_by_id - lookup and get bdi from its id 850 * @id: bdi id to lookup 851 * 852 * Find bdi matching @id and get it. Returns NULL if the matching bdi 853 * doesn't exist or is already unregistered. 854 */ 855 struct backing_dev_info *bdi_get_by_id(u64 id) 856 { 857 struct backing_dev_info *bdi = NULL; 858 struct rb_node **p; 859 860 spin_lock_bh(&bdi_lock); 861 p = bdi_lookup_rb_node(id, NULL); 862 if (*p) { 863 bdi = rb_entry(*p, struct backing_dev_info, rb_node); 864 bdi_get(bdi); 865 } 866 spin_unlock_bh(&bdi_lock); 867 868 return bdi; 869 } 870 871 int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) 872 { 873 struct device *dev; 874 struct rb_node *parent, **p; 875 876 if (bdi->dev) /* The driver needs to use separate queues per device */ 877 return 0; 878 879 vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); 880 dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); 881 if (IS_ERR(dev)) 882 return PTR_ERR(dev); 883 884 cgwb_bdi_register(bdi); 885 bdi->dev = dev; 886 887 bdi_debug_register(bdi, dev_name(dev)); 888 set_bit(WB_registered, &bdi->wb.state); 889 890 spin_lock_bh(&bdi_lock); 891 892 bdi->id = ++bdi_id_cursor; 893 894 p = bdi_lookup_rb_node(bdi->id, &parent); 895 rb_link_node(&bdi->rb_node, parent, p); 896 rb_insert_color(&bdi->rb_node, &bdi_tree); 897 898 list_add_tail_rcu(&bdi->bdi_list, &bdi_list); 899 900 spin_unlock_bh(&bdi_lock); 901 902 trace_writeback_bdi_register(bdi); 903 return 0; 904 } 905 906 int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) 907 { 908 va_list args; 909 int ret; 910 911 va_start(args, fmt); 912 ret = bdi_register_va(bdi, fmt, args); 913 va_end(args); 914 return ret; 915 } 916 EXPORT_SYMBOL(bdi_register); 917 918 void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) 919 { 920 WARN_ON_ONCE(bdi->owner); 921 bdi->owner = owner; 922 get_device(owner); 923 } 924 925 /* 926 * Remove bdi from bdi_list, and ensure that it is no longer visible 927 */ 928 static void bdi_remove_from_list(struct backing_dev_info *bdi) 929 { 930 spin_lock_bh(&bdi_lock); 931 rb_erase(&bdi->rb_node, &bdi_tree); 932 list_del_rcu(&bdi->bdi_list); 933 spin_unlock_bh(&bdi_lock); 934 935 synchronize_rcu_expedited(); 936 } 937 938 void bdi_unregister(struct backing_dev_info *bdi) 939 { 940 del_timer_sync(&bdi->laptop_mode_wb_timer); 941 942 /* make sure nobody finds us on the bdi_list anymore */ 943 bdi_remove_from_list(bdi); 944 wb_shutdown(&bdi->wb); 945 cgwb_bdi_unregister(bdi); 946 947 if (bdi->dev) { 948 bdi_debug_unregister(bdi); 949 device_unregister(bdi->dev); 950 bdi->dev = NULL; 951 } 952 953 if (bdi->owner) { 954 put_device(bdi->owner); 955 bdi->owner = NULL; 956 } 957 } 958 EXPORT_SYMBOL(bdi_unregister); 959 960 static void release_bdi(struct kref *ref) 961 { 962 struct backing_dev_info *bdi = 963 container_of(ref, struct backing_dev_info, refcnt); 964 965 WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); 966 WARN_ON_ONCE(bdi->dev); 967 wb_exit(&bdi->wb); 968 kfree(bdi); 969 } 970 971 void bdi_put(struct backing_dev_info *bdi) 972 { 973 kref_put(&bdi->refcnt, release_bdi); 974 } 975 EXPORT_SYMBOL(bdi_put); 976 977 const char *bdi_dev_name(struct backing_dev_info *bdi) 978 { 979 if (!bdi || !bdi->dev) 980 return bdi_unknown_name; 981 return bdi->dev_name; 982 } 983 EXPORT_SYMBOL_GPL(bdi_dev_name); 984 985 static wait_queue_head_t congestion_wqh[2] = { 986 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 987 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 988 }; 989 static atomic_t nr_wb_congested[2]; 990 991 void clear_bdi_congested(struct backing_dev_info *bdi, int sync) 992 { 993 wait_queue_head_t *wqh = &congestion_wqh[sync]; 994 enum wb_congested_state bit; 995 996 bit = sync ? WB_sync_congested : WB_async_congested; 997 if (test_and_clear_bit(bit, &bdi->wb.congested)) 998 atomic_dec(&nr_wb_congested[sync]); 999 smp_mb__after_atomic(); 1000 if (waitqueue_active(wqh)) 1001 wake_up(wqh); 1002 } 1003 EXPORT_SYMBOL(clear_bdi_congested); 1004 1005 void set_bdi_congested(struct backing_dev_info *bdi, int sync) 1006 { 1007 enum wb_congested_state bit; 1008 1009 bit = sync ? WB_sync_congested : WB_async_congested; 1010 if (!test_and_set_bit(bit, &bdi->wb.congested)) 1011 atomic_inc(&nr_wb_congested[sync]); 1012 } 1013 EXPORT_SYMBOL(set_bdi_congested); 1014 1015 /** 1016 * congestion_wait - wait for a backing_dev to become uncongested 1017 * @sync: SYNC or ASYNC IO 1018 * @timeout: timeout in jiffies 1019 * 1020 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit 1021 * write congestion. If no backing_devs are congested then just wait for the 1022 * next write to be completed. 1023 */ 1024 long congestion_wait(int sync, long timeout) 1025 { 1026 long ret; 1027 unsigned long start = jiffies; 1028 DEFINE_WAIT(wait); 1029 wait_queue_head_t *wqh = &congestion_wqh[sync]; 1030 1031 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 1032 ret = io_schedule_timeout(timeout); 1033 finish_wait(wqh, &wait); 1034 1035 trace_writeback_congestion_wait(jiffies_to_usecs(timeout), 1036 jiffies_to_usecs(jiffies - start)); 1037 1038 return ret; 1039 } 1040 EXPORT_SYMBOL(congestion_wait); 1041