1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* memcontrol.c - Memory Controller 3 * 4 * Copyright IBM Corporation, 2007 5 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 6 * 7 * Copyright 2007 OpenVZ SWsoft Inc 8 * Author: Pavel Emelianov <xemul@openvz.org> 9 * 10 * Memory thresholds 11 * Copyright (C) 2009 Nokia Corporation 12 * Author: Kirill A. Shutemov 13 * 14 * Kernel Memory Controller 15 * Copyright (C) 2012 Parallels Inc. and Google Inc. 16 * Authors: Glauber Costa and Suleiman Souhlal 17 * 18 * Native page reclaim 19 * Charge lifetime sanitation 20 * Lockless page tracking & accounting 21 * Unified hierarchy configuration model 22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 23 * 24 * Per memcg lru locking 25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi 26 */ 27 28 #include <linux/page_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/pagewalk.h> 32 #include <linux/sched/mm.h> 33 #include <linux/shmem_fs.h> 34 #include <linux/hugetlb.h> 35 #include <linux/pagemap.h> 36 #include <linux/vm_event_item.h> 37 #include <linux/smp.h> 38 #include <linux/page-flags.h> 39 #include <linux/backing-dev.h> 40 #include <linux/bit_spinlock.h> 41 #include <linux/rcupdate.h> 42 #include <linux/limits.h> 43 #include <linux/export.h> 44 #include <linux/mutex.h> 45 #include <linux/rbtree.h> 46 #include <linux/slab.h> 47 #include <linux/swap.h> 48 #include <linux/swapops.h> 49 #include <linux/spinlock.h> 50 #include <linux/eventfd.h> 51 #include <linux/poll.h> 52 #include <linux/sort.h> 53 #include <linux/fs.h> 54 #include <linux/seq_file.h> 55 #include <linux/vmpressure.h> 56 #include <linux/mm_inline.h> 57 #include <linux/swap_cgroup.h> 58 #include <linux/cpu.h> 59 #include <linux/oom.h> 60 #include <linux/lockdep.h> 61 #include <linux/file.h> 62 #include <linux/tracehook.h> 63 #include <linux/psi.h> 64 #include <linux/seq_buf.h> 65 #include "internal.h" 66 #include <net/sock.h> 67 #include <net/ip.h> 68 #include "slab.h" 69 70 #include <linux/uaccess.h> 71 72 #include <trace/events/vmscan.h> 73 74 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 75 EXPORT_SYMBOL(memory_cgrp_subsys); 76 77 struct mem_cgroup *root_mem_cgroup __read_mostly; 78 79 /* Active memory cgroup to use from an interrupt context */ 80 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 81 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); 82 83 /* Socket memory accounting disabled? */ 84 static bool cgroup_memory_nosocket __ro_after_init; 85 86 /* Kernel memory accounting disabled? */ 87 bool cgroup_memory_nokmem __ro_after_init; 88 89 /* Whether the swap controller is active */ 90 #ifdef CONFIG_MEMCG_SWAP 91 bool cgroup_memory_noswap __ro_after_init; 92 #else 93 #define cgroup_memory_noswap 1 94 #endif 95 96 #ifdef CONFIG_CGROUP_WRITEBACK 97 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 98 #endif 99 100 /* Whether legacy memory+swap accounting is active */ 101 static bool do_memsw_account(void) 102 { 103 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; 104 } 105 106 #define THRESHOLDS_EVENTS_TARGET 128 107 #define SOFTLIMIT_EVENTS_TARGET 1024 108 109 /* 110 * Cgroups above their limits are maintained in a RB-Tree, independent of 111 * their hierarchy representation 112 */ 113 114 struct mem_cgroup_tree_per_node { 115 struct rb_root rb_root; 116 struct rb_node *rb_rightmost; 117 spinlock_t lock; 118 }; 119 120 struct mem_cgroup_tree { 121 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 122 }; 123 124 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 125 126 /* for OOM */ 127 struct mem_cgroup_eventfd_list { 128 struct list_head list; 129 struct eventfd_ctx *eventfd; 130 }; 131 132 /* 133 * cgroup_event represents events which userspace want to receive. 134 */ 135 struct mem_cgroup_event { 136 /* 137 * memcg which the event belongs to. 138 */ 139 struct mem_cgroup *memcg; 140 /* 141 * eventfd to signal userspace about the event. 142 */ 143 struct eventfd_ctx *eventfd; 144 /* 145 * Each of these stored in a list by the cgroup. 146 */ 147 struct list_head list; 148 /* 149 * register_event() callback will be used to add new userspace 150 * waiter for changes related to this event. Use eventfd_signal() 151 * on eventfd to send notification to userspace. 152 */ 153 int (*register_event)(struct mem_cgroup *memcg, 154 struct eventfd_ctx *eventfd, const char *args); 155 /* 156 * unregister_event() callback will be called when userspace closes 157 * the eventfd or on cgroup removing. This callback must be set, 158 * if you want provide notification functionality. 159 */ 160 void (*unregister_event)(struct mem_cgroup *memcg, 161 struct eventfd_ctx *eventfd); 162 /* 163 * All fields below needed to unregister event when 164 * userspace closes eventfd. 165 */ 166 poll_table pt; 167 wait_queue_head_t *wqh; 168 wait_queue_entry_t wait; 169 struct work_struct remove; 170 }; 171 172 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 173 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 174 175 /* Stuffs for move charges at task migration. */ 176 /* 177 * Types of charges to be moved. 178 */ 179 #define MOVE_ANON 0x1U 180 #define MOVE_FILE 0x2U 181 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 182 183 /* "mc" and its members are protected by cgroup_mutex */ 184 static struct move_charge_struct { 185 spinlock_t lock; /* for from, to */ 186 struct mm_struct *mm; 187 struct mem_cgroup *from; 188 struct mem_cgroup *to; 189 unsigned long flags; 190 unsigned long precharge; 191 unsigned long moved_charge; 192 unsigned long moved_swap; 193 struct task_struct *moving_task; /* a task moving charges */ 194 wait_queue_head_t waitq; /* a waitq for other context */ 195 } mc = { 196 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 197 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 198 }; 199 200 /* 201 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 202 * limit reclaim to prevent infinite loops, if they ever occur. 203 */ 204 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 205 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 206 207 /* for encoding cft->private value on file */ 208 enum res_type { 209 _MEM, 210 _MEMSWAP, 211 _OOM_TYPE, 212 _KMEM, 213 _TCP, 214 }; 215 216 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 217 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 218 #define MEMFILE_ATTR(val) ((val) & 0xffff) 219 /* Used for OOM notifier */ 220 #define OOM_CONTROL (0) 221 222 /* 223 * Iteration constructs for visiting all cgroups (under a tree). If 224 * loops are exited prematurely (break), mem_cgroup_iter_break() must 225 * be used for reference counting. 226 */ 227 #define for_each_mem_cgroup_tree(iter, root) \ 228 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 229 iter != NULL; \ 230 iter = mem_cgroup_iter(root, iter, NULL)) 231 232 #define for_each_mem_cgroup(iter) \ 233 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 234 iter != NULL; \ 235 iter = mem_cgroup_iter(NULL, iter, NULL)) 236 237 static inline bool task_is_dying(void) 238 { 239 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 240 (current->flags & PF_EXITING); 241 } 242 243 /* Some nice accessors for the vmpressure. */ 244 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 245 { 246 if (!memcg) 247 memcg = root_mem_cgroup; 248 return &memcg->vmpressure; 249 } 250 251 struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) 252 { 253 return container_of(vmpr, struct mem_cgroup, vmpressure); 254 } 255 256 #ifdef CONFIG_MEMCG_KMEM 257 extern spinlock_t css_set_lock; 258 259 bool mem_cgroup_kmem_disabled(void) 260 { 261 return cgroup_memory_nokmem; 262 } 263 264 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 265 unsigned int nr_pages); 266 267 static void obj_cgroup_release(struct percpu_ref *ref) 268 { 269 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); 270 unsigned int nr_bytes; 271 unsigned int nr_pages; 272 unsigned long flags; 273 274 /* 275 * At this point all allocated objects are freed, and 276 * objcg->nr_charged_bytes can't have an arbitrary byte value. 277 * However, it can be PAGE_SIZE or (x * PAGE_SIZE). 278 * 279 * The following sequence can lead to it: 280 * 1) CPU0: objcg == stock->cached_objcg 281 * 2) CPU1: we do a small allocation (e.g. 92 bytes), 282 * PAGE_SIZE bytes are charged 283 * 3) CPU1: a process from another memcg is allocating something, 284 * the stock if flushed, 285 * objcg->nr_charged_bytes = PAGE_SIZE - 92 286 * 5) CPU0: we do release this object, 287 * 92 bytes are added to stock->nr_bytes 288 * 6) CPU0: stock is flushed, 289 * 92 bytes are added to objcg->nr_charged_bytes 290 * 291 * In the result, nr_charged_bytes == PAGE_SIZE. 292 * This page will be uncharged in obj_cgroup_release(). 293 */ 294 nr_bytes = atomic_read(&objcg->nr_charged_bytes); 295 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); 296 nr_pages = nr_bytes >> PAGE_SHIFT; 297 298 if (nr_pages) 299 obj_cgroup_uncharge_pages(objcg, nr_pages); 300 301 spin_lock_irqsave(&css_set_lock, flags); 302 list_del(&objcg->list); 303 spin_unlock_irqrestore(&css_set_lock, flags); 304 305 percpu_ref_exit(ref); 306 kfree_rcu(objcg, rcu); 307 } 308 309 static struct obj_cgroup *obj_cgroup_alloc(void) 310 { 311 struct obj_cgroup *objcg; 312 int ret; 313 314 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); 315 if (!objcg) 316 return NULL; 317 318 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, 319 GFP_KERNEL); 320 if (ret) { 321 kfree(objcg); 322 return NULL; 323 } 324 INIT_LIST_HEAD(&objcg->list); 325 return objcg; 326 } 327 328 static void memcg_reparent_objcgs(struct mem_cgroup *memcg, 329 struct mem_cgroup *parent) 330 { 331 struct obj_cgroup *objcg, *iter; 332 333 objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 334 335 spin_lock_irq(&css_set_lock); 336 337 /* 1) Ready to reparent active objcg. */ 338 list_add(&objcg->list, &memcg->objcg_list); 339 /* 2) Reparent active objcg and already reparented objcgs to parent. */ 340 list_for_each_entry(iter, &memcg->objcg_list, list) 341 WRITE_ONCE(iter->memcg, parent); 342 /* 3) Move already reparented objcgs to the parent's list */ 343 list_splice(&memcg->objcg_list, &parent->objcg_list); 344 345 spin_unlock_irq(&css_set_lock); 346 347 percpu_ref_kill(&objcg->refcnt); 348 } 349 350 /* 351 * This will be used as a shrinker list's index. 352 * The main reason for not using cgroup id for this: 353 * this works better in sparse environments, where we have a lot of memcgs, 354 * but only a few kmem-limited. Or also, if we have, for instance, 200 355 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 356 * 200 entry array for that. 357 * 358 * The current size of the caches array is stored in memcg_nr_cache_ids. It 359 * will double each time we have to increase it. 360 */ 361 static DEFINE_IDA(memcg_cache_ida); 362 int memcg_nr_cache_ids; 363 364 /* Protects memcg_nr_cache_ids */ 365 static DECLARE_RWSEM(memcg_cache_ids_sem); 366 367 void memcg_get_cache_ids(void) 368 { 369 down_read(&memcg_cache_ids_sem); 370 } 371 372 void memcg_put_cache_ids(void) 373 { 374 up_read(&memcg_cache_ids_sem); 375 } 376 377 /* 378 * MIN_SIZE is different than 1, because we would like to avoid going through 379 * the alloc/free process all the time. In a small machine, 4 kmem-limited 380 * cgroups is a reasonable guess. In the future, it could be a parameter or 381 * tunable, but that is strictly not necessary. 382 * 383 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 384 * this constant directly from cgroup, but it is understandable that this is 385 * better kept as an internal representation in cgroup.c. In any case, the 386 * cgrp_id space is not getting any smaller, and we don't have to necessarily 387 * increase ours as well if it increases. 388 */ 389 #define MEMCG_CACHES_MIN_SIZE 4 390 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 391 392 /* 393 * A lot of the calls to the cache allocation functions are expected to be 394 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are 395 * conditional to this static branch, we'll have to allow modules that does 396 * kmem_cache_alloc and the such to see this symbol as well 397 */ 398 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); 399 EXPORT_SYMBOL(memcg_kmem_enabled_key); 400 #endif 401 402 /** 403 * mem_cgroup_css_from_page - css of the memcg associated with a page 404 * @page: page of interest 405 * 406 * If memcg is bound to the default hierarchy, css of the memcg associated 407 * with @page is returned. The returned css remains associated with @page 408 * until it is released. 409 * 410 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 411 * is returned. 412 */ 413 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 414 { 415 struct mem_cgroup *memcg; 416 417 memcg = page_memcg(page); 418 419 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 420 memcg = root_mem_cgroup; 421 422 return &memcg->css; 423 } 424 425 /** 426 * page_cgroup_ino - return inode number of the memcg a page is charged to 427 * @page: the page 428 * 429 * Look up the closest online ancestor of the memory cgroup @page is charged to 430 * and return its inode number or 0 if @page is not charged to any cgroup. It 431 * is safe to call this function without holding a reference to @page. 432 * 433 * Note, this function is inherently racy, because there is nothing to prevent 434 * the cgroup inode from getting torn down and potentially reallocated a moment 435 * after page_cgroup_ino() returns, so it only should be used by callers that 436 * do not care (such as procfs interfaces). 437 */ 438 ino_t page_cgroup_ino(struct page *page) 439 { 440 struct mem_cgroup *memcg; 441 unsigned long ino = 0; 442 443 rcu_read_lock(); 444 memcg = page_memcg_check(page); 445 446 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 447 memcg = parent_mem_cgroup(memcg); 448 if (memcg) 449 ino = cgroup_ino(memcg->css.cgroup); 450 rcu_read_unlock(); 451 return ino; 452 } 453 454 static struct mem_cgroup_per_node * 455 mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) 456 { 457 int nid = page_to_nid(page); 458 459 return memcg->nodeinfo[nid]; 460 } 461 462 static struct mem_cgroup_tree_per_node * 463 soft_limit_tree_node(int nid) 464 { 465 return soft_limit_tree.rb_tree_per_node[nid]; 466 } 467 468 static struct mem_cgroup_tree_per_node * 469 soft_limit_tree_from_page(struct page *page) 470 { 471 int nid = page_to_nid(page); 472 473 return soft_limit_tree.rb_tree_per_node[nid]; 474 } 475 476 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 477 struct mem_cgroup_tree_per_node *mctz, 478 unsigned long new_usage_in_excess) 479 { 480 struct rb_node **p = &mctz->rb_root.rb_node; 481 struct rb_node *parent = NULL; 482 struct mem_cgroup_per_node *mz_node; 483 bool rightmost = true; 484 485 if (mz->on_tree) 486 return; 487 488 mz->usage_in_excess = new_usage_in_excess; 489 if (!mz->usage_in_excess) 490 return; 491 while (*p) { 492 parent = *p; 493 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 494 tree_node); 495 if (mz->usage_in_excess < mz_node->usage_in_excess) { 496 p = &(*p)->rb_left; 497 rightmost = false; 498 } else { 499 p = &(*p)->rb_right; 500 } 501 } 502 503 if (rightmost) 504 mctz->rb_rightmost = &mz->tree_node; 505 506 rb_link_node(&mz->tree_node, parent, p); 507 rb_insert_color(&mz->tree_node, &mctz->rb_root); 508 mz->on_tree = true; 509 } 510 511 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 512 struct mem_cgroup_tree_per_node *mctz) 513 { 514 if (!mz->on_tree) 515 return; 516 517 if (&mz->tree_node == mctz->rb_rightmost) 518 mctz->rb_rightmost = rb_prev(&mz->tree_node); 519 520 rb_erase(&mz->tree_node, &mctz->rb_root); 521 mz->on_tree = false; 522 } 523 524 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 525 struct mem_cgroup_tree_per_node *mctz) 526 { 527 unsigned long flags; 528 529 spin_lock_irqsave(&mctz->lock, flags); 530 __mem_cgroup_remove_exceeded(mz, mctz); 531 spin_unlock_irqrestore(&mctz->lock, flags); 532 } 533 534 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 535 { 536 unsigned long nr_pages = page_counter_read(&memcg->memory); 537 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 538 unsigned long excess = 0; 539 540 if (nr_pages > soft_limit) 541 excess = nr_pages - soft_limit; 542 543 return excess; 544 } 545 546 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 547 { 548 unsigned long excess; 549 struct mem_cgroup_per_node *mz; 550 struct mem_cgroup_tree_per_node *mctz; 551 552 mctz = soft_limit_tree_from_page(page); 553 if (!mctz) 554 return; 555 /* 556 * Necessary to update all ancestors when hierarchy is used. 557 * because their event counter is not touched. 558 */ 559 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 560 mz = mem_cgroup_page_nodeinfo(memcg, page); 561 excess = soft_limit_excess(memcg); 562 /* 563 * We have to update the tree if mz is on RB-tree or 564 * mem is over its softlimit. 565 */ 566 if (excess || mz->on_tree) { 567 unsigned long flags; 568 569 spin_lock_irqsave(&mctz->lock, flags); 570 /* if on-tree, remove it */ 571 if (mz->on_tree) 572 __mem_cgroup_remove_exceeded(mz, mctz); 573 /* 574 * Insert again. mz->usage_in_excess will be updated. 575 * If excess is 0, no tree ops. 576 */ 577 __mem_cgroup_insert_exceeded(mz, mctz, excess); 578 spin_unlock_irqrestore(&mctz->lock, flags); 579 } 580 } 581 } 582 583 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 584 { 585 struct mem_cgroup_tree_per_node *mctz; 586 struct mem_cgroup_per_node *mz; 587 int nid; 588 589 for_each_node(nid) { 590 mz = memcg->nodeinfo[nid]; 591 mctz = soft_limit_tree_node(nid); 592 if (mctz) 593 mem_cgroup_remove_exceeded(mz, mctz); 594 } 595 } 596 597 static struct mem_cgroup_per_node * 598 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 599 { 600 struct mem_cgroup_per_node *mz; 601 602 retry: 603 mz = NULL; 604 if (!mctz->rb_rightmost) 605 goto done; /* Nothing to reclaim from */ 606 607 mz = rb_entry(mctz->rb_rightmost, 608 struct mem_cgroup_per_node, tree_node); 609 /* 610 * Remove the node now but someone else can add it back, 611 * we will to add it back at the end of reclaim to its correct 612 * position in the tree. 613 */ 614 __mem_cgroup_remove_exceeded(mz, mctz); 615 if (!soft_limit_excess(mz->memcg) || 616 !css_tryget(&mz->memcg->css)) 617 goto retry; 618 done: 619 return mz; 620 } 621 622 static struct mem_cgroup_per_node * 623 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 624 { 625 struct mem_cgroup_per_node *mz; 626 627 spin_lock_irq(&mctz->lock); 628 mz = __mem_cgroup_largest_soft_limit_node(mctz); 629 spin_unlock_irq(&mctz->lock); 630 return mz; 631 } 632 633 /* 634 * memcg and lruvec stats flushing 635 * 636 * Many codepaths leading to stats update or read are performance sensitive and 637 * adding stats flushing in such codepaths is not desirable. So, to optimize the 638 * flushing the kernel does: 639 * 640 * 1) Periodically and asynchronously flush the stats every 2 seconds to not let 641 * rstat update tree grow unbounded. 642 * 643 * 2) Flush the stats synchronously on reader side only when there are more than 644 * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization 645 * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but 646 * only for 2 seconds due to (1). 647 */ 648 static void flush_memcg_stats_dwork(struct work_struct *w); 649 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); 650 static DEFINE_SPINLOCK(stats_flush_lock); 651 static DEFINE_PER_CPU(unsigned int, stats_updates); 652 static atomic_t stats_flush_threshold = ATOMIC_INIT(0); 653 654 static inline void memcg_rstat_updated(struct mem_cgroup *memcg) 655 { 656 cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); 657 if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH)) 658 atomic_inc(&stats_flush_threshold); 659 } 660 661 static void __mem_cgroup_flush_stats(void) 662 { 663 unsigned long flag; 664 665 if (!spin_trylock_irqsave(&stats_flush_lock, flag)) 666 return; 667 668 cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); 669 atomic_set(&stats_flush_threshold, 0); 670 spin_unlock_irqrestore(&stats_flush_lock, flag); 671 } 672 673 void mem_cgroup_flush_stats(void) 674 { 675 if (atomic_read(&stats_flush_threshold) > num_online_cpus()) 676 __mem_cgroup_flush_stats(); 677 } 678 679 static void flush_memcg_stats_dwork(struct work_struct *w) 680 { 681 mem_cgroup_flush_stats(); 682 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); 683 } 684 685 /** 686 * __mod_memcg_state - update cgroup memory statistics 687 * @memcg: the memory cgroup 688 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 689 * @val: delta to add to the counter, can be negative 690 */ 691 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) 692 { 693 if (mem_cgroup_disabled()) 694 return; 695 696 __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 697 memcg_rstat_updated(memcg); 698 } 699 700 /* idx can be of type enum memcg_stat_item or node_stat_item. */ 701 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) 702 { 703 long x = 0; 704 int cpu; 705 706 for_each_possible_cpu(cpu) 707 x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); 708 #ifdef CONFIG_SMP 709 if (x < 0) 710 x = 0; 711 #endif 712 return x; 713 } 714 715 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 716 int val) 717 { 718 struct mem_cgroup_per_node *pn; 719 struct mem_cgroup *memcg; 720 721 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 722 memcg = pn->memcg; 723 724 /* Update memcg */ 725 __this_cpu_add(memcg->vmstats_percpu->state[idx], val); 726 727 /* Update lruvec */ 728 __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); 729 730 memcg_rstat_updated(memcg); 731 } 732 733 /** 734 * __mod_lruvec_state - update lruvec memory statistics 735 * @lruvec: the lruvec 736 * @idx: the stat item 737 * @val: delta to add to the counter, can be negative 738 * 739 * The lruvec is the intersection of the NUMA node and a cgroup. This 740 * function updates the all three counters that are affected by a 741 * change of state at this level: per-node, per-cgroup, per-lruvec. 742 */ 743 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 744 int val) 745 { 746 /* Update node */ 747 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); 748 749 /* Update memcg and lruvec */ 750 if (!mem_cgroup_disabled()) 751 __mod_memcg_lruvec_state(lruvec, idx, val); 752 } 753 754 void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, 755 int val) 756 { 757 struct page *head = compound_head(page); /* rmap on tail pages */ 758 struct mem_cgroup *memcg; 759 pg_data_t *pgdat = page_pgdat(page); 760 struct lruvec *lruvec; 761 762 rcu_read_lock(); 763 memcg = page_memcg(head); 764 /* Untracked pages have no memcg, no lruvec. Update only the node */ 765 if (!memcg) { 766 rcu_read_unlock(); 767 __mod_node_page_state(pgdat, idx, val); 768 return; 769 } 770 771 lruvec = mem_cgroup_lruvec(memcg, pgdat); 772 __mod_lruvec_state(lruvec, idx, val); 773 rcu_read_unlock(); 774 } 775 EXPORT_SYMBOL(__mod_lruvec_page_state); 776 777 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) 778 { 779 pg_data_t *pgdat = page_pgdat(virt_to_page(p)); 780 struct mem_cgroup *memcg; 781 struct lruvec *lruvec; 782 783 rcu_read_lock(); 784 memcg = mem_cgroup_from_obj(p); 785 786 /* 787 * Untracked pages have no memcg, no lruvec. Update only the 788 * node. If we reparent the slab objects to the root memcg, 789 * when we free the slab object, we need to update the per-memcg 790 * vmstats to keep it correct for the root memcg. 791 */ 792 if (!memcg) { 793 __mod_node_page_state(pgdat, idx, val); 794 } else { 795 lruvec = mem_cgroup_lruvec(memcg, pgdat); 796 __mod_lruvec_state(lruvec, idx, val); 797 } 798 rcu_read_unlock(); 799 } 800 801 /* 802 * mod_objcg_mlstate() may be called with irq enabled, so 803 * mod_memcg_lruvec_state() should be used. 804 */ 805 static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, 806 struct pglist_data *pgdat, 807 enum node_stat_item idx, int nr) 808 { 809 struct mem_cgroup *memcg; 810 struct lruvec *lruvec; 811 812 rcu_read_lock(); 813 memcg = obj_cgroup_memcg(objcg); 814 lruvec = mem_cgroup_lruvec(memcg, pgdat); 815 mod_memcg_lruvec_state(lruvec, idx, nr); 816 rcu_read_unlock(); 817 } 818 819 /** 820 * __count_memcg_events - account VM events in a cgroup 821 * @memcg: the memory cgroup 822 * @idx: the event item 823 * @count: the number of events that occurred 824 */ 825 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 826 unsigned long count) 827 { 828 if (mem_cgroup_disabled()) 829 return; 830 831 __this_cpu_add(memcg->vmstats_percpu->events[idx], count); 832 memcg_rstat_updated(memcg); 833 } 834 835 static unsigned long memcg_events(struct mem_cgroup *memcg, int event) 836 { 837 return READ_ONCE(memcg->vmstats.events[event]); 838 } 839 840 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 841 { 842 long x = 0; 843 int cpu; 844 845 for_each_possible_cpu(cpu) 846 x += per_cpu(memcg->vmstats_percpu->events[event], cpu); 847 return x; 848 } 849 850 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 851 struct page *page, 852 int nr_pages) 853 { 854 /* pagein of a big page is an event. So, ignore page size */ 855 if (nr_pages > 0) 856 __count_memcg_events(memcg, PGPGIN, 1); 857 else { 858 __count_memcg_events(memcg, PGPGOUT, 1); 859 nr_pages = -nr_pages; /* for event */ 860 } 861 862 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 863 } 864 865 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 866 enum mem_cgroup_events_target target) 867 { 868 unsigned long val, next; 869 870 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 871 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 872 /* from time_after() in jiffies.h */ 873 if ((long)(next - val) < 0) { 874 switch (target) { 875 case MEM_CGROUP_TARGET_THRESH: 876 next = val + THRESHOLDS_EVENTS_TARGET; 877 break; 878 case MEM_CGROUP_TARGET_SOFTLIMIT: 879 next = val + SOFTLIMIT_EVENTS_TARGET; 880 break; 881 default: 882 break; 883 } 884 __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 885 return true; 886 } 887 return false; 888 } 889 890 /* 891 * Check events in order. 892 * 893 */ 894 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 895 { 896 /* threshold event is triggered in finer grain than soft limit */ 897 if (unlikely(mem_cgroup_event_ratelimit(memcg, 898 MEM_CGROUP_TARGET_THRESH))) { 899 bool do_softlimit; 900 901 do_softlimit = mem_cgroup_event_ratelimit(memcg, 902 MEM_CGROUP_TARGET_SOFTLIMIT); 903 mem_cgroup_threshold(memcg); 904 if (unlikely(do_softlimit)) 905 mem_cgroup_update_tree(memcg, page); 906 } 907 } 908 909 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 910 { 911 /* 912 * mm_update_next_owner() may clear mm->owner to NULL 913 * if it races with swapoff, page migration, etc. 914 * So this can be called with p == NULL. 915 */ 916 if (unlikely(!p)) 917 return NULL; 918 919 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 920 } 921 EXPORT_SYMBOL(mem_cgroup_from_task); 922 923 static __always_inline struct mem_cgroup *active_memcg(void) 924 { 925 if (!in_task()) 926 return this_cpu_read(int_active_memcg); 927 else 928 return current->active_memcg; 929 } 930 931 /** 932 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 933 * @mm: mm from which memcg should be extracted. It can be NULL. 934 * 935 * Obtain a reference on mm->memcg and returns it if successful. If mm 936 * is NULL, then the memcg is chosen as follows: 937 * 1) The active memcg, if set. 938 * 2) current->mm->memcg, if available 939 * 3) root memcg 940 * If mem_cgroup is disabled, NULL is returned. 941 */ 942 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 943 { 944 struct mem_cgroup *memcg; 945 946 if (mem_cgroup_disabled()) 947 return NULL; 948 949 /* 950 * Page cache insertions can happen without an 951 * actual mm context, e.g. during disk probing 952 * on boot, loopback IO, acct() writes etc. 953 * 954 * No need to css_get on root memcg as the reference 955 * counting is disabled on the root level in the 956 * cgroup core. See CSS_NO_REF. 957 */ 958 if (unlikely(!mm)) { 959 memcg = active_memcg(); 960 if (unlikely(memcg)) { 961 /* remote memcg must hold a ref */ 962 css_get(&memcg->css); 963 return memcg; 964 } 965 mm = current->mm; 966 if (unlikely(!mm)) 967 return root_mem_cgroup; 968 } 969 970 rcu_read_lock(); 971 do { 972 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 973 if (unlikely(!memcg)) 974 memcg = root_mem_cgroup; 975 } while (!css_tryget(&memcg->css)); 976 rcu_read_unlock(); 977 return memcg; 978 } 979 EXPORT_SYMBOL(get_mem_cgroup_from_mm); 980 981 static __always_inline bool memcg_kmem_bypass(void) 982 { 983 /* Allow remote memcg charging from any context. */ 984 if (unlikely(active_memcg())) 985 return false; 986 987 /* Memcg to charge can't be determined. */ 988 if (!in_task() || !current->mm || (current->flags & PF_KTHREAD)) 989 return true; 990 991 return false; 992 } 993 994 /** 995 * mem_cgroup_iter - iterate over memory cgroup hierarchy 996 * @root: hierarchy root 997 * @prev: previously returned memcg, NULL on first invocation 998 * @reclaim: cookie for shared reclaim walks, NULL for full walks 999 * 1000 * Returns references to children of the hierarchy below @root, or 1001 * @root itself, or %NULL after a full round-trip. 1002 * 1003 * Caller must pass the return value in @prev on subsequent 1004 * invocations for reference counting, or use mem_cgroup_iter_break() 1005 * to cancel a hierarchy walk before the round-trip is complete. 1006 * 1007 * Reclaimers can specify a node in @reclaim to divide up the memcgs 1008 * in the hierarchy among all concurrent reclaimers operating on the 1009 * same node. 1010 */ 1011 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1012 struct mem_cgroup *prev, 1013 struct mem_cgroup_reclaim_cookie *reclaim) 1014 { 1015 struct mem_cgroup_reclaim_iter *iter; 1016 struct cgroup_subsys_state *css = NULL; 1017 struct mem_cgroup *memcg = NULL; 1018 struct mem_cgroup *pos = NULL; 1019 1020 if (mem_cgroup_disabled()) 1021 return NULL; 1022 1023 if (!root) 1024 root = root_mem_cgroup; 1025 1026 if (prev && !reclaim) 1027 pos = prev; 1028 1029 rcu_read_lock(); 1030 1031 if (reclaim) { 1032 struct mem_cgroup_per_node *mz; 1033 1034 mz = root->nodeinfo[reclaim->pgdat->node_id]; 1035 iter = &mz->iter; 1036 1037 if (prev && reclaim->generation != iter->generation) 1038 goto out_unlock; 1039 1040 while (1) { 1041 pos = READ_ONCE(iter->position); 1042 if (!pos || css_tryget(&pos->css)) 1043 break; 1044 /* 1045 * css reference reached zero, so iter->position will 1046 * be cleared by ->css_released. However, we should not 1047 * rely on this happening soon, because ->css_released 1048 * is called from a work queue, and by busy-waiting we 1049 * might block it. So we clear iter->position right 1050 * away. 1051 */ 1052 (void)cmpxchg(&iter->position, pos, NULL); 1053 } 1054 } 1055 1056 if (pos) 1057 css = &pos->css; 1058 1059 for (;;) { 1060 css = css_next_descendant_pre(css, &root->css); 1061 if (!css) { 1062 /* 1063 * Reclaimers share the hierarchy walk, and a 1064 * new one might jump in right at the end of 1065 * the hierarchy - make sure they see at least 1066 * one group and restart from the beginning. 1067 */ 1068 if (!prev) 1069 continue; 1070 break; 1071 } 1072 1073 /* 1074 * Verify the css and acquire a reference. The root 1075 * is provided by the caller, so we know it's alive 1076 * and kicking, and don't take an extra reference. 1077 */ 1078 memcg = mem_cgroup_from_css(css); 1079 1080 if (css == &root->css) 1081 break; 1082 1083 if (css_tryget(css)) 1084 break; 1085 1086 memcg = NULL; 1087 } 1088 1089 if (reclaim) { 1090 /* 1091 * The position could have already been updated by a competing 1092 * thread, so check that the value hasn't changed since we read 1093 * it to avoid reclaiming from the same cgroup twice. 1094 */ 1095 (void)cmpxchg(&iter->position, pos, memcg); 1096 1097 if (pos) 1098 css_put(&pos->css); 1099 1100 if (!memcg) 1101 iter->generation++; 1102 else if (!prev) 1103 reclaim->generation = iter->generation; 1104 } 1105 1106 out_unlock: 1107 rcu_read_unlock(); 1108 if (prev && prev != root) 1109 css_put(&prev->css); 1110 1111 return memcg; 1112 } 1113 1114 /** 1115 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1116 * @root: hierarchy root 1117 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1118 */ 1119 void mem_cgroup_iter_break(struct mem_cgroup *root, 1120 struct mem_cgroup *prev) 1121 { 1122 if (!root) 1123 root = root_mem_cgroup; 1124 if (prev && prev != root) 1125 css_put(&prev->css); 1126 } 1127 1128 static void __invalidate_reclaim_iterators(struct mem_cgroup *from, 1129 struct mem_cgroup *dead_memcg) 1130 { 1131 struct mem_cgroup_reclaim_iter *iter; 1132 struct mem_cgroup_per_node *mz; 1133 int nid; 1134 1135 for_each_node(nid) { 1136 mz = from->nodeinfo[nid]; 1137 iter = &mz->iter; 1138 cmpxchg(&iter->position, dead_memcg, NULL); 1139 } 1140 } 1141 1142 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 1143 { 1144 struct mem_cgroup *memcg = dead_memcg; 1145 struct mem_cgroup *last; 1146 1147 do { 1148 __invalidate_reclaim_iterators(memcg, dead_memcg); 1149 last = memcg; 1150 } while ((memcg = parent_mem_cgroup(memcg))); 1151 1152 /* 1153 * When cgruop1 non-hierarchy mode is used, 1154 * parent_mem_cgroup() does not walk all the way up to the 1155 * cgroup root (root_mem_cgroup). So we have to handle 1156 * dead_memcg from cgroup root separately. 1157 */ 1158 if (last != root_mem_cgroup) 1159 __invalidate_reclaim_iterators(root_mem_cgroup, 1160 dead_memcg); 1161 } 1162 1163 /** 1164 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 1165 * @memcg: hierarchy root 1166 * @fn: function to call for each task 1167 * @arg: argument passed to @fn 1168 * 1169 * This function iterates over tasks attached to @memcg or to any of its 1170 * descendants and calls @fn for each task. If @fn returns a non-zero 1171 * value, the function breaks the iteration loop and returns the value. 1172 * Otherwise, it will iterate over all tasks and return 0. 1173 * 1174 * This function must not be called for the root memory cgroup. 1175 */ 1176 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 1177 int (*fn)(struct task_struct *, void *), void *arg) 1178 { 1179 struct mem_cgroup *iter; 1180 int ret = 0; 1181 1182 BUG_ON(memcg == root_mem_cgroup); 1183 1184 for_each_mem_cgroup_tree(iter, memcg) { 1185 struct css_task_iter it; 1186 struct task_struct *task; 1187 1188 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 1189 while (!ret && (task = css_task_iter_next(&it))) 1190 ret = fn(task, arg); 1191 css_task_iter_end(&it); 1192 if (ret) { 1193 mem_cgroup_iter_break(memcg, iter); 1194 break; 1195 } 1196 } 1197 return ret; 1198 } 1199 1200 #ifdef CONFIG_DEBUG_VM 1201 void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) 1202 { 1203 struct mem_cgroup *memcg; 1204 1205 if (mem_cgroup_disabled()) 1206 return; 1207 1208 memcg = page_memcg(page); 1209 1210 if (!memcg) 1211 VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page); 1212 else 1213 VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page); 1214 } 1215 #endif 1216 1217 /** 1218 * lock_page_lruvec - lock and return lruvec for a given page. 1219 * @page: the page 1220 * 1221 * These functions are safe to use under any of the following conditions: 1222 * - page locked 1223 * - PageLRU cleared 1224 * - lock_page_memcg() 1225 * - page->_refcount is zero 1226 */ 1227 struct lruvec *lock_page_lruvec(struct page *page) 1228 { 1229 struct lruvec *lruvec; 1230 1231 lruvec = mem_cgroup_page_lruvec(page); 1232 spin_lock(&lruvec->lru_lock); 1233 1234 lruvec_memcg_debug(lruvec, page); 1235 1236 return lruvec; 1237 } 1238 1239 struct lruvec *lock_page_lruvec_irq(struct page *page) 1240 { 1241 struct lruvec *lruvec; 1242 1243 lruvec = mem_cgroup_page_lruvec(page); 1244 spin_lock_irq(&lruvec->lru_lock); 1245 1246 lruvec_memcg_debug(lruvec, page); 1247 1248 return lruvec; 1249 } 1250 1251 struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) 1252 { 1253 struct lruvec *lruvec; 1254 1255 lruvec = mem_cgroup_page_lruvec(page); 1256 spin_lock_irqsave(&lruvec->lru_lock, *flags); 1257 1258 lruvec_memcg_debug(lruvec, page); 1259 1260 return lruvec; 1261 } 1262 1263 /** 1264 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1265 * @lruvec: mem_cgroup per zone lru vector 1266 * @lru: index of lru list the page is sitting on 1267 * @zid: zone id of the accounted pages 1268 * @nr_pages: positive when adding or negative when removing 1269 * 1270 * This function must be called under lru_lock, just before a page is added 1271 * to or just after a page is removed from an lru list (that ordering being 1272 * so as to allow it to check that lru_size 0 is consistent with list_empty). 1273 */ 1274 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1275 int zid, int nr_pages) 1276 { 1277 struct mem_cgroup_per_node *mz; 1278 unsigned long *lru_size; 1279 long size; 1280 1281 if (mem_cgroup_disabled()) 1282 return; 1283 1284 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1285 lru_size = &mz->lru_zone_size[zid][lru]; 1286 1287 if (nr_pages < 0) 1288 *lru_size += nr_pages; 1289 1290 size = *lru_size; 1291 if (WARN_ONCE(size < 0, 1292 "%s(%p, %d, %d): lru_size %ld\n", 1293 __func__, lruvec, lru, nr_pages, size)) { 1294 VM_BUG_ON(1); 1295 *lru_size = 0; 1296 } 1297 1298 if (nr_pages > 0) 1299 *lru_size += nr_pages; 1300 } 1301 1302 /** 1303 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1304 * @memcg: the memory cgroup 1305 * 1306 * Returns the maximum amount of memory @mem can be charged with, in 1307 * pages. 1308 */ 1309 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1310 { 1311 unsigned long margin = 0; 1312 unsigned long count; 1313 unsigned long limit; 1314 1315 count = page_counter_read(&memcg->memory); 1316 limit = READ_ONCE(memcg->memory.max); 1317 if (count < limit) 1318 margin = limit - count; 1319 1320 if (do_memsw_account()) { 1321 count = page_counter_read(&memcg->memsw); 1322 limit = READ_ONCE(memcg->memsw.max); 1323 if (count < limit) 1324 margin = min(margin, limit - count); 1325 else 1326 margin = 0; 1327 } 1328 1329 return margin; 1330 } 1331 1332 /* 1333 * A routine for checking "mem" is under move_account() or not. 1334 * 1335 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1336 * moving cgroups. This is for waiting at high-memory pressure 1337 * caused by "move". 1338 */ 1339 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1340 { 1341 struct mem_cgroup *from; 1342 struct mem_cgroup *to; 1343 bool ret = false; 1344 /* 1345 * Unlike task_move routines, we access mc.to, mc.from not under 1346 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1347 */ 1348 spin_lock(&mc.lock); 1349 from = mc.from; 1350 to = mc.to; 1351 if (!from) 1352 goto unlock; 1353 1354 ret = mem_cgroup_is_descendant(from, memcg) || 1355 mem_cgroup_is_descendant(to, memcg); 1356 unlock: 1357 spin_unlock(&mc.lock); 1358 return ret; 1359 } 1360 1361 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1362 { 1363 if (mc.moving_task && current != mc.moving_task) { 1364 if (mem_cgroup_under_move(memcg)) { 1365 DEFINE_WAIT(wait); 1366 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1367 /* moving charge context might have finished. */ 1368 if (mc.moving_task) 1369 schedule(); 1370 finish_wait(&mc.waitq, &wait); 1371 return true; 1372 } 1373 } 1374 return false; 1375 } 1376 1377 struct memory_stat { 1378 const char *name; 1379 unsigned int idx; 1380 }; 1381 1382 static const struct memory_stat memory_stats[] = { 1383 { "anon", NR_ANON_MAPPED }, 1384 { "file", NR_FILE_PAGES }, 1385 { "kernel_stack", NR_KERNEL_STACK_KB }, 1386 { "pagetables", NR_PAGETABLE }, 1387 { "percpu", MEMCG_PERCPU_B }, 1388 { "sock", MEMCG_SOCK }, 1389 { "shmem", NR_SHMEM }, 1390 { "file_mapped", NR_FILE_MAPPED }, 1391 { "file_dirty", NR_FILE_DIRTY }, 1392 { "file_writeback", NR_WRITEBACK }, 1393 #ifdef CONFIG_SWAP 1394 { "swapcached", NR_SWAPCACHE }, 1395 #endif 1396 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1397 { "anon_thp", NR_ANON_THPS }, 1398 { "file_thp", NR_FILE_THPS }, 1399 { "shmem_thp", NR_SHMEM_THPS }, 1400 #endif 1401 { "inactive_anon", NR_INACTIVE_ANON }, 1402 { "active_anon", NR_ACTIVE_ANON }, 1403 { "inactive_file", NR_INACTIVE_FILE }, 1404 { "active_file", NR_ACTIVE_FILE }, 1405 { "unevictable", NR_UNEVICTABLE }, 1406 { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B }, 1407 { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B }, 1408 1409 /* The memory events */ 1410 { "workingset_refault_anon", WORKINGSET_REFAULT_ANON }, 1411 { "workingset_refault_file", WORKINGSET_REFAULT_FILE }, 1412 { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON }, 1413 { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE }, 1414 { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, 1415 { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, 1416 { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, 1417 }; 1418 1419 /* Translate stat items to the correct unit for memory.stat output */ 1420 static int memcg_page_state_unit(int item) 1421 { 1422 switch (item) { 1423 case MEMCG_PERCPU_B: 1424 case NR_SLAB_RECLAIMABLE_B: 1425 case NR_SLAB_UNRECLAIMABLE_B: 1426 case WORKINGSET_REFAULT_ANON: 1427 case WORKINGSET_REFAULT_FILE: 1428 case WORKINGSET_ACTIVATE_ANON: 1429 case WORKINGSET_ACTIVATE_FILE: 1430 case WORKINGSET_RESTORE_ANON: 1431 case WORKINGSET_RESTORE_FILE: 1432 case WORKINGSET_NODERECLAIM: 1433 return 1; 1434 case NR_KERNEL_STACK_KB: 1435 return SZ_1K; 1436 default: 1437 return PAGE_SIZE; 1438 } 1439 } 1440 1441 static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, 1442 int item) 1443 { 1444 return memcg_page_state(memcg, item) * memcg_page_state_unit(item); 1445 } 1446 1447 static char *memory_stat_format(struct mem_cgroup *memcg) 1448 { 1449 struct seq_buf s; 1450 int i; 1451 1452 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); 1453 if (!s.buffer) 1454 return NULL; 1455 1456 /* 1457 * Provide statistics on the state of the memory subsystem as 1458 * well as cumulative event counters that show past behavior. 1459 * 1460 * This list is ordered following a combination of these gradients: 1461 * 1) generic big picture -> specifics and details 1462 * 2) reflecting userspace activity -> reflecting kernel heuristics 1463 * 1464 * Current memory state: 1465 */ 1466 mem_cgroup_flush_stats(); 1467 1468 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1469 u64 size; 1470 1471 size = memcg_page_state_output(memcg, memory_stats[i].idx); 1472 seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); 1473 1474 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { 1475 size += memcg_page_state_output(memcg, 1476 NR_SLAB_RECLAIMABLE_B); 1477 seq_buf_printf(&s, "slab %llu\n", size); 1478 } 1479 } 1480 1481 /* Accumulated memory events */ 1482 1483 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT), 1484 memcg_events(memcg, PGFAULT)); 1485 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT), 1486 memcg_events(memcg, PGMAJFAULT)); 1487 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL), 1488 memcg_events(memcg, PGREFILL)); 1489 seq_buf_printf(&s, "pgscan %lu\n", 1490 memcg_events(memcg, PGSCAN_KSWAPD) + 1491 memcg_events(memcg, PGSCAN_DIRECT)); 1492 seq_buf_printf(&s, "pgsteal %lu\n", 1493 memcg_events(memcg, PGSTEAL_KSWAPD) + 1494 memcg_events(memcg, PGSTEAL_DIRECT)); 1495 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE), 1496 memcg_events(memcg, PGACTIVATE)); 1497 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE), 1498 memcg_events(memcg, PGDEACTIVATE)); 1499 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE), 1500 memcg_events(memcg, PGLAZYFREE)); 1501 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED), 1502 memcg_events(memcg, PGLAZYFREED)); 1503 1504 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1505 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC), 1506 memcg_events(memcg, THP_FAULT_ALLOC)); 1507 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC), 1508 memcg_events(memcg, THP_COLLAPSE_ALLOC)); 1509 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1510 1511 /* The above should easily fit into one page */ 1512 WARN_ON_ONCE(seq_buf_has_overflowed(&s)); 1513 1514 return s.buffer; 1515 } 1516 1517 #define K(x) ((x) << (PAGE_SHIFT-10)) 1518 /** 1519 * mem_cgroup_print_oom_context: Print OOM information relevant to 1520 * memory controller. 1521 * @memcg: The memory cgroup that went over limit 1522 * @p: Task that is going to be killed 1523 * 1524 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1525 * enabled 1526 */ 1527 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 1528 { 1529 rcu_read_lock(); 1530 1531 if (memcg) { 1532 pr_cont(",oom_memcg="); 1533 pr_cont_cgroup_path(memcg->css.cgroup); 1534 } else 1535 pr_cont(",global_oom"); 1536 if (p) { 1537 pr_cont(",task_memcg="); 1538 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1539 } 1540 rcu_read_unlock(); 1541 } 1542 1543 /** 1544 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 1545 * memory controller. 1546 * @memcg: The memory cgroup that went over limit 1547 */ 1548 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1549 { 1550 char *buf; 1551 1552 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1553 K((u64)page_counter_read(&memcg->memory)), 1554 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); 1555 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1556 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 1557 K((u64)page_counter_read(&memcg->swap)), 1558 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); 1559 else { 1560 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1561 K((u64)page_counter_read(&memcg->memsw)), 1562 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1563 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1564 K((u64)page_counter_read(&memcg->kmem)), 1565 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1566 } 1567 1568 pr_info("Memory cgroup stats for "); 1569 pr_cont_cgroup_path(memcg->css.cgroup); 1570 pr_cont(":"); 1571 buf = memory_stat_format(memcg); 1572 if (!buf) 1573 return; 1574 pr_info("%s", buf); 1575 kfree(buf); 1576 } 1577 1578 /* 1579 * Return the memory (and swap, if configured) limit for a memcg. 1580 */ 1581 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 1582 { 1583 unsigned long max = READ_ONCE(memcg->memory.max); 1584 1585 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 1586 if (mem_cgroup_swappiness(memcg)) 1587 max += min(READ_ONCE(memcg->swap.max), 1588 (unsigned long)total_swap_pages); 1589 } else { /* v1 */ 1590 if (mem_cgroup_swappiness(memcg)) { 1591 /* Calculate swap excess capacity from memsw limit */ 1592 unsigned long swap = READ_ONCE(memcg->memsw.max) - max; 1593 1594 max += min(swap, (unsigned long)total_swap_pages); 1595 } 1596 } 1597 return max; 1598 } 1599 1600 unsigned long mem_cgroup_size(struct mem_cgroup *memcg) 1601 { 1602 return page_counter_read(&memcg->memory); 1603 } 1604 1605 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1606 int order) 1607 { 1608 struct oom_control oc = { 1609 .zonelist = NULL, 1610 .nodemask = NULL, 1611 .memcg = memcg, 1612 .gfp_mask = gfp_mask, 1613 .order = order, 1614 }; 1615 bool ret = true; 1616 1617 if (mutex_lock_killable(&oom_lock)) 1618 return true; 1619 1620 if (mem_cgroup_margin(memcg) >= (1 << order)) 1621 goto unlock; 1622 1623 /* 1624 * A few threads which were not waiting at mutex_lock_killable() can 1625 * fail to bail out. Therefore, check again after holding oom_lock. 1626 */ 1627 ret = task_is_dying() || out_of_memory(&oc); 1628 1629 unlock: 1630 mutex_unlock(&oom_lock); 1631 return ret; 1632 } 1633 1634 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1635 pg_data_t *pgdat, 1636 gfp_t gfp_mask, 1637 unsigned long *total_scanned) 1638 { 1639 struct mem_cgroup *victim = NULL; 1640 int total = 0; 1641 int loop = 0; 1642 unsigned long excess; 1643 unsigned long nr_scanned; 1644 struct mem_cgroup_reclaim_cookie reclaim = { 1645 .pgdat = pgdat, 1646 }; 1647 1648 excess = soft_limit_excess(root_memcg); 1649 1650 while (1) { 1651 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1652 if (!victim) { 1653 loop++; 1654 if (loop >= 2) { 1655 /* 1656 * If we have not been able to reclaim 1657 * anything, it might because there are 1658 * no reclaimable pages under this hierarchy 1659 */ 1660 if (!total) 1661 break; 1662 /* 1663 * We want to do more targeted reclaim. 1664 * excess >> 2 is not to excessive so as to 1665 * reclaim too much, nor too less that we keep 1666 * coming back to reclaim from this cgroup 1667 */ 1668 if (total >= (excess >> 2) || 1669 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1670 break; 1671 } 1672 continue; 1673 } 1674 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1675 pgdat, &nr_scanned); 1676 *total_scanned += nr_scanned; 1677 if (!soft_limit_excess(root_memcg)) 1678 break; 1679 } 1680 mem_cgroup_iter_break(root_memcg, victim); 1681 return total; 1682 } 1683 1684 #ifdef CONFIG_LOCKDEP 1685 static struct lockdep_map memcg_oom_lock_dep_map = { 1686 .name = "memcg_oom_lock", 1687 }; 1688 #endif 1689 1690 static DEFINE_SPINLOCK(memcg_oom_lock); 1691 1692 /* 1693 * Check OOM-Killer is already running under our hierarchy. 1694 * If someone is running, return false. 1695 */ 1696 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1697 { 1698 struct mem_cgroup *iter, *failed = NULL; 1699 1700 spin_lock(&memcg_oom_lock); 1701 1702 for_each_mem_cgroup_tree(iter, memcg) { 1703 if (iter->oom_lock) { 1704 /* 1705 * this subtree of our hierarchy is already locked 1706 * so we cannot give a lock. 1707 */ 1708 failed = iter; 1709 mem_cgroup_iter_break(memcg, iter); 1710 break; 1711 } else 1712 iter->oom_lock = true; 1713 } 1714 1715 if (failed) { 1716 /* 1717 * OK, we failed to lock the whole subtree so we have 1718 * to clean up what we set up to the failing subtree 1719 */ 1720 for_each_mem_cgroup_tree(iter, memcg) { 1721 if (iter == failed) { 1722 mem_cgroup_iter_break(memcg, iter); 1723 break; 1724 } 1725 iter->oom_lock = false; 1726 } 1727 } else 1728 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1729 1730 spin_unlock(&memcg_oom_lock); 1731 1732 return !failed; 1733 } 1734 1735 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1736 { 1737 struct mem_cgroup *iter; 1738 1739 spin_lock(&memcg_oom_lock); 1740 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1741 for_each_mem_cgroup_tree(iter, memcg) 1742 iter->oom_lock = false; 1743 spin_unlock(&memcg_oom_lock); 1744 } 1745 1746 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1747 { 1748 struct mem_cgroup *iter; 1749 1750 spin_lock(&memcg_oom_lock); 1751 for_each_mem_cgroup_tree(iter, memcg) 1752 iter->under_oom++; 1753 spin_unlock(&memcg_oom_lock); 1754 } 1755 1756 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1757 { 1758 struct mem_cgroup *iter; 1759 1760 /* 1761 * Be careful about under_oom underflows because a child memcg 1762 * could have been added after mem_cgroup_mark_under_oom. 1763 */ 1764 spin_lock(&memcg_oom_lock); 1765 for_each_mem_cgroup_tree(iter, memcg) 1766 if (iter->under_oom > 0) 1767 iter->under_oom--; 1768 spin_unlock(&memcg_oom_lock); 1769 } 1770 1771 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1772 1773 struct oom_wait_info { 1774 struct mem_cgroup *memcg; 1775 wait_queue_entry_t wait; 1776 }; 1777 1778 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1779 unsigned mode, int sync, void *arg) 1780 { 1781 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1782 struct mem_cgroup *oom_wait_memcg; 1783 struct oom_wait_info *oom_wait_info; 1784 1785 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1786 oom_wait_memcg = oom_wait_info->memcg; 1787 1788 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1789 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1790 return 0; 1791 return autoremove_wake_function(wait, mode, sync, arg); 1792 } 1793 1794 static void memcg_oom_recover(struct mem_cgroup *memcg) 1795 { 1796 /* 1797 * For the following lockless ->under_oom test, the only required 1798 * guarantee is that it must see the state asserted by an OOM when 1799 * this function is called as a result of userland actions 1800 * triggered by the notification of the OOM. This is trivially 1801 * achieved by invoking mem_cgroup_mark_under_oom() before 1802 * triggering notification. 1803 */ 1804 if (memcg && memcg->under_oom) 1805 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1806 } 1807 1808 enum oom_status { 1809 OOM_SUCCESS, 1810 OOM_FAILED, 1811 OOM_ASYNC, 1812 OOM_SKIPPED 1813 }; 1814 1815 static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1816 { 1817 enum oom_status ret; 1818 bool locked; 1819 1820 if (order > PAGE_ALLOC_COSTLY_ORDER) 1821 return OOM_SKIPPED; 1822 1823 memcg_memory_event(memcg, MEMCG_OOM); 1824 1825 /* 1826 * We are in the middle of the charge context here, so we 1827 * don't want to block when potentially sitting on a callstack 1828 * that holds all kinds of filesystem and mm locks. 1829 * 1830 * cgroup1 allows disabling the OOM killer and waiting for outside 1831 * handling until the charge can succeed; remember the context and put 1832 * the task to sleep at the end of the page fault when all locks are 1833 * released. 1834 * 1835 * On the other hand, in-kernel OOM killer allows for an async victim 1836 * memory reclaim (oom_reaper) and that means that we are not solely 1837 * relying on the oom victim to make a forward progress and we can 1838 * invoke the oom killer here. 1839 * 1840 * Please note that mem_cgroup_out_of_memory might fail to find a 1841 * victim and then we have to bail out from the charge path. 1842 */ 1843 if (memcg->oom_kill_disable) { 1844 if (!current->in_user_fault) 1845 return OOM_SKIPPED; 1846 css_get(&memcg->css); 1847 current->memcg_in_oom = memcg; 1848 current->memcg_oom_gfp_mask = mask; 1849 current->memcg_oom_order = order; 1850 1851 return OOM_ASYNC; 1852 } 1853 1854 mem_cgroup_mark_under_oom(memcg); 1855 1856 locked = mem_cgroup_oom_trylock(memcg); 1857 1858 if (locked) 1859 mem_cgroup_oom_notify(memcg); 1860 1861 mem_cgroup_unmark_under_oom(memcg); 1862 if (mem_cgroup_out_of_memory(memcg, mask, order)) 1863 ret = OOM_SUCCESS; 1864 else 1865 ret = OOM_FAILED; 1866 1867 if (locked) 1868 mem_cgroup_oom_unlock(memcg); 1869 1870 return ret; 1871 } 1872 1873 /** 1874 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1875 * @handle: actually kill/wait or just clean up the OOM state 1876 * 1877 * This has to be called at the end of a page fault if the memcg OOM 1878 * handler was enabled. 1879 * 1880 * Memcg supports userspace OOM handling where failed allocations must 1881 * sleep on a waitqueue until the userspace task resolves the 1882 * situation. Sleeping directly in the charge context with all kinds 1883 * of locks held is not a good idea, instead we remember an OOM state 1884 * in the task and mem_cgroup_oom_synchronize() has to be called at 1885 * the end of the page fault to complete the OOM handling. 1886 * 1887 * Returns %true if an ongoing memcg OOM situation was detected and 1888 * completed, %false otherwise. 1889 */ 1890 bool mem_cgroup_oom_synchronize(bool handle) 1891 { 1892 struct mem_cgroup *memcg = current->memcg_in_oom; 1893 struct oom_wait_info owait; 1894 bool locked; 1895 1896 /* OOM is global, do not handle */ 1897 if (!memcg) 1898 return false; 1899 1900 if (!handle) 1901 goto cleanup; 1902 1903 owait.memcg = memcg; 1904 owait.wait.flags = 0; 1905 owait.wait.func = memcg_oom_wake_function; 1906 owait.wait.private = current; 1907 INIT_LIST_HEAD(&owait.wait.entry); 1908 1909 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1910 mem_cgroup_mark_under_oom(memcg); 1911 1912 locked = mem_cgroup_oom_trylock(memcg); 1913 1914 if (locked) 1915 mem_cgroup_oom_notify(memcg); 1916 1917 if (locked && !memcg->oom_kill_disable) { 1918 mem_cgroup_unmark_under_oom(memcg); 1919 finish_wait(&memcg_oom_waitq, &owait.wait); 1920 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, 1921 current->memcg_oom_order); 1922 } else { 1923 schedule(); 1924 mem_cgroup_unmark_under_oom(memcg); 1925 finish_wait(&memcg_oom_waitq, &owait.wait); 1926 } 1927 1928 if (locked) { 1929 mem_cgroup_oom_unlock(memcg); 1930 /* 1931 * There is no guarantee that an OOM-lock contender 1932 * sees the wakeups triggered by the OOM kill 1933 * uncharges. Wake any sleepers explicitly. 1934 */ 1935 memcg_oom_recover(memcg); 1936 } 1937 cleanup: 1938 current->memcg_in_oom = NULL; 1939 css_put(&memcg->css); 1940 return true; 1941 } 1942 1943 /** 1944 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 1945 * @victim: task to be killed by the OOM killer 1946 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 1947 * 1948 * Returns a pointer to a memory cgroup, which has to be cleaned up 1949 * by killing all belonging OOM-killable tasks. 1950 * 1951 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 1952 */ 1953 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 1954 struct mem_cgroup *oom_domain) 1955 { 1956 struct mem_cgroup *oom_group = NULL; 1957 struct mem_cgroup *memcg; 1958 1959 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1960 return NULL; 1961 1962 if (!oom_domain) 1963 oom_domain = root_mem_cgroup; 1964 1965 rcu_read_lock(); 1966 1967 memcg = mem_cgroup_from_task(victim); 1968 if (memcg == root_mem_cgroup) 1969 goto out; 1970 1971 /* 1972 * If the victim task has been asynchronously moved to a different 1973 * memory cgroup, we might end up killing tasks outside oom_domain. 1974 * In this case it's better to ignore memory.group.oom. 1975 */ 1976 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) 1977 goto out; 1978 1979 /* 1980 * Traverse the memory cgroup hierarchy from the victim task's 1981 * cgroup up to the OOMing cgroup (or root) to find the 1982 * highest-level memory cgroup with oom.group set. 1983 */ 1984 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 1985 if (memcg->oom_group) 1986 oom_group = memcg; 1987 1988 if (memcg == oom_domain) 1989 break; 1990 } 1991 1992 if (oom_group) 1993 css_get(&oom_group->css); 1994 out: 1995 rcu_read_unlock(); 1996 1997 return oom_group; 1998 } 1999 2000 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 2001 { 2002 pr_info("Tasks in "); 2003 pr_cont_cgroup_path(memcg->css.cgroup); 2004 pr_cont(" are going to be killed due to memory.oom.group set\n"); 2005 } 2006 2007 /** 2008 * lock_page_memcg - lock a page and memcg binding 2009 * @page: the page 2010 * 2011 * This function protects unlocked LRU pages from being moved to 2012 * another cgroup. 2013 * 2014 * It ensures lifetime of the locked memcg. Caller is responsible 2015 * for the lifetime of the page. 2016 */ 2017 void lock_page_memcg(struct page *page) 2018 { 2019 struct page *head = compound_head(page); /* rmap on tail pages */ 2020 struct mem_cgroup *memcg; 2021 unsigned long flags; 2022 2023 /* 2024 * The RCU lock is held throughout the transaction. The fast 2025 * path can get away without acquiring the memcg->move_lock 2026 * because page moving starts with an RCU grace period. 2027 */ 2028 rcu_read_lock(); 2029 2030 if (mem_cgroup_disabled()) 2031 return; 2032 again: 2033 memcg = page_memcg(head); 2034 if (unlikely(!memcg)) 2035 return; 2036 2037 #ifdef CONFIG_PROVE_LOCKING 2038 local_irq_save(flags); 2039 might_lock(&memcg->move_lock); 2040 local_irq_restore(flags); 2041 #endif 2042 2043 if (atomic_read(&memcg->moving_account) <= 0) 2044 return; 2045 2046 spin_lock_irqsave(&memcg->move_lock, flags); 2047 if (memcg != page_memcg(head)) { 2048 spin_unlock_irqrestore(&memcg->move_lock, flags); 2049 goto again; 2050 } 2051 2052 /* 2053 * When charge migration first begins, we can have multiple 2054 * critical sections holding the fast-path RCU lock and one 2055 * holding the slowpath move_lock. Track the task who has the 2056 * move_lock for unlock_page_memcg(). 2057 */ 2058 memcg->move_lock_task = current; 2059 memcg->move_lock_flags = flags; 2060 } 2061 EXPORT_SYMBOL(lock_page_memcg); 2062 2063 static void __unlock_page_memcg(struct mem_cgroup *memcg) 2064 { 2065 if (memcg && memcg->move_lock_task == current) { 2066 unsigned long flags = memcg->move_lock_flags; 2067 2068 memcg->move_lock_task = NULL; 2069 memcg->move_lock_flags = 0; 2070 2071 spin_unlock_irqrestore(&memcg->move_lock, flags); 2072 } 2073 2074 rcu_read_unlock(); 2075 } 2076 2077 /** 2078 * unlock_page_memcg - unlock a page and memcg binding 2079 * @page: the page 2080 */ 2081 void unlock_page_memcg(struct page *page) 2082 { 2083 struct page *head = compound_head(page); 2084 2085 __unlock_page_memcg(page_memcg(head)); 2086 } 2087 EXPORT_SYMBOL(unlock_page_memcg); 2088 2089 struct obj_stock { 2090 #ifdef CONFIG_MEMCG_KMEM 2091 struct obj_cgroup *cached_objcg; 2092 struct pglist_data *cached_pgdat; 2093 unsigned int nr_bytes; 2094 int nr_slab_reclaimable_b; 2095 int nr_slab_unreclaimable_b; 2096 #else 2097 int dummy[0]; 2098 #endif 2099 }; 2100 2101 struct memcg_stock_pcp { 2102 struct mem_cgroup *cached; /* this never be root cgroup */ 2103 unsigned int nr_pages; 2104 struct obj_stock task_obj; 2105 struct obj_stock irq_obj; 2106 2107 struct work_struct work; 2108 unsigned long flags; 2109 #define FLUSHING_CACHED_CHARGE 0 2110 }; 2111 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2112 static DEFINE_MUTEX(percpu_charge_mutex); 2113 2114 #ifdef CONFIG_MEMCG_KMEM 2115 static void drain_obj_stock(struct obj_stock *stock); 2116 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2117 struct mem_cgroup *root_memcg); 2118 2119 #else 2120 static inline void drain_obj_stock(struct obj_stock *stock) 2121 { 2122 } 2123 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2124 struct mem_cgroup *root_memcg) 2125 { 2126 return false; 2127 } 2128 #endif 2129 2130 /* 2131 * Most kmem_cache_alloc() calls are from user context. The irq disable/enable 2132 * sequence used in this case to access content from object stock is slow. 2133 * To optimize for user context access, there are now two object stocks for 2134 * task context and interrupt context access respectively. 2135 * 2136 * The task context object stock can be accessed by disabling preemption only 2137 * which is cheap in non-preempt kernel. The interrupt context object stock 2138 * can only be accessed after disabling interrupt. User context code can 2139 * access interrupt object stock, but not vice versa. 2140 */ 2141 static inline struct obj_stock *get_obj_stock(unsigned long *pflags) 2142 { 2143 struct memcg_stock_pcp *stock; 2144 2145 if (likely(in_task())) { 2146 *pflags = 0UL; 2147 preempt_disable(); 2148 stock = this_cpu_ptr(&memcg_stock); 2149 return &stock->task_obj; 2150 } 2151 2152 local_irq_save(*pflags); 2153 stock = this_cpu_ptr(&memcg_stock); 2154 return &stock->irq_obj; 2155 } 2156 2157 static inline void put_obj_stock(unsigned long flags) 2158 { 2159 if (likely(in_task())) 2160 preempt_enable(); 2161 else 2162 local_irq_restore(flags); 2163 } 2164 2165 /** 2166 * consume_stock: Try to consume stocked charge on this cpu. 2167 * @memcg: memcg to consume from. 2168 * @nr_pages: how many pages to charge. 2169 * 2170 * The charges will only happen if @memcg matches the current cpu's memcg 2171 * stock, and at least @nr_pages are available in that stock. Failure to 2172 * service an allocation will refill the stock. 2173 * 2174 * returns true if successful, false otherwise. 2175 */ 2176 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2177 { 2178 struct memcg_stock_pcp *stock; 2179 unsigned long flags; 2180 bool ret = false; 2181 2182 if (nr_pages > MEMCG_CHARGE_BATCH) 2183 return ret; 2184 2185 local_irq_save(flags); 2186 2187 stock = this_cpu_ptr(&memcg_stock); 2188 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2189 stock->nr_pages -= nr_pages; 2190 ret = true; 2191 } 2192 2193 local_irq_restore(flags); 2194 2195 return ret; 2196 } 2197 2198 /* 2199 * Returns stocks cached in percpu and reset cached information. 2200 */ 2201 static void drain_stock(struct memcg_stock_pcp *stock) 2202 { 2203 struct mem_cgroup *old = stock->cached; 2204 2205 if (!old) 2206 return; 2207 2208 if (stock->nr_pages) { 2209 page_counter_uncharge(&old->memory, stock->nr_pages); 2210 if (do_memsw_account()) 2211 page_counter_uncharge(&old->memsw, stock->nr_pages); 2212 stock->nr_pages = 0; 2213 } 2214 2215 css_put(&old->css); 2216 stock->cached = NULL; 2217 } 2218 2219 static void drain_local_stock(struct work_struct *dummy) 2220 { 2221 struct memcg_stock_pcp *stock; 2222 unsigned long flags; 2223 2224 /* 2225 * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. 2226 * drain_stock races is that we always operate on local CPU stock 2227 * here with IRQ disabled 2228 */ 2229 local_irq_save(flags); 2230 2231 stock = this_cpu_ptr(&memcg_stock); 2232 drain_obj_stock(&stock->irq_obj); 2233 if (in_task()) 2234 drain_obj_stock(&stock->task_obj); 2235 drain_stock(stock); 2236 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2237 2238 local_irq_restore(flags); 2239 } 2240 2241 /* 2242 * Cache charges(val) to local per_cpu area. 2243 * This will be consumed by consume_stock() function, later. 2244 */ 2245 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2246 { 2247 struct memcg_stock_pcp *stock; 2248 unsigned long flags; 2249 2250 local_irq_save(flags); 2251 2252 stock = this_cpu_ptr(&memcg_stock); 2253 if (stock->cached != memcg) { /* reset if necessary */ 2254 drain_stock(stock); 2255 css_get(&memcg->css); 2256 stock->cached = memcg; 2257 } 2258 stock->nr_pages += nr_pages; 2259 2260 if (stock->nr_pages > MEMCG_CHARGE_BATCH) 2261 drain_stock(stock); 2262 2263 local_irq_restore(flags); 2264 } 2265 2266 /* 2267 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2268 * of the hierarchy under it. 2269 */ 2270 static void drain_all_stock(struct mem_cgroup *root_memcg) 2271 { 2272 int cpu, curcpu; 2273 2274 /* If someone's already draining, avoid adding running more workers. */ 2275 if (!mutex_trylock(&percpu_charge_mutex)) 2276 return; 2277 /* 2278 * Notify other cpus that system-wide "drain" is running 2279 * We do not care about races with the cpu hotplug because cpu down 2280 * as well as workers from this path always operate on the local 2281 * per-cpu data. CPU up doesn't touch memcg_stock at all. 2282 */ 2283 curcpu = get_cpu(); 2284 for_each_online_cpu(cpu) { 2285 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2286 struct mem_cgroup *memcg; 2287 bool flush = false; 2288 2289 rcu_read_lock(); 2290 memcg = stock->cached; 2291 if (memcg && stock->nr_pages && 2292 mem_cgroup_is_descendant(memcg, root_memcg)) 2293 flush = true; 2294 else if (obj_stock_flush_required(stock, root_memcg)) 2295 flush = true; 2296 rcu_read_unlock(); 2297 2298 if (flush && 2299 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2300 if (cpu == curcpu) 2301 drain_local_stock(&stock->work); 2302 else 2303 schedule_work_on(cpu, &stock->work); 2304 } 2305 } 2306 put_cpu(); 2307 mutex_unlock(&percpu_charge_mutex); 2308 } 2309 2310 static int memcg_hotplug_cpu_dead(unsigned int cpu) 2311 { 2312 struct memcg_stock_pcp *stock; 2313 2314 stock = &per_cpu(memcg_stock, cpu); 2315 drain_stock(stock); 2316 2317 return 0; 2318 } 2319 2320 static unsigned long reclaim_high(struct mem_cgroup *memcg, 2321 unsigned int nr_pages, 2322 gfp_t gfp_mask) 2323 { 2324 unsigned long nr_reclaimed = 0; 2325 2326 do { 2327 unsigned long pflags; 2328 2329 if (page_counter_read(&memcg->memory) <= 2330 READ_ONCE(memcg->memory.high)) 2331 continue; 2332 2333 memcg_memory_event(memcg, MEMCG_HIGH); 2334 2335 psi_memstall_enter(&pflags); 2336 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 2337 gfp_mask, true); 2338 psi_memstall_leave(&pflags); 2339 } while ((memcg = parent_mem_cgroup(memcg)) && 2340 !mem_cgroup_is_root(memcg)); 2341 2342 return nr_reclaimed; 2343 } 2344 2345 static void high_work_func(struct work_struct *work) 2346 { 2347 struct mem_cgroup *memcg; 2348 2349 memcg = container_of(work, struct mem_cgroup, high_work); 2350 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 2351 } 2352 2353 /* 2354 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 2355 * enough to still cause a significant slowdown in most cases, while still 2356 * allowing diagnostics and tracing to proceed without becoming stuck. 2357 */ 2358 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 2359 2360 /* 2361 * When calculating the delay, we use these either side of the exponentiation to 2362 * maintain precision and scale to a reasonable number of jiffies (see the table 2363 * below. 2364 * 2365 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 2366 * overage ratio to a delay. 2367 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the 2368 * proposed penalty in order to reduce to a reasonable number of jiffies, and 2369 * to produce a reasonable delay curve. 2370 * 2371 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 2372 * reasonable delay curve compared to precision-adjusted overage, not 2373 * penalising heavily at first, but still making sure that growth beyond the 2374 * limit penalises misbehaviour cgroups by slowing them down exponentially. For 2375 * example, with a high of 100 megabytes: 2376 * 2377 * +-------+------------------------+ 2378 * | usage | time to allocate in ms | 2379 * +-------+------------------------+ 2380 * | 100M | 0 | 2381 * | 101M | 6 | 2382 * | 102M | 25 | 2383 * | 103M | 57 | 2384 * | 104M | 102 | 2385 * | 105M | 159 | 2386 * | 106M | 230 | 2387 * | 107M | 313 | 2388 * | 108M | 409 | 2389 * | 109M | 518 | 2390 * | 110M | 639 | 2391 * | 111M | 774 | 2392 * | 112M | 921 | 2393 * | 113M | 1081 | 2394 * | 114M | 1254 | 2395 * | 115M | 1439 | 2396 * | 116M | 1638 | 2397 * | 117M | 1849 | 2398 * | 118M | 2000 | 2399 * | 119M | 2000 | 2400 * | 120M | 2000 | 2401 * +-------+------------------------+ 2402 */ 2403 #define MEMCG_DELAY_PRECISION_SHIFT 20 2404 #define MEMCG_DELAY_SCALING_SHIFT 14 2405 2406 static u64 calculate_overage(unsigned long usage, unsigned long high) 2407 { 2408 u64 overage; 2409 2410 if (usage <= high) 2411 return 0; 2412 2413 /* 2414 * Prevent division by 0 in overage calculation by acting as if 2415 * it was a threshold of 1 page 2416 */ 2417 high = max(high, 1UL); 2418 2419 overage = usage - high; 2420 overage <<= MEMCG_DELAY_PRECISION_SHIFT; 2421 return div64_u64(overage, high); 2422 } 2423 2424 static u64 mem_find_max_overage(struct mem_cgroup *memcg) 2425 { 2426 u64 overage, max_overage = 0; 2427 2428 do { 2429 overage = calculate_overage(page_counter_read(&memcg->memory), 2430 READ_ONCE(memcg->memory.high)); 2431 max_overage = max(overage, max_overage); 2432 } while ((memcg = parent_mem_cgroup(memcg)) && 2433 !mem_cgroup_is_root(memcg)); 2434 2435 return max_overage; 2436 } 2437 2438 static u64 swap_find_max_overage(struct mem_cgroup *memcg) 2439 { 2440 u64 overage, max_overage = 0; 2441 2442 do { 2443 overage = calculate_overage(page_counter_read(&memcg->swap), 2444 READ_ONCE(memcg->swap.high)); 2445 if (overage) 2446 memcg_memory_event(memcg, MEMCG_SWAP_HIGH); 2447 max_overage = max(overage, max_overage); 2448 } while ((memcg = parent_mem_cgroup(memcg)) && 2449 !mem_cgroup_is_root(memcg)); 2450 2451 return max_overage; 2452 } 2453 2454 /* 2455 * Get the number of jiffies that we should penalise a mischievous cgroup which 2456 * is exceeding its memory.high by checking both it and its ancestors. 2457 */ 2458 static unsigned long calculate_high_delay(struct mem_cgroup *memcg, 2459 unsigned int nr_pages, 2460 u64 max_overage) 2461 { 2462 unsigned long penalty_jiffies; 2463 2464 if (!max_overage) 2465 return 0; 2466 2467 /* 2468 * We use overage compared to memory.high to calculate the number of 2469 * jiffies to sleep (penalty_jiffies). Ideally this value should be 2470 * fairly lenient on small overages, and increasingly harsh when the 2471 * memcg in question makes it clear that it has no intention of stopping 2472 * its crazy behaviour, so we exponentially increase the delay based on 2473 * overage amount. 2474 */ 2475 penalty_jiffies = max_overage * max_overage * HZ; 2476 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 2477 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 2478 2479 /* 2480 * Factor in the task's own contribution to the overage, such that four 2481 * N-sized allocations are throttled approximately the same as one 2482 * 4N-sized allocation. 2483 * 2484 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 2485 * larger the current charge patch is than that. 2486 */ 2487 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 2488 } 2489 2490 /* 2491 * Scheduled by try_charge() to be executed from the userland return path 2492 * and reclaims memory over the high limit. 2493 */ 2494 void mem_cgroup_handle_over_high(void) 2495 { 2496 unsigned long penalty_jiffies; 2497 unsigned long pflags; 2498 unsigned long nr_reclaimed; 2499 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2500 int nr_retries = MAX_RECLAIM_RETRIES; 2501 struct mem_cgroup *memcg; 2502 bool in_retry = false; 2503 2504 if (likely(!nr_pages)) 2505 return; 2506 2507 memcg = get_mem_cgroup_from_mm(current->mm); 2508 current->memcg_nr_pages_over_high = 0; 2509 2510 retry_reclaim: 2511 /* 2512 * The allocating task should reclaim at least the batch size, but for 2513 * subsequent retries we only want to do what's necessary to prevent oom 2514 * or breaching resource isolation. 2515 * 2516 * This is distinct from memory.max or page allocator behaviour because 2517 * memory.high is currently batched, whereas memory.max and the page 2518 * allocator run every time an allocation is made. 2519 */ 2520 nr_reclaimed = reclaim_high(memcg, 2521 in_retry ? SWAP_CLUSTER_MAX : nr_pages, 2522 GFP_KERNEL); 2523 2524 /* 2525 * memory.high is breached and reclaim is unable to keep up. Throttle 2526 * allocators proactively to slow down excessive growth. 2527 */ 2528 penalty_jiffies = calculate_high_delay(memcg, nr_pages, 2529 mem_find_max_overage(memcg)); 2530 2531 penalty_jiffies += calculate_high_delay(memcg, nr_pages, 2532 swap_find_max_overage(memcg)); 2533 2534 /* 2535 * Clamp the max delay per usermode return so as to still keep the 2536 * application moving forwards and also permit diagnostics, albeit 2537 * extremely slowly. 2538 */ 2539 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2540 2541 /* 2542 * Don't sleep if the amount of jiffies this memcg owes us is so low 2543 * that it's not even worth doing, in an attempt to be nice to those who 2544 * go only a small amount over their memory.high value and maybe haven't 2545 * been aggressively reclaimed enough yet. 2546 */ 2547 if (penalty_jiffies <= HZ / 100) 2548 goto out; 2549 2550 /* 2551 * If reclaim is making forward progress but we're still over 2552 * memory.high, we want to encourage that rather than doing allocator 2553 * throttling. 2554 */ 2555 if (nr_reclaimed || nr_retries--) { 2556 in_retry = true; 2557 goto retry_reclaim; 2558 } 2559 2560 /* 2561 * If we exit early, we're guaranteed to die (since 2562 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 2563 * need to account for any ill-begotten jiffies to pay them off later. 2564 */ 2565 psi_memstall_enter(&pflags); 2566 schedule_timeout_killable(penalty_jiffies); 2567 psi_memstall_leave(&pflags); 2568 2569 out: 2570 css_put(&memcg->css); 2571 } 2572 2573 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, 2574 unsigned int nr_pages) 2575 { 2576 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 2577 int nr_retries = MAX_RECLAIM_RETRIES; 2578 struct mem_cgroup *mem_over_limit; 2579 struct page_counter *counter; 2580 enum oom_status oom_status; 2581 unsigned long nr_reclaimed; 2582 bool passed_oom = false; 2583 bool may_swap = true; 2584 bool drained = false; 2585 unsigned long pflags; 2586 2587 retry: 2588 if (consume_stock(memcg, nr_pages)) 2589 return 0; 2590 2591 if (!do_memsw_account() || 2592 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2593 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2594 goto done_restock; 2595 if (do_memsw_account()) 2596 page_counter_uncharge(&memcg->memsw, batch); 2597 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2598 } else { 2599 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2600 may_swap = false; 2601 } 2602 2603 if (batch > nr_pages) { 2604 batch = nr_pages; 2605 goto retry; 2606 } 2607 2608 /* 2609 * Memcg doesn't have a dedicated reserve for atomic 2610 * allocations. But like the global atomic pool, we need to 2611 * put the burden of reclaim on regular allocation requests 2612 * and let these go through as privileged allocations. 2613 */ 2614 if (gfp_mask & __GFP_ATOMIC) 2615 goto force; 2616 2617 /* 2618 * Prevent unbounded recursion when reclaim operations need to 2619 * allocate memory. This might exceed the limits temporarily, 2620 * but we prefer facilitating memory reclaim and getting back 2621 * under the limit over triggering OOM kills in these cases. 2622 */ 2623 if (unlikely(current->flags & PF_MEMALLOC)) 2624 goto force; 2625 2626 if (unlikely(task_in_memcg_oom(current))) 2627 goto nomem; 2628 2629 if (!gfpflags_allow_blocking(gfp_mask)) 2630 goto nomem; 2631 2632 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2633 2634 psi_memstall_enter(&pflags); 2635 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2636 gfp_mask, may_swap); 2637 psi_memstall_leave(&pflags); 2638 2639 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2640 goto retry; 2641 2642 if (!drained) { 2643 drain_all_stock(mem_over_limit); 2644 drained = true; 2645 goto retry; 2646 } 2647 2648 if (gfp_mask & __GFP_NORETRY) 2649 goto nomem; 2650 /* 2651 * Even though the limit is exceeded at this point, reclaim 2652 * may have been able to free some pages. Retry the charge 2653 * before killing the task. 2654 * 2655 * Only for regular pages, though: huge pages are rather 2656 * unlikely to succeed so close to the limit, and we fall back 2657 * to regular pages anyway in case of failure. 2658 */ 2659 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2660 goto retry; 2661 /* 2662 * At task move, charge accounts can be doubly counted. So, it's 2663 * better to wait until the end of task_move if something is going on. 2664 */ 2665 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2666 goto retry; 2667 2668 if (nr_retries--) 2669 goto retry; 2670 2671 if (gfp_mask & __GFP_RETRY_MAYFAIL) 2672 goto nomem; 2673 2674 /* Avoid endless loop for tasks bypassed by the oom killer */ 2675 if (passed_oom && task_is_dying()) 2676 goto nomem; 2677 2678 /* 2679 * keep retrying as long as the memcg oom killer is able to make 2680 * a forward progress or bypass the charge if the oom killer 2681 * couldn't make any progress. 2682 */ 2683 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, 2684 get_order(nr_pages * PAGE_SIZE)); 2685 if (oom_status == OOM_SUCCESS) { 2686 passed_oom = true; 2687 nr_retries = MAX_RECLAIM_RETRIES; 2688 goto retry; 2689 } 2690 nomem: 2691 if (!(gfp_mask & __GFP_NOFAIL)) 2692 return -ENOMEM; 2693 force: 2694 /* 2695 * The allocation either can't fail or will lead to more memory 2696 * being freed very soon. Allow memory usage go over the limit 2697 * temporarily by force charging it. 2698 */ 2699 page_counter_charge(&memcg->memory, nr_pages); 2700 if (do_memsw_account()) 2701 page_counter_charge(&memcg->memsw, nr_pages); 2702 2703 return 0; 2704 2705 done_restock: 2706 if (batch > nr_pages) 2707 refill_stock(memcg, batch - nr_pages); 2708 2709 /* 2710 * If the hierarchy is above the normal consumption range, schedule 2711 * reclaim on returning to userland. We can perform reclaim here 2712 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2713 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2714 * not recorded as it most likely matches current's and won't 2715 * change in the meantime. As high limit is checked again before 2716 * reclaim, the cost of mismatch is negligible. 2717 */ 2718 do { 2719 bool mem_high, swap_high; 2720 2721 mem_high = page_counter_read(&memcg->memory) > 2722 READ_ONCE(memcg->memory.high); 2723 swap_high = page_counter_read(&memcg->swap) > 2724 READ_ONCE(memcg->swap.high); 2725 2726 /* Don't bother a random interrupted task */ 2727 if (in_interrupt()) { 2728 if (mem_high) { 2729 schedule_work(&memcg->high_work); 2730 break; 2731 } 2732 continue; 2733 } 2734 2735 if (mem_high || swap_high) { 2736 /* 2737 * The allocating tasks in this cgroup will need to do 2738 * reclaim or be throttled to prevent further growth 2739 * of the memory or swap footprints. 2740 * 2741 * Target some best-effort fairness between the tasks, 2742 * and distribute reclaim work and delay penalties 2743 * based on how much each task is actually allocating. 2744 */ 2745 current->memcg_nr_pages_over_high += batch; 2746 set_notify_resume(current); 2747 break; 2748 } 2749 } while ((memcg = parent_mem_cgroup(memcg))); 2750 2751 return 0; 2752 } 2753 2754 static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2755 unsigned int nr_pages) 2756 { 2757 if (mem_cgroup_is_root(memcg)) 2758 return 0; 2759 2760 return try_charge_memcg(memcg, gfp_mask, nr_pages); 2761 } 2762 2763 static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2764 { 2765 if (mem_cgroup_is_root(memcg)) 2766 return; 2767 2768 page_counter_uncharge(&memcg->memory, nr_pages); 2769 if (do_memsw_account()) 2770 page_counter_uncharge(&memcg->memsw, nr_pages); 2771 } 2772 2773 static void commit_charge(struct page *page, struct mem_cgroup *memcg) 2774 { 2775 VM_BUG_ON_PAGE(page_memcg(page), page); 2776 /* 2777 * Any of the following ensures page's memcg stability: 2778 * 2779 * - the page lock 2780 * - LRU isolation 2781 * - lock_page_memcg() 2782 * - exclusive reference 2783 */ 2784 page->memcg_data = (unsigned long)memcg; 2785 } 2786 2787 static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) 2788 { 2789 struct mem_cgroup *memcg; 2790 2791 rcu_read_lock(); 2792 retry: 2793 memcg = obj_cgroup_memcg(objcg); 2794 if (unlikely(!css_tryget(&memcg->css))) 2795 goto retry; 2796 rcu_read_unlock(); 2797 2798 return memcg; 2799 } 2800 2801 #ifdef CONFIG_MEMCG_KMEM 2802 /* 2803 * The allocated objcg pointers array is not accounted directly. 2804 * Moreover, it should not come from DMA buffer and is not readily 2805 * reclaimable. So those GFP bits should be masked off. 2806 */ 2807 #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) 2808 2809 int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, 2810 gfp_t gfp, bool new_page) 2811 { 2812 unsigned int objects = objs_per_slab_page(s, page); 2813 unsigned long memcg_data; 2814 void *vec; 2815 2816 gfp &= ~OBJCGS_CLEAR_MASK; 2817 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, 2818 page_to_nid(page)); 2819 if (!vec) 2820 return -ENOMEM; 2821 2822 memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; 2823 if (new_page) { 2824 /* 2825 * If the slab page is brand new and nobody can yet access 2826 * it's memcg_data, no synchronization is required and 2827 * memcg_data can be simply assigned. 2828 */ 2829 page->memcg_data = memcg_data; 2830 } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) { 2831 /* 2832 * If the slab page is already in use, somebody can allocate 2833 * and assign obj_cgroups in parallel. In this case the existing 2834 * objcg vector should be reused. 2835 */ 2836 kfree(vec); 2837 return 0; 2838 } 2839 2840 kmemleak_not_leak(vec); 2841 return 0; 2842 } 2843 2844 /* 2845 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2846 * 2847 * A passed kernel object can be a slab object or a generic kernel page, so 2848 * different mechanisms for getting the memory cgroup pointer should be used. 2849 * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller 2850 * can not know for sure how the kernel object is implemented. 2851 * mem_cgroup_from_obj() can be safely used in such cases. 2852 * 2853 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2854 * cgroup_mutex, etc. 2855 */ 2856 struct mem_cgroup *mem_cgroup_from_obj(void *p) 2857 { 2858 struct page *page; 2859 2860 if (mem_cgroup_disabled()) 2861 return NULL; 2862 2863 page = virt_to_head_page(p); 2864 2865 /* 2866 * Slab objects are accounted individually, not per-page. 2867 * Memcg membership data for each individual object is saved in 2868 * the page->obj_cgroups. 2869 */ 2870 if (page_objcgs_check(page)) { 2871 struct obj_cgroup *objcg; 2872 unsigned int off; 2873 2874 off = obj_to_index(page->slab_cache, page, p); 2875 objcg = page_objcgs(page)[off]; 2876 if (objcg) 2877 return obj_cgroup_memcg(objcg); 2878 2879 return NULL; 2880 } 2881 2882 /* 2883 * page_memcg_check() is used here, because page_has_obj_cgroups() 2884 * check above could fail because the object cgroups vector wasn't set 2885 * at that moment, but it can be set concurrently. 2886 * page_memcg_check(page) will guarantee that a proper memory 2887 * cgroup pointer or NULL will be returned. 2888 */ 2889 return page_memcg_check(page); 2890 } 2891 2892 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) 2893 { 2894 struct obj_cgroup *objcg = NULL; 2895 struct mem_cgroup *memcg; 2896 2897 if (memcg_kmem_bypass()) 2898 return NULL; 2899 2900 rcu_read_lock(); 2901 if (unlikely(active_memcg())) 2902 memcg = active_memcg(); 2903 else 2904 memcg = mem_cgroup_from_task(current); 2905 2906 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { 2907 objcg = rcu_dereference(memcg->objcg); 2908 if (objcg && obj_cgroup_tryget(objcg)) 2909 break; 2910 objcg = NULL; 2911 } 2912 rcu_read_unlock(); 2913 2914 return objcg; 2915 } 2916 2917 static int memcg_alloc_cache_id(void) 2918 { 2919 int id, size; 2920 int err; 2921 2922 id = ida_simple_get(&memcg_cache_ida, 2923 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2924 if (id < 0) 2925 return id; 2926 2927 if (id < memcg_nr_cache_ids) 2928 return id; 2929 2930 /* 2931 * There's no space for the new id in memcg_caches arrays, 2932 * so we have to grow them. 2933 */ 2934 down_write(&memcg_cache_ids_sem); 2935 2936 size = 2 * (id + 1); 2937 if (size < MEMCG_CACHES_MIN_SIZE) 2938 size = MEMCG_CACHES_MIN_SIZE; 2939 else if (size > MEMCG_CACHES_MAX_SIZE) 2940 size = MEMCG_CACHES_MAX_SIZE; 2941 2942 err = memcg_update_all_list_lrus(size); 2943 if (!err) 2944 memcg_nr_cache_ids = size; 2945 2946 up_write(&memcg_cache_ids_sem); 2947 2948 if (err) { 2949 ida_simple_remove(&memcg_cache_ida, id); 2950 return err; 2951 } 2952 return id; 2953 } 2954 2955 static void memcg_free_cache_id(int id) 2956 { 2957 ida_simple_remove(&memcg_cache_ida, id); 2958 } 2959 2960 /* 2961 * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg 2962 * @objcg: object cgroup to uncharge 2963 * @nr_pages: number of pages to uncharge 2964 */ 2965 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, 2966 unsigned int nr_pages) 2967 { 2968 struct mem_cgroup *memcg; 2969 2970 memcg = get_mem_cgroup_from_objcg(objcg); 2971 2972 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2973 page_counter_uncharge(&memcg->kmem, nr_pages); 2974 refill_stock(memcg, nr_pages); 2975 2976 css_put(&memcg->css); 2977 } 2978 2979 /* 2980 * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg 2981 * @objcg: object cgroup to charge 2982 * @gfp: reclaim mode 2983 * @nr_pages: number of pages to charge 2984 * 2985 * Returns 0 on success, an error code on failure. 2986 */ 2987 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, 2988 unsigned int nr_pages) 2989 { 2990 struct mem_cgroup *memcg; 2991 int ret; 2992 2993 memcg = get_mem_cgroup_from_objcg(objcg); 2994 2995 ret = try_charge_memcg(memcg, gfp, nr_pages); 2996 if (ret) 2997 goto out; 2998 2999 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 3000 page_counter_charge(&memcg->kmem, nr_pages); 3001 out: 3002 css_put(&memcg->css); 3003 3004 return ret; 3005 } 3006 3007 /** 3008 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup 3009 * @page: page to charge 3010 * @gfp: reclaim mode 3011 * @order: allocation order 3012 * 3013 * Returns 0 on success, an error code on failure. 3014 */ 3015 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) 3016 { 3017 struct obj_cgroup *objcg; 3018 int ret = 0; 3019 3020 objcg = get_obj_cgroup_from_current(); 3021 if (objcg) { 3022 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); 3023 if (!ret) { 3024 page->memcg_data = (unsigned long)objcg | 3025 MEMCG_DATA_KMEM; 3026 return 0; 3027 } 3028 obj_cgroup_put(objcg); 3029 } 3030 return ret; 3031 } 3032 3033 /** 3034 * __memcg_kmem_uncharge_page: uncharge a kmem page 3035 * @page: page to uncharge 3036 * @order: allocation order 3037 */ 3038 void __memcg_kmem_uncharge_page(struct page *page, int order) 3039 { 3040 struct obj_cgroup *objcg; 3041 unsigned int nr_pages = 1 << order; 3042 3043 if (!PageMemcgKmem(page)) 3044 return; 3045 3046 objcg = __page_objcg(page); 3047 obj_cgroup_uncharge_pages(objcg, nr_pages); 3048 page->memcg_data = 0; 3049 obj_cgroup_put(objcg); 3050 } 3051 3052 void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, 3053 enum node_stat_item idx, int nr) 3054 { 3055 unsigned long flags; 3056 struct obj_stock *stock = get_obj_stock(&flags); 3057 int *bytes; 3058 3059 /* 3060 * Save vmstat data in stock and skip vmstat array update unless 3061 * accumulating over a page of vmstat data or when pgdat or idx 3062 * changes. 3063 */ 3064 if (stock->cached_objcg != objcg) { 3065 drain_obj_stock(stock); 3066 obj_cgroup_get(objcg); 3067 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 3068 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 3069 stock->cached_objcg = objcg; 3070 stock->cached_pgdat = pgdat; 3071 } else if (stock->cached_pgdat != pgdat) { 3072 /* Flush the existing cached vmstat data */ 3073 struct pglist_data *oldpg = stock->cached_pgdat; 3074 3075 if (stock->nr_slab_reclaimable_b) { 3076 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, 3077 stock->nr_slab_reclaimable_b); 3078 stock->nr_slab_reclaimable_b = 0; 3079 } 3080 if (stock->nr_slab_unreclaimable_b) { 3081 mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, 3082 stock->nr_slab_unreclaimable_b); 3083 stock->nr_slab_unreclaimable_b = 0; 3084 } 3085 stock->cached_pgdat = pgdat; 3086 } 3087 3088 bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b 3089 : &stock->nr_slab_unreclaimable_b; 3090 /* 3091 * Even for large object >= PAGE_SIZE, the vmstat data will still be 3092 * cached locally at least once before pushing it out. 3093 */ 3094 if (!*bytes) { 3095 *bytes = nr; 3096 nr = 0; 3097 } else { 3098 *bytes += nr; 3099 if (abs(*bytes) > PAGE_SIZE) { 3100 nr = *bytes; 3101 *bytes = 0; 3102 } else { 3103 nr = 0; 3104 } 3105 } 3106 if (nr) 3107 mod_objcg_mlstate(objcg, pgdat, idx, nr); 3108 3109 put_obj_stock(flags); 3110 } 3111 3112 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 3113 { 3114 unsigned long flags; 3115 struct obj_stock *stock = get_obj_stock(&flags); 3116 bool ret = false; 3117 3118 if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { 3119 stock->nr_bytes -= nr_bytes; 3120 ret = true; 3121 } 3122 3123 put_obj_stock(flags); 3124 3125 return ret; 3126 } 3127 3128 static void drain_obj_stock(struct obj_stock *stock) 3129 { 3130 struct obj_cgroup *old = stock->cached_objcg; 3131 3132 if (!old) 3133 return; 3134 3135 if (stock->nr_bytes) { 3136 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3137 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); 3138 3139 if (nr_pages) 3140 obj_cgroup_uncharge_pages(old, nr_pages); 3141 3142 /* 3143 * The leftover is flushed to the centralized per-memcg value. 3144 * On the next attempt to refill obj stock it will be moved 3145 * to a per-cpu stock (probably, on an other CPU), see 3146 * refill_obj_stock(). 3147 * 3148 * How often it's flushed is a trade-off between the memory 3149 * limit enforcement accuracy and potential CPU contention, 3150 * so it might be changed in the future. 3151 */ 3152 atomic_add(nr_bytes, &old->nr_charged_bytes); 3153 stock->nr_bytes = 0; 3154 } 3155 3156 /* 3157 * Flush the vmstat data in current stock 3158 */ 3159 if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { 3160 if (stock->nr_slab_reclaimable_b) { 3161 mod_objcg_mlstate(old, stock->cached_pgdat, 3162 NR_SLAB_RECLAIMABLE_B, 3163 stock->nr_slab_reclaimable_b); 3164 stock->nr_slab_reclaimable_b = 0; 3165 } 3166 if (stock->nr_slab_unreclaimable_b) { 3167 mod_objcg_mlstate(old, stock->cached_pgdat, 3168 NR_SLAB_UNRECLAIMABLE_B, 3169 stock->nr_slab_unreclaimable_b); 3170 stock->nr_slab_unreclaimable_b = 0; 3171 } 3172 stock->cached_pgdat = NULL; 3173 } 3174 3175 obj_cgroup_put(old); 3176 stock->cached_objcg = NULL; 3177 } 3178 3179 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 3180 struct mem_cgroup *root_memcg) 3181 { 3182 struct mem_cgroup *memcg; 3183 3184 if (in_task() && stock->task_obj.cached_objcg) { 3185 memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); 3186 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 3187 return true; 3188 } 3189 if (stock->irq_obj.cached_objcg) { 3190 memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg); 3191 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 3192 return true; 3193 } 3194 3195 return false; 3196 } 3197 3198 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, 3199 bool allow_uncharge) 3200 { 3201 unsigned long flags; 3202 struct obj_stock *stock = get_obj_stock(&flags); 3203 unsigned int nr_pages = 0; 3204 3205 if (stock->cached_objcg != objcg) { /* reset if necessary */ 3206 drain_obj_stock(stock); 3207 obj_cgroup_get(objcg); 3208 stock->cached_objcg = objcg; 3209 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) 3210 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; 3211 allow_uncharge = true; /* Allow uncharge when objcg changes */ 3212 } 3213 stock->nr_bytes += nr_bytes; 3214 3215 if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { 3216 nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3217 stock->nr_bytes &= (PAGE_SIZE - 1); 3218 } 3219 3220 put_obj_stock(flags); 3221 3222 if (nr_pages) 3223 obj_cgroup_uncharge_pages(objcg, nr_pages); 3224 } 3225 3226 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 3227 { 3228 unsigned int nr_pages, nr_bytes; 3229 int ret; 3230 3231 if (consume_obj_stock(objcg, size)) 3232 return 0; 3233 3234 /* 3235 * In theory, objcg->nr_charged_bytes can have enough 3236 * pre-charged bytes to satisfy the allocation. However, 3237 * flushing objcg->nr_charged_bytes requires two atomic 3238 * operations, and objcg->nr_charged_bytes can't be big. 3239 * The shared objcg->nr_charged_bytes can also become a 3240 * performance bottleneck if all tasks of the same memcg are 3241 * trying to update it. So it's better to ignore it and try 3242 * grab some new pages. The stock's nr_bytes will be flushed to 3243 * objcg->nr_charged_bytes later on when objcg changes. 3244 * 3245 * The stock's nr_bytes may contain enough pre-charged bytes 3246 * to allow one less page from being charged, but we can't rely 3247 * on the pre-charged bytes not being changed outside of 3248 * consume_obj_stock() or refill_obj_stock(). So ignore those 3249 * pre-charged bytes as well when charging pages. To avoid a 3250 * page uncharge right after a page charge, we set the 3251 * allow_uncharge flag to false when calling refill_obj_stock() 3252 * to temporarily allow the pre-charged bytes to exceed the page 3253 * size limit. The maximum reachable value of the pre-charged 3254 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data 3255 * race. 3256 */ 3257 nr_pages = size >> PAGE_SHIFT; 3258 nr_bytes = size & (PAGE_SIZE - 1); 3259 3260 if (nr_bytes) 3261 nr_pages += 1; 3262 3263 ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); 3264 if (!ret && nr_bytes) 3265 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); 3266 3267 return ret; 3268 } 3269 3270 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 3271 { 3272 refill_obj_stock(objcg, size, true); 3273 } 3274 3275 #endif /* CONFIG_MEMCG_KMEM */ 3276 3277 /* 3278 * Because page_memcg(head) is not set on tails, set it now. 3279 */ 3280 void split_page_memcg(struct page *head, unsigned int nr) 3281 { 3282 struct mem_cgroup *memcg = page_memcg(head); 3283 int i; 3284 3285 if (mem_cgroup_disabled() || !memcg) 3286 return; 3287 3288 for (i = 1; i < nr; i++) 3289 head[i].memcg_data = head->memcg_data; 3290 3291 if (PageMemcgKmem(head)) 3292 obj_cgroup_get_many(__page_objcg(head), nr - 1); 3293 else 3294 css_get_many(&memcg->css, nr - 1); 3295 } 3296 3297 #ifdef CONFIG_MEMCG_SWAP 3298 /** 3299 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3300 * @entry: swap entry to be moved 3301 * @from: mem_cgroup which the entry is moved from 3302 * @to: mem_cgroup which the entry is moved to 3303 * 3304 * It succeeds only when the swap_cgroup's record for this entry is the same 3305 * as the mem_cgroup's id of @from. 3306 * 3307 * Returns 0 on success, -EINVAL on failure. 3308 * 3309 * The caller must have charged to @to, IOW, called page_counter_charge() about 3310 * both res and memsw, and called css_get(). 3311 */ 3312 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3313 struct mem_cgroup *from, struct mem_cgroup *to) 3314 { 3315 unsigned short old_id, new_id; 3316 3317 old_id = mem_cgroup_id(from); 3318 new_id = mem_cgroup_id(to); 3319 3320 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3321 mod_memcg_state(from, MEMCG_SWAP, -1); 3322 mod_memcg_state(to, MEMCG_SWAP, 1); 3323 return 0; 3324 } 3325 return -EINVAL; 3326 } 3327 #else 3328 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3329 struct mem_cgroup *from, struct mem_cgroup *to) 3330 { 3331 return -EINVAL; 3332 } 3333 #endif 3334 3335 static DEFINE_MUTEX(memcg_max_mutex); 3336 3337 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 3338 unsigned long max, bool memsw) 3339 { 3340 bool enlarge = false; 3341 bool drained = false; 3342 int ret; 3343 bool limits_invariant; 3344 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 3345 3346 do { 3347 if (signal_pending(current)) { 3348 ret = -EINTR; 3349 break; 3350 } 3351 3352 mutex_lock(&memcg_max_mutex); 3353 /* 3354 * Make sure that the new limit (memsw or memory limit) doesn't 3355 * break our basic invariant rule memory.max <= memsw.max. 3356 */ 3357 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 3358 max <= memcg->memsw.max; 3359 if (!limits_invariant) { 3360 mutex_unlock(&memcg_max_mutex); 3361 ret = -EINVAL; 3362 break; 3363 } 3364 if (max > counter->max) 3365 enlarge = true; 3366 ret = page_counter_set_max(counter, max); 3367 mutex_unlock(&memcg_max_mutex); 3368 3369 if (!ret) 3370 break; 3371 3372 if (!drained) { 3373 drain_all_stock(memcg); 3374 drained = true; 3375 continue; 3376 } 3377 3378 if (!try_to_free_mem_cgroup_pages(memcg, 1, 3379 GFP_KERNEL, !memsw)) { 3380 ret = -EBUSY; 3381 break; 3382 } 3383 } while (true); 3384 3385 if (!ret && enlarge) 3386 memcg_oom_recover(memcg); 3387 3388 return ret; 3389 } 3390 3391 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 3392 gfp_t gfp_mask, 3393 unsigned long *total_scanned) 3394 { 3395 unsigned long nr_reclaimed = 0; 3396 struct mem_cgroup_per_node *mz, *next_mz = NULL; 3397 unsigned long reclaimed; 3398 int loop = 0; 3399 struct mem_cgroup_tree_per_node *mctz; 3400 unsigned long excess; 3401 unsigned long nr_scanned; 3402 3403 if (order > 0) 3404 return 0; 3405 3406 mctz = soft_limit_tree_node(pgdat->node_id); 3407 3408 /* 3409 * Do not even bother to check the largest node if the root 3410 * is empty. Do it lockless to prevent lock bouncing. Races 3411 * are acceptable as soft limit is best effort anyway. 3412 */ 3413 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 3414 return 0; 3415 3416 /* 3417 * This loop can run a while, specially if mem_cgroup's continuously 3418 * keep exceeding their soft limit and putting the system under 3419 * pressure 3420 */ 3421 do { 3422 if (next_mz) 3423 mz = next_mz; 3424 else 3425 mz = mem_cgroup_largest_soft_limit_node(mctz); 3426 if (!mz) 3427 break; 3428 3429 nr_scanned = 0; 3430 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 3431 gfp_mask, &nr_scanned); 3432 nr_reclaimed += reclaimed; 3433 *total_scanned += nr_scanned; 3434 spin_lock_irq(&mctz->lock); 3435 __mem_cgroup_remove_exceeded(mz, mctz); 3436 3437 /* 3438 * If we failed to reclaim anything from this memory cgroup 3439 * it is time to move on to the next cgroup 3440 */ 3441 next_mz = NULL; 3442 if (!reclaimed) 3443 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 3444 3445 excess = soft_limit_excess(mz->memcg); 3446 /* 3447 * One school of thought says that we should not add 3448 * back the node to the tree if reclaim returns 0. 3449 * But our reclaim could return 0, simply because due 3450 * to priority we are exposing a smaller subset of 3451 * memory to reclaim from. Consider this as a longer 3452 * term TODO. 3453 */ 3454 /* If excess == 0, no tree ops */ 3455 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3456 spin_unlock_irq(&mctz->lock); 3457 css_put(&mz->memcg->css); 3458 loop++; 3459 /* 3460 * Could not reclaim anything and there are no more 3461 * mem cgroups to try or we seem to be looping without 3462 * reclaiming anything. 3463 */ 3464 if (!nr_reclaimed && 3465 (next_mz == NULL || 3466 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3467 break; 3468 } while (!nr_reclaimed); 3469 if (next_mz) 3470 css_put(&next_mz->memcg->css); 3471 return nr_reclaimed; 3472 } 3473 3474 /* 3475 * Reclaims as many pages from the given memcg as possible. 3476 * 3477 * Caller is responsible for holding css reference for memcg. 3478 */ 3479 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3480 { 3481 int nr_retries = MAX_RECLAIM_RETRIES; 3482 3483 /* we call try-to-free pages for make this cgroup empty */ 3484 lru_add_drain_all(); 3485 3486 drain_all_stock(memcg); 3487 3488 /* try to free all pages in this cgroup */ 3489 while (nr_retries && page_counter_read(&memcg->memory)) { 3490 if (signal_pending(current)) 3491 return -EINTR; 3492 3493 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true)) 3494 nr_retries--; 3495 } 3496 3497 return 0; 3498 } 3499 3500 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3501 char *buf, size_t nbytes, 3502 loff_t off) 3503 { 3504 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3505 3506 if (mem_cgroup_is_root(memcg)) 3507 return -EINVAL; 3508 return mem_cgroup_force_empty(memcg) ?: nbytes; 3509 } 3510 3511 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3512 struct cftype *cft) 3513 { 3514 return 1; 3515 } 3516 3517 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3518 struct cftype *cft, u64 val) 3519 { 3520 if (val == 1) 3521 return 0; 3522 3523 pr_warn_once("Non-hierarchical mode is deprecated. " 3524 "Please report your usecase to linux-mm@kvack.org if you " 3525 "depend on this functionality.\n"); 3526 3527 return -EINVAL; 3528 } 3529 3530 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3531 { 3532 unsigned long val; 3533 3534 if (mem_cgroup_is_root(memcg)) { 3535 mem_cgroup_flush_stats(); 3536 val = memcg_page_state(memcg, NR_FILE_PAGES) + 3537 memcg_page_state(memcg, NR_ANON_MAPPED); 3538 if (swap) 3539 val += memcg_page_state(memcg, MEMCG_SWAP); 3540 } else { 3541 if (!swap) 3542 val = page_counter_read(&memcg->memory); 3543 else 3544 val = page_counter_read(&memcg->memsw); 3545 } 3546 return val; 3547 } 3548 3549 enum { 3550 RES_USAGE, 3551 RES_LIMIT, 3552 RES_MAX_USAGE, 3553 RES_FAILCNT, 3554 RES_SOFT_LIMIT, 3555 }; 3556 3557 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3558 struct cftype *cft) 3559 { 3560 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3561 struct page_counter *counter; 3562 3563 switch (MEMFILE_TYPE(cft->private)) { 3564 case _MEM: 3565 counter = &memcg->memory; 3566 break; 3567 case _MEMSWAP: 3568 counter = &memcg->memsw; 3569 break; 3570 case _KMEM: 3571 counter = &memcg->kmem; 3572 break; 3573 case _TCP: 3574 counter = &memcg->tcpmem; 3575 break; 3576 default: 3577 BUG(); 3578 } 3579 3580 switch (MEMFILE_ATTR(cft->private)) { 3581 case RES_USAGE: 3582 if (counter == &memcg->memory) 3583 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 3584 if (counter == &memcg->memsw) 3585 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 3586 return (u64)page_counter_read(counter) * PAGE_SIZE; 3587 case RES_LIMIT: 3588 return (u64)counter->max * PAGE_SIZE; 3589 case RES_MAX_USAGE: 3590 return (u64)counter->watermark * PAGE_SIZE; 3591 case RES_FAILCNT: 3592 return counter->failcnt; 3593 case RES_SOFT_LIMIT: 3594 return (u64)memcg->soft_limit * PAGE_SIZE; 3595 default: 3596 BUG(); 3597 } 3598 } 3599 3600 #ifdef CONFIG_MEMCG_KMEM 3601 static int memcg_online_kmem(struct mem_cgroup *memcg) 3602 { 3603 struct obj_cgroup *objcg; 3604 int memcg_id; 3605 3606 if (cgroup_memory_nokmem) 3607 return 0; 3608 3609 BUG_ON(memcg->kmemcg_id >= 0); 3610 3611 memcg_id = memcg_alloc_cache_id(); 3612 if (memcg_id < 0) 3613 return memcg_id; 3614 3615 objcg = obj_cgroup_alloc(); 3616 if (!objcg) { 3617 memcg_free_cache_id(memcg_id); 3618 return -ENOMEM; 3619 } 3620 objcg->memcg = memcg; 3621 rcu_assign_pointer(memcg->objcg, objcg); 3622 3623 static_branch_enable(&memcg_kmem_enabled_key); 3624 3625 memcg->kmemcg_id = memcg_id; 3626 3627 return 0; 3628 } 3629 3630 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3631 { 3632 struct mem_cgroup *parent; 3633 int kmemcg_id; 3634 3635 if (memcg->kmemcg_id == -1) 3636 return; 3637 3638 parent = parent_mem_cgroup(memcg); 3639 if (!parent) 3640 parent = root_mem_cgroup; 3641 3642 memcg_reparent_objcgs(memcg, parent); 3643 3644 kmemcg_id = memcg->kmemcg_id; 3645 BUG_ON(kmemcg_id < 0); 3646 3647 /* 3648 * After we have finished memcg_reparent_objcgs(), all list_lrus 3649 * corresponding to this cgroup are guaranteed to remain empty. 3650 * The ordering is imposed by list_lru_node->lock taken by 3651 * memcg_drain_all_list_lrus(). 3652 */ 3653 memcg_drain_all_list_lrus(kmemcg_id, parent); 3654 3655 memcg_free_cache_id(kmemcg_id); 3656 memcg->kmemcg_id = -1; 3657 } 3658 #else 3659 static int memcg_online_kmem(struct mem_cgroup *memcg) 3660 { 3661 return 0; 3662 } 3663 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3664 { 3665 } 3666 #endif /* CONFIG_MEMCG_KMEM */ 3667 3668 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 3669 { 3670 int ret; 3671 3672 mutex_lock(&memcg_max_mutex); 3673 3674 ret = page_counter_set_max(&memcg->tcpmem, max); 3675 if (ret) 3676 goto out; 3677 3678 if (!memcg->tcpmem_active) { 3679 /* 3680 * The active flag needs to be written after the static_key 3681 * update. This is what guarantees that the socket activation 3682 * function is the last one to run. See mem_cgroup_sk_alloc() 3683 * for details, and note that we don't mark any socket as 3684 * belonging to this memcg until that flag is up. 3685 * 3686 * We need to do this, because static_keys will span multiple 3687 * sites, but we can't control their order. If we mark a socket 3688 * as accounted, but the accounting functions are not patched in 3689 * yet, we'll lose accounting. 3690 * 3691 * We never race with the readers in mem_cgroup_sk_alloc(), 3692 * because when this value change, the code to process it is not 3693 * patched in yet. 3694 */ 3695 static_branch_inc(&memcg_sockets_enabled_key); 3696 memcg->tcpmem_active = true; 3697 } 3698 out: 3699 mutex_unlock(&memcg_max_mutex); 3700 return ret; 3701 } 3702 3703 /* 3704 * The user of this function is... 3705 * RES_LIMIT. 3706 */ 3707 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3708 char *buf, size_t nbytes, loff_t off) 3709 { 3710 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3711 unsigned long nr_pages; 3712 int ret; 3713 3714 buf = strstrip(buf); 3715 ret = page_counter_memparse(buf, "-1", &nr_pages); 3716 if (ret) 3717 return ret; 3718 3719 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3720 case RES_LIMIT: 3721 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3722 ret = -EINVAL; 3723 break; 3724 } 3725 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3726 case _MEM: 3727 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 3728 break; 3729 case _MEMSWAP: 3730 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3731 break; 3732 case _KMEM: 3733 /* kmem.limit_in_bytes is deprecated. */ 3734 ret = -EOPNOTSUPP; 3735 break; 3736 case _TCP: 3737 ret = memcg_update_tcp_max(memcg, nr_pages); 3738 break; 3739 } 3740 break; 3741 case RES_SOFT_LIMIT: 3742 memcg->soft_limit = nr_pages; 3743 ret = 0; 3744 break; 3745 } 3746 return ret ?: nbytes; 3747 } 3748 3749 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3750 size_t nbytes, loff_t off) 3751 { 3752 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3753 struct page_counter *counter; 3754 3755 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3756 case _MEM: 3757 counter = &memcg->memory; 3758 break; 3759 case _MEMSWAP: 3760 counter = &memcg->memsw; 3761 break; 3762 case _KMEM: 3763 counter = &memcg->kmem; 3764 break; 3765 case _TCP: 3766 counter = &memcg->tcpmem; 3767 break; 3768 default: 3769 BUG(); 3770 } 3771 3772 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3773 case RES_MAX_USAGE: 3774 page_counter_reset_watermark(counter); 3775 break; 3776 case RES_FAILCNT: 3777 counter->failcnt = 0; 3778 break; 3779 default: 3780 BUG(); 3781 } 3782 3783 return nbytes; 3784 } 3785 3786 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3787 struct cftype *cft) 3788 { 3789 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3790 } 3791 3792 #ifdef CONFIG_MMU 3793 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3794 struct cftype *cft, u64 val) 3795 { 3796 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3797 3798 if (val & ~MOVE_MASK) 3799 return -EINVAL; 3800 3801 /* 3802 * No kind of locking is needed in here, because ->can_attach() will 3803 * check this value once in the beginning of the process, and then carry 3804 * on with stale data. This means that changes to this value will only 3805 * affect task migrations starting after the change. 3806 */ 3807 memcg->move_charge_at_immigrate = val; 3808 return 0; 3809 } 3810 #else 3811 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3812 struct cftype *cft, u64 val) 3813 { 3814 return -ENOSYS; 3815 } 3816 #endif 3817 3818 #ifdef CONFIG_NUMA 3819 3820 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 3821 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 3822 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 3823 3824 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 3825 int nid, unsigned int lru_mask, bool tree) 3826 { 3827 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 3828 unsigned long nr = 0; 3829 enum lru_list lru; 3830 3831 VM_BUG_ON((unsigned)nid >= nr_node_ids); 3832 3833 for_each_lru(lru) { 3834 if (!(BIT(lru) & lru_mask)) 3835 continue; 3836 if (tree) 3837 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 3838 else 3839 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 3840 } 3841 return nr; 3842 } 3843 3844 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 3845 unsigned int lru_mask, 3846 bool tree) 3847 { 3848 unsigned long nr = 0; 3849 enum lru_list lru; 3850 3851 for_each_lru(lru) { 3852 if (!(BIT(lru) & lru_mask)) 3853 continue; 3854 if (tree) 3855 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 3856 else 3857 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 3858 } 3859 return nr; 3860 } 3861 3862 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3863 { 3864 struct numa_stat { 3865 const char *name; 3866 unsigned int lru_mask; 3867 }; 3868 3869 static const struct numa_stat stats[] = { 3870 { "total", LRU_ALL }, 3871 { "file", LRU_ALL_FILE }, 3872 { "anon", LRU_ALL_ANON }, 3873 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3874 }; 3875 const struct numa_stat *stat; 3876 int nid; 3877 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3878 3879 mem_cgroup_flush_stats(); 3880 3881 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3882 seq_printf(m, "%s=%lu", stat->name, 3883 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 3884 false)); 3885 for_each_node_state(nid, N_MEMORY) 3886 seq_printf(m, " N%d=%lu", nid, 3887 mem_cgroup_node_nr_lru_pages(memcg, nid, 3888 stat->lru_mask, false)); 3889 seq_putc(m, '\n'); 3890 } 3891 3892 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3893 3894 seq_printf(m, "hierarchical_%s=%lu", stat->name, 3895 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 3896 true)); 3897 for_each_node_state(nid, N_MEMORY) 3898 seq_printf(m, " N%d=%lu", nid, 3899 mem_cgroup_node_nr_lru_pages(memcg, nid, 3900 stat->lru_mask, true)); 3901 seq_putc(m, '\n'); 3902 } 3903 3904 return 0; 3905 } 3906 #endif /* CONFIG_NUMA */ 3907 3908 static const unsigned int memcg1_stats[] = { 3909 NR_FILE_PAGES, 3910 NR_ANON_MAPPED, 3911 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3912 NR_ANON_THPS, 3913 #endif 3914 NR_SHMEM, 3915 NR_FILE_MAPPED, 3916 NR_FILE_DIRTY, 3917 NR_WRITEBACK, 3918 MEMCG_SWAP, 3919 }; 3920 3921 static const char *const memcg1_stat_names[] = { 3922 "cache", 3923 "rss", 3924 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3925 "rss_huge", 3926 #endif 3927 "shmem", 3928 "mapped_file", 3929 "dirty", 3930 "writeback", 3931 "swap", 3932 }; 3933 3934 /* Universal VM events cgroup1 shows, original sort order */ 3935 static const unsigned int memcg1_events[] = { 3936 PGPGIN, 3937 PGPGOUT, 3938 PGFAULT, 3939 PGMAJFAULT, 3940 }; 3941 3942 static int memcg_stat_show(struct seq_file *m, void *v) 3943 { 3944 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3945 unsigned long memory, memsw; 3946 struct mem_cgroup *mi; 3947 unsigned int i; 3948 3949 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 3950 3951 mem_cgroup_flush_stats(); 3952 3953 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 3954 unsigned long nr; 3955 3956 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 3957 continue; 3958 nr = memcg_page_state_local(memcg, memcg1_stats[i]); 3959 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); 3960 } 3961 3962 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 3963 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]), 3964 memcg_events_local(memcg, memcg1_events[i])); 3965 3966 for (i = 0; i < NR_LRU_LISTS; i++) 3967 seq_printf(m, "%s %lu\n", lru_list_name(i), 3968 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 3969 PAGE_SIZE); 3970 3971 /* Hierarchical information */ 3972 memory = memsw = PAGE_COUNTER_MAX; 3973 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 3974 memory = min(memory, READ_ONCE(mi->memory.max)); 3975 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 3976 } 3977 seq_printf(m, "hierarchical_memory_limit %llu\n", 3978 (u64)memory * PAGE_SIZE); 3979 if (do_memsw_account()) 3980 seq_printf(m, "hierarchical_memsw_limit %llu\n", 3981 (u64)memsw * PAGE_SIZE); 3982 3983 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 3984 unsigned long nr; 3985 3986 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 3987 continue; 3988 nr = memcg_page_state(memcg, memcg1_stats[i]); 3989 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], 3990 (u64)nr * PAGE_SIZE); 3991 } 3992 3993 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 3994 seq_printf(m, "total_%s %llu\n", 3995 vm_event_name(memcg1_events[i]), 3996 (u64)memcg_events(memcg, memcg1_events[i])); 3997 3998 for (i = 0; i < NR_LRU_LISTS; i++) 3999 seq_printf(m, "total_%s %llu\n", lru_list_name(i), 4000 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 4001 PAGE_SIZE); 4002 4003 #ifdef CONFIG_DEBUG_VM 4004 { 4005 pg_data_t *pgdat; 4006 struct mem_cgroup_per_node *mz; 4007 unsigned long anon_cost = 0; 4008 unsigned long file_cost = 0; 4009 4010 for_each_online_pgdat(pgdat) { 4011 mz = memcg->nodeinfo[pgdat->node_id]; 4012 4013 anon_cost += mz->lruvec.anon_cost; 4014 file_cost += mz->lruvec.file_cost; 4015 } 4016 seq_printf(m, "anon_cost %lu\n", anon_cost); 4017 seq_printf(m, "file_cost %lu\n", file_cost); 4018 } 4019 #endif 4020 4021 return 0; 4022 } 4023 4024 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 4025 struct cftype *cft) 4026 { 4027 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4028 4029 return mem_cgroup_swappiness(memcg); 4030 } 4031 4032 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 4033 struct cftype *cft, u64 val) 4034 { 4035 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4036 4037 if (val > 200) 4038 return -EINVAL; 4039 4040 if (!mem_cgroup_is_root(memcg)) 4041 memcg->swappiness = val; 4042 else 4043 vm_swappiness = val; 4044 4045 return 0; 4046 } 4047 4048 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4049 { 4050 struct mem_cgroup_threshold_ary *t; 4051 unsigned long usage; 4052 int i; 4053 4054 rcu_read_lock(); 4055 if (!swap) 4056 t = rcu_dereference(memcg->thresholds.primary); 4057 else 4058 t = rcu_dereference(memcg->memsw_thresholds.primary); 4059 4060 if (!t) 4061 goto unlock; 4062 4063 usage = mem_cgroup_usage(memcg, swap); 4064 4065 /* 4066 * current_threshold points to threshold just below or equal to usage. 4067 * If it's not true, a threshold was crossed after last 4068 * call of __mem_cgroup_threshold(). 4069 */ 4070 i = t->current_threshold; 4071 4072 /* 4073 * Iterate backward over array of thresholds starting from 4074 * current_threshold and check if a threshold is crossed. 4075 * If none of thresholds below usage is crossed, we read 4076 * only one element of the array here. 4077 */ 4078 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4079 eventfd_signal(t->entries[i].eventfd, 1); 4080 4081 /* i = current_threshold + 1 */ 4082 i++; 4083 4084 /* 4085 * Iterate forward over array of thresholds starting from 4086 * current_threshold+1 and check if a threshold is crossed. 4087 * If none of thresholds above usage is crossed, we read 4088 * only one element of the array here. 4089 */ 4090 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4091 eventfd_signal(t->entries[i].eventfd, 1); 4092 4093 /* Update current_threshold */ 4094 t->current_threshold = i - 1; 4095 unlock: 4096 rcu_read_unlock(); 4097 } 4098 4099 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4100 { 4101 while (memcg) { 4102 __mem_cgroup_threshold(memcg, false); 4103 if (do_memsw_account()) 4104 __mem_cgroup_threshold(memcg, true); 4105 4106 memcg = parent_mem_cgroup(memcg); 4107 } 4108 } 4109 4110 static int compare_thresholds(const void *a, const void *b) 4111 { 4112 const struct mem_cgroup_threshold *_a = a; 4113 const struct mem_cgroup_threshold *_b = b; 4114 4115 if (_a->threshold > _b->threshold) 4116 return 1; 4117 4118 if (_a->threshold < _b->threshold) 4119 return -1; 4120 4121 return 0; 4122 } 4123 4124 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4125 { 4126 struct mem_cgroup_eventfd_list *ev; 4127 4128 spin_lock(&memcg_oom_lock); 4129 4130 list_for_each_entry(ev, &memcg->oom_notify, list) 4131 eventfd_signal(ev->eventfd, 1); 4132 4133 spin_unlock(&memcg_oom_lock); 4134 return 0; 4135 } 4136 4137 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4138 { 4139 struct mem_cgroup *iter; 4140 4141 for_each_mem_cgroup_tree(iter, memcg) 4142 mem_cgroup_oom_notify_cb(iter); 4143 } 4144 4145 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4146 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 4147 { 4148 struct mem_cgroup_thresholds *thresholds; 4149 struct mem_cgroup_threshold_ary *new; 4150 unsigned long threshold; 4151 unsigned long usage; 4152 int i, size, ret; 4153 4154 ret = page_counter_memparse(args, "-1", &threshold); 4155 if (ret) 4156 return ret; 4157 4158 mutex_lock(&memcg->thresholds_lock); 4159 4160 if (type == _MEM) { 4161 thresholds = &memcg->thresholds; 4162 usage = mem_cgroup_usage(memcg, false); 4163 } else if (type == _MEMSWAP) { 4164 thresholds = &memcg->memsw_thresholds; 4165 usage = mem_cgroup_usage(memcg, true); 4166 } else 4167 BUG(); 4168 4169 /* Check if a threshold crossed before adding a new one */ 4170 if (thresholds->primary) 4171 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4172 4173 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4174 4175 /* Allocate memory for new array of thresholds */ 4176 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 4177 if (!new) { 4178 ret = -ENOMEM; 4179 goto unlock; 4180 } 4181 new->size = size; 4182 4183 /* Copy thresholds (if any) to new array */ 4184 if (thresholds->primary) 4185 memcpy(new->entries, thresholds->primary->entries, 4186 flex_array_size(new, entries, size - 1)); 4187 4188 /* Add new threshold */ 4189 new->entries[size - 1].eventfd = eventfd; 4190 new->entries[size - 1].threshold = threshold; 4191 4192 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4193 sort(new->entries, size, sizeof(*new->entries), 4194 compare_thresholds, NULL); 4195 4196 /* Find current threshold */ 4197 new->current_threshold = -1; 4198 for (i = 0; i < size; i++) { 4199 if (new->entries[i].threshold <= usage) { 4200 /* 4201 * new->current_threshold will not be used until 4202 * rcu_assign_pointer(), so it's safe to increment 4203 * it here. 4204 */ 4205 ++new->current_threshold; 4206 } else 4207 break; 4208 } 4209 4210 /* Free old spare buffer and save old primary buffer as spare */ 4211 kfree(thresholds->spare); 4212 thresholds->spare = thresholds->primary; 4213 4214 rcu_assign_pointer(thresholds->primary, new); 4215 4216 /* To be sure that nobody uses thresholds */ 4217 synchronize_rcu(); 4218 4219 unlock: 4220 mutex_unlock(&memcg->thresholds_lock); 4221 4222 return ret; 4223 } 4224 4225 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4226 struct eventfd_ctx *eventfd, const char *args) 4227 { 4228 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4229 } 4230 4231 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4232 struct eventfd_ctx *eventfd, const char *args) 4233 { 4234 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4235 } 4236 4237 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4238 struct eventfd_ctx *eventfd, enum res_type type) 4239 { 4240 struct mem_cgroup_thresholds *thresholds; 4241 struct mem_cgroup_threshold_ary *new; 4242 unsigned long usage; 4243 int i, j, size, entries; 4244 4245 mutex_lock(&memcg->thresholds_lock); 4246 4247 if (type == _MEM) { 4248 thresholds = &memcg->thresholds; 4249 usage = mem_cgroup_usage(memcg, false); 4250 } else if (type == _MEMSWAP) { 4251 thresholds = &memcg->memsw_thresholds; 4252 usage = mem_cgroup_usage(memcg, true); 4253 } else 4254 BUG(); 4255 4256 if (!thresholds->primary) 4257 goto unlock; 4258 4259 /* Check if a threshold crossed before removing */ 4260 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4261 4262 /* Calculate new number of threshold */ 4263 size = entries = 0; 4264 for (i = 0; i < thresholds->primary->size; i++) { 4265 if (thresholds->primary->entries[i].eventfd != eventfd) 4266 size++; 4267 else 4268 entries++; 4269 } 4270 4271 new = thresholds->spare; 4272 4273 /* If no items related to eventfd have been cleared, nothing to do */ 4274 if (!entries) 4275 goto unlock; 4276 4277 /* Set thresholds array to NULL if we don't have thresholds */ 4278 if (!size) { 4279 kfree(new); 4280 new = NULL; 4281 goto swap_buffers; 4282 } 4283 4284 new->size = size; 4285 4286 /* Copy thresholds and find current threshold */ 4287 new->current_threshold = -1; 4288 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4289 if (thresholds->primary->entries[i].eventfd == eventfd) 4290 continue; 4291 4292 new->entries[j] = thresholds->primary->entries[i]; 4293 if (new->entries[j].threshold <= usage) { 4294 /* 4295 * new->current_threshold will not be used 4296 * until rcu_assign_pointer(), so it's safe to increment 4297 * it here. 4298 */ 4299 ++new->current_threshold; 4300 } 4301 j++; 4302 } 4303 4304 swap_buffers: 4305 /* Swap primary and spare array */ 4306 thresholds->spare = thresholds->primary; 4307 4308 rcu_assign_pointer(thresholds->primary, new); 4309 4310 /* To be sure that nobody uses thresholds */ 4311 synchronize_rcu(); 4312 4313 /* If all events are unregistered, free the spare array */ 4314 if (!new) { 4315 kfree(thresholds->spare); 4316 thresholds->spare = NULL; 4317 } 4318 unlock: 4319 mutex_unlock(&memcg->thresholds_lock); 4320 } 4321 4322 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4323 struct eventfd_ctx *eventfd) 4324 { 4325 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4326 } 4327 4328 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4329 struct eventfd_ctx *eventfd) 4330 { 4331 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4332 } 4333 4334 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4335 struct eventfd_ctx *eventfd, const char *args) 4336 { 4337 struct mem_cgroup_eventfd_list *event; 4338 4339 event = kmalloc(sizeof(*event), GFP_KERNEL); 4340 if (!event) 4341 return -ENOMEM; 4342 4343 spin_lock(&memcg_oom_lock); 4344 4345 event->eventfd = eventfd; 4346 list_add(&event->list, &memcg->oom_notify); 4347 4348 /* already in OOM ? */ 4349 if (memcg->under_oom) 4350 eventfd_signal(eventfd, 1); 4351 spin_unlock(&memcg_oom_lock); 4352 4353 return 0; 4354 } 4355 4356 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4357 struct eventfd_ctx *eventfd) 4358 { 4359 struct mem_cgroup_eventfd_list *ev, *tmp; 4360 4361 spin_lock(&memcg_oom_lock); 4362 4363 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4364 if (ev->eventfd == eventfd) { 4365 list_del(&ev->list); 4366 kfree(ev); 4367 } 4368 } 4369 4370 spin_unlock(&memcg_oom_lock); 4371 } 4372 4373 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4374 { 4375 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 4376 4377 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4378 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 4379 seq_printf(sf, "oom_kill %lu\n", 4380 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 4381 return 0; 4382 } 4383 4384 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4385 struct cftype *cft, u64 val) 4386 { 4387 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4388 4389 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4390 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 4391 return -EINVAL; 4392 4393 memcg->oom_kill_disable = val; 4394 if (!val) 4395 memcg_oom_recover(memcg); 4396 4397 return 0; 4398 } 4399 4400 #ifdef CONFIG_CGROUP_WRITEBACK 4401 4402 #include <trace/events/writeback.h> 4403 4404 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4405 { 4406 return wb_domain_init(&memcg->cgwb_domain, gfp); 4407 } 4408 4409 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4410 { 4411 wb_domain_exit(&memcg->cgwb_domain); 4412 } 4413 4414 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4415 { 4416 wb_domain_size_changed(&memcg->cgwb_domain); 4417 } 4418 4419 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 4420 { 4421 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4422 4423 if (!memcg->css.parent) 4424 return NULL; 4425 4426 return &memcg->cgwb_domain; 4427 } 4428 4429 /** 4430 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 4431 * @wb: bdi_writeback in question 4432 * @pfilepages: out parameter for number of file pages 4433 * @pheadroom: out parameter for number of allocatable pages according to memcg 4434 * @pdirty: out parameter for number of dirty pages 4435 * @pwriteback: out parameter for number of pages under writeback 4436 * 4437 * Determine the numbers of file, headroom, dirty, and writeback pages in 4438 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 4439 * is a bit more involved. 4440 * 4441 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 4442 * headroom is calculated as the lowest headroom of itself and the 4443 * ancestors. Note that this doesn't consider the actual amount of 4444 * available memory in the system. The caller should further cap 4445 * *@pheadroom accordingly. 4446 */ 4447 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 4448 unsigned long *pheadroom, unsigned long *pdirty, 4449 unsigned long *pwriteback) 4450 { 4451 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4452 struct mem_cgroup *parent; 4453 4454 mem_cgroup_flush_stats(); 4455 4456 *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); 4457 *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); 4458 *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + 4459 memcg_page_state(memcg, NR_ACTIVE_FILE); 4460 4461 *pheadroom = PAGE_COUNTER_MAX; 4462 while ((parent = parent_mem_cgroup(memcg))) { 4463 unsigned long ceiling = min(READ_ONCE(memcg->memory.max), 4464 READ_ONCE(memcg->memory.high)); 4465 unsigned long used = page_counter_read(&memcg->memory); 4466 4467 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 4468 memcg = parent; 4469 } 4470 } 4471 4472 /* 4473 * Foreign dirty flushing 4474 * 4475 * There's an inherent mismatch between memcg and writeback. The former 4476 * tracks ownership per-page while the latter per-inode. This was a 4477 * deliberate design decision because honoring per-page ownership in the 4478 * writeback path is complicated, may lead to higher CPU and IO overheads 4479 * and deemed unnecessary given that write-sharing an inode across 4480 * different cgroups isn't a common use-case. 4481 * 4482 * Combined with inode majority-writer ownership switching, this works well 4483 * enough in most cases but there are some pathological cases. For 4484 * example, let's say there are two cgroups A and B which keep writing to 4485 * different but confined parts of the same inode. B owns the inode and 4486 * A's memory is limited far below B's. A's dirty ratio can rise enough to 4487 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 4488 * triggering background writeback. A will be slowed down without a way to 4489 * make writeback of the dirty pages happen. 4490 * 4491 * Conditions like the above can lead to a cgroup getting repeatedly and 4492 * severely throttled after making some progress after each 4493 * dirty_expire_interval while the underlying IO device is almost 4494 * completely idle. 4495 * 4496 * Solving this problem completely requires matching the ownership tracking 4497 * granularities between memcg and writeback in either direction. However, 4498 * the more egregious behaviors can be avoided by simply remembering the 4499 * most recent foreign dirtying events and initiating remote flushes on 4500 * them when local writeback isn't enough to keep the memory clean enough. 4501 * 4502 * The following two functions implement such mechanism. When a foreign 4503 * page - a page whose memcg and writeback ownerships don't match - is 4504 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 4505 * bdi_writeback on the page owning memcg. When balance_dirty_pages() 4506 * decides that the memcg needs to sleep due to high dirty ratio, it calls 4507 * mem_cgroup_flush_foreign() which queues writeback on the recorded 4508 * foreign bdi_writebacks which haven't expired. Both the numbers of 4509 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 4510 * limited to MEMCG_CGWB_FRN_CNT. 4511 * 4512 * The mechanism only remembers IDs and doesn't hold any object references. 4513 * As being wrong occasionally doesn't matter, updates and accesses to the 4514 * records are lockless and racy. 4515 */ 4516 void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, 4517 struct bdi_writeback *wb) 4518 { 4519 struct mem_cgroup *memcg = page_memcg(page); 4520 struct memcg_cgwb_frn *frn; 4521 u64 now = get_jiffies_64(); 4522 u64 oldest_at = now; 4523 int oldest = -1; 4524 int i; 4525 4526 trace_track_foreign_dirty(page, wb); 4527 4528 /* 4529 * Pick the slot to use. If there is already a slot for @wb, keep 4530 * using it. If not replace the oldest one which isn't being 4531 * written out. 4532 */ 4533 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4534 frn = &memcg->cgwb_frn[i]; 4535 if (frn->bdi_id == wb->bdi->id && 4536 frn->memcg_id == wb->memcg_css->id) 4537 break; 4538 if (time_before64(frn->at, oldest_at) && 4539 atomic_read(&frn->done.cnt) == 1) { 4540 oldest = i; 4541 oldest_at = frn->at; 4542 } 4543 } 4544 4545 if (i < MEMCG_CGWB_FRN_CNT) { 4546 /* 4547 * Re-using an existing one. Update timestamp lazily to 4548 * avoid making the cacheline hot. We want them to be 4549 * reasonably up-to-date and significantly shorter than 4550 * dirty_expire_interval as that's what expires the record. 4551 * Use the shorter of 1s and dirty_expire_interval / 8. 4552 */ 4553 unsigned long update_intv = 4554 min_t(unsigned long, HZ, 4555 msecs_to_jiffies(dirty_expire_interval * 10) / 8); 4556 4557 if (time_before64(frn->at, now - update_intv)) 4558 frn->at = now; 4559 } else if (oldest >= 0) { 4560 /* replace the oldest free one */ 4561 frn = &memcg->cgwb_frn[oldest]; 4562 frn->bdi_id = wb->bdi->id; 4563 frn->memcg_id = wb->memcg_css->id; 4564 frn->at = now; 4565 } 4566 } 4567 4568 /* issue foreign writeback flushes for recorded foreign dirtying events */ 4569 void mem_cgroup_flush_foreign(struct bdi_writeback *wb) 4570 { 4571 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4572 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 4573 u64 now = jiffies_64; 4574 int i; 4575 4576 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4577 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 4578 4579 /* 4580 * If the record is older than dirty_expire_interval, 4581 * writeback on it has already started. No need to kick it 4582 * off again. Also, don't start a new one if there's 4583 * already one in flight. 4584 */ 4585 if (time_after64(frn->at, now - intv) && 4586 atomic_read(&frn->done.cnt) == 1) { 4587 frn->at = 0; 4588 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 4589 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 4590 WB_REASON_FOREIGN_FLUSH, 4591 &frn->done); 4592 } 4593 } 4594 } 4595 4596 #else /* CONFIG_CGROUP_WRITEBACK */ 4597 4598 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4599 { 4600 return 0; 4601 } 4602 4603 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4604 { 4605 } 4606 4607 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4608 { 4609 } 4610 4611 #endif /* CONFIG_CGROUP_WRITEBACK */ 4612 4613 /* 4614 * DO NOT USE IN NEW FILES. 4615 * 4616 * "cgroup.event_control" implementation. 4617 * 4618 * This is way over-engineered. It tries to support fully configurable 4619 * events for each user. Such level of flexibility is completely 4620 * unnecessary especially in the light of the planned unified hierarchy. 4621 * 4622 * Please deprecate this and replace with something simpler if at all 4623 * possible. 4624 */ 4625 4626 /* 4627 * Unregister event and free resources. 4628 * 4629 * Gets called from workqueue. 4630 */ 4631 static void memcg_event_remove(struct work_struct *work) 4632 { 4633 struct mem_cgroup_event *event = 4634 container_of(work, struct mem_cgroup_event, remove); 4635 struct mem_cgroup *memcg = event->memcg; 4636 4637 remove_wait_queue(event->wqh, &event->wait); 4638 4639 event->unregister_event(memcg, event->eventfd); 4640 4641 /* Notify userspace the event is going away. */ 4642 eventfd_signal(event->eventfd, 1); 4643 4644 eventfd_ctx_put(event->eventfd); 4645 kfree(event); 4646 css_put(&memcg->css); 4647 } 4648 4649 /* 4650 * Gets called on EPOLLHUP on eventfd when user closes it. 4651 * 4652 * Called with wqh->lock held and interrupts disabled. 4653 */ 4654 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 4655 int sync, void *key) 4656 { 4657 struct mem_cgroup_event *event = 4658 container_of(wait, struct mem_cgroup_event, wait); 4659 struct mem_cgroup *memcg = event->memcg; 4660 __poll_t flags = key_to_poll(key); 4661 4662 if (flags & EPOLLHUP) { 4663 /* 4664 * If the event has been detached at cgroup removal, we 4665 * can simply return knowing the other side will cleanup 4666 * for us. 4667 * 4668 * We can't race against event freeing since the other 4669 * side will require wqh->lock via remove_wait_queue(), 4670 * which we hold. 4671 */ 4672 spin_lock(&memcg->event_list_lock); 4673 if (!list_empty(&event->list)) { 4674 list_del_init(&event->list); 4675 /* 4676 * We are in atomic context, but cgroup_event_remove() 4677 * may sleep, so we have to call it in workqueue. 4678 */ 4679 schedule_work(&event->remove); 4680 } 4681 spin_unlock(&memcg->event_list_lock); 4682 } 4683 4684 return 0; 4685 } 4686 4687 static void memcg_event_ptable_queue_proc(struct file *file, 4688 wait_queue_head_t *wqh, poll_table *pt) 4689 { 4690 struct mem_cgroup_event *event = 4691 container_of(pt, struct mem_cgroup_event, pt); 4692 4693 event->wqh = wqh; 4694 add_wait_queue(wqh, &event->wait); 4695 } 4696 4697 /* 4698 * DO NOT USE IN NEW FILES. 4699 * 4700 * Parse input and register new cgroup event handler. 4701 * 4702 * Input must be in format '<event_fd> <control_fd> <args>'. 4703 * Interpretation of args is defined by control file implementation. 4704 */ 4705 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4706 char *buf, size_t nbytes, loff_t off) 4707 { 4708 struct cgroup_subsys_state *css = of_css(of); 4709 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4710 struct mem_cgroup_event *event; 4711 struct cgroup_subsys_state *cfile_css; 4712 unsigned int efd, cfd; 4713 struct fd efile; 4714 struct fd cfile; 4715 const char *name; 4716 char *endp; 4717 int ret; 4718 4719 buf = strstrip(buf); 4720 4721 efd = simple_strtoul(buf, &endp, 10); 4722 if (*endp != ' ') 4723 return -EINVAL; 4724 buf = endp + 1; 4725 4726 cfd = simple_strtoul(buf, &endp, 10); 4727 if ((*endp != ' ') && (*endp != '\0')) 4728 return -EINVAL; 4729 buf = endp + 1; 4730 4731 event = kzalloc(sizeof(*event), GFP_KERNEL); 4732 if (!event) 4733 return -ENOMEM; 4734 4735 event->memcg = memcg; 4736 INIT_LIST_HEAD(&event->list); 4737 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4738 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4739 INIT_WORK(&event->remove, memcg_event_remove); 4740 4741 efile = fdget(efd); 4742 if (!efile.file) { 4743 ret = -EBADF; 4744 goto out_kfree; 4745 } 4746 4747 event->eventfd = eventfd_ctx_fileget(efile.file); 4748 if (IS_ERR(event->eventfd)) { 4749 ret = PTR_ERR(event->eventfd); 4750 goto out_put_efile; 4751 } 4752 4753 cfile = fdget(cfd); 4754 if (!cfile.file) { 4755 ret = -EBADF; 4756 goto out_put_eventfd; 4757 } 4758 4759 /* the process need read permission on control file */ 4760 /* AV: shouldn't we check that it's been opened for read instead? */ 4761 ret = file_permission(cfile.file, MAY_READ); 4762 if (ret < 0) 4763 goto out_put_cfile; 4764 4765 /* 4766 * Determine the event callbacks and set them in @event. This used 4767 * to be done via struct cftype but cgroup core no longer knows 4768 * about these events. The following is crude but the whole thing 4769 * is for compatibility anyway. 4770 * 4771 * DO NOT ADD NEW FILES. 4772 */ 4773 name = cfile.file->f_path.dentry->d_name.name; 4774 4775 if (!strcmp(name, "memory.usage_in_bytes")) { 4776 event->register_event = mem_cgroup_usage_register_event; 4777 event->unregister_event = mem_cgroup_usage_unregister_event; 4778 } else if (!strcmp(name, "memory.oom_control")) { 4779 event->register_event = mem_cgroup_oom_register_event; 4780 event->unregister_event = mem_cgroup_oom_unregister_event; 4781 } else if (!strcmp(name, "memory.pressure_level")) { 4782 event->register_event = vmpressure_register_event; 4783 event->unregister_event = vmpressure_unregister_event; 4784 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4785 event->register_event = memsw_cgroup_usage_register_event; 4786 event->unregister_event = memsw_cgroup_usage_unregister_event; 4787 } else { 4788 ret = -EINVAL; 4789 goto out_put_cfile; 4790 } 4791 4792 /* 4793 * Verify @cfile should belong to @css. Also, remaining events are 4794 * automatically removed on cgroup destruction but the removal is 4795 * asynchronous, so take an extra ref on @css. 4796 */ 4797 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, 4798 &memory_cgrp_subsys); 4799 ret = -EINVAL; 4800 if (IS_ERR(cfile_css)) 4801 goto out_put_cfile; 4802 if (cfile_css != css) { 4803 css_put(cfile_css); 4804 goto out_put_cfile; 4805 } 4806 4807 ret = event->register_event(memcg, event->eventfd, buf); 4808 if (ret) 4809 goto out_put_css; 4810 4811 vfs_poll(efile.file, &event->pt); 4812 4813 spin_lock_irq(&memcg->event_list_lock); 4814 list_add(&event->list, &memcg->event_list); 4815 spin_unlock_irq(&memcg->event_list_lock); 4816 4817 fdput(cfile); 4818 fdput(efile); 4819 4820 return nbytes; 4821 4822 out_put_css: 4823 css_put(css); 4824 out_put_cfile: 4825 fdput(cfile); 4826 out_put_eventfd: 4827 eventfd_ctx_put(event->eventfd); 4828 out_put_efile: 4829 fdput(efile); 4830 out_kfree: 4831 kfree(event); 4832 4833 return ret; 4834 } 4835 4836 static struct cftype mem_cgroup_legacy_files[] = { 4837 { 4838 .name = "usage_in_bytes", 4839 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4840 .read_u64 = mem_cgroup_read_u64, 4841 }, 4842 { 4843 .name = "max_usage_in_bytes", 4844 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4845 .write = mem_cgroup_reset, 4846 .read_u64 = mem_cgroup_read_u64, 4847 }, 4848 { 4849 .name = "limit_in_bytes", 4850 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4851 .write = mem_cgroup_write, 4852 .read_u64 = mem_cgroup_read_u64, 4853 }, 4854 { 4855 .name = "soft_limit_in_bytes", 4856 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4857 .write = mem_cgroup_write, 4858 .read_u64 = mem_cgroup_read_u64, 4859 }, 4860 { 4861 .name = "failcnt", 4862 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4863 .write = mem_cgroup_reset, 4864 .read_u64 = mem_cgroup_read_u64, 4865 }, 4866 { 4867 .name = "stat", 4868 .seq_show = memcg_stat_show, 4869 }, 4870 { 4871 .name = "force_empty", 4872 .write = mem_cgroup_force_empty_write, 4873 }, 4874 { 4875 .name = "use_hierarchy", 4876 .write_u64 = mem_cgroup_hierarchy_write, 4877 .read_u64 = mem_cgroup_hierarchy_read, 4878 }, 4879 { 4880 .name = "cgroup.event_control", /* XXX: for compat */ 4881 .write = memcg_write_event_control, 4882 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 4883 }, 4884 { 4885 .name = "swappiness", 4886 .read_u64 = mem_cgroup_swappiness_read, 4887 .write_u64 = mem_cgroup_swappiness_write, 4888 }, 4889 { 4890 .name = "move_charge_at_immigrate", 4891 .read_u64 = mem_cgroup_move_charge_read, 4892 .write_u64 = mem_cgroup_move_charge_write, 4893 }, 4894 { 4895 .name = "oom_control", 4896 .seq_show = mem_cgroup_oom_control_read, 4897 .write_u64 = mem_cgroup_oom_control_write, 4898 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4899 }, 4900 { 4901 .name = "pressure_level", 4902 }, 4903 #ifdef CONFIG_NUMA 4904 { 4905 .name = "numa_stat", 4906 .seq_show = memcg_numa_stat_show, 4907 }, 4908 #endif 4909 { 4910 .name = "kmem.limit_in_bytes", 4911 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 4912 .write = mem_cgroup_write, 4913 .read_u64 = mem_cgroup_read_u64, 4914 }, 4915 { 4916 .name = "kmem.usage_in_bytes", 4917 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 4918 .read_u64 = mem_cgroup_read_u64, 4919 }, 4920 { 4921 .name = "kmem.failcnt", 4922 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 4923 .write = mem_cgroup_reset, 4924 .read_u64 = mem_cgroup_read_u64, 4925 }, 4926 { 4927 .name = "kmem.max_usage_in_bytes", 4928 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 4929 .write = mem_cgroup_reset, 4930 .read_u64 = mem_cgroup_read_u64, 4931 }, 4932 #if defined(CONFIG_MEMCG_KMEM) && \ 4933 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 4934 { 4935 .name = "kmem.slabinfo", 4936 .seq_show = memcg_slab_show, 4937 }, 4938 #endif 4939 { 4940 .name = "kmem.tcp.limit_in_bytes", 4941 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 4942 .write = mem_cgroup_write, 4943 .read_u64 = mem_cgroup_read_u64, 4944 }, 4945 { 4946 .name = "kmem.tcp.usage_in_bytes", 4947 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 4948 .read_u64 = mem_cgroup_read_u64, 4949 }, 4950 { 4951 .name = "kmem.tcp.failcnt", 4952 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 4953 .write = mem_cgroup_reset, 4954 .read_u64 = mem_cgroup_read_u64, 4955 }, 4956 { 4957 .name = "kmem.tcp.max_usage_in_bytes", 4958 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 4959 .write = mem_cgroup_reset, 4960 .read_u64 = mem_cgroup_read_u64, 4961 }, 4962 { }, /* terminate */ 4963 }; 4964 4965 /* 4966 * Private memory cgroup IDR 4967 * 4968 * Swap-out records and page cache shadow entries need to store memcg 4969 * references in constrained space, so we maintain an ID space that is 4970 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 4971 * memory-controlled cgroups to 64k. 4972 * 4973 * However, there usually are many references to the offline CSS after 4974 * the cgroup has been destroyed, such as page cache or reclaimable 4975 * slab objects, that don't need to hang on to the ID. We want to keep 4976 * those dead CSS from occupying IDs, or we might quickly exhaust the 4977 * relatively small ID space and prevent the creation of new cgroups 4978 * even when there are much fewer than 64k cgroups - possibly none. 4979 * 4980 * Maintain a private 16-bit ID space for memcg, and allow the ID to 4981 * be freed and recycled when it's no longer needed, which is usually 4982 * when the CSS is offlined. 4983 * 4984 * The only exception to that are records of swapped out tmpfs/shmem 4985 * pages that need to be attributed to live ancestors on swapin. But 4986 * those references are manageable from userspace. 4987 */ 4988 4989 static DEFINE_IDR(mem_cgroup_idr); 4990 4991 static void mem_cgroup_id_remove(struct mem_cgroup *memcg) 4992 { 4993 if (memcg->id.id > 0) { 4994 idr_remove(&mem_cgroup_idr, memcg->id.id); 4995 memcg->id.id = 0; 4996 } 4997 } 4998 4999 static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, 5000 unsigned int n) 5001 { 5002 refcount_add(n, &memcg->id.ref); 5003 } 5004 5005 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 5006 { 5007 if (refcount_sub_and_test(n, &memcg->id.ref)) { 5008 mem_cgroup_id_remove(memcg); 5009 5010 /* Memcg ID pins CSS */ 5011 css_put(&memcg->css); 5012 } 5013 } 5014 5015 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 5016 { 5017 mem_cgroup_id_put_many(memcg, 1); 5018 } 5019 5020 /** 5021 * mem_cgroup_from_id - look up a memcg from a memcg id 5022 * @id: the memcg id to look up 5023 * 5024 * Caller must hold rcu_read_lock(). 5025 */ 5026 struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 5027 { 5028 WARN_ON_ONCE(!rcu_read_lock_held()); 5029 return idr_find(&mem_cgroup_idr, id); 5030 } 5031 5032 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5033 { 5034 struct mem_cgroup_per_node *pn; 5035 int tmp = node; 5036 /* 5037 * This routine is called against possible nodes. 5038 * But it's BUG to call kmalloc() against offline node. 5039 * 5040 * TODO: this routine can waste much memory for nodes which will 5041 * never be onlined. It's better to use memory hotplug callback 5042 * function. 5043 */ 5044 if (!node_state(node, N_NORMAL_MEMORY)) 5045 tmp = -1; 5046 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 5047 if (!pn) 5048 return 1; 5049 5050 pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, 5051 GFP_KERNEL_ACCOUNT); 5052 if (!pn->lruvec_stats_percpu) { 5053 kfree(pn); 5054 return 1; 5055 } 5056 5057 lruvec_init(&pn->lruvec); 5058 pn->usage_in_excess = 0; 5059 pn->on_tree = false; 5060 pn->memcg = memcg; 5061 5062 memcg->nodeinfo[node] = pn; 5063 return 0; 5064 } 5065 5066 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5067 { 5068 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 5069 5070 if (!pn) 5071 return; 5072 5073 free_percpu(pn->lruvec_stats_percpu); 5074 kfree(pn); 5075 } 5076 5077 static void __mem_cgroup_free(struct mem_cgroup *memcg) 5078 { 5079 int node; 5080 5081 for_each_node(node) 5082 free_mem_cgroup_per_node_info(memcg, node); 5083 free_percpu(memcg->vmstats_percpu); 5084 kfree(memcg); 5085 } 5086 5087 static void mem_cgroup_free(struct mem_cgroup *memcg) 5088 { 5089 memcg_wb_domain_exit(memcg); 5090 __mem_cgroup_free(memcg); 5091 } 5092 5093 static struct mem_cgroup *mem_cgroup_alloc(void) 5094 { 5095 struct mem_cgroup *memcg; 5096 unsigned int size; 5097 int node; 5098 int __maybe_unused i; 5099 long error = -ENOMEM; 5100 5101 size = sizeof(struct mem_cgroup); 5102 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 5103 5104 memcg = kzalloc(size, GFP_KERNEL); 5105 if (!memcg) 5106 return ERR_PTR(error); 5107 5108 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 5109 1, MEM_CGROUP_ID_MAX, 5110 GFP_KERNEL); 5111 if (memcg->id.id < 0) { 5112 error = memcg->id.id; 5113 goto fail; 5114 } 5115 5116 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, 5117 GFP_KERNEL_ACCOUNT); 5118 if (!memcg->vmstats_percpu) 5119 goto fail; 5120 5121 for_each_node(node) 5122 if (alloc_mem_cgroup_per_node_info(memcg, node)) 5123 goto fail; 5124 5125 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 5126 goto fail; 5127 5128 INIT_WORK(&memcg->high_work, high_work_func); 5129 INIT_LIST_HEAD(&memcg->oom_notify); 5130 mutex_init(&memcg->thresholds_lock); 5131 spin_lock_init(&memcg->move_lock); 5132 vmpressure_init(&memcg->vmpressure); 5133 INIT_LIST_HEAD(&memcg->event_list); 5134 spin_lock_init(&memcg->event_list_lock); 5135 memcg->socket_pressure = jiffies; 5136 #ifdef CONFIG_MEMCG_KMEM 5137 memcg->kmemcg_id = -1; 5138 INIT_LIST_HEAD(&memcg->objcg_list); 5139 #endif 5140 #ifdef CONFIG_CGROUP_WRITEBACK 5141 INIT_LIST_HEAD(&memcg->cgwb_list); 5142 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5143 memcg->cgwb_frn[i].done = 5144 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 5145 #endif 5146 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5147 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 5148 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 5149 memcg->deferred_split_queue.split_queue_len = 0; 5150 #endif 5151 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 5152 return memcg; 5153 fail: 5154 mem_cgroup_id_remove(memcg); 5155 __mem_cgroup_free(memcg); 5156 return ERR_PTR(error); 5157 } 5158 5159 static struct cgroup_subsys_state * __ref 5160 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5161 { 5162 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 5163 struct mem_cgroup *memcg, *old_memcg; 5164 long error = -ENOMEM; 5165 5166 old_memcg = set_active_memcg(parent); 5167 memcg = mem_cgroup_alloc(); 5168 set_active_memcg(old_memcg); 5169 if (IS_ERR(memcg)) 5170 return ERR_CAST(memcg); 5171 5172 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5173 memcg->soft_limit = PAGE_COUNTER_MAX; 5174 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5175 if (parent) { 5176 memcg->swappiness = mem_cgroup_swappiness(parent); 5177 memcg->oom_kill_disable = parent->oom_kill_disable; 5178 5179 page_counter_init(&memcg->memory, &parent->memory); 5180 page_counter_init(&memcg->swap, &parent->swap); 5181 page_counter_init(&memcg->kmem, &parent->kmem); 5182 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 5183 } else { 5184 page_counter_init(&memcg->memory, NULL); 5185 page_counter_init(&memcg->swap, NULL); 5186 page_counter_init(&memcg->kmem, NULL); 5187 page_counter_init(&memcg->tcpmem, NULL); 5188 5189 root_mem_cgroup = memcg; 5190 return &memcg->css; 5191 } 5192 5193 /* The following stuff does not apply to the root */ 5194 error = memcg_online_kmem(memcg); 5195 if (error) 5196 goto fail; 5197 5198 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5199 static_branch_inc(&memcg_sockets_enabled_key); 5200 5201 return &memcg->css; 5202 fail: 5203 mem_cgroup_id_remove(memcg); 5204 mem_cgroup_free(memcg); 5205 return ERR_PTR(error); 5206 } 5207 5208 static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 5209 { 5210 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5211 5212 /* 5213 * A memcg must be visible for expand_shrinker_info() 5214 * by the time the maps are allocated. So, we allocate maps 5215 * here, when for_each_mem_cgroup() can't skip it. 5216 */ 5217 if (alloc_shrinker_info(memcg)) { 5218 mem_cgroup_id_remove(memcg); 5219 return -ENOMEM; 5220 } 5221 5222 /* Online state pins memcg ID, memcg ID pins CSS */ 5223 refcount_set(&memcg->id.ref, 1); 5224 css_get(css); 5225 5226 if (unlikely(mem_cgroup_is_root(memcg))) 5227 queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 5228 2UL*HZ); 5229 return 0; 5230 } 5231 5232 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5233 { 5234 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5235 struct mem_cgroup_event *event, *tmp; 5236 5237 /* 5238 * Unregister events and notify userspace. 5239 * Notify userspace about cgroup removing only after rmdir of cgroup 5240 * directory to avoid race between userspace and kernelspace. 5241 */ 5242 spin_lock_irq(&memcg->event_list_lock); 5243 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5244 list_del_init(&event->list); 5245 schedule_work(&event->remove); 5246 } 5247 spin_unlock_irq(&memcg->event_list_lock); 5248 5249 page_counter_set_min(&memcg->memory, 0); 5250 page_counter_set_low(&memcg->memory, 0); 5251 5252 memcg_offline_kmem(memcg); 5253 reparent_shrinker_deferred(memcg); 5254 wb_memcg_offline(memcg); 5255 5256 drain_all_stock(memcg); 5257 5258 mem_cgroup_id_put(memcg); 5259 } 5260 5261 static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 5262 { 5263 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5264 5265 invalidate_reclaim_iterators(memcg); 5266 } 5267 5268 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5269 { 5270 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5271 int __maybe_unused i; 5272 5273 #ifdef CONFIG_CGROUP_WRITEBACK 5274 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5275 wb_wait_for_completion(&memcg->cgwb_frn[i].done); 5276 #endif 5277 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5278 static_branch_dec(&memcg_sockets_enabled_key); 5279 5280 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 5281 static_branch_dec(&memcg_sockets_enabled_key); 5282 5283 vmpressure_cleanup(&memcg->vmpressure); 5284 cancel_work_sync(&memcg->high_work); 5285 mem_cgroup_remove_from_trees(memcg); 5286 free_shrinker_info(memcg); 5287 5288 /* Need to offline kmem if online_css() fails */ 5289 memcg_offline_kmem(memcg); 5290 mem_cgroup_free(memcg); 5291 } 5292 5293 /** 5294 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5295 * @css: the target css 5296 * 5297 * Reset the states of the mem_cgroup associated with @css. This is 5298 * invoked when the userland requests disabling on the default hierarchy 5299 * but the memcg is pinned through dependency. The memcg should stop 5300 * applying policies and should revert to the vanilla state as it may be 5301 * made visible again. 5302 * 5303 * The current implementation only resets the essential configurations. 5304 * This needs to be expanded to cover all the visible parts. 5305 */ 5306 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5307 { 5308 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5309 5310 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 5311 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 5312 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 5313 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 5314 page_counter_set_min(&memcg->memory, 0); 5315 page_counter_set_low(&memcg->memory, 0); 5316 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5317 memcg->soft_limit = PAGE_COUNTER_MAX; 5318 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5319 memcg_wb_domain_size_changed(memcg); 5320 } 5321 5322 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) 5323 { 5324 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5325 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5326 struct memcg_vmstats_percpu *statc; 5327 long delta, v; 5328 int i, nid; 5329 5330 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); 5331 5332 for (i = 0; i < MEMCG_NR_STAT; i++) { 5333 /* 5334 * Collect the aggregated propagation counts of groups 5335 * below us. We're in a per-cpu loop here and this is 5336 * a global counter, so the first cycle will get them. 5337 */ 5338 delta = memcg->vmstats.state_pending[i]; 5339 if (delta) 5340 memcg->vmstats.state_pending[i] = 0; 5341 5342 /* Add CPU changes on this level since the last flush */ 5343 v = READ_ONCE(statc->state[i]); 5344 if (v != statc->state_prev[i]) { 5345 delta += v - statc->state_prev[i]; 5346 statc->state_prev[i] = v; 5347 } 5348 5349 if (!delta) 5350 continue; 5351 5352 /* Aggregate counts on this level and propagate upwards */ 5353 memcg->vmstats.state[i] += delta; 5354 if (parent) 5355 parent->vmstats.state_pending[i] += delta; 5356 } 5357 5358 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 5359 delta = memcg->vmstats.events_pending[i]; 5360 if (delta) 5361 memcg->vmstats.events_pending[i] = 0; 5362 5363 v = READ_ONCE(statc->events[i]); 5364 if (v != statc->events_prev[i]) { 5365 delta += v - statc->events_prev[i]; 5366 statc->events_prev[i] = v; 5367 } 5368 5369 if (!delta) 5370 continue; 5371 5372 memcg->vmstats.events[i] += delta; 5373 if (parent) 5374 parent->vmstats.events_pending[i] += delta; 5375 } 5376 5377 for_each_node_state(nid, N_MEMORY) { 5378 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; 5379 struct mem_cgroup_per_node *ppn = NULL; 5380 struct lruvec_stats_percpu *lstatc; 5381 5382 if (parent) 5383 ppn = parent->nodeinfo[nid]; 5384 5385 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); 5386 5387 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { 5388 delta = pn->lruvec_stats.state_pending[i]; 5389 if (delta) 5390 pn->lruvec_stats.state_pending[i] = 0; 5391 5392 v = READ_ONCE(lstatc->state[i]); 5393 if (v != lstatc->state_prev[i]) { 5394 delta += v - lstatc->state_prev[i]; 5395 lstatc->state_prev[i] = v; 5396 } 5397 5398 if (!delta) 5399 continue; 5400 5401 pn->lruvec_stats.state[i] += delta; 5402 if (ppn) 5403 ppn->lruvec_stats.state_pending[i] += delta; 5404 } 5405 } 5406 } 5407 5408 #ifdef CONFIG_MMU 5409 /* Handlers for move charge at task migration. */ 5410 static int mem_cgroup_do_precharge(unsigned long count) 5411 { 5412 int ret; 5413 5414 /* Try a single bulk charge without reclaim first, kswapd may wake */ 5415 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 5416 if (!ret) { 5417 mc.precharge += count; 5418 return ret; 5419 } 5420 5421 /* Try charges one by one with reclaim, but do not retry */ 5422 while (count--) { 5423 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 5424 if (ret) 5425 return ret; 5426 mc.precharge++; 5427 cond_resched(); 5428 } 5429 return 0; 5430 } 5431 5432 union mc_target { 5433 struct page *page; 5434 swp_entry_t ent; 5435 }; 5436 5437 enum mc_target_type { 5438 MC_TARGET_NONE = 0, 5439 MC_TARGET_PAGE, 5440 MC_TARGET_SWAP, 5441 MC_TARGET_DEVICE, 5442 }; 5443 5444 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5445 unsigned long addr, pte_t ptent) 5446 { 5447 struct page *page = vm_normal_page(vma, addr, ptent); 5448 5449 if (!page || !page_mapped(page)) 5450 return NULL; 5451 if (PageAnon(page)) { 5452 if (!(mc.flags & MOVE_ANON)) 5453 return NULL; 5454 } else { 5455 if (!(mc.flags & MOVE_FILE)) 5456 return NULL; 5457 } 5458 if (!get_page_unless_zero(page)) 5459 return NULL; 5460 5461 return page; 5462 } 5463 5464 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 5465 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5466 pte_t ptent, swp_entry_t *entry) 5467 { 5468 struct page *page = NULL; 5469 swp_entry_t ent = pte_to_swp_entry(ptent); 5470 5471 if (!(mc.flags & MOVE_ANON)) 5472 return NULL; 5473 5474 /* 5475 * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to 5476 * a device and because they are not accessible by CPU they are store 5477 * as special swap entry in the CPU page table. 5478 */ 5479 if (is_device_private_entry(ent)) { 5480 page = pfn_swap_entry_to_page(ent); 5481 /* 5482 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have 5483 * a refcount of 1 when free (unlike normal page) 5484 */ 5485 if (!page_ref_add_unless(page, 1, 1)) 5486 return NULL; 5487 return page; 5488 } 5489 5490 if (non_swap_entry(ent)) 5491 return NULL; 5492 5493 /* 5494 * Because lookup_swap_cache() updates some statistics counter, 5495 * we call find_get_page() with swapper_space directly. 5496 */ 5497 page = find_get_page(swap_address_space(ent), swp_offset(ent)); 5498 entry->val = ent.val; 5499 5500 return page; 5501 } 5502 #else 5503 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5504 pte_t ptent, swp_entry_t *entry) 5505 { 5506 return NULL; 5507 } 5508 #endif 5509 5510 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5511 unsigned long addr, pte_t ptent) 5512 { 5513 if (!vma->vm_file) /* anonymous vma */ 5514 return NULL; 5515 if (!(mc.flags & MOVE_FILE)) 5516 return NULL; 5517 5518 /* page is moved even if it's not RSS of this task(page-faulted). */ 5519 /* shmem/tmpfs may report page out on swap: account for that too. */ 5520 return find_get_incore_page(vma->vm_file->f_mapping, 5521 linear_page_index(vma, addr)); 5522 } 5523 5524 /** 5525 * mem_cgroup_move_account - move account of the page 5526 * @page: the page 5527 * @compound: charge the page as compound or small page 5528 * @from: mem_cgroup which the page is moved from. 5529 * @to: mem_cgroup which the page is moved to. @from != @to. 5530 * 5531 * The caller must make sure the page is not on LRU (isolate_page() is useful.) 5532 * 5533 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 5534 * from old cgroup. 5535 */ 5536 static int mem_cgroup_move_account(struct page *page, 5537 bool compound, 5538 struct mem_cgroup *from, 5539 struct mem_cgroup *to) 5540 { 5541 struct lruvec *from_vec, *to_vec; 5542 struct pglist_data *pgdat; 5543 unsigned int nr_pages = compound ? thp_nr_pages(page) : 1; 5544 int ret; 5545 5546 VM_BUG_ON(from == to); 5547 VM_BUG_ON_PAGE(PageLRU(page), page); 5548 VM_BUG_ON(compound && !PageTransHuge(page)); 5549 5550 /* 5551 * Prevent mem_cgroup_migrate() from looking at 5552 * page's memory cgroup of its source page while we change it. 5553 */ 5554 ret = -EBUSY; 5555 if (!trylock_page(page)) 5556 goto out; 5557 5558 ret = -EINVAL; 5559 if (page_memcg(page) != from) 5560 goto out_unlock; 5561 5562 pgdat = page_pgdat(page); 5563 from_vec = mem_cgroup_lruvec(from, pgdat); 5564 to_vec = mem_cgroup_lruvec(to, pgdat); 5565 5566 lock_page_memcg(page); 5567 5568 if (PageAnon(page)) { 5569 if (page_mapped(page)) { 5570 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); 5571 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); 5572 if (PageTransHuge(page)) { 5573 __mod_lruvec_state(from_vec, NR_ANON_THPS, 5574 -nr_pages); 5575 __mod_lruvec_state(to_vec, NR_ANON_THPS, 5576 nr_pages); 5577 } 5578 } 5579 } else { 5580 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); 5581 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); 5582 5583 if (PageSwapBacked(page)) { 5584 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); 5585 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); 5586 } 5587 5588 if (page_mapped(page)) { 5589 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); 5590 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); 5591 } 5592 5593 if (PageDirty(page)) { 5594 struct address_space *mapping = page_mapping(page); 5595 5596 if (mapping_can_writeback(mapping)) { 5597 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 5598 -nr_pages); 5599 __mod_lruvec_state(to_vec, NR_FILE_DIRTY, 5600 nr_pages); 5601 } 5602 } 5603 } 5604 5605 if (PageWriteback(page)) { 5606 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); 5607 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); 5608 } 5609 5610 /* 5611 * All state has been migrated, let's switch to the new memcg. 5612 * 5613 * It is safe to change page's memcg here because the page 5614 * is referenced, charged, isolated, and locked: we can't race 5615 * with (un)charging, migration, LRU putback, or anything else 5616 * that would rely on a stable page's memory cgroup. 5617 * 5618 * Note that lock_page_memcg is a memcg lock, not a page lock, 5619 * to save space. As soon as we switch page's memory cgroup to a 5620 * new memcg that isn't locked, the above state can change 5621 * concurrently again. Make sure we're truly done with it. 5622 */ 5623 smp_mb(); 5624 5625 css_get(&to->css); 5626 css_put(&from->css); 5627 5628 page->memcg_data = (unsigned long)to; 5629 5630 __unlock_page_memcg(from); 5631 5632 ret = 0; 5633 5634 local_irq_disable(); 5635 mem_cgroup_charge_statistics(to, page, nr_pages); 5636 memcg_check_events(to, page); 5637 mem_cgroup_charge_statistics(from, page, -nr_pages); 5638 memcg_check_events(from, page); 5639 local_irq_enable(); 5640 out_unlock: 5641 unlock_page(page); 5642 out: 5643 return ret; 5644 } 5645 5646 /** 5647 * get_mctgt_type - get target type of moving charge 5648 * @vma: the vma the pte to be checked belongs 5649 * @addr: the address corresponding to the pte to be checked 5650 * @ptent: the pte to be checked 5651 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5652 * 5653 * Returns 5654 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5655 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5656 * move charge. if @target is not NULL, the page is stored in target->page 5657 * with extra refcnt got(Callers should handle it). 5658 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5659 * target for charge migration. if @target is not NULL, the entry is stored 5660 * in target->ent. 5661 * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE 5662 * (so ZONE_DEVICE page and thus not on the lru). 5663 * For now we such page is charge like a regular page would be as for all 5664 * intent and purposes it is just special memory taking the place of a 5665 * regular page. 5666 * 5667 * See Documentations/vm/hmm.txt and include/linux/hmm.h 5668 * 5669 * Called with pte lock held. 5670 */ 5671 5672 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5673 unsigned long addr, pte_t ptent, union mc_target *target) 5674 { 5675 struct page *page = NULL; 5676 enum mc_target_type ret = MC_TARGET_NONE; 5677 swp_entry_t ent = { .val = 0 }; 5678 5679 if (pte_present(ptent)) 5680 page = mc_handle_present_pte(vma, addr, ptent); 5681 else if (is_swap_pte(ptent)) 5682 page = mc_handle_swap_pte(vma, ptent, &ent); 5683 else if (pte_none(ptent)) 5684 page = mc_handle_file_pte(vma, addr, ptent); 5685 5686 if (!page && !ent.val) 5687 return ret; 5688 if (page) { 5689 /* 5690 * Do only loose check w/o serialization. 5691 * mem_cgroup_move_account() checks the page is valid or 5692 * not under LRU exclusion. 5693 */ 5694 if (page_memcg(page) == mc.from) { 5695 ret = MC_TARGET_PAGE; 5696 if (is_device_private_page(page)) 5697 ret = MC_TARGET_DEVICE; 5698 if (target) 5699 target->page = page; 5700 } 5701 if (!ret || !target) 5702 put_page(page); 5703 } 5704 /* 5705 * There is a swap entry and a page doesn't exist or isn't charged. 5706 * But we cannot move a tail-page in a THP. 5707 */ 5708 if (ent.val && !ret && (!page || !PageTransCompound(page)) && 5709 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 5710 ret = MC_TARGET_SWAP; 5711 if (target) 5712 target->ent = ent; 5713 } 5714 return ret; 5715 } 5716 5717 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5718 /* 5719 * We don't consider PMD mapped swapping or file mapped pages because THP does 5720 * not support them for now. 5721 * Caller should make sure that pmd_trans_huge(pmd) is true. 5722 */ 5723 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5724 unsigned long addr, pmd_t pmd, union mc_target *target) 5725 { 5726 struct page *page = NULL; 5727 enum mc_target_type ret = MC_TARGET_NONE; 5728 5729 if (unlikely(is_swap_pmd(pmd))) { 5730 VM_BUG_ON(thp_migration_supported() && 5731 !is_pmd_migration_entry(pmd)); 5732 return ret; 5733 } 5734 page = pmd_page(pmd); 5735 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5736 if (!(mc.flags & MOVE_ANON)) 5737 return ret; 5738 if (page_memcg(page) == mc.from) { 5739 ret = MC_TARGET_PAGE; 5740 if (target) { 5741 get_page(page); 5742 target->page = page; 5743 } 5744 } 5745 return ret; 5746 } 5747 #else 5748 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5749 unsigned long addr, pmd_t pmd, union mc_target *target) 5750 { 5751 return MC_TARGET_NONE; 5752 } 5753 #endif 5754 5755 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5756 unsigned long addr, unsigned long end, 5757 struct mm_walk *walk) 5758 { 5759 struct vm_area_struct *vma = walk->vma; 5760 pte_t *pte; 5761 spinlock_t *ptl; 5762 5763 ptl = pmd_trans_huge_lock(pmd, vma); 5764 if (ptl) { 5765 /* 5766 * Note their can not be MC_TARGET_DEVICE for now as we do not 5767 * support transparent huge page with MEMORY_DEVICE_PRIVATE but 5768 * this might change. 5769 */ 5770 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5771 mc.precharge += HPAGE_PMD_NR; 5772 spin_unlock(ptl); 5773 return 0; 5774 } 5775 5776 if (pmd_trans_unstable(pmd)) 5777 return 0; 5778 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5779 for (; addr != end; pte++, addr += PAGE_SIZE) 5780 if (get_mctgt_type(vma, addr, *pte, NULL)) 5781 mc.precharge++; /* increment precharge temporarily */ 5782 pte_unmap_unlock(pte - 1, ptl); 5783 cond_resched(); 5784 5785 return 0; 5786 } 5787 5788 static const struct mm_walk_ops precharge_walk_ops = { 5789 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5790 }; 5791 5792 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5793 { 5794 unsigned long precharge; 5795 5796 mmap_read_lock(mm); 5797 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); 5798 mmap_read_unlock(mm); 5799 5800 precharge = mc.precharge; 5801 mc.precharge = 0; 5802 5803 return precharge; 5804 } 5805 5806 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5807 { 5808 unsigned long precharge = mem_cgroup_count_precharge(mm); 5809 5810 VM_BUG_ON(mc.moving_task); 5811 mc.moving_task = current; 5812 return mem_cgroup_do_precharge(precharge); 5813 } 5814 5815 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5816 static void __mem_cgroup_clear_mc(void) 5817 { 5818 struct mem_cgroup *from = mc.from; 5819 struct mem_cgroup *to = mc.to; 5820 5821 /* we must uncharge all the leftover precharges from mc.to */ 5822 if (mc.precharge) { 5823 cancel_charge(mc.to, mc.precharge); 5824 mc.precharge = 0; 5825 } 5826 /* 5827 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5828 * we must uncharge here. 5829 */ 5830 if (mc.moved_charge) { 5831 cancel_charge(mc.from, mc.moved_charge); 5832 mc.moved_charge = 0; 5833 } 5834 /* we must fixup refcnts and charges */ 5835 if (mc.moved_swap) { 5836 /* uncharge swap account from the old cgroup */ 5837 if (!mem_cgroup_is_root(mc.from)) 5838 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 5839 5840 mem_cgroup_id_put_many(mc.from, mc.moved_swap); 5841 5842 /* 5843 * we charged both to->memory and to->memsw, so we 5844 * should uncharge to->memory. 5845 */ 5846 if (!mem_cgroup_is_root(mc.to)) 5847 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 5848 5849 mc.moved_swap = 0; 5850 } 5851 memcg_oom_recover(from); 5852 memcg_oom_recover(to); 5853 wake_up_all(&mc.waitq); 5854 } 5855 5856 static void mem_cgroup_clear_mc(void) 5857 { 5858 struct mm_struct *mm = mc.mm; 5859 5860 /* 5861 * we must clear moving_task before waking up waiters at the end of 5862 * task migration. 5863 */ 5864 mc.moving_task = NULL; 5865 __mem_cgroup_clear_mc(); 5866 spin_lock(&mc.lock); 5867 mc.from = NULL; 5868 mc.to = NULL; 5869 mc.mm = NULL; 5870 spin_unlock(&mc.lock); 5871 5872 mmput(mm); 5873 } 5874 5875 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 5876 { 5877 struct cgroup_subsys_state *css; 5878 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 5879 struct mem_cgroup *from; 5880 struct task_struct *leader, *p; 5881 struct mm_struct *mm; 5882 unsigned long move_flags; 5883 int ret = 0; 5884 5885 /* charge immigration isn't supported on the default hierarchy */ 5886 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5887 return 0; 5888 5889 /* 5890 * Multi-process migrations only happen on the default hierarchy 5891 * where charge immigration is not used. Perform charge 5892 * immigration if @tset contains a leader and whine if there are 5893 * multiple. 5894 */ 5895 p = NULL; 5896 cgroup_taskset_for_each_leader(leader, css, tset) { 5897 WARN_ON_ONCE(p); 5898 p = leader; 5899 memcg = mem_cgroup_from_css(css); 5900 } 5901 if (!p) 5902 return 0; 5903 5904 /* 5905 * We are now committed to this value whatever it is. Changes in this 5906 * tunable will only affect upcoming migrations, not the current one. 5907 * So we need to save it, and keep it going. 5908 */ 5909 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 5910 if (!move_flags) 5911 return 0; 5912 5913 from = mem_cgroup_from_task(p); 5914 5915 VM_BUG_ON(from == memcg); 5916 5917 mm = get_task_mm(p); 5918 if (!mm) 5919 return 0; 5920 /* We move charges only when we move a owner of the mm */ 5921 if (mm->owner == p) { 5922 VM_BUG_ON(mc.from); 5923 VM_BUG_ON(mc.to); 5924 VM_BUG_ON(mc.precharge); 5925 VM_BUG_ON(mc.moved_charge); 5926 VM_BUG_ON(mc.moved_swap); 5927 5928 spin_lock(&mc.lock); 5929 mc.mm = mm; 5930 mc.from = from; 5931 mc.to = memcg; 5932 mc.flags = move_flags; 5933 spin_unlock(&mc.lock); 5934 /* We set mc.moving_task later */ 5935 5936 ret = mem_cgroup_precharge_mc(mm); 5937 if (ret) 5938 mem_cgroup_clear_mc(); 5939 } else { 5940 mmput(mm); 5941 } 5942 return ret; 5943 } 5944 5945 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 5946 { 5947 if (mc.to) 5948 mem_cgroup_clear_mc(); 5949 } 5950 5951 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5952 unsigned long addr, unsigned long end, 5953 struct mm_walk *walk) 5954 { 5955 int ret = 0; 5956 struct vm_area_struct *vma = walk->vma; 5957 pte_t *pte; 5958 spinlock_t *ptl; 5959 enum mc_target_type target_type; 5960 union mc_target target; 5961 struct page *page; 5962 5963 ptl = pmd_trans_huge_lock(pmd, vma); 5964 if (ptl) { 5965 if (mc.precharge < HPAGE_PMD_NR) { 5966 spin_unlock(ptl); 5967 return 0; 5968 } 5969 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 5970 if (target_type == MC_TARGET_PAGE) { 5971 page = target.page; 5972 if (!isolate_lru_page(page)) { 5973 if (!mem_cgroup_move_account(page, true, 5974 mc.from, mc.to)) { 5975 mc.precharge -= HPAGE_PMD_NR; 5976 mc.moved_charge += HPAGE_PMD_NR; 5977 } 5978 putback_lru_page(page); 5979 } 5980 put_page(page); 5981 } else if (target_type == MC_TARGET_DEVICE) { 5982 page = target.page; 5983 if (!mem_cgroup_move_account(page, true, 5984 mc.from, mc.to)) { 5985 mc.precharge -= HPAGE_PMD_NR; 5986 mc.moved_charge += HPAGE_PMD_NR; 5987 } 5988 put_page(page); 5989 } 5990 spin_unlock(ptl); 5991 return 0; 5992 } 5993 5994 if (pmd_trans_unstable(pmd)) 5995 return 0; 5996 retry: 5997 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5998 for (; addr != end; addr += PAGE_SIZE) { 5999 pte_t ptent = *(pte++); 6000 bool device = false; 6001 swp_entry_t ent; 6002 6003 if (!mc.precharge) 6004 break; 6005 6006 switch (get_mctgt_type(vma, addr, ptent, &target)) { 6007 case MC_TARGET_DEVICE: 6008 device = true; 6009 fallthrough; 6010 case MC_TARGET_PAGE: 6011 page = target.page; 6012 /* 6013 * We can have a part of the split pmd here. Moving it 6014 * can be done but it would be too convoluted so simply 6015 * ignore such a partial THP and keep it in original 6016 * memcg. There should be somebody mapping the head. 6017 */ 6018 if (PageTransCompound(page)) 6019 goto put; 6020 if (!device && isolate_lru_page(page)) 6021 goto put; 6022 if (!mem_cgroup_move_account(page, false, 6023 mc.from, mc.to)) { 6024 mc.precharge--; 6025 /* we uncharge from mc.from later. */ 6026 mc.moved_charge++; 6027 } 6028 if (!device) 6029 putback_lru_page(page); 6030 put: /* get_mctgt_type() gets the page */ 6031 put_page(page); 6032 break; 6033 case MC_TARGET_SWAP: 6034 ent = target.ent; 6035 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 6036 mc.precharge--; 6037 mem_cgroup_id_get_many(mc.to, 1); 6038 /* we fixup other refcnts and charges later. */ 6039 mc.moved_swap++; 6040 } 6041 break; 6042 default: 6043 break; 6044 } 6045 } 6046 pte_unmap_unlock(pte - 1, ptl); 6047 cond_resched(); 6048 6049 if (addr != end) { 6050 /* 6051 * We have consumed all precharges we got in can_attach(). 6052 * We try charge one by one, but don't do any additional 6053 * charges to mc.to if we have failed in charge once in attach() 6054 * phase. 6055 */ 6056 ret = mem_cgroup_do_precharge(1); 6057 if (!ret) 6058 goto retry; 6059 } 6060 6061 return ret; 6062 } 6063 6064 static const struct mm_walk_ops charge_walk_ops = { 6065 .pmd_entry = mem_cgroup_move_charge_pte_range, 6066 }; 6067 6068 static void mem_cgroup_move_charge(void) 6069 { 6070 lru_add_drain_all(); 6071 /* 6072 * Signal lock_page_memcg() to take the memcg's move_lock 6073 * while we're moving its pages to another memcg. Then wait 6074 * for already started RCU-only updates to finish. 6075 */ 6076 atomic_inc(&mc.from->moving_account); 6077 synchronize_rcu(); 6078 retry: 6079 if (unlikely(!mmap_read_trylock(mc.mm))) { 6080 /* 6081 * Someone who are holding the mmap_lock might be waiting in 6082 * waitq. So we cancel all extra charges, wake up all waiters, 6083 * and retry. Because we cancel precharges, we might not be able 6084 * to move enough charges, but moving charge is a best-effort 6085 * feature anyway, so it wouldn't be a big problem. 6086 */ 6087 __mem_cgroup_clear_mc(); 6088 cond_resched(); 6089 goto retry; 6090 } 6091 /* 6092 * When we have consumed all precharges and failed in doing 6093 * additional charge, the page walk just aborts. 6094 */ 6095 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, 6096 NULL); 6097 6098 mmap_read_unlock(mc.mm); 6099 atomic_dec(&mc.from->moving_account); 6100 } 6101 6102 static void mem_cgroup_move_task(void) 6103 { 6104 if (mc.to) { 6105 mem_cgroup_move_charge(); 6106 mem_cgroup_clear_mc(); 6107 } 6108 } 6109 #else /* !CONFIG_MMU */ 6110 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6111 { 6112 return 0; 6113 } 6114 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6115 { 6116 } 6117 static void mem_cgroup_move_task(void) 6118 { 6119 } 6120 #endif 6121 6122 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 6123 { 6124 if (value == PAGE_COUNTER_MAX) 6125 seq_puts(m, "max\n"); 6126 else 6127 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 6128 6129 return 0; 6130 } 6131 6132 static u64 memory_current_read(struct cgroup_subsys_state *css, 6133 struct cftype *cft) 6134 { 6135 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6136 6137 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 6138 } 6139 6140 static int memory_min_show(struct seq_file *m, void *v) 6141 { 6142 return seq_puts_memcg_tunable(m, 6143 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 6144 } 6145 6146 static ssize_t memory_min_write(struct kernfs_open_file *of, 6147 char *buf, size_t nbytes, loff_t off) 6148 { 6149 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6150 unsigned long min; 6151 int err; 6152 6153 buf = strstrip(buf); 6154 err = page_counter_memparse(buf, "max", &min); 6155 if (err) 6156 return err; 6157 6158 page_counter_set_min(&memcg->memory, min); 6159 6160 return nbytes; 6161 } 6162 6163 static int memory_low_show(struct seq_file *m, void *v) 6164 { 6165 return seq_puts_memcg_tunable(m, 6166 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 6167 } 6168 6169 static ssize_t memory_low_write(struct kernfs_open_file *of, 6170 char *buf, size_t nbytes, loff_t off) 6171 { 6172 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6173 unsigned long low; 6174 int err; 6175 6176 buf = strstrip(buf); 6177 err = page_counter_memparse(buf, "max", &low); 6178 if (err) 6179 return err; 6180 6181 page_counter_set_low(&memcg->memory, low); 6182 6183 return nbytes; 6184 } 6185 6186 static int memory_high_show(struct seq_file *m, void *v) 6187 { 6188 return seq_puts_memcg_tunable(m, 6189 READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); 6190 } 6191 6192 static ssize_t memory_high_write(struct kernfs_open_file *of, 6193 char *buf, size_t nbytes, loff_t off) 6194 { 6195 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6196 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6197 bool drained = false; 6198 unsigned long high; 6199 int err; 6200 6201 buf = strstrip(buf); 6202 err = page_counter_memparse(buf, "max", &high); 6203 if (err) 6204 return err; 6205 6206 page_counter_set_high(&memcg->memory, high); 6207 6208 for (;;) { 6209 unsigned long nr_pages = page_counter_read(&memcg->memory); 6210 unsigned long reclaimed; 6211 6212 if (nr_pages <= high) 6213 break; 6214 6215 if (signal_pending(current)) 6216 break; 6217 6218 if (!drained) { 6219 drain_all_stock(memcg); 6220 drained = true; 6221 continue; 6222 } 6223 6224 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 6225 GFP_KERNEL, true); 6226 6227 if (!reclaimed && !nr_retries--) 6228 break; 6229 } 6230 6231 memcg_wb_domain_size_changed(memcg); 6232 return nbytes; 6233 } 6234 6235 static int memory_max_show(struct seq_file *m, void *v) 6236 { 6237 return seq_puts_memcg_tunable(m, 6238 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 6239 } 6240 6241 static ssize_t memory_max_write(struct kernfs_open_file *of, 6242 char *buf, size_t nbytes, loff_t off) 6243 { 6244 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6245 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; 6246 bool drained = false; 6247 unsigned long max; 6248 int err; 6249 6250 buf = strstrip(buf); 6251 err = page_counter_memparse(buf, "max", &max); 6252 if (err) 6253 return err; 6254 6255 xchg(&memcg->memory.max, max); 6256 6257 for (;;) { 6258 unsigned long nr_pages = page_counter_read(&memcg->memory); 6259 6260 if (nr_pages <= max) 6261 break; 6262 6263 if (signal_pending(current)) 6264 break; 6265 6266 if (!drained) { 6267 drain_all_stock(memcg); 6268 drained = true; 6269 continue; 6270 } 6271 6272 if (nr_reclaims) { 6273 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 6274 GFP_KERNEL, true)) 6275 nr_reclaims--; 6276 continue; 6277 } 6278 6279 memcg_memory_event(memcg, MEMCG_OOM); 6280 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 6281 break; 6282 } 6283 6284 memcg_wb_domain_size_changed(memcg); 6285 return nbytes; 6286 } 6287 6288 static void __memory_events_show(struct seq_file *m, atomic_long_t *events) 6289 { 6290 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 6291 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 6292 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 6293 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 6294 seq_printf(m, "oom_kill %lu\n", 6295 atomic_long_read(&events[MEMCG_OOM_KILL])); 6296 } 6297 6298 static int memory_events_show(struct seq_file *m, void *v) 6299 { 6300 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6301 6302 __memory_events_show(m, memcg->memory_events); 6303 return 0; 6304 } 6305 6306 static int memory_events_local_show(struct seq_file *m, void *v) 6307 { 6308 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6309 6310 __memory_events_show(m, memcg->memory_events_local); 6311 return 0; 6312 } 6313 6314 static int memory_stat_show(struct seq_file *m, void *v) 6315 { 6316 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6317 char *buf; 6318 6319 buf = memory_stat_format(memcg); 6320 if (!buf) 6321 return -ENOMEM; 6322 seq_puts(m, buf); 6323 kfree(buf); 6324 return 0; 6325 } 6326 6327 #ifdef CONFIG_NUMA 6328 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, 6329 int item) 6330 { 6331 return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item); 6332 } 6333 6334 static int memory_numa_stat_show(struct seq_file *m, void *v) 6335 { 6336 int i; 6337 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6338 6339 mem_cgroup_flush_stats(); 6340 6341 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 6342 int nid; 6343 6344 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) 6345 continue; 6346 6347 seq_printf(m, "%s", memory_stats[i].name); 6348 for_each_node_state(nid, N_MEMORY) { 6349 u64 size; 6350 struct lruvec *lruvec; 6351 6352 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 6353 size = lruvec_page_state_output(lruvec, 6354 memory_stats[i].idx); 6355 seq_printf(m, " N%d=%llu", nid, size); 6356 } 6357 seq_putc(m, '\n'); 6358 } 6359 6360 return 0; 6361 } 6362 #endif 6363 6364 static int memory_oom_group_show(struct seq_file *m, void *v) 6365 { 6366 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6367 6368 seq_printf(m, "%d\n", memcg->oom_group); 6369 6370 return 0; 6371 } 6372 6373 static ssize_t memory_oom_group_write(struct kernfs_open_file *of, 6374 char *buf, size_t nbytes, loff_t off) 6375 { 6376 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6377 int ret, oom_group; 6378 6379 buf = strstrip(buf); 6380 if (!buf) 6381 return -EINVAL; 6382 6383 ret = kstrtoint(buf, 0, &oom_group); 6384 if (ret) 6385 return ret; 6386 6387 if (oom_group != 0 && oom_group != 1) 6388 return -EINVAL; 6389 6390 memcg->oom_group = oom_group; 6391 6392 return nbytes; 6393 } 6394 6395 static struct cftype memory_files[] = { 6396 { 6397 .name = "current", 6398 .flags = CFTYPE_NOT_ON_ROOT, 6399 .read_u64 = memory_current_read, 6400 }, 6401 { 6402 .name = "min", 6403 .flags = CFTYPE_NOT_ON_ROOT, 6404 .seq_show = memory_min_show, 6405 .write = memory_min_write, 6406 }, 6407 { 6408 .name = "low", 6409 .flags = CFTYPE_NOT_ON_ROOT, 6410 .seq_show = memory_low_show, 6411 .write = memory_low_write, 6412 }, 6413 { 6414 .name = "high", 6415 .flags = CFTYPE_NOT_ON_ROOT, 6416 .seq_show = memory_high_show, 6417 .write = memory_high_write, 6418 }, 6419 { 6420 .name = "max", 6421 .flags = CFTYPE_NOT_ON_ROOT, 6422 .seq_show = memory_max_show, 6423 .write = memory_max_write, 6424 }, 6425 { 6426 .name = "events", 6427 .flags = CFTYPE_NOT_ON_ROOT, 6428 .file_offset = offsetof(struct mem_cgroup, events_file), 6429 .seq_show = memory_events_show, 6430 }, 6431 { 6432 .name = "events.local", 6433 .flags = CFTYPE_NOT_ON_ROOT, 6434 .file_offset = offsetof(struct mem_cgroup, events_local_file), 6435 .seq_show = memory_events_local_show, 6436 }, 6437 { 6438 .name = "stat", 6439 .seq_show = memory_stat_show, 6440 }, 6441 #ifdef CONFIG_NUMA 6442 { 6443 .name = "numa_stat", 6444 .seq_show = memory_numa_stat_show, 6445 }, 6446 #endif 6447 { 6448 .name = "oom.group", 6449 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 6450 .seq_show = memory_oom_group_show, 6451 .write = memory_oom_group_write, 6452 }, 6453 { } /* terminate */ 6454 }; 6455 6456 struct cgroup_subsys memory_cgrp_subsys = { 6457 .css_alloc = mem_cgroup_css_alloc, 6458 .css_online = mem_cgroup_css_online, 6459 .css_offline = mem_cgroup_css_offline, 6460 .css_released = mem_cgroup_css_released, 6461 .css_free = mem_cgroup_css_free, 6462 .css_reset = mem_cgroup_css_reset, 6463 .css_rstat_flush = mem_cgroup_css_rstat_flush, 6464 .can_attach = mem_cgroup_can_attach, 6465 .cancel_attach = mem_cgroup_cancel_attach, 6466 .post_attach = mem_cgroup_move_task, 6467 .dfl_cftypes = memory_files, 6468 .legacy_cftypes = mem_cgroup_legacy_files, 6469 .early_init = 0, 6470 }; 6471 6472 /* 6473 * This function calculates an individual cgroup's effective 6474 * protection which is derived from its own memory.min/low, its 6475 * parent's and siblings' settings, as well as the actual memory 6476 * distribution in the tree. 6477 * 6478 * The following rules apply to the effective protection values: 6479 * 6480 * 1. At the first level of reclaim, effective protection is equal to 6481 * the declared protection in memory.min and memory.low. 6482 * 6483 * 2. To enable safe delegation of the protection configuration, at 6484 * subsequent levels the effective protection is capped to the 6485 * parent's effective protection. 6486 * 6487 * 3. To make complex and dynamic subtrees easier to configure, the 6488 * user is allowed to overcommit the declared protection at a given 6489 * level. If that is the case, the parent's effective protection is 6490 * distributed to the children in proportion to how much protection 6491 * they have declared and how much of it they are utilizing. 6492 * 6493 * This makes distribution proportional, but also work-conserving: 6494 * if one cgroup claims much more protection than it uses memory, 6495 * the unused remainder is available to its siblings. 6496 * 6497 * 4. Conversely, when the declared protection is undercommitted at a 6498 * given level, the distribution of the larger parental protection 6499 * budget is NOT proportional. A cgroup's protection from a sibling 6500 * is capped to its own memory.min/low setting. 6501 * 6502 * 5. However, to allow protecting recursive subtrees from each other 6503 * without having to declare each individual cgroup's fixed share 6504 * of the ancestor's claim to protection, any unutilized - 6505 * "floating" - protection from up the tree is distributed in 6506 * proportion to each cgroup's *usage*. This makes the protection 6507 * neutral wrt sibling cgroups and lets them compete freely over 6508 * the shared parental protection budget, but it protects the 6509 * subtree as a whole from neighboring subtrees. 6510 * 6511 * Note that 4. and 5. are not in conflict: 4. is about protecting 6512 * against immediate siblings whereas 5. is about protecting against 6513 * neighboring subtrees. 6514 */ 6515 static unsigned long effective_protection(unsigned long usage, 6516 unsigned long parent_usage, 6517 unsigned long setting, 6518 unsigned long parent_effective, 6519 unsigned long siblings_protected) 6520 { 6521 unsigned long protected; 6522 unsigned long ep; 6523 6524 protected = min(usage, setting); 6525 /* 6526 * If all cgroups at this level combined claim and use more 6527 * protection then what the parent affords them, distribute 6528 * shares in proportion to utilization. 6529 * 6530 * We are using actual utilization rather than the statically 6531 * claimed protection in order to be work-conserving: claimed 6532 * but unused protection is available to siblings that would 6533 * otherwise get a smaller chunk than what they claimed. 6534 */ 6535 if (siblings_protected > parent_effective) 6536 return protected * parent_effective / siblings_protected; 6537 6538 /* 6539 * Ok, utilized protection of all children is within what the 6540 * parent affords them, so we know whatever this child claims 6541 * and utilizes is effectively protected. 6542 * 6543 * If there is unprotected usage beyond this value, reclaim 6544 * will apply pressure in proportion to that amount. 6545 * 6546 * If there is unutilized protection, the cgroup will be fully 6547 * shielded from reclaim, but we do return a smaller value for 6548 * protection than what the group could enjoy in theory. This 6549 * is okay. With the overcommit distribution above, effective 6550 * protection is always dependent on how memory is actually 6551 * consumed among the siblings anyway. 6552 */ 6553 ep = protected; 6554 6555 /* 6556 * If the children aren't claiming (all of) the protection 6557 * afforded to them by the parent, distribute the remainder in 6558 * proportion to the (unprotected) memory of each cgroup. That 6559 * way, cgroups that aren't explicitly prioritized wrt each 6560 * other compete freely over the allowance, but they are 6561 * collectively protected from neighboring trees. 6562 * 6563 * We're using unprotected memory for the weight so that if 6564 * some cgroups DO claim explicit protection, we don't protect 6565 * the same bytes twice. 6566 * 6567 * Check both usage and parent_usage against the respective 6568 * protected values. One should imply the other, but they 6569 * aren't read atomically - make sure the division is sane. 6570 */ 6571 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) 6572 return ep; 6573 if (parent_effective > siblings_protected && 6574 parent_usage > siblings_protected && 6575 usage > protected) { 6576 unsigned long unclaimed; 6577 6578 unclaimed = parent_effective - siblings_protected; 6579 unclaimed *= usage - protected; 6580 unclaimed /= parent_usage - siblings_protected; 6581 6582 ep += unclaimed; 6583 } 6584 6585 return ep; 6586 } 6587 6588 /** 6589 * mem_cgroup_calculate_protection - check if memory consumption is in the normal range 6590 * @root: the top ancestor of the sub-tree being checked 6591 * @memcg: the memory cgroup to check 6592 * 6593 * WARNING: This function is not stateless! It can only be used as part 6594 * of a top-down tree iteration, not for isolated queries. 6595 */ 6596 void mem_cgroup_calculate_protection(struct mem_cgroup *root, 6597 struct mem_cgroup *memcg) 6598 { 6599 unsigned long usage, parent_usage; 6600 struct mem_cgroup *parent; 6601 6602 if (mem_cgroup_disabled()) 6603 return; 6604 6605 if (!root) 6606 root = root_mem_cgroup; 6607 6608 /* 6609 * Effective values of the reclaim targets are ignored so they 6610 * can be stale. Have a look at mem_cgroup_protection for more 6611 * details. 6612 * TODO: calculation should be more robust so that we do not need 6613 * that special casing. 6614 */ 6615 if (memcg == root) 6616 return; 6617 6618 usage = page_counter_read(&memcg->memory); 6619 if (!usage) 6620 return; 6621 6622 parent = parent_mem_cgroup(memcg); 6623 /* No parent means a non-hierarchical mode on v1 memcg */ 6624 if (!parent) 6625 return; 6626 6627 if (parent == root) { 6628 memcg->memory.emin = READ_ONCE(memcg->memory.min); 6629 memcg->memory.elow = READ_ONCE(memcg->memory.low); 6630 return; 6631 } 6632 6633 parent_usage = page_counter_read(&parent->memory); 6634 6635 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, 6636 READ_ONCE(memcg->memory.min), 6637 READ_ONCE(parent->memory.emin), 6638 atomic_long_read(&parent->memory.children_min_usage))); 6639 6640 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, 6641 READ_ONCE(memcg->memory.low), 6642 READ_ONCE(parent->memory.elow), 6643 atomic_long_read(&parent->memory.children_low_usage))); 6644 } 6645 6646 static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) 6647 { 6648 unsigned int nr_pages = thp_nr_pages(page); 6649 int ret; 6650 6651 ret = try_charge(memcg, gfp, nr_pages); 6652 if (ret) 6653 goto out; 6654 6655 css_get(&memcg->css); 6656 commit_charge(page, memcg); 6657 6658 local_irq_disable(); 6659 mem_cgroup_charge_statistics(memcg, page, nr_pages); 6660 memcg_check_events(memcg, page); 6661 local_irq_enable(); 6662 out: 6663 return ret; 6664 } 6665 6666 /** 6667 * __mem_cgroup_charge - charge a newly allocated page to a cgroup 6668 * @page: page to charge 6669 * @mm: mm context of the victim 6670 * @gfp_mask: reclaim mode 6671 * 6672 * Try to charge @page to the memcg that @mm belongs to, reclaiming 6673 * pages according to @gfp_mask if necessary. if @mm is NULL, try to 6674 * charge to the active memcg. 6675 * 6676 * Do not use this for pages allocated for swapin. 6677 * 6678 * Returns 0 on success. Otherwise, an error code is returned. 6679 */ 6680 int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, 6681 gfp_t gfp_mask) 6682 { 6683 struct mem_cgroup *memcg; 6684 int ret; 6685 6686 memcg = get_mem_cgroup_from_mm(mm); 6687 ret = charge_memcg(page, memcg, gfp_mask); 6688 css_put(&memcg->css); 6689 6690 return ret; 6691 } 6692 6693 /** 6694 * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin 6695 * @page: page to charge 6696 * @mm: mm context of the victim 6697 * @gfp: reclaim mode 6698 * @entry: swap entry for which the page is allocated 6699 * 6700 * This function charges a page allocated for swapin. Please call this before 6701 * adding the page to the swapcache. 6702 * 6703 * Returns 0 on success. Otherwise, an error code is returned. 6704 */ 6705 int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, 6706 gfp_t gfp, swp_entry_t entry) 6707 { 6708 struct mem_cgroup *memcg; 6709 unsigned short id; 6710 int ret; 6711 6712 if (mem_cgroup_disabled()) 6713 return 0; 6714 6715 id = lookup_swap_cgroup_id(entry); 6716 rcu_read_lock(); 6717 memcg = mem_cgroup_from_id(id); 6718 if (!memcg || !css_tryget_online(&memcg->css)) 6719 memcg = get_mem_cgroup_from_mm(mm); 6720 rcu_read_unlock(); 6721 6722 ret = charge_memcg(page, memcg, gfp); 6723 6724 css_put(&memcg->css); 6725 return ret; 6726 } 6727 6728 /* 6729 * mem_cgroup_swapin_uncharge_swap - uncharge swap slot 6730 * @entry: swap entry for which the page is charged 6731 * 6732 * Call this function after successfully adding the charged page to swapcache. 6733 * 6734 * Note: This function assumes the page for which swap slot is being uncharged 6735 * is order 0 page. 6736 */ 6737 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) 6738 { 6739 /* 6740 * Cgroup1's unified memory+swap counter has been charged with the 6741 * new swapcache page, finish the transfer by uncharging the swap 6742 * slot. The swap slot would also get uncharged when it dies, but 6743 * it can stick around indefinitely and we'd count the page twice 6744 * the entire time. 6745 * 6746 * Cgroup2 has separate resource counters for memory and swap, 6747 * so this is a non-issue here. Memory and swap charge lifetimes 6748 * correspond 1:1 to page and swap slot lifetimes: we charge the 6749 * page to memory here, and uncharge swap when the slot is freed. 6750 */ 6751 if (!mem_cgroup_disabled() && do_memsw_account()) { 6752 /* 6753 * The swap entry might not get freed for a long time, 6754 * let's not wait for it. The page already received a 6755 * memory+swap charge, drop the swap entry duplicate. 6756 */ 6757 mem_cgroup_uncharge_swap(entry, 1); 6758 } 6759 } 6760 6761 struct uncharge_gather { 6762 struct mem_cgroup *memcg; 6763 unsigned long nr_memory; 6764 unsigned long pgpgout; 6765 unsigned long nr_kmem; 6766 struct page *dummy_page; 6767 }; 6768 6769 static inline void uncharge_gather_clear(struct uncharge_gather *ug) 6770 { 6771 memset(ug, 0, sizeof(*ug)); 6772 } 6773 6774 static void uncharge_batch(const struct uncharge_gather *ug) 6775 { 6776 unsigned long flags; 6777 6778 if (ug->nr_memory) { 6779 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); 6780 if (do_memsw_account()) 6781 page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); 6782 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) 6783 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); 6784 memcg_oom_recover(ug->memcg); 6785 } 6786 6787 local_irq_save(flags); 6788 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 6789 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); 6790 memcg_check_events(ug->memcg, ug->dummy_page); 6791 local_irq_restore(flags); 6792 6793 /* drop reference from uncharge_page */ 6794 css_put(&ug->memcg->css); 6795 } 6796 6797 static void uncharge_page(struct page *page, struct uncharge_gather *ug) 6798 { 6799 unsigned long nr_pages; 6800 struct mem_cgroup *memcg; 6801 struct obj_cgroup *objcg; 6802 bool use_objcg = PageMemcgKmem(page); 6803 6804 VM_BUG_ON_PAGE(PageLRU(page), page); 6805 6806 /* 6807 * Nobody should be changing or seriously looking at 6808 * page memcg or objcg at this point, we have fully 6809 * exclusive access to the page. 6810 */ 6811 if (use_objcg) { 6812 objcg = __page_objcg(page); 6813 /* 6814 * This get matches the put at the end of the function and 6815 * kmem pages do not hold memcg references anymore. 6816 */ 6817 memcg = get_mem_cgroup_from_objcg(objcg); 6818 } else { 6819 memcg = __page_memcg(page); 6820 } 6821 6822 if (!memcg) 6823 return; 6824 6825 if (ug->memcg != memcg) { 6826 if (ug->memcg) { 6827 uncharge_batch(ug); 6828 uncharge_gather_clear(ug); 6829 } 6830 ug->memcg = memcg; 6831 ug->dummy_page = page; 6832 6833 /* pairs with css_put in uncharge_batch */ 6834 css_get(&memcg->css); 6835 } 6836 6837 nr_pages = compound_nr(page); 6838 6839 if (use_objcg) { 6840 ug->nr_memory += nr_pages; 6841 ug->nr_kmem += nr_pages; 6842 6843 page->memcg_data = 0; 6844 obj_cgroup_put(objcg); 6845 } else { 6846 /* LRU pages aren't accounted at the root level */ 6847 if (!mem_cgroup_is_root(memcg)) 6848 ug->nr_memory += nr_pages; 6849 ug->pgpgout++; 6850 6851 page->memcg_data = 0; 6852 } 6853 6854 css_put(&memcg->css); 6855 } 6856 6857 /** 6858 * __mem_cgroup_uncharge - uncharge a page 6859 * @page: page to uncharge 6860 * 6861 * Uncharge a page previously charged with __mem_cgroup_charge(). 6862 */ 6863 void __mem_cgroup_uncharge(struct page *page) 6864 { 6865 struct uncharge_gather ug; 6866 6867 /* Don't touch page->lru of any random page, pre-check: */ 6868 if (!page_memcg(page)) 6869 return; 6870 6871 uncharge_gather_clear(&ug); 6872 uncharge_page(page, &ug); 6873 uncharge_batch(&ug); 6874 } 6875 6876 /** 6877 * __mem_cgroup_uncharge_list - uncharge a list of page 6878 * @page_list: list of pages to uncharge 6879 * 6880 * Uncharge a list of pages previously charged with 6881 * __mem_cgroup_charge(). 6882 */ 6883 void __mem_cgroup_uncharge_list(struct list_head *page_list) 6884 { 6885 struct uncharge_gather ug; 6886 struct page *page; 6887 6888 uncharge_gather_clear(&ug); 6889 list_for_each_entry(page, page_list, lru) 6890 uncharge_page(page, &ug); 6891 if (ug.memcg) 6892 uncharge_batch(&ug); 6893 } 6894 6895 /** 6896 * mem_cgroup_migrate - charge a page's replacement 6897 * @oldpage: currently circulating page 6898 * @newpage: replacement page 6899 * 6900 * Charge @newpage as a replacement page for @oldpage. @oldpage will 6901 * be uncharged upon free. 6902 * 6903 * Both pages must be locked, @newpage->mapping must be set up. 6904 */ 6905 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) 6906 { 6907 struct mem_cgroup *memcg; 6908 unsigned int nr_pages; 6909 unsigned long flags; 6910 6911 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 6912 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 6913 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 6914 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 6915 newpage); 6916 6917 if (mem_cgroup_disabled()) 6918 return; 6919 6920 /* Page cache replacement: new page already charged? */ 6921 if (page_memcg(newpage)) 6922 return; 6923 6924 memcg = page_memcg(oldpage); 6925 VM_WARN_ON_ONCE_PAGE(!memcg, oldpage); 6926 if (!memcg) 6927 return; 6928 6929 /* Force-charge the new page. The old one will be freed soon */ 6930 nr_pages = thp_nr_pages(newpage); 6931 6932 if (!mem_cgroup_is_root(memcg)) { 6933 page_counter_charge(&memcg->memory, nr_pages); 6934 if (do_memsw_account()) 6935 page_counter_charge(&memcg->memsw, nr_pages); 6936 } 6937 6938 css_get(&memcg->css); 6939 commit_charge(newpage, memcg); 6940 6941 local_irq_save(flags); 6942 mem_cgroup_charge_statistics(memcg, newpage, nr_pages); 6943 memcg_check_events(memcg, newpage); 6944 local_irq_restore(flags); 6945 } 6946 6947 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 6948 EXPORT_SYMBOL(memcg_sockets_enabled_key); 6949 6950 void mem_cgroup_sk_alloc(struct sock *sk) 6951 { 6952 struct mem_cgroup *memcg; 6953 6954 if (!mem_cgroup_sockets_enabled) 6955 return; 6956 6957 /* Do not associate the sock with unrelated interrupted task's memcg. */ 6958 if (in_interrupt()) 6959 return; 6960 6961 rcu_read_lock(); 6962 memcg = mem_cgroup_from_task(current); 6963 if (memcg == root_mem_cgroup) 6964 goto out; 6965 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 6966 goto out; 6967 if (css_tryget(&memcg->css)) 6968 sk->sk_memcg = memcg; 6969 out: 6970 rcu_read_unlock(); 6971 } 6972 6973 void mem_cgroup_sk_free(struct sock *sk) 6974 { 6975 if (sk->sk_memcg) 6976 css_put(&sk->sk_memcg->css); 6977 } 6978 6979 /** 6980 * mem_cgroup_charge_skmem - charge socket memory 6981 * @memcg: memcg to charge 6982 * @nr_pages: number of pages to charge 6983 * @gfp_mask: reclaim mode 6984 * 6985 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 6986 * @memcg's configured limit, %false if it doesn't. 6987 */ 6988 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 6989 gfp_t gfp_mask) 6990 { 6991 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 6992 struct page_counter *fail; 6993 6994 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 6995 memcg->tcpmem_pressure = 0; 6996 return true; 6997 } 6998 memcg->tcpmem_pressure = 1; 6999 if (gfp_mask & __GFP_NOFAIL) { 7000 page_counter_charge(&memcg->tcpmem, nr_pages); 7001 return true; 7002 } 7003 return false; 7004 } 7005 7006 if (try_charge(memcg, gfp_mask, nr_pages) == 0) { 7007 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 7008 return true; 7009 } 7010 7011 return false; 7012 } 7013 7014 /** 7015 * mem_cgroup_uncharge_skmem - uncharge socket memory 7016 * @memcg: memcg to uncharge 7017 * @nr_pages: number of pages to uncharge 7018 */ 7019 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 7020 { 7021 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7022 page_counter_uncharge(&memcg->tcpmem, nr_pages); 7023 return; 7024 } 7025 7026 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 7027 7028 refill_stock(memcg, nr_pages); 7029 } 7030 7031 static int __init cgroup_memory(char *s) 7032 { 7033 char *token; 7034 7035 while ((token = strsep(&s, ",")) != NULL) { 7036 if (!*token) 7037 continue; 7038 if (!strcmp(token, "nosocket")) 7039 cgroup_memory_nosocket = true; 7040 if (!strcmp(token, "nokmem")) 7041 cgroup_memory_nokmem = true; 7042 } 7043 return 0; 7044 } 7045 __setup("cgroup.memory=", cgroup_memory); 7046 7047 /* 7048 * subsys_initcall() for memory controller. 7049 * 7050 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 7051 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 7052 * basically everything that doesn't depend on a specific mem_cgroup structure 7053 * should be initialized from here. 7054 */ 7055 static int __init mem_cgroup_init(void) 7056 { 7057 int cpu, node; 7058 7059 /* 7060 * Currently s32 type (can refer to struct batched_lruvec_stat) is 7061 * used for per-memcg-per-cpu caching of per-node statistics. In order 7062 * to work fine, we should make sure that the overfill threshold can't 7063 * exceed S32_MAX / PAGE_SIZE. 7064 */ 7065 BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE); 7066 7067 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 7068 memcg_hotplug_cpu_dead); 7069 7070 for_each_possible_cpu(cpu) 7071 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 7072 drain_local_stock); 7073 7074 for_each_node(node) { 7075 struct mem_cgroup_tree_per_node *rtpn; 7076 7077 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 7078 node_online(node) ? node : NUMA_NO_NODE); 7079 7080 rtpn->rb_root = RB_ROOT; 7081 rtpn->rb_rightmost = NULL; 7082 spin_lock_init(&rtpn->lock); 7083 soft_limit_tree.rb_tree_per_node[node] = rtpn; 7084 } 7085 7086 return 0; 7087 } 7088 subsys_initcall(mem_cgroup_init); 7089 7090 #ifdef CONFIG_MEMCG_SWAP 7091 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 7092 { 7093 while (!refcount_inc_not_zero(&memcg->id.ref)) { 7094 /* 7095 * The root cgroup cannot be destroyed, so it's refcount must 7096 * always be >= 1. 7097 */ 7098 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { 7099 VM_BUG_ON(1); 7100 break; 7101 } 7102 memcg = parent_mem_cgroup(memcg); 7103 if (!memcg) 7104 memcg = root_mem_cgroup; 7105 } 7106 return memcg; 7107 } 7108 7109 /** 7110 * mem_cgroup_swapout - transfer a memsw charge to swap 7111 * @page: page whose memsw charge to transfer 7112 * @entry: swap entry to move the charge to 7113 * 7114 * Transfer the memsw charge of @page to @entry. 7115 */ 7116 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 7117 { 7118 struct mem_cgroup *memcg, *swap_memcg; 7119 unsigned int nr_entries; 7120 unsigned short oldid; 7121 7122 VM_BUG_ON_PAGE(PageLRU(page), page); 7123 VM_BUG_ON_PAGE(page_count(page), page); 7124 7125 if (mem_cgroup_disabled()) 7126 return; 7127 7128 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7129 return; 7130 7131 memcg = page_memcg(page); 7132 7133 VM_WARN_ON_ONCE_PAGE(!memcg, page); 7134 if (!memcg) 7135 return; 7136 7137 /* 7138 * In case the memcg owning these pages has been offlined and doesn't 7139 * have an ID allocated to it anymore, charge the closest online 7140 * ancestor for the swap instead and transfer the memory+swap charge. 7141 */ 7142 swap_memcg = mem_cgroup_id_get_online(memcg); 7143 nr_entries = thp_nr_pages(page); 7144 /* Get references for the tail pages, too */ 7145 if (nr_entries > 1) 7146 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 7147 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 7148 nr_entries); 7149 VM_BUG_ON_PAGE(oldid, page); 7150 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 7151 7152 page->memcg_data = 0; 7153 7154 if (!mem_cgroup_is_root(memcg)) 7155 page_counter_uncharge(&memcg->memory, nr_entries); 7156 7157 if (!cgroup_memory_noswap && memcg != swap_memcg) { 7158 if (!mem_cgroup_is_root(swap_memcg)) 7159 page_counter_charge(&swap_memcg->memsw, nr_entries); 7160 page_counter_uncharge(&memcg->memsw, nr_entries); 7161 } 7162 7163 /* 7164 * Interrupts should be disabled here because the caller holds the 7165 * i_pages lock which is taken with interrupts-off. It is 7166 * important here to have the interrupts disabled because it is the 7167 * only synchronisation we have for updating the per-CPU variables. 7168 */ 7169 VM_BUG_ON(!irqs_disabled()); 7170 mem_cgroup_charge_statistics(memcg, page, -nr_entries); 7171 memcg_check_events(memcg, page); 7172 7173 css_put(&memcg->css); 7174 } 7175 7176 /** 7177 * __mem_cgroup_try_charge_swap - try charging swap space for a page 7178 * @page: page being added to swap 7179 * @entry: swap entry to charge 7180 * 7181 * Try to charge @page's memcg for the swap space at @entry. 7182 * 7183 * Returns 0 on success, -ENOMEM on failure. 7184 */ 7185 int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) 7186 { 7187 unsigned int nr_pages = thp_nr_pages(page); 7188 struct page_counter *counter; 7189 struct mem_cgroup *memcg; 7190 unsigned short oldid; 7191 7192 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7193 return 0; 7194 7195 memcg = page_memcg(page); 7196 7197 VM_WARN_ON_ONCE_PAGE(!memcg, page); 7198 if (!memcg) 7199 return 0; 7200 7201 if (!entry.val) { 7202 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7203 return 0; 7204 } 7205 7206 memcg = mem_cgroup_id_get_online(memcg); 7207 7208 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && 7209 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 7210 memcg_memory_event(memcg, MEMCG_SWAP_MAX); 7211 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7212 mem_cgroup_id_put(memcg); 7213 return -ENOMEM; 7214 } 7215 7216 /* Get references for the tail pages, too */ 7217 if (nr_pages > 1) 7218 mem_cgroup_id_get_many(memcg, nr_pages - 1); 7219 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 7220 VM_BUG_ON_PAGE(oldid, page); 7221 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 7222 7223 return 0; 7224 } 7225 7226 /** 7227 * __mem_cgroup_uncharge_swap - uncharge swap space 7228 * @entry: swap entry to uncharge 7229 * @nr_pages: the amount of swap space to uncharge 7230 */ 7231 void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 7232 { 7233 struct mem_cgroup *memcg; 7234 unsigned short id; 7235 7236 id = swap_cgroup_record(entry, 0, nr_pages); 7237 rcu_read_lock(); 7238 memcg = mem_cgroup_from_id(id); 7239 if (memcg) { 7240 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { 7241 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7242 page_counter_uncharge(&memcg->swap, nr_pages); 7243 else 7244 page_counter_uncharge(&memcg->memsw, nr_pages); 7245 } 7246 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 7247 mem_cgroup_id_put_many(memcg, nr_pages); 7248 } 7249 rcu_read_unlock(); 7250 } 7251 7252 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 7253 { 7254 long nr_swap_pages = get_nr_swap_pages(); 7255 7256 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7257 return nr_swap_pages; 7258 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) 7259 nr_swap_pages = min_t(long, nr_swap_pages, 7260 READ_ONCE(memcg->swap.max) - 7261 page_counter_read(&memcg->swap)); 7262 return nr_swap_pages; 7263 } 7264 7265 bool mem_cgroup_swap_full(struct page *page) 7266 { 7267 struct mem_cgroup *memcg; 7268 7269 VM_BUG_ON_PAGE(!PageLocked(page), page); 7270 7271 if (vm_swap_full()) 7272 return true; 7273 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7274 return false; 7275 7276 memcg = page_memcg(page); 7277 if (!memcg) 7278 return false; 7279 7280 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { 7281 unsigned long usage = page_counter_read(&memcg->swap); 7282 7283 if (usage * 2 >= READ_ONCE(memcg->swap.high) || 7284 usage * 2 >= READ_ONCE(memcg->swap.max)) 7285 return true; 7286 } 7287 7288 return false; 7289 } 7290 7291 static int __init setup_swap_account(char *s) 7292 { 7293 if (!strcmp(s, "1")) 7294 cgroup_memory_noswap = false; 7295 else if (!strcmp(s, "0")) 7296 cgroup_memory_noswap = true; 7297 return 1; 7298 } 7299 __setup("swapaccount=", setup_swap_account); 7300 7301 static u64 swap_current_read(struct cgroup_subsys_state *css, 7302 struct cftype *cft) 7303 { 7304 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7305 7306 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 7307 } 7308 7309 static int swap_high_show(struct seq_file *m, void *v) 7310 { 7311 return seq_puts_memcg_tunable(m, 7312 READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); 7313 } 7314 7315 static ssize_t swap_high_write(struct kernfs_open_file *of, 7316 char *buf, size_t nbytes, loff_t off) 7317 { 7318 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7319 unsigned long high; 7320 int err; 7321 7322 buf = strstrip(buf); 7323 err = page_counter_memparse(buf, "max", &high); 7324 if (err) 7325 return err; 7326 7327 page_counter_set_high(&memcg->swap, high); 7328 7329 return nbytes; 7330 } 7331 7332 static int swap_max_show(struct seq_file *m, void *v) 7333 { 7334 return seq_puts_memcg_tunable(m, 7335 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 7336 } 7337 7338 static ssize_t swap_max_write(struct kernfs_open_file *of, 7339 char *buf, size_t nbytes, loff_t off) 7340 { 7341 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7342 unsigned long max; 7343 int err; 7344 7345 buf = strstrip(buf); 7346 err = page_counter_memparse(buf, "max", &max); 7347 if (err) 7348 return err; 7349 7350 xchg(&memcg->swap.max, max); 7351 7352 return nbytes; 7353 } 7354 7355 static int swap_events_show(struct seq_file *m, void *v) 7356 { 7357 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 7358 7359 seq_printf(m, "high %lu\n", 7360 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); 7361 seq_printf(m, "max %lu\n", 7362 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 7363 seq_printf(m, "fail %lu\n", 7364 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 7365 7366 return 0; 7367 } 7368 7369 static struct cftype swap_files[] = { 7370 { 7371 .name = "swap.current", 7372 .flags = CFTYPE_NOT_ON_ROOT, 7373 .read_u64 = swap_current_read, 7374 }, 7375 { 7376 .name = "swap.high", 7377 .flags = CFTYPE_NOT_ON_ROOT, 7378 .seq_show = swap_high_show, 7379 .write = swap_high_write, 7380 }, 7381 { 7382 .name = "swap.max", 7383 .flags = CFTYPE_NOT_ON_ROOT, 7384 .seq_show = swap_max_show, 7385 .write = swap_max_write, 7386 }, 7387 { 7388 .name = "swap.events", 7389 .flags = CFTYPE_NOT_ON_ROOT, 7390 .file_offset = offsetof(struct mem_cgroup, swap_events_file), 7391 .seq_show = swap_events_show, 7392 }, 7393 { } /* terminate */ 7394 }; 7395 7396 static struct cftype memsw_files[] = { 7397 { 7398 .name = "memsw.usage_in_bytes", 7399 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 7400 .read_u64 = mem_cgroup_read_u64, 7401 }, 7402 { 7403 .name = "memsw.max_usage_in_bytes", 7404 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 7405 .write = mem_cgroup_reset, 7406 .read_u64 = mem_cgroup_read_u64, 7407 }, 7408 { 7409 .name = "memsw.limit_in_bytes", 7410 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 7411 .write = mem_cgroup_write, 7412 .read_u64 = mem_cgroup_read_u64, 7413 }, 7414 { 7415 .name = "memsw.failcnt", 7416 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 7417 .write = mem_cgroup_reset, 7418 .read_u64 = mem_cgroup_read_u64, 7419 }, 7420 { }, /* terminate */ 7421 }; 7422 7423 /* 7424 * If mem_cgroup_swap_init() is implemented as a subsys_initcall() 7425 * instead of a core_initcall(), this could mean cgroup_memory_noswap still 7426 * remains set to false even when memcg is disabled via "cgroup_disable=memory" 7427 * boot parameter. This may result in premature OOPS inside 7428 * mem_cgroup_get_nr_swap_pages() function in corner cases. 7429 */ 7430 static int __init mem_cgroup_swap_init(void) 7431 { 7432 /* No memory control -> no swap control */ 7433 if (mem_cgroup_disabled()) 7434 cgroup_memory_noswap = true; 7435 7436 if (cgroup_memory_noswap) 7437 return 0; 7438 7439 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); 7440 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); 7441 7442 return 0; 7443 } 7444 core_initcall(mem_cgroup_swap_init); 7445 7446 #endif /* CONFIG_MEMCG_SWAP */ 7447