1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* memcontrol.c - Memory Controller 3 * 4 * Copyright IBM Corporation, 2007 5 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 6 * 7 * Copyright 2007 OpenVZ SWsoft Inc 8 * Author: Pavel Emelianov <xemul@openvz.org> 9 * 10 * Memory thresholds 11 * Copyright (C) 2009 Nokia Corporation 12 * Author: Kirill A. Shutemov 13 * 14 * Kernel Memory Controller 15 * Copyright (C) 2012 Parallels Inc. and Google Inc. 16 * Authors: Glauber Costa and Suleiman Souhlal 17 * 18 * Native page reclaim 19 * Charge lifetime sanitation 20 * Lockless page tracking & accounting 21 * Unified hierarchy configuration model 22 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner 23 */ 24 25 #include <linux/page_counter.h> 26 #include <linux/memcontrol.h> 27 #include <linux/cgroup.h> 28 #include <linux/pagewalk.h> 29 #include <linux/sched/mm.h> 30 #include <linux/shmem_fs.h> 31 #include <linux/hugetlb.h> 32 #include <linux/pagemap.h> 33 #include <linux/vm_event_item.h> 34 #include <linux/smp.h> 35 #include <linux/page-flags.h> 36 #include <linux/backing-dev.h> 37 #include <linux/bit_spinlock.h> 38 #include <linux/rcupdate.h> 39 #include <linux/limits.h> 40 #include <linux/export.h> 41 #include <linux/mutex.h> 42 #include <linux/rbtree.h> 43 #include <linux/slab.h> 44 #include <linux/swap.h> 45 #include <linux/swapops.h> 46 #include <linux/spinlock.h> 47 #include <linux/eventfd.h> 48 #include <linux/poll.h> 49 #include <linux/sort.h> 50 #include <linux/fs.h> 51 #include <linux/seq_file.h> 52 #include <linux/vmpressure.h> 53 #include <linux/mm_inline.h> 54 #include <linux/swap_cgroup.h> 55 #include <linux/cpu.h> 56 #include <linux/oom.h> 57 #include <linux/lockdep.h> 58 #include <linux/file.h> 59 #include <linux/tracehook.h> 60 #include <linux/psi.h> 61 #include <linux/seq_buf.h> 62 #include "internal.h" 63 #include <net/sock.h> 64 #include <net/ip.h> 65 #include "slab.h" 66 67 #include <linux/uaccess.h> 68 69 #include <trace/events/vmscan.h> 70 71 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 72 EXPORT_SYMBOL(memory_cgrp_subsys); 73 74 struct mem_cgroup *root_mem_cgroup __read_mostly; 75 76 /* Active memory cgroup to use from an interrupt context */ 77 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 78 79 /* Socket memory accounting disabled? */ 80 static bool cgroup_memory_nosocket; 81 82 /* Kernel memory accounting disabled? */ 83 static bool cgroup_memory_nokmem; 84 85 /* Whether the swap controller is active */ 86 #ifdef CONFIG_MEMCG_SWAP 87 bool cgroup_memory_noswap __read_mostly; 88 #else 89 #define cgroup_memory_noswap 1 90 #endif 91 92 #ifdef CONFIG_CGROUP_WRITEBACK 93 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); 94 #endif 95 96 /* Whether legacy memory+swap accounting is active */ 97 static bool do_memsw_account(void) 98 { 99 return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; 100 } 101 102 #define THRESHOLDS_EVENTS_TARGET 128 103 #define SOFTLIMIT_EVENTS_TARGET 1024 104 105 /* 106 * Cgroups above their limits are maintained in a RB-Tree, independent of 107 * their hierarchy representation 108 */ 109 110 struct mem_cgroup_tree_per_node { 111 struct rb_root rb_root; 112 struct rb_node *rb_rightmost; 113 spinlock_t lock; 114 }; 115 116 struct mem_cgroup_tree { 117 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 118 }; 119 120 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 121 122 /* for OOM */ 123 struct mem_cgroup_eventfd_list { 124 struct list_head list; 125 struct eventfd_ctx *eventfd; 126 }; 127 128 /* 129 * cgroup_event represents events which userspace want to receive. 130 */ 131 struct mem_cgroup_event { 132 /* 133 * memcg which the event belongs to. 134 */ 135 struct mem_cgroup *memcg; 136 /* 137 * eventfd to signal userspace about the event. 138 */ 139 struct eventfd_ctx *eventfd; 140 /* 141 * Each of these stored in a list by the cgroup. 142 */ 143 struct list_head list; 144 /* 145 * register_event() callback will be used to add new userspace 146 * waiter for changes related to this event. Use eventfd_signal() 147 * on eventfd to send notification to userspace. 148 */ 149 int (*register_event)(struct mem_cgroup *memcg, 150 struct eventfd_ctx *eventfd, const char *args); 151 /* 152 * unregister_event() callback will be called when userspace closes 153 * the eventfd or on cgroup removing. This callback must be set, 154 * if you want provide notification functionality. 155 */ 156 void (*unregister_event)(struct mem_cgroup *memcg, 157 struct eventfd_ctx *eventfd); 158 /* 159 * All fields below needed to unregister event when 160 * userspace closes eventfd. 161 */ 162 poll_table pt; 163 wait_queue_head_t *wqh; 164 wait_queue_entry_t wait; 165 struct work_struct remove; 166 }; 167 168 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 169 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 170 171 /* Stuffs for move charges at task migration. */ 172 /* 173 * Types of charges to be moved. 174 */ 175 #define MOVE_ANON 0x1U 176 #define MOVE_FILE 0x2U 177 #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 178 179 /* "mc" and its members are protected by cgroup_mutex */ 180 static struct move_charge_struct { 181 spinlock_t lock; /* for from, to */ 182 struct mm_struct *mm; 183 struct mem_cgroup *from; 184 struct mem_cgroup *to; 185 unsigned long flags; 186 unsigned long precharge; 187 unsigned long moved_charge; 188 unsigned long moved_swap; 189 struct task_struct *moving_task; /* a task moving charges */ 190 wait_queue_head_t waitq; /* a waitq for other context */ 191 } mc = { 192 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 193 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 194 }; 195 196 /* 197 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 198 * limit reclaim to prevent infinite loops, if they ever occur. 199 */ 200 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 201 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 202 203 /* for encoding cft->private value on file */ 204 enum res_type { 205 _MEM, 206 _MEMSWAP, 207 _OOM_TYPE, 208 _KMEM, 209 _TCP, 210 }; 211 212 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 213 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 214 #define MEMFILE_ATTR(val) ((val) & 0xffff) 215 /* Used for OOM nofiier */ 216 #define OOM_CONTROL (0) 217 218 /* 219 * Iteration constructs for visiting all cgroups (under a tree). If 220 * loops are exited prematurely (break), mem_cgroup_iter_break() must 221 * be used for reference counting. 222 */ 223 #define for_each_mem_cgroup_tree(iter, root) \ 224 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 225 iter != NULL; \ 226 iter = mem_cgroup_iter(root, iter, NULL)) 227 228 #define for_each_mem_cgroup(iter) \ 229 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 230 iter != NULL; \ 231 iter = mem_cgroup_iter(NULL, iter, NULL)) 232 233 static inline bool should_force_charge(void) 234 { 235 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || 236 (current->flags & PF_EXITING); 237 } 238 239 /* Some nice accessors for the vmpressure. */ 240 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 241 { 242 if (!memcg) 243 memcg = root_mem_cgroup; 244 return &memcg->vmpressure; 245 } 246 247 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 248 { 249 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 250 } 251 252 #ifdef CONFIG_MEMCG_KMEM 253 extern spinlock_t css_set_lock; 254 255 static void obj_cgroup_release(struct percpu_ref *ref) 256 { 257 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); 258 struct mem_cgroup *memcg; 259 unsigned int nr_bytes; 260 unsigned int nr_pages; 261 unsigned long flags; 262 263 /* 264 * At this point all allocated objects are freed, and 265 * objcg->nr_charged_bytes can't have an arbitrary byte value. 266 * However, it can be PAGE_SIZE or (x * PAGE_SIZE). 267 * 268 * The following sequence can lead to it: 269 * 1) CPU0: objcg == stock->cached_objcg 270 * 2) CPU1: we do a small allocation (e.g. 92 bytes), 271 * PAGE_SIZE bytes are charged 272 * 3) CPU1: a process from another memcg is allocating something, 273 * the stock if flushed, 274 * objcg->nr_charged_bytes = PAGE_SIZE - 92 275 * 5) CPU0: we do release this object, 276 * 92 bytes are added to stock->nr_bytes 277 * 6) CPU0: stock is flushed, 278 * 92 bytes are added to objcg->nr_charged_bytes 279 * 280 * In the result, nr_charged_bytes == PAGE_SIZE. 281 * This page will be uncharged in obj_cgroup_release(). 282 */ 283 nr_bytes = atomic_read(&objcg->nr_charged_bytes); 284 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); 285 nr_pages = nr_bytes >> PAGE_SHIFT; 286 287 spin_lock_irqsave(&css_set_lock, flags); 288 memcg = obj_cgroup_memcg(objcg); 289 if (nr_pages) 290 __memcg_kmem_uncharge(memcg, nr_pages); 291 list_del(&objcg->list); 292 mem_cgroup_put(memcg); 293 spin_unlock_irqrestore(&css_set_lock, flags); 294 295 percpu_ref_exit(ref); 296 kfree_rcu(objcg, rcu); 297 } 298 299 static struct obj_cgroup *obj_cgroup_alloc(void) 300 { 301 struct obj_cgroup *objcg; 302 int ret; 303 304 objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); 305 if (!objcg) 306 return NULL; 307 308 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, 309 GFP_KERNEL); 310 if (ret) { 311 kfree(objcg); 312 return NULL; 313 } 314 INIT_LIST_HEAD(&objcg->list); 315 return objcg; 316 } 317 318 static void memcg_reparent_objcgs(struct mem_cgroup *memcg, 319 struct mem_cgroup *parent) 320 { 321 struct obj_cgroup *objcg, *iter; 322 323 objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 324 325 spin_lock_irq(&css_set_lock); 326 327 /* Move active objcg to the parent's list */ 328 xchg(&objcg->memcg, parent); 329 css_get(&parent->css); 330 list_add(&objcg->list, &parent->objcg_list); 331 332 /* Move already reparented objcgs to the parent's list */ 333 list_for_each_entry(iter, &memcg->objcg_list, list) { 334 css_get(&parent->css); 335 xchg(&iter->memcg, parent); 336 css_put(&memcg->css); 337 } 338 list_splice(&memcg->objcg_list, &parent->objcg_list); 339 340 spin_unlock_irq(&css_set_lock); 341 342 percpu_ref_kill(&objcg->refcnt); 343 } 344 345 /* 346 * This will be used as a shrinker list's index. 347 * The main reason for not using cgroup id for this: 348 * this works better in sparse environments, where we have a lot of memcgs, 349 * but only a few kmem-limited. Or also, if we have, for instance, 200 350 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 351 * 200 entry array for that. 352 * 353 * The current size of the caches array is stored in memcg_nr_cache_ids. It 354 * will double each time we have to increase it. 355 */ 356 static DEFINE_IDA(memcg_cache_ida); 357 int memcg_nr_cache_ids; 358 359 /* Protects memcg_nr_cache_ids */ 360 static DECLARE_RWSEM(memcg_cache_ids_sem); 361 362 void memcg_get_cache_ids(void) 363 { 364 down_read(&memcg_cache_ids_sem); 365 } 366 367 void memcg_put_cache_ids(void) 368 { 369 up_read(&memcg_cache_ids_sem); 370 } 371 372 /* 373 * MIN_SIZE is different than 1, because we would like to avoid going through 374 * the alloc/free process all the time. In a small machine, 4 kmem-limited 375 * cgroups is a reasonable guess. In the future, it could be a parameter or 376 * tunable, but that is strictly not necessary. 377 * 378 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 379 * this constant directly from cgroup, but it is understandable that this is 380 * better kept as an internal representation in cgroup.c. In any case, the 381 * cgrp_id space is not getting any smaller, and we don't have to necessarily 382 * increase ours as well if it increases. 383 */ 384 #define MEMCG_CACHES_MIN_SIZE 4 385 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 386 387 /* 388 * A lot of the calls to the cache allocation functions are expected to be 389 * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are 390 * conditional to this static branch, we'll have to allow modules that does 391 * kmem_cache_alloc and the such to see this symbol as well 392 */ 393 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); 394 EXPORT_SYMBOL(memcg_kmem_enabled_key); 395 #endif 396 397 static int memcg_shrinker_map_size; 398 static DEFINE_MUTEX(memcg_shrinker_map_mutex); 399 400 static void memcg_free_shrinker_map_rcu(struct rcu_head *head) 401 { 402 kvfree(container_of(head, struct memcg_shrinker_map, rcu)); 403 } 404 405 static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, 406 int size, int old_size) 407 { 408 struct memcg_shrinker_map *new, *old; 409 int nid; 410 411 lockdep_assert_held(&memcg_shrinker_map_mutex); 412 413 for_each_node(nid) { 414 old = rcu_dereference_protected( 415 mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); 416 /* Not yet online memcg */ 417 if (!old) 418 return 0; 419 420 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); 421 if (!new) 422 return -ENOMEM; 423 424 /* Set all old bits, clear all new bits */ 425 memset(new->map, (int)0xff, old_size); 426 memset((void *)new->map + old_size, 0, size - old_size); 427 428 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); 429 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); 430 } 431 432 return 0; 433 } 434 435 static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) 436 { 437 struct mem_cgroup_per_node *pn; 438 struct memcg_shrinker_map *map; 439 int nid; 440 441 if (mem_cgroup_is_root(memcg)) 442 return; 443 444 for_each_node(nid) { 445 pn = mem_cgroup_nodeinfo(memcg, nid); 446 map = rcu_dereference_protected(pn->shrinker_map, true); 447 if (map) 448 kvfree(map); 449 rcu_assign_pointer(pn->shrinker_map, NULL); 450 } 451 } 452 453 static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) 454 { 455 struct memcg_shrinker_map *map; 456 int nid, size, ret = 0; 457 458 if (mem_cgroup_is_root(memcg)) 459 return 0; 460 461 mutex_lock(&memcg_shrinker_map_mutex); 462 size = memcg_shrinker_map_size; 463 for_each_node(nid) { 464 map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); 465 if (!map) { 466 memcg_free_shrinker_maps(memcg); 467 ret = -ENOMEM; 468 break; 469 } 470 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); 471 } 472 mutex_unlock(&memcg_shrinker_map_mutex); 473 474 return ret; 475 } 476 477 int memcg_expand_shrinker_maps(int new_id) 478 { 479 int size, old_size, ret = 0; 480 struct mem_cgroup *memcg; 481 482 size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); 483 old_size = memcg_shrinker_map_size; 484 if (size <= old_size) 485 return 0; 486 487 mutex_lock(&memcg_shrinker_map_mutex); 488 if (!root_mem_cgroup) 489 goto unlock; 490 491 for_each_mem_cgroup(memcg) { 492 if (mem_cgroup_is_root(memcg)) 493 continue; 494 ret = memcg_expand_one_shrinker_map(memcg, size, old_size); 495 if (ret) { 496 mem_cgroup_iter_break(NULL, memcg); 497 goto unlock; 498 } 499 } 500 unlock: 501 if (!ret) 502 memcg_shrinker_map_size = size; 503 mutex_unlock(&memcg_shrinker_map_mutex); 504 return ret; 505 } 506 507 void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) 508 { 509 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { 510 struct memcg_shrinker_map *map; 511 512 rcu_read_lock(); 513 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); 514 /* Pairs with smp mb in shrink_slab() */ 515 smp_mb__before_atomic(); 516 set_bit(shrinker_id, map->map); 517 rcu_read_unlock(); 518 } 519 } 520 521 /** 522 * mem_cgroup_css_from_page - css of the memcg associated with a page 523 * @page: page of interest 524 * 525 * If memcg is bound to the default hierarchy, css of the memcg associated 526 * with @page is returned. The returned css remains associated with @page 527 * until it is released. 528 * 529 * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup 530 * is returned. 531 */ 532 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) 533 { 534 struct mem_cgroup *memcg; 535 536 memcg = page->mem_cgroup; 537 538 if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 539 memcg = root_mem_cgroup; 540 541 return &memcg->css; 542 } 543 544 /** 545 * page_cgroup_ino - return inode number of the memcg a page is charged to 546 * @page: the page 547 * 548 * Look up the closest online ancestor of the memory cgroup @page is charged to 549 * and return its inode number or 0 if @page is not charged to any cgroup. It 550 * is safe to call this function without holding a reference to @page. 551 * 552 * Note, this function is inherently racy, because there is nothing to prevent 553 * the cgroup inode from getting torn down and potentially reallocated a moment 554 * after page_cgroup_ino() returns, so it only should be used by callers that 555 * do not care (such as procfs interfaces). 556 */ 557 ino_t page_cgroup_ino(struct page *page) 558 { 559 struct mem_cgroup *memcg; 560 unsigned long ino = 0; 561 562 rcu_read_lock(); 563 memcg = page->mem_cgroup; 564 565 /* 566 * The lowest bit set means that memcg isn't a valid 567 * memcg pointer, but a obj_cgroups pointer. 568 * In this case the page is shared and doesn't belong 569 * to any specific memory cgroup. 570 */ 571 if ((unsigned long) memcg & 0x1UL) 572 memcg = NULL; 573 574 while (memcg && !(memcg->css.flags & CSS_ONLINE)) 575 memcg = parent_mem_cgroup(memcg); 576 if (memcg) 577 ino = cgroup_ino(memcg->css.cgroup); 578 rcu_read_unlock(); 579 return ino; 580 } 581 582 static struct mem_cgroup_per_node * 583 mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) 584 { 585 int nid = page_to_nid(page); 586 587 return memcg->nodeinfo[nid]; 588 } 589 590 static struct mem_cgroup_tree_per_node * 591 soft_limit_tree_node(int nid) 592 { 593 return soft_limit_tree.rb_tree_per_node[nid]; 594 } 595 596 static struct mem_cgroup_tree_per_node * 597 soft_limit_tree_from_page(struct page *page) 598 { 599 int nid = page_to_nid(page); 600 601 return soft_limit_tree.rb_tree_per_node[nid]; 602 } 603 604 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 605 struct mem_cgroup_tree_per_node *mctz, 606 unsigned long new_usage_in_excess) 607 { 608 struct rb_node **p = &mctz->rb_root.rb_node; 609 struct rb_node *parent = NULL; 610 struct mem_cgroup_per_node *mz_node; 611 bool rightmost = true; 612 613 if (mz->on_tree) 614 return; 615 616 mz->usage_in_excess = new_usage_in_excess; 617 if (!mz->usage_in_excess) 618 return; 619 while (*p) { 620 parent = *p; 621 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 622 tree_node); 623 if (mz->usage_in_excess < mz_node->usage_in_excess) { 624 p = &(*p)->rb_left; 625 rightmost = false; 626 } else { 627 p = &(*p)->rb_right; 628 } 629 } 630 631 if (rightmost) 632 mctz->rb_rightmost = &mz->tree_node; 633 634 rb_link_node(&mz->tree_node, parent, p); 635 rb_insert_color(&mz->tree_node, &mctz->rb_root); 636 mz->on_tree = true; 637 } 638 639 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 640 struct mem_cgroup_tree_per_node *mctz) 641 { 642 if (!mz->on_tree) 643 return; 644 645 if (&mz->tree_node == mctz->rb_rightmost) 646 mctz->rb_rightmost = rb_prev(&mz->tree_node); 647 648 rb_erase(&mz->tree_node, &mctz->rb_root); 649 mz->on_tree = false; 650 } 651 652 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 653 struct mem_cgroup_tree_per_node *mctz) 654 { 655 unsigned long flags; 656 657 spin_lock_irqsave(&mctz->lock, flags); 658 __mem_cgroup_remove_exceeded(mz, mctz); 659 spin_unlock_irqrestore(&mctz->lock, flags); 660 } 661 662 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 663 { 664 unsigned long nr_pages = page_counter_read(&memcg->memory); 665 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 666 unsigned long excess = 0; 667 668 if (nr_pages > soft_limit) 669 excess = nr_pages - soft_limit; 670 671 return excess; 672 } 673 674 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 675 { 676 unsigned long excess; 677 struct mem_cgroup_per_node *mz; 678 struct mem_cgroup_tree_per_node *mctz; 679 680 mctz = soft_limit_tree_from_page(page); 681 if (!mctz) 682 return; 683 /* 684 * Necessary to update all ancestors when hierarchy is used. 685 * because their event counter is not touched. 686 */ 687 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 688 mz = mem_cgroup_page_nodeinfo(memcg, page); 689 excess = soft_limit_excess(memcg); 690 /* 691 * We have to update the tree if mz is on RB-tree or 692 * mem is over its softlimit. 693 */ 694 if (excess || mz->on_tree) { 695 unsigned long flags; 696 697 spin_lock_irqsave(&mctz->lock, flags); 698 /* if on-tree, remove it */ 699 if (mz->on_tree) 700 __mem_cgroup_remove_exceeded(mz, mctz); 701 /* 702 * Insert again. mz->usage_in_excess will be updated. 703 * If excess is 0, no tree ops. 704 */ 705 __mem_cgroup_insert_exceeded(mz, mctz, excess); 706 spin_unlock_irqrestore(&mctz->lock, flags); 707 } 708 } 709 } 710 711 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 712 { 713 struct mem_cgroup_tree_per_node *mctz; 714 struct mem_cgroup_per_node *mz; 715 int nid; 716 717 for_each_node(nid) { 718 mz = mem_cgroup_nodeinfo(memcg, nid); 719 mctz = soft_limit_tree_node(nid); 720 if (mctz) 721 mem_cgroup_remove_exceeded(mz, mctz); 722 } 723 } 724 725 static struct mem_cgroup_per_node * 726 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 727 { 728 struct mem_cgroup_per_node *mz; 729 730 retry: 731 mz = NULL; 732 if (!mctz->rb_rightmost) 733 goto done; /* Nothing to reclaim from */ 734 735 mz = rb_entry(mctz->rb_rightmost, 736 struct mem_cgroup_per_node, tree_node); 737 /* 738 * Remove the node now but someone else can add it back, 739 * we will to add it back at the end of reclaim to its correct 740 * position in the tree. 741 */ 742 __mem_cgroup_remove_exceeded(mz, mctz); 743 if (!soft_limit_excess(mz->memcg) || 744 !css_tryget(&mz->memcg->css)) 745 goto retry; 746 done: 747 return mz; 748 } 749 750 static struct mem_cgroup_per_node * 751 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 752 { 753 struct mem_cgroup_per_node *mz; 754 755 spin_lock_irq(&mctz->lock); 756 mz = __mem_cgroup_largest_soft_limit_node(mctz); 757 spin_unlock_irq(&mctz->lock); 758 return mz; 759 } 760 761 /** 762 * __mod_memcg_state - update cgroup memory statistics 763 * @memcg: the memory cgroup 764 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item 765 * @val: delta to add to the counter, can be negative 766 */ 767 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) 768 { 769 long x, threshold = MEMCG_CHARGE_BATCH; 770 771 if (mem_cgroup_disabled()) 772 return; 773 774 if (memcg_stat_item_in_bytes(idx)) 775 threshold <<= PAGE_SHIFT; 776 777 x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); 778 if (unlikely(abs(x) > threshold)) { 779 struct mem_cgroup *mi; 780 781 /* 782 * Batch local counters to keep them in sync with 783 * the hierarchical ones. 784 */ 785 __this_cpu_add(memcg->vmstats_local->stat[idx], x); 786 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 787 atomic_long_add(x, &mi->vmstats[idx]); 788 x = 0; 789 } 790 __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); 791 } 792 793 static struct mem_cgroup_per_node * 794 parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) 795 { 796 struct mem_cgroup *parent; 797 798 parent = parent_mem_cgroup(pn->memcg); 799 if (!parent) 800 return NULL; 801 return mem_cgroup_nodeinfo(parent, nid); 802 } 803 804 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 805 int val) 806 { 807 struct mem_cgroup_per_node *pn; 808 struct mem_cgroup *memcg; 809 long x, threshold = MEMCG_CHARGE_BATCH; 810 811 pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 812 memcg = pn->memcg; 813 814 /* Update memcg */ 815 __mod_memcg_state(memcg, idx, val); 816 817 /* Update lruvec */ 818 __this_cpu_add(pn->lruvec_stat_local->count[idx], val); 819 820 if (vmstat_item_in_bytes(idx)) 821 threshold <<= PAGE_SHIFT; 822 823 x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); 824 if (unlikely(abs(x) > threshold)) { 825 pg_data_t *pgdat = lruvec_pgdat(lruvec); 826 struct mem_cgroup_per_node *pi; 827 828 for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) 829 atomic_long_add(x, &pi->lruvec_stat[idx]); 830 x = 0; 831 } 832 __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); 833 } 834 835 /** 836 * __mod_lruvec_state - update lruvec memory statistics 837 * @lruvec: the lruvec 838 * @idx: the stat item 839 * @val: delta to add to the counter, can be negative 840 * 841 * The lruvec is the intersection of the NUMA node and a cgroup. This 842 * function updates the all three counters that are affected by a 843 * change of state at this level: per-node, per-cgroup, per-lruvec. 844 */ 845 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, 846 int val) 847 { 848 /* Update node */ 849 __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); 850 851 /* Update memcg and lruvec */ 852 if (!mem_cgroup_disabled()) 853 __mod_memcg_lruvec_state(lruvec, idx, val); 854 } 855 856 void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, 857 int val) 858 { 859 struct page *head = compound_head(page); /* rmap on tail pages */ 860 pg_data_t *pgdat = page_pgdat(page); 861 struct lruvec *lruvec; 862 863 /* Untracked pages have no memcg, no lruvec. Update only the node */ 864 if (!head->mem_cgroup) { 865 __mod_node_page_state(pgdat, idx, val); 866 return; 867 } 868 869 lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); 870 __mod_lruvec_state(lruvec, idx, val); 871 } 872 EXPORT_SYMBOL(__mod_lruvec_page_state); 873 874 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) 875 { 876 pg_data_t *pgdat = page_pgdat(virt_to_page(p)); 877 struct mem_cgroup *memcg; 878 struct lruvec *lruvec; 879 880 rcu_read_lock(); 881 memcg = mem_cgroup_from_obj(p); 882 883 /* 884 * Untracked pages have no memcg, no lruvec. Update only the 885 * node. If we reparent the slab objects to the root memcg, 886 * when we free the slab object, we need to update the per-memcg 887 * vmstats to keep it correct for the root memcg. 888 */ 889 if (!memcg) { 890 __mod_node_page_state(pgdat, idx, val); 891 } else { 892 lruvec = mem_cgroup_lruvec(memcg, pgdat); 893 __mod_lruvec_state(lruvec, idx, val); 894 } 895 rcu_read_unlock(); 896 } 897 898 /** 899 * __count_memcg_events - account VM events in a cgroup 900 * @memcg: the memory cgroup 901 * @idx: the event item 902 * @count: the number of events that occured 903 */ 904 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, 905 unsigned long count) 906 { 907 unsigned long x; 908 909 if (mem_cgroup_disabled()) 910 return; 911 912 x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); 913 if (unlikely(x > MEMCG_CHARGE_BATCH)) { 914 struct mem_cgroup *mi; 915 916 /* 917 * Batch local counters to keep them in sync with 918 * the hierarchical ones. 919 */ 920 __this_cpu_add(memcg->vmstats_local->events[idx], x); 921 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 922 atomic_long_add(x, &mi->vmevents[idx]); 923 x = 0; 924 } 925 __this_cpu_write(memcg->vmstats_percpu->events[idx], x); 926 } 927 928 static unsigned long memcg_events(struct mem_cgroup *memcg, int event) 929 { 930 return atomic_long_read(&memcg->vmevents[event]); 931 } 932 933 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) 934 { 935 long x = 0; 936 int cpu; 937 938 for_each_possible_cpu(cpu) 939 x += per_cpu(memcg->vmstats_local->events[event], cpu); 940 return x; 941 } 942 943 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 944 struct page *page, 945 int nr_pages) 946 { 947 /* pagein of a big page is an event. So, ignore page size */ 948 if (nr_pages > 0) 949 __count_memcg_events(memcg, PGPGIN, 1); 950 else { 951 __count_memcg_events(memcg, PGPGOUT, 1); 952 nr_pages = -nr_pages; /* for event */ 953 } 954 955 __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); 956 } 957 958 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 959 enum mem_cgroup_events_target target) 960 { 961 unsigned long val, next; 962 963 val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); 964 next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); 965 /* from time_after() in jiffies.h */ 966 if ((long)(next - val) < 0) { 967 switch (target) { 968 case MEM_CGROUP_TARGET_THRESH: 969 next = val + THRESHOLDS_EVENTS_TARGET; 970 break; 971 case MEM_CGROUP_TARGET_SOFTLIMIT: 972 next = val + SOFTLIMIT_EVENTS_TARGET; 973 break; 974 default: 975 break; 976 } 977 __this_cpu_write(memcg->vmstats_percpu->targets[target], next); 978 return true; 979 } 980 return false; 981 } 982 983 /* 984 * Check events in order. 985 * 986 */ 987 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 988 { 989 /* threshold event is triggered in finer grain than soft limit */ 990 if (unlikely(mem_cgroup_event_ratelimit(memcg, 991 MEM_CGROUP_TARGET_THRESH))) { 992 bool do_softlimit; 993 994 do_softlimit = mem_cgroup_event_ratelimit(memcg, 995 MEM_CGROUP_TARGET_SOFTLIMIT); 996 mem_cgroup_threshold(memcg); 997 if (unlikely(do_softlimit)) 998 mem_cgroup_update_tree(memcg, page); 999 } 1000 } 1001 1002 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1003 { 1004 /* 1005 * mm_update_next_owner() may clear mm->owner to NULL 1006 * if it races with swapoff, page migration, etc. 1007 * So this can be called with p == NULL. 1008 */ 1009 if (unlikely(!p)) 1010 return NULL; 1011 1012 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 1013 } 1014 EXPORT_SYMBOL(mem_cgroup_from_task); 1015 1016 /** 1017 * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. 1018 * @mm: mm from which memcg should be extracted. It can be NULL. 1019 * 1020 * Obtain a reference on mm->memcg and returns it if successful. Otherwise 1021 * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is 1022 * returned. 1023 */ 1024 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1025 { 1026 struct mem_cgroup *memcg; 1027 1028 if (mem_cgroup_disabled()) 1029 return NULL; 1030 1031 rcu_read_lock(); 1032 do { 1033 /* 1034 * Page cache insertions can happen withou an 1035 * actual mm context, e.g. during disk probing 1036 * on boot, loopback IO, acct() writes etc. 1037 */ 1038 if (unlikely(!mm)) 1039 memcg = root_mem_cgroup; 1040 else { 1041 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1042 if (unlikely(!memcg)) 1043 memcg = root_mem_cgroup; 1044 } 1045 } while (!css_tryget(&memcg->css)); 1046 rcu_read_unlock(); 1047 return memcg; 1048 } 1049 EXPORT_SYMBOL(get_mem_cgroup_from_mm); 1050 1051 /** 1052 * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. 1053 * @page: page from which memcg should be extracted. 1054 * 1055 * Obtain a reference on page->memcg and returns it if successful. Otherwise 1056 * root_mem_cgroup is returned. 1057 */ 1058 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) 1059 { 1060 struct mem_cgroup *memcg = page->mem_cgroup; 1061 1062 if (mem_cgroup_disabled()) 1063 return NULL; 1064 1065 rcu_read_lock(); 1066 /* Page should not get uncharged and freed memcg under us. */ 1067 if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) 1068 memcg = root_mem_cgroup; 1069 rcu_read_unlock(); 1070 return memcg; 1071 } 1072 EXPORT_SYMBOL(get_mem_cgroup_from_page); 1073 1074 static __always_inline struct mem_cgroup *active_memcg(void) 1075 { 1076 if (in_interrupt()) 1077 return this_cpu_read(int_active_memcg); 1078 else 1079 return current->active_memcg; 1080 } 1081 1082 static __always_inline struct mem_cgroup *get_active_memcg(void) 1083 { 1084 struct mem_cgroup *memcg; 1085 1086 rcu_read_lock(); 1087 memcg = active_memcg(); 1088 if (memcg) { 1089 /* current->active_memcg must hold a ref. */ 1090 if (WARN_ON_ONCE(!css_tryget(&memcg->css))) 1091 memcg = root_mem_cgroup; 1092 else 1093 memcg = current->active_memcg; 1094 } 1095 rcu_read_unlock(); 1096 1097 return memcg; 1098 } 1099 1100 static __always_inline bool memcg_kmem_bypass(void) 1101 { 1102 /* Allow remote memcg charging from any context. */ 1103 if (unlikely(active_memcg())) 1104 return false; 1105 1106 /* Memcg to charge can't be determined. */ 1107 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) 1108 return true; 1109 1110 return false; 1111 } 1112 1113 /** 1114 * If active memcg is set, do not fallback to current->mm->memcg. 1115 */ 1116 static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) 1117 { 1118 if (memcg_kmem_bypass()) 1119 return NULL; 1120 1121 if (unlikely(active_memcg())) 1122 return get_active_memcg(); 1123 1124 return get_mem_cgroup_from_mm(current->mm); 1125 } 1126 1127 /** 1128 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1129 * @root: hierarchy root 1130 * @prev: previously returned memcg, NULL on first invocation 1131 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1132 * 1133 * Returns references to children of the hierarchy below @root, or 1134 * @root itself, or %NULL after a full round-trip. 1135 * 1136 * Caller must pass the return value in @prev on subsequent 1137 * invocations for reference counting, or use mem_cgroup_iter_break() 1138 * to cancel a hierarchy walk before the round-trip is complete. 1139 * 1140 * Reclaimers can specify a node in @reclaim to divide up the memcgs 1141 * in the hierarchy among all concurrent reclaimers operating on the 1142 * same node. 1143 */ 1144 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1145 struct mem_cgroup *prev, 1146 struct mem_cgroup_reclaim_cookie *reclaim) 1147 { 1148 struct mem_cgroup_reclaim_iter *iter; 1149 struct cgroup_subsys_state *css = NULL; 1150 struct mem_cgroup *memcg = NULL; 1151 struct mem_cgroup *pos = NULL; 1152 1153 if (mem_cgroup_disabled()) 1154 return NULL; 1155 1156 if (!root) 1157 root = root_mem_cgroup; 1158 1159 if (prev && !reclaim) 1160 pos = prev; 1161 1162 rcu_read_lock(); 1163 1164 if (reclaim) { 1165 struct mem_cgroup_per_node *mz; 1166 1167 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); 1168 iter = &mz->iter; 1169 1170 if (prev && reclaim->generation != iter->generation) 1171 goto out_unlock; 1172 1173 while (1) { 1174 pos = READ_ONCE(iter->position); 1175 if (!pos || css_tryget(&pos->css)) 1176 break; 1177 /* 1178 * css reference reached zero, so iter->position will 1179 * be cleared by ->css_released. However, we should not 1180 * rely on this happening soon, because ->css_released 1181 * is called from a work queue, and by busy-waiting we 1182 * might block it. So we clear iter->position right 1183 * away. 1184 */ 1185 (void)cmpxchg(&iter->position, pos, NULL); 1186 } 1187 } 1188 1189 if (pos) 1190 css = &pos->css; 1191 1192 for (;;) { 1193 css = css_next_descendant_pre(css, &root->css); 1194 if (!css) { 1195 /* 1196 * Reclaimers share the hierarchy walk, and a 1197 * new one might jump in right at the end of 1198 * the hierarchy - make sure they see at least 1199 * one group and restart from the beginning. 1200 */ 1201 if (!prev) 1202 continue; 1203 break; 1204 } 1205 1206 /* 1207 * Verify the css and acquire a reference. The root 1208 * is provided by the caller, so we know it's alive 1209 * and kicking, and don't take an extra reference. 1210 */ 1211 memcg = mem_cgroup_from_css(css); 1212 1213 if (css == &root->css) 1214 break; 1215 1216 if (css_tryget(css)) 1217 break; 1218 1219 memcg = NULL; 1220 } 1221 1222 if (reclaim) { 1223 /* 1224 * The position could have already been updated by a competing 1225 * thread, so check that the value hasn't changed since we read 1226 * it to avoid reclaiming from the same cgroup twice. 1227 */ 1228 (void)cmpxchg(&iter->position, pos, memcg); 1229 1230 if (pos) 1231 css_put(&pos->css); 1232 1233 if (!memcg) 1234 iter->generation++; 1235 else if (!prev) 1236 reclaim->generation = iter->generation; 1237 } 1238 1239 out_unlock: 1240 rcu_read_unlock(); 1241 if (prev && prev != root) 1242 css_put(&prev->css); 1243 1244 return memcg; 1245 } 1246 1247 /** 1248 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1249 * @root: hierarchy root 1250 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1251 */ 1252 void mem_cgroup_iter_break(struct mem_cgroup *root, 1253 struct mem_cgroup *prev) 1254 { 1255 if (!root) 1256 root = root_mem_cgroup; 1257 if (prev && prev != root) 1258 css_put(&prev->css); 1259 } 1260 1261 static void __invalidate_reclaim_iterators(struct mem_cgroup *from, 1262 struct mem_cgroup *dead_memcg) 1263 { 1264 struct mem_cgroup_reclaim_iter *iter; 1265 struct mem_cgroup_per_node *mz; 1266 int nid; 1267 1268 for_each_node(nid) { 1269 mz = mem_cgroup_nodeinfo(from, nid); 1270 iter = &mz->iter; 1271 cmpxchg(&iter->position, dead_memcg, NULL); 1272 } 1273 } 1274 1275 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) 1276 { 1277 struct mem_cgroup *memcg = dead_memcg; 1278 struct mem_cgroup *last; 1279 1280 do { 1281 __invalidate_reclaim_iterators(memcg, dead_memcg); 1282 last = memcg; 1283 } while ((memcg = parent_mem_cgroup(memcg))); 1284 1285 /* 1286 * When cgruop1 non-hierarchy mode is used, 1287 * parent_mem_cgroup() does not walk all the way up to the 1288 * cgroup root (root_mem_cgroup). So we have to handle 1289 * dead_memcg from cgroup root separately. 1290 */ 1291 if (last != root_mem_cgroup) 1292 __invalidate_reclaim_iterators(root_mem_cgroup, 1293 dead_memcg); 1294 } 1295 1296 /** 1297 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy 1298 * @memcg: hierarchy root 1299 * @fn: function to call for each task 1300 * @arg: argument passed to @fn 1301 * 1302 * This function iterates over tasks attached to @memcg or to any of its 1303 * descendants and calls @fn for each task. If @fn returns a non-zero 1304 * value, the function breaks the iteration loop and returns the value. 1305 * Otherwise, it will iterate over all tasks and return 0. 1306 * 1307 * This function must not be called for the root memory cgroup. 1308 */ 1309 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, 1310 int (*fn)(struct task_struct *, void *), void *arg) 1311 { 1312 struct mem_cgroup *iter; 1313 int ret = 0; 1314 1315 BUG_ON(memcg == root_mem_cgroup); 1316 1317 for_each_mem_cgroup_tree(iter, memcg) { 1318 struct css_task_iter it; 1319 struct task_struct *task; 1320 1321 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); 1322 while (!ret && (task = css_task_iter_next(&it))) 1323 ret = fn(task, arg); 1324 css_task_iter_end(&it); 1325 if (ret) { 1326 mem_cgroup_iter_break(memcg, iter); 1327 break; 1328 } 1329 } 1330 return ret; 1331 } 1332 1333 /** 1334 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page 1335 * @page: the page 1336 * @pgdat: pgdat of the page 1337 * 1338 * This function relies on page's memcg being stable - see the 1339 * access rules in commit_charge(). 1340 */ 1341 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) 1342 { 1343 struct mem_cgroup_per_node *mz; 1344 struct mem_cgroup *memcg; 1345 struct lruvec *lruvec; 1346 1347 if (mem_cgroup_disabled()) { 1348 lruvec = &pgdat->__lruvec; 1349 goto out; 1350 } 1351 1352 memcg = page->mem_cgroup; 1353 /* 1354 * Swapcache readahead pages are added to the LRU - and 1355 * possibly migrated - before they are charged. 1356 */ 1357 if (!memcg) 1358 memcg = root_mem_cgroup; 1359 1360 mz = mem_cgroup_page_nodeinfo(memcg, page); 1361 lruvec = &mz->lruvec; 1362 out: 1363 /* 1364 * Since a node can be onlined after the mem_cgroup was created, 1365 * we have to be prepared to initialize lruvec->zone here; 1366 * and if offlined then reonlined, we need to reinitialize it. 1367 */ 1368 if (unlikely(lruvec->pgdat != pgdat)) 1369 lruvec->pgdat = pgdat; 1370 return lruvec; 1371 } 1372 1373 /** 1374 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1375 * @lruvec: mem_cgroup per zone lru vector 1376 * @lru: index of lru list the page is sitting on 1377 * @zid: zone id of the accounted pages 1378 * @nr_pages: positive when adding or negative when removing 1379 * 1380 * This function must be called under lru_lock, just before a page is added 1381 * to or just after a page is removed from an lru list (that ordering being 1382 * so as to allow it to check that lru_size 0 is consistent with list_empty). 1383 */ 1384 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1385 int zid, int nr_pages) 1386 { 1387 struct mem_cgroup_per_node *mz; 1388 unsigned long *lru_size; 1389 long size; 1390 1391 if (mem_cgroup_disabled()) 1392 return; 1393 1394 mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); 1395 lru_size = &mz->lru_zone_size[zid][lru]; 1396 1397 if (nr_pages < 0) 1398 *lru_size += nr_pages; 1399 1400 size = *lru_size; 1401 if (WARN_ONCE(size < 0, 1402 "%s(%p, %d, %d): lru_size %ld\n", 1403 __func__, lruvec, lru, nr_pages, size)) { 1404 VM_BUG_ON(1); 1405 *lru_size = 0; 1406 } 1407 1408 if (nr_pages > 0) 1409 *lru_size += nr_pages; 1410 } 1411 1412 /** 1413 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1414 * @memcg: the memory cgroup 1415 * 1416 * Returns the maximum amount of memory @mem can be charged with, in 1417 * pages. 1418 */ 1419 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1420 { 1421 unsigned long margin = 0; 1422 unsigned long count; 1423 unsigned long limit; 1424 1425 count = page_counter_read(&memcg->memory); 1426 limit = READ_ONCE(memcg->memory.max); 1427 if (count < limit) 1428 margin = limit - count; 1429 1430 if (do_memsw_account()) { 1431 count = page_counter_read(&memcg->memsw); 1432 limit = READ_ONCE(memcg->memsw.max); 1433 if (count < limit) 1434 margin = min(margin, limit - count); 1435 else 1436 margin = 0; 1437 } 1438 1439 return margin; 1440 } 1441 1442 /* 1443 * A routine for checking "mem" is under move_account() or not. 1444 * 1445 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1446 * moving cgroups. This is for waiting at high-memory pressure 1447 * caused by "move". 1448 */ 1449 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1450 { 1451 struct mem_cgroup *from; 1452 struct mem_cgroup *to; 1453 bool ret = false; 1454 /* 1455 * Unlike task_move routines, we access mc.to, mc.from not under 1456 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1457 */ 1458 spin_lock(&mc.lock); 1459 from = mc.from; 1460 to = mc.to; 1461 if (!from) 1462 goto unlock; 1463 1464 ret = mem_cgroup_is_descendant(from, memcg) || 1465 mem_cgroup_is_descendant(to, memcg); 1466 unlock: 1467 spin_unlock(&mc.lock); 1468 return ret; 1469 } 1470 1471 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1472 { 1473 if (mc.moving_task && current != mc.moving_task) { 1474 if (mem_cgroup_under_move(memcg)) { 1475 DEFINE_WAIT(wait); 1476 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1477 /* moving charge context might have finished. */ 1478 if (mc.moving_task) 1479 schedule(); 1480 finish_wait(&mc.waitq, &wait); 1481 return true; 1482 } 1483 } 1484 return false; 1485 } 1486 1487 struct memory_stat { 1488 const char *name; 1489 unsigned int ratio; 1490 unsigned int idx; 1491 }; 1492 1493 static struct memory_stat memory_stats[] = { 1494 { "anon", PAGE_SIZE, NR_ANON_MAPPED }, 1495 { "file", PAGE_SIZE, NR_FILE_PAGES }, 1496 { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, 1497 { "pagetables", PAGE_SIZE, NR_PAGETABLE }, 1498 { "percpu", 1, MEMCG_PERCPU_B }, 1499 { "sock", PAGE_SIZE, MEMCG_SOCK }, 1500 { "shmem", PAGE_SIZE, NR_SHMEM }, 1501 { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED }, 1502 { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY }, 1503 { "file_writeback", PAGE_SIZE, NR_WRITEBACK }, 1504 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1505 /* 1506 * The ratio will be initialized in memory_stats_init(). Because 1507 * on some architectures, the macro of HPAGE_PMD_SIZE is not 1508 * constant(e.g. powerpc). 1509 */ 1510 { "anon_thp", 0, NR_ANON_THPS }, 1511 { "file_thp", 0, NR_FILE_THPS }, 1512 { "shmem_thp", 0, NR_SHMEM_THPS }, 1513 #endif 1514 { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, 1515 { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON }, 1516 { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE }, 1517 { "active_file", PAGE_SIZE, NR_ACTIVE_FILE }, 1518 { "unevictable", PAGE_SIZE, NR_UNEVICTABLE }, 1519 1520 /* 1521 * Note: The slab_reclaimable and slab_unreclaimable must be 1522 * together and slab_reclaimable must be in front. 1523 */ 1524 { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B }, 1525 { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B }, 1526 1527 /* The memory events */ 1528 { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON }, 1529 { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE }, 1530 { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON }, 1531 { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE }, 1532 { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON }, 1533 { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE }, 1534 { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM }, 1535 }; 1536 1537 static int __init memory_stats_init(void) 1538 { 1539 int i; 1540 1541 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1542 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1543 if (memory_stats[i].idx == NR_ANON_THPS || 1544 memory_stats[i].idx == NR_FILE_THPS || 1545 memory_stats[i].idx == NR_SHMEM_THPS) 1546 memory_stats[i].ratio = HPAGE_PMD_SIZE; 1547 #endif 1548 VM_BUG_ON(!memory_stats[i].ratio); 1549 VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT); 1550 } 1551 1552 return 0; 1553 } 1554 pure_initcall(memory_stats_init); 1555 1556 static char *memory_stat_format(struct mem_cgroup *memcg) 1557 { 1558 struct seq_buf s; 1559 int i; 1560 1561 seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); 1562 if (!s.buffer) 1563 return NULL; 1564 1565 /* 1566 * Provide statistics on the state of the memory subsystem as 1567 * well as cumulative event counters that show past behavior. 1568 * 1569 * This list is ordered following a combination of these gradients: 1570 * 1) generic big picture -> specifics and details 1571 * 2) reflecting userspace activity -> reflecting kernel heuristics 1572 * 1573 * Current memory state: 1574 */ 1575 1576 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 1577 u64 size; 1578 1579 size = memcg_page_state(memcg, memory_stats[i].idx); 1580 size *= memory_stats[i].ratio; 1581 seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); 1582 1583 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { 1584 size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + 1585 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); 1586 seq_buf_printf(&s, "slab %llu\n", size); 1587 } 1588 } 1589 1590 /* Accumulated memory events */ 1591 1592 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT), 1593 memcg_events(memcg, PGFAULT)); 1594 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT), 1595 memcg_events(memcg, PGMAJFAULT)); 1596 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL), 1597 memcg_events(memcg, PGREFILL)); 1598 seq_buf_printf(&s, "pgscan %lu\n", 1599 memcg_events(memcg, PGSCAN_KSWAPD) + 1600 memcg_events(memcg, PGSCAN_DIRECT)); 1601 seq_buf_printf(&s, "pgsteal %lu\n", 1602 memcg_events(memcg, PGSTEAL_KSWAPD) + 1603 memcg_events(memcg, PGSTEAL_DIRECT)); 1604 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE), 1605 memcg_events(memcg, PGACTIVATE)); 1606 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE), 1607 memcg_events(memcg, PGDEACTIVATE)); 1608 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE), 1609 memcg_events(memcg, PGLAZYFREE)); 1610 seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED), 1611 memcg_events(memcg, PGLAZYFREED)); 1612 1613 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1614 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC), 1615 memcg_events(memcg, THP_FAULT_ALLOC)); 1616 seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC), 1617 memcg_events(memcg, THP_COLLAPSE_ALLOC)); 1618 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1619 1620 /* The above should easily fit into one page */ 1621 WARN_ON_ONCE(seq_buf_has_overflowed(&s)); 1622 1623 return s.buffer; 1624 } 1625 1626 #define K(x) ((x) << (PAGE_SHIFT-10)) 1627 /** 1628 * mem_cgroup_print_oom_context: Print OOM information relevant to 1629 * memory controller. 1630 * @memcg: The memory cgroup that went over limit 1631 * @p: Task that is going to be killed 1632 * 1633 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1634 * enabled 1635 */ 1636 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) 1637 { 1638 rcu_read_lock(); 1639 1640 if (memcg) { 1641 pr_cont(",oom_memcg="); 1642 pr_cont_cgroup_path(memcg->css.cgroup); 1643 } else 1644 pr_cont(",global_oom"); 1645 if (p) { 1646 pr_cont(",task_memcg="); 1647 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1648 } 1649 rcu_read_unlock(); 1650 } 1651 1652 /** 1653 * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to 1654 * memory controller. 1655 * @memcg: The memory cgroup that went over limit 1656 */ 1657 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) 1658 { 1659 char *buf; 1660 1661 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1662 K((u64)page_counter_read(&memcg->memory)), 1663 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); 1664 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1665 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", 1666 K((u64)page_counter_read(&memcg->swap)), 1667 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); 1668 else { 1669 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1670 K((u64)page_counter_read(&memcg->memsw)), 1671 K((u64)memcg->memsw.max), memcg->memsw.failcnt); 1672 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1673 K((u64)page_counter_read(&memcg->kmem)), 1674 K((u64)memcg->kmem.max), memcg->kmem.failcnt); 1675 } 1676 1677 pr_info("Memory cgroup stats for "); 1678 pr_cont_cgroup_path(memcg->css.cgroup); 1679 pr_cont(":"); 1680 buf = memory_stat_format(memcg); 1681 if (!buf) 1682 return; 1683 pr_info("%s", buf); 1684 kfree(buf); 1685 } 1686 1687 /* 1688 * Return the memory (and swap, if configured) limit for a memcg. 1689 */ 1690 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) 1691 { 1692 unsigned long max = READ_ONCE(memcg->memory.max); 1693 1694 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 1695 if (mem_cgroup_swappiness(memcg)) 1696 max += min(READ_ONCE(memcg->swap.max), 1697 (unsigned long)total_swap_pages); 1698 } else { /* v1 */ 1699 if (mem_cgroup_swappiness(memcg)) { 1700 /* Calculate swap excess capacity from memsw limit */ 1701 unsigned long swap = READ_ONCE(memcg->memsw.max) - max; 1702 1703 max += min(swap, (unsigned long)total_swap_pages); 1704 } 1705 } 1706 return max; 1707 } 1708 1709 unsigned long mem_cgroup_size(struct mem_cgroup *memcg) 1710 { 1711 return page_counter_read(&memcg->memory); 1712 } 1713 1714 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1715 int order) 1716 { 1717 struct oom_control oc = { 1718 .zonelist = NULL, 1719 .nodemask = NULL, 1720 .memcg = memcg, 1721 .gfp_mask = gfp_mask, 1722 .order = order, 1723 }; 1724 bool ret = true; 1725 1726 if (mutex_lock_killable(&oom_lock)) 1727 return true; 1728 1729 if (mem_cgroup_margin(memcg) >= (1 << order)) 1730 goto unlock; 1731 1732 /* 1733 * A few threads which were not waiting at mutex_lock_killable() can 1734 * fail to bail out. Therefore, check again after holding oom_lock. 1735 */ 1736 ret = should_force_charge() || out_of_memory(&oc); 1737 1738 unlock: 1739 mutex_unlock(&oom_lock); 1740 return ret; 1741 } 1742 1743 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1744 pg_data_t *pgdat, 1745 gfp_t gfp_mask, 1746 unsigned long *total_scanned) 1747 { 1748 struct mem_cgroup *victim = NULL; 1749 int total = 0; 1750 int loop = 0; 1751 unsigned long excess; 1752 unsigned long nr_scanned; 1753 struct mem_cgroup_reclaim_cookie reclaim = { 1754 .pgdat = pgdat, 1755 }; 1756 1757 excess = soft_limit_excess(root_memcg); 1758 1759 while (1) { 1760 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1761 if (!victim) { 1762 loop++; 1763 if (loop >= 2) { 1764 /* 1765 * If we have not been able to reclaim 1766 * anything, it might because there are 1767 * no reclaimable pages under this hierarchy 1768 */ 1769 if (!total) 1770 break; 1771 /* 1772 * We want to do more targeted reclaim. 1773 * excess >> 2 is not to excessive so as to 1774 * reclaim too much, nor too less that we keep 1775 * coming back to reclaim from this cgroup 1776 */ 1777 if (total >= (excess >> 2) || 1778 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1779 break; 1780 } 1781 continue; 1782 } 1783 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 1784 pgdat, &nr_scanned); 1785 *total_scanned += nr_scanned; 1786 if (!soft_limit_excess(root_memcg)) 1787 break; 1788 } 1789 mem_cgroup_iter_break(root_memcg, victim); 1790 return total; 1791 } 1792 1793 #ifdef CONFIG_LOCKDEP 1794 static struct lockdep_map memcg_oom_lock_dep_map = { 1795 .name = "memcg_oom_lock", 1796 }; 1797 #endif 1798 1799 static DEFINE_SPINLOCK(memcg_oom_lock); 1800 1801 /* 1802 * Check OOM-Killer is already running under our hierarchy. 1803 * If someone is running, return false. 1804 */ 1805 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1806 { 1807 struct mem_cgroup *iter, *failed = NULL; 1808 1809 spin_lock(&memcg_oom_lock); 1810 1811 for_each_mem_cgroup_tree(iter, memcg) { 1812 if (iter->oom_lock) { 1813 /* 1814 * this subtree of our hierarchy is already locked 1815 * so we cannot give a lock. 1816 */ 1817 failed = iter; 1818 mem_cgroup_iter_break(memcg, iter); 1819 break; 1820 } else 1821 iter->oom_lock = true; 1822 } 1823 1824 if (failed) { 1825 /* 1826 * OK, we failed to lock the whole subtree so we have 1827 * to clean up what we set up to the failing subtree 1828 */ 1829 for_each_mem_cgroup_tree(iter, memcg) { 1830 if (iter == failed) { 1831 mem_cgroup_iter_break(memcg, iter); 1832 break; 1833 } 1834 iter->oom_lock = false; 1835 } 1836 } else 1837 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1838 1839 spin_unlock(&memcg_oom_lock); 1840 1841 return !failed; 1842 } 1843 1844 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1845 { 1846 struct mem_cgroup *iter; 1847 1848 spin_lock(&memcg_oom_lock); 1849 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1850 for_each_mem_cgroup_tree(iter, memcg) 1851 iter->oom_lock = false; 1852 spin_unlock(&memcg_oom_lock); 1853 } 1854 1855 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1856 { 1857 struct mem_cgroup *iter; 1858 1859 spin_lock(&memcg_oom_lock); 1860 for_each_mem_cgroup_tree(iter, memcg) 1861 iter->under_oom++; 1862 spin_unlock(&memcg_oom_lock); 1863 } 1864 1865 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1866 { 1867 struct mem_cgroup *iter; 1868 1869 /* 1870 * Be careful about under_oom underflows becase a child memcg 1871 * could have been added after mem_cgroup_mark_under_oom. 1872 */ 1873 spin_lock(&memcg_oom_lock); 1874 for_each_mem_cgroup_tree(iter, memcg) 1875 if (iter->under_oom > 0) 1876 iter->under_oom--; 1877 spin_unlock(&memcg_oom_lock); 1878 } 1879 1880 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1881 1882 struct oom_wait_info { 1883 struct mem_cgroup *memcg; 1884 wait_queue_entry_t wait; 1885 }; 1886 1887 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1888 unsigned mode, int sync, void *arg) 1889 { 1890 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1891 struct mem_cgroup *oom_wait_memcg; 1892 struct oom_wait_info *oom_wait_info; 1893 1894 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1895 oom_wait_memcg = oom_wait_info->memcg; 1896 1897 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1898 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1899 return 0; 1900 return autoremove_wake_function(wait, mode, sync, arg); 1901 } 1902 1903 static void memcg_oom_recover(struct mem_cgroup *memcg) 1904 { 1905 /* 1906 * For the following lockless ->under_oom test, the only required 1907 * guarantee is that it must see the state asserted by an OOM when 1908 * this function is called as a result of userland actions 1909 * triggered by the notification of the OOM. This is trivially 1910 * achieved by invoking mem_cgroup_mark_under_oom() before 1911 * triggering notification. 1912 */ 1913 if (memcg && memcg->under_oom) 1914 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1915 } 1916 1917 enum oom_status { 1918 OOM_SUCCESS, 1919 OOM_FAILED, 1920 OOM_ASYNC, 1921 OOM_SKIPPED 1922 }; 1923 1924 static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1925 { 1926 enum oom_status ret; 1927 bool locked; 1928 1929 if (order > PAGE_ALLOC_COSTLY_ORDER) 1930 return OOM_SKIPPED; 1931 1932 memcg_memory_event(memcg, MEMCG_OOM); 1933 1934 /* 1935 * We are in the middle of the charge context here, so we 1936 * don't want to block when potentially sitting on a callstack 1937 * that holds all kinds of filesystem and mm locks. 1938 * 1939 * cgroup1 allows disabling the OOM killer and waiting for outside 1940 * handling until the charge can succeed; remember the context and put 1941 * the task to sleep at the end of the page fault when all locks are 1942 * released. 1943 * 1944 * On the other hand, in-kernel OOM killer allows for an async victim 1945 * memory reclaim (oom_reaper) and that means that we are not solely 1946 * relying on the oom victim to make a forward progress and we can 1947 * invoke the oom killer here. 1948 * 1949 * Please note that mem_cgroup_out_of_memory might fail to find a 1950 * victim and then we have to bail out from the charge path. 1951 */ 1952 if (memcg->oom_kill_disable) { 1953 if (!current->in_user_fault) 1954 return OOM_SKIPPED; 1955 css_get(&memcg->css); 1956 current->memcg_in_oom = memcg; 1957 current->memcg_oom_gfp_mask = mask; 1958 current->memcg_oom_order = order; 1959 1960 return OOM_ASYNC; 1961 } 1962 1963 mem_cgroup_mark_under_oom(memcg); 1964 1965 locked = mem_cgroup_oom_trylock(memcg); 1966 1967 if (locked) 1968 mem_cgroup_oom_notify(memcg); 1969 1970 mem_cgroup_unmark_under_oom(memcg); 1971 if (mem_cgroup_out_of_memory(memcg, mask, order)) 1972 ret = OOM_SUCCESS; 1973 else 1974 ret = OOM_FAILED; 1975 1976 if (locked) 1977 mem_cgroup_oom_unlock(memcg); 1978 1979 return ret; 1980 } 1981 1982 /** 1983 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1984 * @handle: actually kill/wait or just clean up the OOM state 1985 * 1986 * This has to be called at the end of a page fault if the memcg OOM 1987 * handler was enabled. 1988 * 1989 * Memcg supports userspace OOM handling where failed allocations must 1990 * sleep on a waitqueue until the userspace task resolves the 1991 * situation. Sleeping directly in the charge context with all kinds 1992 * of locks held is not a good idea, instead we remember an OOM state 1993 * in the task and mem_cgroup_oom_synchronize() has to be called at 1994 * the end of the page fault to complete the OOM handling. 1995 * 1996 * Returns %true if an ongoing memcg OOM situation was detected and 1997 * completed, %false otherwise. 1998 */ 1999 bool mem_cgroup_oom_synchronize(bool handle) 2000 { 2001 struct mem_cgroup *memcg = current->memcg_in_oom; 2002 struct oom_wait_info owait; 2003 bool locked; 2004 2005 /* OOM is global, do not handle */ 2006 if (!memcg) 2007 return false; 2008 2009 if (!handle) 2010 goto cleanup; 2011 2012 owait.memcg = memcg; 2013 owait.wait.flags = 0; 2014 owait.wait.func = memcg_oom_wake_function; 2015 owait.wait.private = current; 2016 INIT_LIST_HEAD(&owait.wait.entry); 2017 2018 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2019 mem_cgroup_mark_under_oom(memcg); 2020 2021 locked = mem_cgroup_oom_trylock(memcg); 2022 2023 if (locked) 2024 mem_cgroup_oom_notify(memcg); 2025 2026 if (locked && !memcg->oom_kill_disable) { 2027 mem_cgroup_unmark_under_oom(memcg); 2028 finish_wait(&memcg_oom_waitq, &owait.wait); 2029 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, 2030 current->memcg_oom_order); 2031 } else { 2032 schedule(); 2033 mem_cgroup_unmark_under_oom(memcg); 2034 finish_wait(&memcg_oom_waitq, &owait.wait); 2035 } 2036 2037 if (locked) { 2038 mem_cgroup_oom_unlock(memcg); 2039 /* 2040 * There is no guarantee that an OOM-lock contender 2041 * sees the wakeups triggered by the OOM kill 2042 * uncharges. Wake any sleepers explicitely. 2043 */ 2044 memcg_oom_recover(memcg); 2045 } 2046 cleanup: 2047 current->memcg_in_oom = NULL; 2048 css_put(&memcg->css); 2049 return true; 2050 } 2051 2052 /** 2053 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM 2054 * @victim: task to be killed by the OOM killer 2055 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM 2056 * 2057 * Returns a pointer to a memory cgroup, which has to be cleaned up 2058 * by killing all belonging OOM-killable tasks. 2059 * 2060 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. 2061 */ 2062 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, 2063 struct mem_cgroup *oom_domain) 2064 { 2065 struct mem_cgroup *oom_group = NULL; 2066 struct mem_cgroup *memcg; 2067 2068 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 2069 return NULL; 2070 2071 if (!oom_domain) 2072 oom_domain = root_mem_cgroup; 2073 2074 rcu_read_lock(); 2075 2076 memcg = mem_cgroup_from_task(victim); 2077 if (memcg == root_mem_cgroup) 2078 goto out; 2079 2080 /* 2081 * If the victim task has been asynchronously moved to a different 2082 * memory cgroup, we might end up killing tasks outside oom_domain. 2083 * In this case it's better to ignore memory.group.oom. 2084 */ 2085 if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) 2086 goto out; 2087 2088 /* 2089 * Traverse the memory cgroup hierarchy from the victim task's 2090 * cgroup up to the OOMing cgroup (or root) to find the 2091 * highest-level memory cgroup with oom.group set. 2092 */ 2093 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 2094 if (memcg->oom_group) 2095 oom_group = memcg; 2096 2097 if (memcg == oom_domain) 2098 break; 2099 } 2100 2101 if (oom_group) 2102 css_get(&oom_group->css); 2103 out: 2104 rcu_read_unlock(); 2105 2106 return oom_group; 2107 } 2108 2109 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) 2110 { 2111 pr_info("Tasks in "); 2112 pr_cont_cgroup_path(memcg->css.cgroup); 2113 pr_cont(" are going to be killed due to memory.oom.group set\n"); 2114 } 2115 2116 /** 2117 * lock_page_memcg - lock a page->mem_cgroup binding 2118 * @page: the page 2119 * 2120 * This function protects unlocked LRU pages from being moved to 2121 * another cgroup. 2122 * 2123 * It ensures lifetime of the returned memcg. Caller is responsible 2124 * for the lifetime of the page; __unlock_page_memcg() is available 2125 * when @page might get freed inside the locked section. 2126 */ 2127 struct mem_cgroup *lock_page_memcg(struct page *page) 2128 { 2129 struct page *head = compound_head(page); /* rmap on tail pages */ 2130 struct mem_cgroup *memcg; 2131 unsigned long flags; 2132 2133 /* 2134 * The RCU lock is held throughout the transaction. The fast 2135 * path can get away without acquiring the memcg->move_lock 2136 * because page moving starts with an RCU grace period. 2137 * 2138 * The RCU lock also protects the memcg from being freed when 2139 * the page state that is going to change is the only thing 2140 * preventing the page itself from being freed. E.g. writeback 2141 * doesn't hold a page reference and relies on PG_writeback to 2142 * keep off truncation, migration and so forth. 2143 */ 2144 rcu_read_lock(); 2145 2146 if (mem_cgroup_disabled()) 2147 return NULL; 2148 again: 2149 memcg = head->mem_cgroup; 2150 if (unlikely(!memcg)) 2151 return NULL; 2152 2153 if (atomic_read(&memcg->moving_account) <= 0) 2154 return memcg; 2155 2156 spin_lock_irqsave(&memcg->move_lock, flags); 2157 if (memcg != head->mem_cgroup) { 2158 spin_unlock_irqrestore(&memcg->move_lock, flags); 2159 goto again; 2160 } 2161 2162 /* 2163 * When charge migration first begins, we can have locked and 2164 * unlocked page stat updates happening concurrently. Track 2165 * the task who has the lock for unlock_page_memcg(). 2166 */ 2167 memcg->move_lock_task = current; 2168 memcg->move_lock_flags = flags; 2169 2170 return memcg; 2171 } 2172 EXPORT_SYMBOL(lock_page_memcg); 2173 2174 /** 2175 * __unlock_page_memcg - unlock and unpin a memcg 2176 * @memcg: the memcg 2177 * 2178 * Unlock and unpin a memcg returned by lock_page_memcg(). 2179 */ 2180 void __unlock_page_memcg(struct mem_cgroup *memcg) 2181 { 2182 if (memcg && memcg->move_lock_task == current) { 2183 unsigned long flags = memcg->move_lock_flags; 2184 2185 memcg->move_lock_task = NULL; 2186 memcg->move_lock_flags = 0; 2187 2188 spin_unlock_irqrestore(&memcg->move_lock, flags); 2189 } 2190 2191 rcu_read_unlock(); 2192 } 2193 2194 /** 2195 * unlock_page_memcg - unlock a page->mem_cgroup binding 2196 * @page: the page 2197 */ 2198 void unlock_page_memcg(struct page *page) 2199 { 2200 struct page *head = compound_head(page); 2201 2202 __unlock_page_memcg(head->mem_cgroup); 2203 } 2204 EXPORT_SYMBOL(unlock_page_memcg); 2205 2206 struct memcg_stock_pcp { 2207 struct mem_cgroup *cached; /* this never be root cgroup */ 2208 unsigned int nr_pages; 2209 2210 #ifdef CONFIG_MEMCG_KMEM 2211 struct obj_cgroup *cached_objcg; 2212 unsigned int nr_bytes; 2213 #endif 2214 2215 struct work_struct work; 2216 unsigned long flags; 2217 #define FLUSHING_CACHED_CHARGE 0 2218 }; 2219 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2220 static DEFINE_MUTEX(percpu_charge_mutex); 2221 2222 #ifdef CONFIG_MEMCG_KMEM 2223 static void drain_obj_stock(struct memcg_stock_pcp *stock); 2224 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2225 struct mem_cgroup *root_memcg); 2226 2227 #else 2228 static inline void drain_obj_stock(struct memcg_stock_pcp *stock) 2229 { 2230 } 2231 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 2232 struct mem_cgroup *root_memcg) 2233 { 2234 return false; 2235 } 2236 #endif 2237 2238 /** 2239 * consume_stock: Try to consume stocked charge on this cpu. 2240 * @memcg: memcg to consume from. 2241 * @nr_pages: how many pages to charge. 2242 * 2243 * The charges will only happen if @memcg matches the current cpu's memcg 2244 * stock, and at least @nr_pages are available in that stock. Failure to 2245 * service an allocation will refill the stock. 2246 * 2247 * returns true if successful, false otherwise. 2248 */ 2249 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2250 { 2251 struct memcg_stock_pcp *stock; 2252 unsigned long flags; 2253 bool ret = false; 2254 2255 if (nr_pages > MEMCG_CHARGE_BATCH) 2256 return ret; 2257 2258 local_irq_save(flags); 2259 2260 stock = this_cpu_ptr(&memcg_stock); 2261 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2262 stock->nr_pages -= nr_pages; 2263 ret = true; 2264 } 2265 2266 local_irq_restore(flags); 2267 2268 return ret; 2269 } 2270 2271 /* 2272 * Returns stocks cached in percpu and reset cached information. 2273 */ 2274 static void drain_stock(struct memcg_stock_pcp *stock) 2275 { 2276 struct mem_cgroup *old = stock->cached; 2277 2278 if (!old) 2279 return; 2280 2281 if (stock->nr_pages) { 2282 page_counter_uncharge(&old->memory, stock->nr_pages); 2283 if (do_memsw_account()) 2284 page_counter_uncharge(&old->memsw, stock->nr_pages); 2285 stock->nr_pages = 0; 2286 } 2287 2288 css_put(&old->css); 2289 stock->cached = NULL; 2290 } 2291 2292 static void drain_local_stock(struct work_struct *dummy) 2293 { 2294 struct memcg_stock_pcp *stock; 2295 unsigned long flags; 2296 2297 /* 2298 * The only protection from memory hotplug vs. drain_stock races is 2299 * that we always operate on local CPU stock here with IRQ disabled 2300 */ 2301 local_irq_save(flags); 2302 2303 stock = this_cpu_ptr(&memcg_stock); 2304 drain_obj_stock(stock); 2305 drain_stock(stock); 2306 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2307 2308 local_irq_restore(flags); 2309 } 2310 2311 /* 2312 * Cache charges(val) to local per_cpu area. 2313 * This will be consumed by consume_stock() function, later. 2314 */ 2315 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2316 { 2317 struct memcg_stock_pcp *stock; 2318 unsigned long flags; 2319 2320 local_irq_save(flags); 2321 2322 stock = this_cpu_ptr(&memcg_stock); 2323 if (stock->cached != memcg) { /* reset if necessary */ 2324 drain_stock(stock); 2325 css_get(&memcg->css); 2326 stock->cached = memcg; 2327 } 2328 stock->nr_pages += nr_pages; 2329 2330 if (stock->nr_pages > MEMCG_CHARGE_BATCH) 2331 drain_stock(stock); 2332 2333 local_irq_restore(flags); 2334 } 2335 2336 /* 2337 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2338 * of the hierarchy under it. 2339 */ 2340 static void drain_all_stock(struct mem_cgroup *root_memcg) 2341 { 2342 int cpu, curcpu; 2343 2344 /* If someone's already draining, avoid adding running more workers. */ 2345 if (!mutex_trylock(&percpu_charge_mutex)) 2346 return; 2347 /* 2348 * Notify other cpus that system-wide "drain" is running 2349 * We do not care about races with the cpu hotplug because cpu down 2350 * as well as workers from this path always operate on the local 2351 * per-cpu data. CPU up doesn't touch memcg_stock at all. 2352 */ 2353 curcpu = get_cpu(); 2354 for_each_online_cpu(cpu) { 2355 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2356 struct mem_cgroup *memcg; 2357 bool flush = false; 2358 2359 rcu_read_lock(); 2360 memcg = stock->cached; 2361 if (memcg && stock->nr_pages && 2362 mem_cgroup_is_descendant(memcg, root_memcg)) 2363 flush = true; 2364 if (obj_stock_flush_required(stock, root_memcg)) 2365 flush = true; 2366 rcu_read_unlock(); 2367 2368 if (flush && 2369 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2370 if (cpu == curcpu) 2371 drain_local_stock(&stock->work); 2372 else 2373 schedule_work_on(cpu, &stock->work); 2374 } 2375 } 2376 put_cpu(); 2377 mutex_unlock(&percpu_charge_mutex); 2378 } 2379 2380 static int memcg_hotplug_cpu_dead(unsigned int cpu) 2381 { 2382 struct memcg_stock_pcp *stock; 2383 struct mem_cgroup *memcg, *mi; 2384 2385 stock = &per_cpu(memcg_stock, cpu); 2386 drain_stock(stock); 2387 2388 for_each_mem_cgroup(memcg) { 2389 int i; 2390 2391 for (i = 0; i < MEMCG_NR_STAT; i++) { 2392 int nid; 2393 long x; 2394 2395 x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); 2396 if (x) 2397 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 2398 atomic_long_add(x, &memcg->vmstats[i]); 2399 2400 if (i >= NR_VM_NODE_STAT_ITEMS) 2401 continue; 2402 2403 for_each_node(nid) { 2404 struct mem_cgroup_per_node *pn; 2405 2406 pn = mem_cgroup_nodeinfo(memcg, nid); 2407 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); 2408 if (x) 2409 do { 2410 atomic_long_add(x, &pn->lruvec_stat[i]); 2411 } while ((pn = parent_nodeinfo(pn, nid))); 2412 } 2413 } 2414 2415 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 2416 long x; 2417 2418 x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); 2419 if (x) 2420 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 2421 atomic_long_add(x, &memcg->vmevents[i]); 2422 } 2423 } 2424 2425 return 0; 2426 } 2427 2428 static unsigned long reclaim_high(struct mem_cgroup *memcg, 2429 unsigned int nr_pages, 2430 gfp_t gfp_mask) 2431 { 2432 unsigned long nr_reclaimed = 0; 2433 2434 do { 2435 unsigned long pflags; 2436 2437 if (page_counter_read(&memcg->memory) <= 2438 READ_ONCE(memcg->memory.high)) 2439 continue; 2440 2441 memcg_memory_event(memcg, MEMCG_HIGH); 2442 2443 psi_memstall_enter(&pflags); 2444 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, 2445 gfp_mask, true); 2446 psi_memstall_leave(&pflags); 2447 } while ((memcg = parent_mem_cgroup(memcg)) && 2448 !mem_cgroup_is_root(memcg)); 2449 2450 return nr_reclaimed; 2451 } 2452 2453 static void high_work_func(struct work_struct *work) 2454 { 2455 struct mem_cgroup *memcg; 2456 2457 memcg = container_of(work, struct mem_cgroup, high_work); 2458 reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); 2459 } 2460 2461 /* 2462 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is 2463 * enough to still cause a significant slowdown in most cases, while still 2464 * allowing diagnostics and tracing to proceed without becoming stuck. 2465 */ 2466 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) 2467 2468 /* 2469 * When calculating the delay, we use these either side of the exponentiation to 2470 * maintain precision and scale to a reasonable number of jiffies (see the table 2471 * below. 2472 * 2473 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the 2474 * overage ratio to a delay. 2475 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the 2476 * proposed penalty in order to reduce to a reasonable number of jiffies, and 2477 * to produce a reasonable delay curve. 2478 * 2479 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a 2480 * reasonable delay curve compared to precision-adjusted overage, not 2481 * penalising heavily at first, but still making sure that growth beyond the 2482 * limit penalises misbehaviour cgroups by slowing them down exponentially. For 2483 * example, with a high of 100 megabytes: 2484 * 2485 * +-------+------------------------+ 2486 * | usage | time to allocate in ms | 2487 * +-------+------------------------+ 2488 * | 100M | 0 | 2489 * | 101M | 6 | 2490 * | 102M | 25 | 2491 * | 103M | 57 | 2492 * | 104M | 102 | 2493 * | 105M | 159 | 2494 * | 106M | 230 | 2495 * | 107M | 313 | 2496 * | 108M | 409 | 2497 * | 109M | 518 | 2498 * | 110M | 639 | 2499 * | 111M | 774 | 2500 * | 112M | 921 | 2501 * | 113M | 1081 | 2502 * | 114M | 1254 | 2503 * | 115M | 1439 | 2504 * | 116M | 1638 | 2505 * | 117M | 1849 | 2506 * | 118M | 2000 | 2507 * | 119M | 2000 | 2508 * | 120M | 2000 | 2509 * +-------+------------------------+ 2510 */ 2511 #define MEMCG_DELAY_PRECISION_SHIFT 20 2512 #define MEMCG_DELAY_SCALING_SHIFT 14 2513 2514 static u64 calculate_overage(unsigned long usage, unsigned long high) 2515 { 2516 u64 overage; 2517 2518 if (usage <= high) 2519 return 0; 2520 2521 /* 2522 * Prevent division by 0 in overage calculation by acting as if 2523 * it was a threshold of 1 page 2524 */ 2525 high = max(high, 1UL); 2526 2527 overage = usage - high; 2528 overage <<= MEMCG_DELAY_PRECISION_SHIFT; 2529 return div64_u64(overage, high); 2530 } 2531 2532 static u64 mem_find_max_overage(struct mem_cgroup *memcg) 2533 { 2534 u64 overage, max_overage = 0; 2535 2536 do { 2537 overage = calculate_overage(page_counter_read(&memcg->memory), 2538 READ_ONCE(memcg->memory.high)); 2539 max_overage = max(overage, max_overage); 2540 } while ((memcg = parent_mem_cgroup(memcg)) && 2541 !mem_cgroup_is_root(memcg)); 2542 2543 return max_overage; 2544 } 2545 2546 static u64 swap_find_max_overage(struct mem_cgroup *memcg) 2547 { 2548 u64 overage, max_overage = 0; 2549 2550 do { 2551 overage = calculate_overage(page_counter_read(&memcg->swap), 2552 READ_ONCE(memcg->swap.high)); 2553 if (overage) 2554 memcg_memory_event(memcg, MEMCG_SWAP_HIGH); 2555 max_overage = max(overage, max_overage); 2556 } while ((memcg = parent_mem_cgroup(memcg)) && 2557 !mem_cgroup_is_root(memcg)); 2558 2559 return max_overage; 2560 } 2561 2562 /* 2563 * Get the number of jiffies that we should penalise a mischievous cgroup which 2564 * is exceeding its memory.high by checking both it and its ancestors. 2565 */ 2566 static unsigned long calculate_high_delay(struct mem_cgroup *memcg, 2567 unsigned int nr_pages, 2568 u64 max_overage) 2569 { 2570 unsigned long penalty_jiffies; 2571 2572 if (!max_overage) 2573 return 0; 2574 2575 /* 2576 * We use overage compared to memory.high to calculate the number of 2577 * jiffies to sleep (penalty_jiffies). Ideally this value should be 2578 * fairly lenient on small overages, and increasingly harsh when the 2579 * memcg in question makes it clear that it has no intention of stopping 2580 * its crazy behaviour, so we exponentially increase the delay based on 2581 * overage amount. 2582 */ 2583 penalty_jiffies = max_overage * max_overage * HZ; 2584 penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 2585 penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 2586 2587 /* 2588 * Factor in the task's own contribution to the overage, such that four 2589 * N-sized allocations are throttled approximately the same as one 2590 * 4N-sized allocation. 2591 * 2592 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or 2593 * larger the current charge patch is than that. 2594 */ 2595 return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; 2596 } 2597 2598 /* 2599 * Scheduled by try_charge() to be executed from the userland return path 2600 * and reclaims memory over the high limit. 2601 */ 2602 void mem_cgroup_handle_over_high(void) 2603 { 2604 unsigned long penalty_jiffies; 2605 unsigned long pflags; 2606 unsigned long nr_reclaimed; 2607 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2608 int nr_retries = MAX_RECLAIM_RETRIES; 2609 struct mem_cgroup *memcg; 2610 bool in_retry = false; 2611 2612 if (likely(!nr_pages)) 2613 return; 2614 2615 memcg = get_mem_cgroup_from_mm(current->mm); 2616 current->memcg_nr_pages_over_high = 0; 2617 2618 retry_reclaim: 2619 /* 2620 * The allocating task should reclaim at least the batch size, but for 2621 * subsequent retries we only want to do what's necessary to prevent oom 2622 * or breaching resource isolation. 2623 * 2624 * This is distinct from memory.max or page allocator behaviour because 2625 * memory.high is currently batched, whereas memory.max and the page 2626 * allocator run every time an allocation is made. 2627 */ 2628 nr_reclaimed = reclaim_high(memcg, 2629 in_retry ? SWAP_CLUSTER_MAX : nr_pages, 2630 GFP_KERNEL); 2631 2632 /* 2633 * memory.high is breached and reclaim is unable to keep up. Throttle 2634 * allocators proactively to slow down excessive growth. 2635 */ 2636 penalty_jiffies = calculate_high_delay(memcg, nr_pages, 2637 mem_find_max_overage(memcg)); 2638 2639 penalty_jiffies += calculate_high_delay(memcg, nr_pages, 2640 swap_find_max_overage(memcg)); 2641 2642 /* 2643 * Clamp the max delay per usermode return so as to still keep the 2644 * application moving forwards and also permit diagnostics, albeit 2645 * extremely slowly. 2646 */ 2647 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2648 2649 /* 2650 * Don't sleep if the amount of jiffies this memcg owes us is so low 2651 * that it's not even worth doing, in an attempt to be nice to those who 2652 * go only a small amount over their memory.high value and maybe haven't 2653 * been aggressively reclaimed enough yet. 2654 */ 2655 if (penalty_jiffies <= HZ / 100) 2656 goto out; 2657 2658 /* 2659 * If reclaim is making forward progress but we're still over 2660 * memory.high, we want to encourage that rather than doing allocator 2661 * throttling. 2662 */ 2663 if (nr_reclaimed || nr_retries--) { 2664 in_retry = true; 2665 goto retry_reclaim; 2666 } 2667 2668 /* 2669 * If we exit early, we're guaranteed to die (since 2670 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't 2671 * need to account for any ill-begotten jiffies to pay them off later. 2672 */ 2673 psi_memstall_enter(&pflags); 2674 schedule_timeout_killable(penalty_jiffies); 2675 psi_memstall_leave(&pflags); 2676 2677 out: 2678 css_put(&memcg->css); 2679 } 2680 2681 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2682 unsigned int nr_pages) 2683 { 2684 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); 2685 int nr_retries = MAX_RECLAIM_RETRIES; 2686 struct mem_cgroup *mem_over_limit; 2687 struct page_counter *counter; 2688 enum oom_status oom_status; 2689 unsigned long nr_reclaimed; 2690 bool may_swap = true; 2691 bool drained = false; 2692 unsigned long pflags; 2693 2694 if (mem_cgroup_is_root(memcg)) 2695 return 0; 2696 retry: 2697 if (consume_stock(memcg, nr_pages)) 2698 return 0; 2699 2700 if (!do_memsw_account() || 2701 page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2702 if (page_counter_try_charge(&memcg->memory, batch, &counter)) 2703 goto done_restock; 2704 if (do_memsw_account()) 2705 page_counter_uncharge(&memcg->memsw, batch); 2706 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2707 } else { 2708 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2709 may_swap = false; 2710 } 2711 2712 if (batch > nr_pages) { 2713 batch = nr_pages; 2714 goto retry; 2715 } 2716 2717 /* 2718 * Memcg doesn't have a dedicated reserve for atomic 2719 * allocations. But like the global atomic pool, we need to 2720 * put the burden of reclaim on regular allocation requests 2721 * and let these go through as privileged allocations. 2722 */ 2723 if (gfp_mask & __GFP_ATOMIC) 2724 goto force; 2725 2726 /* 2727 * Unlike in global OOM situations, memcg is not in a physical 2728 * memory shortage. Allow dying and OOM-killed tasks to 2729 * bypass the last charges so that they can exit quickly and 2730 * free their memory. 2731 */ 2732 if (unlikely(should_force_charge())) 2733 goto force; 2734 2735 /* 2736 * Prevent unbounded recursion when reclaim operations need to 2737 * allocate memory. This might exceed the limits temporarily, 2738 * but we prefer facilitating memory reclaim and getting back 2739 * under the limit over triggering OOM kills in these cases. 2740 */ 2741 if (unlikely(current->flags & PF_MEMALLOC)) 2742 goto force; 2743 2744 if (unlikely(task_in_memcg_oom(current))) 2745 goto nomem; 2746 2747 if (!gfpflags_allow_blocking(gfp_mask)) 2748 goto nomem; 2749 2750 memcg_memory_event(mem_over_limit, MEMCG_MAX); 2751 2752 psi_memstall_enter(&pflags); 2753 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2754 gfp_mask, may_swap); 2755 psi_memstall_leave(&pflags); 2756 2757 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2758 goto retry; 2759 2760 if (!drained) { 2761 drain_all_stock(mem_over_limit); 2762 drained = true; 2763 goto retry; 2764 } 2765 2766 if (gfp_mask & __GFP_NORETRY) 2767 goto nomem; 2768 /* 2769 * Even though the limit is exceeded at this point, reclaim 2770 * may have been able to free some pages. Retry the charge 2771 * before killing the task. 2772 * 2773 * Only for regular pages, though: huge pages are rather 2774 * unlikely to succeed so close to the limit, and we fall back 2775 * to regular pages anyway in case of failure. 2776 */ 2777 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2778 goto retry; 2779 /* 2780 * At task move, charge accounts can be doubly counted. So, it's 2781 * better to wait until the end of task_move if something is going on. 2782 */ 2783 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2784 goto retry; 2785 2786 if (nr_retries--) 2787 goto retry; 2788 2789 if (gfp_mask & __GFP_RETRY_MAYFAIL) 2790 goto nomem; 2791 2792 if (gfp_mask & __GFP_NOFAIL) 2793 goto force; 2794 2795 if (fatal_signal_pending(current)) 2796 goto force; 2797 2798 /* 2799 * keep retrying as long as the memcg oom killer is able to make 2800 * a forward progress or bypass the charge if the oom killer 2801 * couldn't make any progress. 2802 */ 2803 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, 2804 get_order(nr_pages * PAGE_SIZE)); 2805 switch (oom_status) { 2806 case OOM_SUCCESS: 2807 nr_retries = MAX_RECLAIM_RETRIES; 2808 goto retry; 2809 case OOM_FAILED: 2810 goto force; 2811 default: 2812 goto nomem; 2813 } 2814 nomem: 2815 if (!(gfp_mask & __GFP_NOFAIL)) 2816 return -ENOMEM; 2817 force: 2818 /* 2819 * The allocation either can't fail or will lead to more memory 2820 * being freed very soon. Allow memory usage go over the limit 2821 * temporarily by force charging it. 2822 */ 2823 page_counter_charge(&memcg->memory, nr_pages); 2824 if (do_memsw_account()) 2825 page_counter_charge(&memcg->memsw, nr_pages); 2826 2827 return 0; 2828 2829 done_restock: 2830 if (batch > nr_pages) 2831 refill_stock(memcg, batch - nr_pages); 2832 2833 /* 2834 * If the hierarchy is above the normal consumption range, schedule 2835 * reclaim on returning to userland. We can perform reclaim here 2836 * if __GFP_RECLAIM but let's always punt for simplicity and so that 2837 * GFP_KERNEL can consistently be used during reclaim. @memcg is 2838 * not recorded as it most likely matches current's and won't 2839 * change in the meantime. As high limit is checked again before 2840 * reclaim, the cost of mismatch is negligible. 2841 */ 2842 do { 2843 bool mem_high, swap_high; 2844 2845 mem_high = page_counter_read(&memcg->memory) > 2846 READ_ONCE(memcg->memory.high); 2847 swap_high = page_counter_read(&memcg->swap) > 2848 READ_ONCE(memcg->swap.high); 2849 2850 /* Don't bother a random interrupted task */ 2851 if (in_interrupt()) { 2852 if (mem_high) { 2853 schedule_work(&memcg->high_work); 2854 break; 2855 } 2856 continue; 2857 } 2858 2859 if (mem_high || swap_high) { 2860 /* 2861 * The allocating tasks in this cgroup will need to do 2862 * reclaim or be throttled to prevent further growth 2863 * of the memory or swap footprints. 2864 * 2865 * Target some best-effort fairness between the tasks, 2866 * and distribute reclaim work and delay penalties 2867 * based on how much each task is actually allocating. 2868 */ 2869 current->memcg_nr_pages_over_high += batch; 2870 set_notify_resume(current); 2871 break; 2872 } 2873 } while ((memcg = parent_mem_cgroup(memcg))); 2874 2875 return 0; 2876 } 2877 2878 #if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) 2879 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2880 { 2881 if (mem_cgroup_is_root(memcg)) 2882 return; 2883 2884 page_counter_uncharge(&memcg->memory, nr_pages); 2885 if (do_memsw_account()) 2886 page_counter_uncharge(&memcg->memsw, nr_pages); 2887 } 2888 #endif 2889 2890 static void commit_charge(struct page *page, struct mem_cgroup *memcg) 2891 { 2892 VM_BUG_ON_PAGE(page->mem_cgroup, page); 2893 /* 2894 * Any of the following ensures page's memcg stability: 2895 * 2896 * - the page lock 2897 * - LRU isolation 2898 * - lock_page_memcg() 2899 * - exclusive reference 2900 */ 2901 page->mem_cgroup = memcg; 2902 } 2903 2904 #ifdef CONFIG_MEMCG_KMEM 2905 int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, 2906 gfp_t gfp) 2907 { 2908 unsigned int objects = objs_per_slab_page(s, page); 2909 void *vec; 2910 2911 vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, 2912 page_to_nid(page)); 2913 if (!vec) 2914 return -ENOMEM; 2915 2916 if (cmpxchg(&page->obj_cgroups, NULL, 2917 (struct obj_cgroup **) ((unsigned long)vec | 0x1UL))) 2918 kfree(vec); 2919 else 2920 kmemleak_not_leak(vec); 2921 2922 return 0; 2923 } 2924 2925 /* 2926 * Returns a pointer to the memory cgroup to which the kernel object is charged. 2927 * 2928 * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), 2929 * cgroup_mutex, etc. 2930 */ 2931 struct mem_cgroup *mem_cgroup_from_obj(void *p) 2932 { 2933 struct page *page; 2934 2935 if (mem_cgroup_disabled()) 2936 return NULL; 2937 2938 page = virt_to_head_page(p); 2939 2940 /* 2941 * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer 2942 * or a pointer to obj_cgroup vector. In the latter case the lowest 2943 * bit of the pointer is set. 2944 * The page->mem_cgroup pointer can be asynchronously changed 2945 * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed 2946 * from a valid memcg pointer to objcg vector or back. 2947 */ 2948 if (!page->mem_cgroup) 2949 return NULL; 2950 2951 /* 2952 * Slab objects are accounted individually, not per-page. 2953 * Memcg membership data for each individual object is saved in 2954 * the page->obj_cgroups. 2955 */ 2956 if (page_has_obj_cgroups(page)) { 2957 struct obj_cgroup *objcg; 2958 unsigned int off; 2959 2960 off = obj_to_index(page->slab_cache, page, p); 2961 objcg = page_obj_cgroups(page)[off]; 2962 if (objcg) 2963 return obj_cgroup_memcg(objcg); 2964 2965 return NULL; 2966 } 2967 2968 /* All other pages use page->mem_cgroup */ 2969 return page->mem_cgroup; 2970 } 2971 2972 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) 2973 { 2974 struct obj_cgroup *objcg = NULL; 2975 struct mem_cgroup *memcg; 2976 2977 if (memcg_kmem_bypass()) 2978 return NULL; 2979 2980 rcu_read_lock(); 2981 if (unlikely(active_memcg())) 2982 memcg = active_memcg(); 2983 else 2984 memcg = mem_cgroup_from_task(current); 2985 2986 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { 2987 objcg = rcu_dereference(memcg->objcg); 2988 if (objcg && obj_cgroup_tryget(objcg)) 2989 break; 2990 objcg = NULL; 2991 } 2992 rcu_read_unlock(); 2993 2994 return objcg; 2995 } 2996 2997 static int memcg_alloc_cache_id(void) 2998 { 2999 int id, size; 3000 int err; 3001 3002 id = ida_simple_get(&memcg_cache_ida, 3003 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 3004 if (id < 0) 3005 return id; 3006 3007 if (id < memcg_nr_cache_ids) 3008 return id; 3009 3010 /* 3011 * There's no space for the new id in memcg_caches arrays, 3012 * so we have to grow them. 3013 */ 3014 down_write(&memcg_cache_ids_sem); 3015 3016 size = 2 * (id + 1); 3017 if (size < MEMCG_CACHES_MIN_SIZE) 3018 size = MEMCG_CACHES_MIN_SIZE; 3019 else if (size > MEMCG_CACHES_MAX_SIZE) 3020 size = MEMCG_CACHES_MAX_SIZE; 3021 3022 err = memcg_update_all_list_lrus(size); 3023 if (!err) 3024 memcg_nr_cache_ids = size; 3025 3026 up_write(&memcg_cache_ids_sem); 3027 3028 if (err) { 3029 ida_simple_remove(&memcg_cache_ida, id); 3030 return err; 3031 } 3032 return id; 3033 } 3034 3035 static void memcg_free_cache_id(int id) 3036 { 3037 ida_simple_remove(&memcg_cache_ida, id); 3038 } 3039 3040 /** 3041 * __memcg_kmem_charge: charge a number of kernel pages to a memcg 3042 * @memcg: memory cgroup to charge 3043 * @gfp: reclaim mode 3044 * @nr_pages: number of pages to charge 3045 * 3046 * Returns 0 on success, an error code on failure. 3047 */ 3048 int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, 3049 unsigned int nr_pages) 3050 { 3051 struct page_counter *counter; 3052 int ret; 3053 3054 ret = try_charge(memcg, gfp, nr_pages); 3055 if (ret) 3056 return ret; 3057 3058 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && 3059 !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { 3060 3061 /* 3062 * Enforce __GFP_NOFAIL allocation because callers are not 3063 * prepared to see failures and likely do not have any failure 3064 * handling code. 3065 */ 3066 if (gfp & __GFP_NOFAIL) { 3067 page_counter_charge(&memcg->kmem, nr_pages); 3068 return 0; 3069 } 3070 cancel_charge(memcg, nr_pages); 3071 return -ENOMEM; 3072 } 3073 return 0; 3074 } 3075 3076 /** 3077 * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg 3078 * @memcg: memcg to uncharge 3079 * @nr_pages: number of pages to uncharge 3080 */ 3081 void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) 3082 { 3083 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 3084 page_counter_uncharge(&memcg->kmem, nr_pages); 3085 3086 page_counter_uncharge(&memcg->memory, nr_pages); 3087 if (do_memsw_account()) 3088 page_counter_uncharge(&memcg->memsw, nr_pages); 3089 } 3090 3091 /** 3092 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup 3093 * @page: page to charge 3094 * @gfp: reclaim mode 3095 * @order: allocation order 3096 * 3097 * Returns 0 on success, an error code on failure. 3098 */ 3099 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) 3100 { 3101 struct mem_cgroup *memcg; 3102 int ret = 0; 3103 3104 memcg = get_mem_cgroup_from_current(); 3105 if (memcg && !mem_cgroup_is_root(memcg)) { 3106 ret = __memcg_kmem_charge(memcg, gfp, 1 << order); 3107 if (!ret) { 3108 page->mem_cgroup = memcg; 3109 __SetPageKmemcg(page); 3110 return 0; 3111 } 3112 css_put(&memcg->css); 3113 } 3114 return ret; 3115 } 3116 3117 /** 3118 * __memcg_kmem_uncharge_page: uncharge a kmem page 3119 * @page: page to uncharge 3120 * @order: allocation order 3121 */ 3122 void __memcg_kmem_uncharge_page(struct page *page, int order) 3123 { 3124 struct mem_cgroup *memcg = page->mem_cgroup; 3125 unsigned int nr_pages = 1 << order; 3126 3127 if (!memcg) 3128 return; 3129 3130 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 3131 __memcg_kmem_uncharge(memcg, nr_pages); 3132 page->mem_cgroup = NULL; 3133 css_put(&memcg->css); 3134 3135 /* slab pages do not have PageKmemcg flag set */ 3136 if (PageKmemcg(page)) 3137 __ClearPageKmemcg(page); 3138 } 3139 3140 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 3141 { 3142 struct memcg_stock_pcp *stock; 3143 unsigned long flags; 3144 bool ret = false; 3145 3146 local_irq_save(flags); 3147 3148 stock = this_cpu_ptr(&memcg_stock); 3149 if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { 3150 stock->nr_bytes -= nr_bytes; 3151 ret = true; 3152 } 3153 3154 local_irq_restore(flags); 3155 3156 return ret; 3157 } 3158 3159 static void drain_obj_stock(struct memcg_stock_pcp *stock) 3160 { 3161 struct obj_cgroup *old = stock->cached_objcg; 3162 3163 if (!old) 3164 return; 3165 3166 if (stock->nr_bytes) { 3167 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; 3168 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); 3169 3170 if (nr_pages) { 3171 rcu_read_lock(); 3172 __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages); 3173 rcu_read_unlock(); 3174 } 3175 3176 /* 3177 * The leftover is flushed to the centralized per-memcg value. 3178 * On the next attempt to refill obj stock it will be moved 3179 * to a per-cpu stock (probably, on an other CPU), see 3180 * refill_obj_stock(). 3181 * 3182 * How often it's flushed is a trade-off between the memory 3183 * limit enforcement accuracy and potential CPU contention, 3184 * so it might be changed in the future. 3185 */ 3186 atomic_add(nr_bytes, &old->nr_charged_bytes); 3187 stock->nr_bytes = 0; 3188 } 3189 3190 obj_cgroup_put(old); 3191 stock->cached_objcg = NULL; 3192 } 3193 3194 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, 3195 struct mem_cgroup *root_memcg) 3196 { 3197 struct mem_cgroup *memcg; 3198 3199 if (stock->cached_objcg) { 3200 memcg = obj_cgroup_memcg(stock->cached_objcg); 3201 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) 3202 return true; 3203 } 3204 3205 return false; 3206 } 3207 3208 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 3209 { 3210 struct memcg_stock_pcp *stock; 3211 unsigned long flags; 3212 3213 local_irq_save(flags); 3214 3215 stock = this_cpu_ptr(&memcg_stock); 3216 if (stock->cached_objcg != objcg) { /* reset if necessary */ 3217 drain_obj_stock(stock); 3218 obj_cgroup_get(objcg); 3219 stock->cached_objcg = objcg; 3220 stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0); 3221 } 3222 stock->nr_bytes += nr_bytes; 3223 3224 if (stock->nr_bytes > PAGE_SIZE) 3225 drain_obj_stock(stock); 3226 3227 local_irq_restore(flags); 3228 } 3229 3230 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 3231 { 3232 struct mem_cgroup *memcg; 3233 unsigned int nr_pages, nr_bytes; 3234 int ret; 3235 3236 if (consume_obj_stock(objcg, size)) 3237 return 0; 3238 3239 /* 3240 * In theory, memcg->nr_charged_bytes can have enough 3241 * pre-charged bytes to satisfy the allocation. However, 3242 * flushing memcg->nr_charged_bytes requires two atomic 3243 * operations, and memcg->nr_charged_bytes can't be big, 3244 * so it's better to ignore it and try grab some new pages. 3245 * memcg->nr_charged_bytes will be flushed in 3246 * refill_obj_stock(), called from this function or 3247 * independently later. 3248 */ 3249 rcu_read_lock(); 3250 retry: 3251 memcg = obj_cgroup_memcg(objcg); 3252 if (unlikely(!css_tryget(&memcg->css))) 3253 goto retry; 3254 rcu_read_unlock(); 3255 3256 nr_pages = size >> PAGE_SHIFT; 3257 nr_bytes = size & (PAGE_SIZE - 1); 3258 3259 if (nr_bytes) 3260 nr_pages += 1; 3261 3262 ret = __memcg_kmem_charge(memcg, gfp, nr_pages); 3263 if (!ret && nr_bytes) 3264 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes); 3265 3266 css_put(&memcg->css); 3267 return ret; 3268 } 3269 3270 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 3271 { 3272 refill_obj_stock(objcg, size); 3273 } 3274 3275 #endif /* CONFIG_MEMCG_KMEM */ 3276 3277 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3278 3279 /* 3280 * Because tail pages are not marked as "used", set it. We're under 3281 * pgdat->lru_lock and migration entries setup in all page mappings. 3282 */ 3283 void mem_cgroup_split_huge_fixup(struct page *head) 3284 { 3285 struct mem_cgroup *memcg = head->mem_cgroup; 3286 int i; 3287 3288 if (mem_cgroup_disabled()) 3289 return; 3290 3291 for (i = 1; i < HPAGE_PMD_NR; i++) { 3292 css_get(&memcg->css); 3293 head[i].mem_cgroup = memcg; 3294 } 3295 } 3296 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3297 3298 #ifdef CONFIG_MEMCG_SWAP 3299 /** 3300 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3301 * @entry: swap entry to be moved 3302 * @from: mem_cgroup which the entry is moved from 3303 * @to: mem_cgroup which the entry is moved to 3304 * 3305 * It succeeds only when the swap_cgroup's record for this entry is the same 3306 * as the mem_cgroup's id of @from. 3307 * 3308 * Returns 0 on success, -EINVAL on failure. 3309 * 3310 * The caller must have charged to @to, IOW, called page_counter_charge() about 3311 * both res and memsw, and called css_get(). 3312 */ 3313 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3314 struct mem_cgroup *from, struct mem_cgroup *to) 3315 { 3316 unsigned short old_id, new_id; 3317 3318 old_id = mem_cgroup_id(from); 3319 new_id = mem_cgroup_id(to); 3320 3321 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3322 mod_memcg_state(from, MEMCG_SWAP, -1); 3323 mod_memcg_state(to, MEMCG_SWAP, 1); 3324 return 0; 3325 } 3326 return -EINVAL; 3327 } 3328 #else 3329 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3330 struct mem_cgroup *from, struct mem_cgroup *to) 3331 { 3332 return -EINVAL; 3333 } 3334 #endif 3335 3336 static DEFINE_MUTEX(memcg_max_mutex); 3337 3338 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 3339 unsigned long max, bool memsw) 3340 { 3341 bool enlarge = false; 3342 bool drained = false; 3343 int ret; 3344 bool limits_invariant; 3345 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 3346 3347 do { 3348 if (signal_pending(current)) { 3349 ret = -EINTR; 3350 break; 3351 } 3352 3353 mutex_lock(&memcg_max_mutex); 3354 /* 3355 * Make sure that the new limit (memsw or memory limit) doesn't 3356 * break our basic invariant rule memory.max <= memsw.max. 3357 */ 3358 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 3359 max <= memcg->memsw.max; 3360 if (!limits_invariant) { 3361 mutex_unlock(&memcg_max_mutex); 3362 ret = -EINVAL; 3363 break; 3364 } 3365 if (max > counter->max) 3366 enlarge = true; 3367 ret = page_counter_set_max(counter, max); 3368 mutex_unlock(&memcg_max_mutex); 3369 3370 if (!ret) 3371 break; 3372 3373 if (!drained) { 3374 drain_all_stock(memcg); 3375 drained = true; 3376 continue; 3377 } 3378 3379 if (!try_to_free_mem_cgroup_pages(memcg, 1, 3380 GFP_KERNEL, !memsw)) { 3381 ret = -EBUSY; 3382 break; 3383 } 3384 } while (true); 3385 3386 if (!ret && enlarge) 3387 memcg_oom_recover(memcg); 3388 3389 return ret; 3390 } 3391 3392 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, 3393 gfp_t gfp_mask, 3394 unsigned long *total_scanned) 3395 { 3396 unsigned long nr_reclaimed = 0; 3397 struct mem_cgroup_per_node *mz, *next_mz = NULL; 3398 unsigned long reclaimed; 3399 int loop = 0; 3400 struct mem_cgroup_tree_per_node *mctz; 3401 unsigned long excess; 3402 unsigned long nr_scanned; 3403 3404 if (order > 0) 3405 return 0; 3406 3407 mctz = soft_limit_tree_node(pgdat->node_id); 3408 3409 /* 3410 * Do not even bother to check the largest node if the root 3411 * is empty. Do it lockless to prevent lock bouncing. Races 3412 * are acceptable as soft limit is best effort anyway. 3413 */ 3414 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 3415 return 0; 3416 3417 /* 3418 * This loop can run a while, specially if mem_cgroup's continuously 3419 * keep exceeding their soft limit and putting the system under 3420 * pressure 3421 */ 3422 do { 3423 if (next_mz) 3424 mz = next_mz; 3425 else 3426 mz = mem_cgroup_largest_soft_limit_node(mctz); 3427 if (!mz) 3428 break; 3429 3430 nr_scanned = 0; 3431 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 3432 gfp_mask, &nr_scanned); 3433 nr_reclaimed += reclaimed; 3434 *total_scanned += nr_scanned; 3435 spin_lock_irq(&mctz->lock); 3436 __mem_cgroup_remove_exceeded(mz, mctz); 3437 3438 /* 3439 * If we failed to reclaim anything from this memory cgroup 3440 * it is time to move on to the next cgroup 3441 */ 3442 next_mz = NULL; 3443 if (!reclaimed) 3444 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 3445 3446 excess = soft_limit_excess(mz->memcg); 3447 /* 3448 * One school of thought says that we should not add 3449 * back the node to the tree if reclaim returns 0. 3450 * But our reclaim could return 0, simply because due 3451 * to priority we are exposing a smaller subset of 3452 * memory to reclaim from. Consider this as a longer 3453 * term TODO. 3454 */ 3455 /* If excess == 0, no tree ops */ 3456 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3457 spin_unlock_irq(&mctz->lock); 3458 css_put(&mz->memcg->css); 3459 loop++; 3460 /* 3461 * Could not reclaim anything and there are no more 3462 * mem cgroups to try or we seem to be looping without 3463 * reclaiming anything. 3464 */ 3465 if (!nr_reclaimed && 3466 (next_mz == NULL || 3467 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3468 break; 3469 } while (!nr_reclaimed); 3470 if (next_mz) 3471 css_put(&next_mz->memcg->css); 3472 return nr_reclaimed; 3473 } 3474 3475 /* 3476 * Reclaims as many pages from the given memcg as possible. 3477 * 3478 * Caller is responsible for holding css reference for memcg. 3479 */ 3480 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3481 { 3482 int nr_retries = MAX_RECLAIM_RETRIES; 3483 3484 /* we call try-to-free pages for make this cgroup empty */ 3485 lru_add_drain_all(); 3486 3487 drain_all_stock(memcg); 3488 3489 /* try to free all pages in this cgroup */ 3490 while (nr_retries && page_counter_read(&memcg->memory)) { 3491 int progress; 3492 3493 if (signal_pending(current)) 3494 return -EINTR; 3495 3496 progress = try_to_free_mem_cgroup_pages(memcg, 1, 3497 GFP_KERNEL, true); 3498 if (!progress) { 3499 nr_retries--; 3500 /* maybe some writeback is necessary */ 3501 congestion_wait(BLK_RW_ASYNC, HZ/10); 3502 } 3503 3504 } 3505 3506 return 0; 3507 } 3508 3509 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3510 char *buf, size_t nbytes, 3511 loff_t off) 3512 { 3513 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3514 3515 if (mem_cgroup_is_root(memcg)) 3516 return -EINVAL; 3517 return mem_cgroup_force_empty(memcg) ?: nbytes; 3518 } 3519 3520 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3521 struct cftype *cft) 3522 { 3523 return 1; 3524 } 3525 3526 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3527 struct cftype *cft, u64 val) 3528 { 3529 if (val == 1) 3530 return 0; 3531 3532 pr_warn_once("Non-hierarchical mode is deprecated. " 3533 "Please report your usecase to linux-mm@kvack.org if you " 3534 "depend on this functionality.\n"); 3535 3536 return -EINVAL; 3537 } 3538 3539 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3540 { 3541 unsigned long val; 3542 3543 if (mem_cgroup_is_root(memcg)) { 3544 val = memcg_page_state(memcg, NR_FILE_PAGES) + 3545 memcg_page_state(memcg, NR_ANON_MAPPED); 3546 if (swap) 3547 val += memcg_page_state(memcg, MEMCG_SWAP); 3548 } else { 3549 if (!swap) 3550 val = page_counter_read(&memcg->memory); 3551 else 3552 val = page_counter_read(&memcg->memsw); 3553 } 3554 return val; 3555 } 3556 3557 enum { 3558 RES_USAGE, 3559 RES_LIMIT, 3560 RES_MAX_USAGE, 3561 RES_FAILCNT, 3562 RES_SOFT_LIMIT, 3563 }; 3564 3565 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3566 struct cftype *cft) 3567 { 3568 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3569 struct page_counter *counter; 3570 3571 switch (MEMFILE_TYPE(cft->private)) { 3572 case _MEM: 3573 counter = &memcg->memory; 3574 break; 3575 case _MEMSWAP: 3576 counter = &memcg->memsw; 3577 break; 3578 case _KMEM: 3579 counter = &memcg->kmem; 3580 break; 3581 case _TCP: 3582 counter = &memcg->tcpmem; 3583 break; 3584 default: 3585 BUG(); 3586 } 3587 3588 switch (MEMFILE_ATTR(cft->private)) { 3589 case RES_USAGE: 3590 if (counter == &memcg->memory) 3591 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 3592 if (counter == &memcg->memsw) 3593 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 3594 return (u64)page_counter_read(counter) * PAGE_SIZE; 3595 case RES_LIMIT: 3596 return (u64)counter->max * PAGE_SIZE; 3597 case RES_MAX_USAGE: 3598 return (u64)counter->watermark * PAGE_SIZE; 3599 case RES_FAILCNT: 3600 return counter->failcnt; 3601 case RES_SOFT_LIMIT: 3602 return (u64)memcg->soft_limit * PAGE_SIZE; 3603 default: 3604 BUG(); 3605 } 3606 } 3607 3608 static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) 3609 { 3610 unsigned long stat[MEMCG_NR_STAT] = {0}; 3611 struct mem_cgroup *mi; 3612 int node, cpu, i; 3613 3614 for_each_online_cpu(cpu) 3615 for (i = 0; i < MEMCG_NR_STAT; i++) 3616 stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); 3617 3618 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 3619 for (i = 0; i < MEMCG_NR_STAT; i++) 3620 atomic_long_add(stat[i], &mi->vmstats[i]); 3621 3622 for_each_node(node) { 3623 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 3624 struct mem_cgroup_per_node *pi; 3625 3626 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 3627 stat[i] = 0; 3628 3629 for_each_online_cpu(cpu) 3630 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 3631 stat[i] += per_cpu( 3632 pn->lruvec_stat_cpu->count[i], cpu); 3633 3634 for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) 3635 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 3636 atomic_long_add(stat[i], &pi->lruvec_stat[i]); 3637 } 3638 } 3639 3640 static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) 3641 { 3642 unsigned long events[NR_VM_EVENT_ITEMS]; 3643 struct mem_cgroup *mi; 3644 int cpu, i; 3645 3646 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3647 events[i] = 0; 3648 3649 for_each_online_cpu(cpu) 3650 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3651 events[i] += per_cpu(memcg->vmstats_percpu->events[i], 3652 cpu); 3653 3654 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) 3655 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 3656 atomic_long_add(events[i], &mi->vmevents[i]); 3657 } 3658 3659 #ifdef CONFIG_MEMCG_KMEM 3660 static int memcg_online_kmem(struct mem_cgroup *memcg) 3661 { 3662 struct obj_cgroup *objcg; 3663 int memcg_id; 3664 3665 if (cgroup_memory_nokmem) 3666 return 0; 3667 3668 BUG_ON(memcg->kmemcg_id >= 0); 3669 BUG_ON(memcg->kmem_state); 3670 3671 memcg_id = memcg_alloc_cache_id(); 3672 if (memcg_id < 0) 3673 return memcg_id; 3674 3675 objcg = obj_cgroup_alloc(); 3676 if (!objcg) { 3677 memcg_free_cache_id(memcg_id); 3678 return -ENOMEM; 3679 } 3680 objcg->memcg = memcg; 3681 rcu_assign_pointer(memcg->objcg, objcg); 3682 3683 static_branch_enable(&memcg_kmem_enabled_key); 3684 3685 memcg->kmemcg_id = memcg_id; 3686 memcg->kmem_state = KMEM_ONLINE; 3687 3688 return 0; 3689 } 3690 3691 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3692 { 3693 struct cgroup_subsys_state *css; 3694 struct mem_cgroup *parent, *child; 3695 int kmemcg_id; 3696 3697 if (memcg->kmem_state != KMEM_ONLINE) 3698 return; 3699 3700 memcg->kmem_state = KMEM_ALLOCATED; 3701 3702 parent = parent_mem_cgroup(memcg); 3703 if (!parent) 3704 parent = root_mem_cgroup; 3705 3706 memcg_reparent_objcgs(memcg, parent); 3707 3708 kmemcg_id = memcg->kmemcg_id; 3709 BUG_ON(kmemcg_id < 0); 3710 3711 /* 3712 * Change kmemcg_id of this cgroup and all its descendants to the 3713 * parent's id, and then move all entries from this cgroup's list_lrus 3714 * to ones of the parent. After we have finished, all list_lrus 3715 * corresponding to this cgroup are guaranteed to remain empty. The 3716 * ordering is imposed by list_lru_node->lock taken by 3717 * memcg_drain_all_list_lrus(). 3718 */ 3719 rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ 3720 css_for_each_descendant_pre(css, &memcg->css) { 3721 child = mem_cgroup_from_css(css); 3722 BUG_ON(child->kmemcg_id != kmemcg_id); 3723 child->kmemcg_id = parent->kmemcg_id; 3724 } 3725 rcu_read_unlock(); 3726 3727 memcg_drain_all_list_lrus(kmemcg_id, parent); 3728 3729 memcg_free_cache_id(kmemcg_id); 3730 } 3731 3732 static void memcg_free_kmem(struct mem_cgroup *memcg) 3733 { 3734 /* css_alloc() failed, offlining didn't happen */ 3735 if (unlikely(memcg->kmem_state == KMEM_ONLINE)) 3736 memcg_offline_kmem(memcg); 3737 } 3738 #else 3739 static int memcg_online_kmem(struct mem_cgroup *memcg) 3740 { 3741 return 0; 3742 } 3743 static void memcg_offline_kmem(struct mem_cgroup *memcg) 3744 { 3745 } 3746 static void memcg_free_kmem(struct mem_cgroup *memcg) 3747 { 3748 } 3749 #endif /* CONFIG_MEMCG_KMEM */ 3750 3751 static int memcg_update_kmem_max(struct mem_cgroup *memcg, 3752 unsigned long max) 3753 { 3754 int ret; 3755 3756 mutex_lock(&memcg_max_mutex); 3757 ret = page_counter_set_max(&memcg->kmem, max); 3758 mutex_unlock(&memcg_max_mutex); 3759 return ret; 3760 } 3761 3762 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 3763 { 3764 int ret; 3765 3766 mutex_lock(&memcg_max_mutex); 3767 3768 ret = page_counter_set_max(&memcg->tcpmem, max); 3769 if (ret) 3770 goto out; 3771 3772 if (!memcg->tcpmem_active) { 3773 /* 3774 * The active flag needs to be written after the static_key 3775 * update. This is what guarantees that the socket activation 3776 * function is the last one to run. See mem_cgroup_sk_alloc() 3777 * for details, and note that we don't mark any socket as 3778 * belonging to this memcg until that flag is up. 3779 * 3780 * We need to do this, because static_keys will span multiple 3781 * sites, but we can't control their order. If we mark a socket 3782 * as accounted, but the accounting functions are not patched in 3783 * yet, we'll lose accounting. 3784 * 3785 * We never race with the readers in mem_cgroup_sk_alloc(), 3786 * because when this value change, the code to process it is not 3787 * patched in yet. 3788 */ 3789 static_branch_inc(&memcg_sockets_enabled_key); 3790 memcg->tcpmem_active = true; 3791 } 3792 out: 3793 mutex_unlock(&memcg_max_mutex); 3794 return ret; 3795 } 3796 3797 /* 3798 * The user of this function is... 3799 * RES_LIMIT. 3800 */ 3801 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 3802 char *buf, size_t nbytes, loff_t off) 3803 { 3804 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3805 unsigned long nr_pages; 3806 int ret; 3807 3808 buf = strstrip(buf); 3809 ret = page_counter_memparse(buf, "-1", &nr_pages); 3810 if (ret) 3811 return ret; 3812 3813 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3814 case RES_LIMIT: 3815 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3816 ret = -EINVAL; 3817 break; 3818 } 3819 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3820 case _MEM: 3821 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 3822 break; 3823 case _MEMSWAP: 3824 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3825 break; 3826 case _KMEM: 3827 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 3828 "Please report your usecase to linux-mm@kvack.org if you " 3829 "depend on this functionality.\n"); 3830 ret = memcg_update_kmem_max(memcg, nr_pages); 3831 break; 3832 case _TCP: 3833 ret = memcg_update_tcp_max(memcg, nr_pages); 3834 break; 3835 } 3836 break; 3837 case RES_SOFT_LIMIT: 3838 memcg->soft_limit = nr_pages; 3839 ret = 0; 3840 break; 3841 } 3842 return ret ?: nbytes; 3843 } 3844 3845 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3846 size_t nbytes, loff_t off) 3847 { 3848 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3849 struct page_counter *counter; 3850 3851 switch (MEMFILE_TYPE(of_cft(of)->private)) { 3852 case _MEM: 3853 counter = &memcg->memory; 3854 break; 3855 case _MEMSWAP: 3856 counter = &memcg->memsw; 3857 break; 3858 case _KMEM: 3859 counter = &memcg->kmem; 3860 break; 3861 case _TCP: 3862 counter = &memcg->tcpmem; 3863 break; 3864 default: 3865 BUG(); 3866 } 3867 3868 switch (MEMFILE_ATTR(of_cft(of)->private)) { 3869 case RES_MAX_USAGE: 3870 page_counter_reset_watermark(counter); 3871 break; 3872 case RES_FAILCNT: 3873 counter->failcnt = 0; 3874 break; 3875 default: 3876 BUG(); 3877 } 3878 3879 return nbytes; 3880 } 3881 3882 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 3883 struct cftype *cft) 3884 { 3885 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 3886 } 3887 3888 #ifdef CONFIG_MMU 3889 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3890 struct cftype *cft, u64 val) 3891 { 3892 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3893 3894 if (val & ~MOVE_MASK) 3895 return -EINVAL; 3896 3897 /* 3898 * No kind of locking is needed in here, because ->can_attach() will 3899 * check this value once in the beginning of the process, and then carry 3900 * on with stale data. This means that changes to this value will only 3901 * affect task migrations starting after the change. 3902 */ 3903 memcg->move_charge_at_immigrate = val; 3904 return 0; 3905 } 3906 #else 3907 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 3908 struct cftype *cft, u64 val) 3909 { 3910 return -ENOSYS; 3911 } 3912 #endif 3913 3914 #ifdef CONFIG_NUMA 3915 3916 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 3917 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 3918 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 3919 3920 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 3921 int nid, unsigned int lru_mask, bool tree) 3922 { 3923 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 3924 unsigned long nr = 0; 3925 enum lru_list lru; 3926 3927 VM_BUG_ON((unsigned)nid >= nr_node_ids); 3928 3929 for_each_lru(lru) { 3930 if (!(BIT(lru) & lru_mask)) 3931 continue; 3932 if (tree) 3933 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 3934 else 3935 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 3936 } 3937 return nr; 3938 } 3939 3940 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 3941 unsigned int lru_mask, 3942 bool tree) 3943 { 3944 unsigned long nr = 0; 3945 enum lru_list lru; 3946 3947 for_each_lru(lru) { 3948 if (!(BIT(lru) & lru_mask)) 3949 continue; 3950 if (tree) 3951 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 3952 else 3953 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 3954 } 3955 return nr; 3956 } 3957 3958 static int memcg_numa_stat_show(struct seq_file *m, void *v) 3959 { 3960 struct numa_stat { 3961 const char *name; 3962 unsigned int lru_mask; 3963 }; 3964 3965 static const struct numa_stat stats[] = { 3966 { "total", LRU_ALL }, 3967 { "file", LRU_ALL_FILE }, 3968 { "anon", LRU_ALL_ANON }, 3969 { "unevictable", BIT(LRU_UNEVICTABLE) }, 3970 }; 3971 const struct numa_stat *stat; 3972 int nid; 3973 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 3974 3975 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3976 seq_printf(m, "%s=%lu", stat->name, 3977 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 3978 false)); 3979 for_each_node_state(nid, N_MEMORY) 3980 seq_printf(m, " N%d=%lu", nid, 3981 mem_cgroup_node_nr_lru_pages(memcg, nid, 3982 stat->lru_mask, false)); 3983 seq_putc(m, '\n'); 3984 } 3985 3986 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 3987 3988 seq_printf(m, "hierarchical_%s=%lu", stat->name, 3989 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 3990 true)); 3991 for_each_node_state(nid, N_MEMORY) 3992 seq_printf(m, " N%d=%lu", nid, 3993 mem_cgroup_node_nr_lru_pages(memcg, nid, 3994 stat->lru_mask, true)); 3995 seq_putc(m, '\n'); 3996 } 3997 3998 return 0; 3999 } 4000 #endif /* CONFIG_NUMA */ 4001 4002 static const unsigned int memcg1_stats[] = { 4003 NR_FILE_PAGES, 4004 NR_ANON_MAPPED, 4005 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4006 NR_ANON_THPS, 4007 #endif 4008 NR_SHMEM, 4009 NR_FILE_MAPPED, 4010 NR_FILE_DIRTY, 4011 NR_WRITEBACK, 4012 MEMCG_SWAP, 4013 }; 4014 4015 static const char *const memcg1_stat_names[] = { 4016 "cache", 4017 "rss", 4018 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4019 "rss_huge", 4020 #endif 4021 "shmem", 4022 "mapped_file", 4023 "dirty", 4024 "writeback", 4025 "swap", 4026 }; 4027 4028 /* Universal VM events cgroup1 shows, original sort order */ 4029 static const unsigned int memcg1_events[] = { 4030 PGPGIN, 4031 PGPGOUT, 4032 PGFAULT, 4033 PGMAJFAULT, 4034 }; 4035 4036 static int memcg_stat_show(struct seq_file *m, void *v) 4037 { 4038 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 4039 unsigned long memory, memsw; 4040 struct mem_cgroup *mi; 4041 unsigned int i; 4042 4043 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 4044 4045 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4046 unsigned long nr; 4047 4048 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 4049 continue; 4050 nr = memcg_page_state_local(memcg, memcg1_stats[i]); 4051 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4052 if (memcg1_stats[i] == NR_ANON_THPS) 4053 nr *= HPAGE_PMD_NR; 4054 #endif 4055 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); 4056 } 4057 4058 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4059 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]), 4060 memcg_events_local(memcg, memcg1_events[i])); 4061 4062 for (i = 0; i < NR_LRU_LISTS; i++) 4063 seq_printf(m, "%s %lu\n", lru_list_name(i), 4064 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 4065 PAGE_SIZE); 4066 4067 /* Hierarchical information */ 4068 memory = memsw = PAGE_COUNTER_MAX; 4069 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 4070 memory = min(memory, READ_ONCE(mi->memory.max)); 4071 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 4072 } 4073 seq_printf(m, "hierarchical_memory_limit %llu\n", 4074 (u64)memory * PAGE_SIZE); 4075 if (do_memsw_account()) 4076 seq_printf(m, "hierarchical_memsw_limit %llu\n", 4077 (u64)memsw * PAGE_SIZE); 4078 4079 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 4080 unsigned long nr; 4081 4082 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) 4083 continue; 4084 nr = memcg_page_state(memcg, memcg1_stats[i]); 4085 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4086 if (memcg1_stats[i] == NR_ANON_THPS) 4087 nr *= HPAGE_PMD_NR; 4088 #endif 4089 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], 4090 (u64)nr * PAGE_SIZE); 4091 } 4092 4093 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 4094 seq_printf(m, "total_%s %llu\n", 4095 vm_event_name(memcg1_events[i]), 4096 (u64)memcg_events(memcg, memcg1_events[i])); 4097 4098 for (i = 0; i < NR_LRU_LISTS; i++) 4099 seq_printf(m, "total_%s %llu\n", lru_list_name(i), 4100 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 4101 PAGE_SIZE); 4102 4103 #ifdef CONFIG_DEBUG_VM 4104 { 4105 pg_data_t *pgdat; 4106 struct mem_cgroup_per_node *mz; 4107 unsigned long anon_cost = 0; 4108 unsigned long file_cost = 0; 4109 4110 for_each_online_pgdat(pgdat) { 4111 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); 4112 4113 anon_cost += mz->lruvec.anon_cost; 4114 file_cost += mz->lruvec.file_cost; 4115 } 4116 seq_printf(m, "anon_cost %lu\n", anon_cost); 4117 seq_printf(m, "file_cost %lu\n", file_cost); 4118 } 4119 #endif 4120 4121 return 0; 4122 } 4123 4124 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 4125 struct cftype *cft) 4126 { 4127 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4128 4129 return mem_cgroup_swappiness(memcg); 4130 } 4131 4132 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 4133 struct cftype *cft, u64 val) 4134 { 4135 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4136 4137 if (val > 100) 4138 return -EINVAL; 4139 4140 if (css->parent) 4141 memcg->swappiness = val; 4142 else 4143 vm_swappiness = val; 4144 4145 return 0; 4146 } 4147 4148 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4149 { 4150 struct mem_cgroup_threshold_ary *t; 4151 unsigned long usage; 4152 int i; 4153 4154 rcu_read_lock(); 4155 if (!swap) 4156 t = rcu_dereference(memcg->thresholds.primary); 4157 else 4158 t = rcu_dereference(memcg->memsw_thresholds.primary); 4159 4160 if (!t) 4161 goto unlock; 4162 4163 usage = mem_cgroup_usage(memcg, swap); 4164 4165 /* 4166 * current_threshold points to threshold just below or equal to usage. 4167 * If it's not true, a threshold was crossed after last 4168 * call of __mem_cgroup_threshold(). 4169 */ 4170 i = t->current_threshold; 4171 4172 /* 4173 * Iterate backward over array of thresholds starting from 4174 * current_threshold and check if a threshold is crossed. 4175 * If none of thresholds below usage is crossed, we read 4176 * only one element of the array here. 4177 */ 4178 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4179 eventfd_signal(t->entries[i].eventfd, 1); 4180 4181 /* i = current_threshold + 1 */ 4182 i++; 4183 4184 /* 4185 * Iterate forward over array of thresholds starting from 4186 * current_threshold+1 and check if a threshold is crossed. 4187 * If none of thresholds above usage is crossed, we read 4188 * only one element of the array here. 4189 */ 4190 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4191 eventfd_signal(t->entries[i].eventfd, 1); 4192 4193 /* Update current_threshold */ 4194 t->current_threshold = i - 1; 4195 unlock: 4196 rcu_read_unlock(); 4197 } 4198 4199 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4200 { 4201 while (memcg) { 4202 __mem_cgroup_threshold(memcg, false); 4203 if (do_memsw_account()) 4204 __mem_cgroup_threshold(memcg, true); 4205 4206 memcg = parent_mem_cgroup(memcg); 4207 } 4208 } 4209 4210 static int compare_thresholds(const void *a, const void *b) 4211 { 4212 const struct mem_cgroup_threshold *_a = a; 4213 const struct mem_cgroup_threshold *_b = b; 4214 4215 if (_a->threshold > _b->threshold) 4216 return 1; 4217 4218 if (_a->threshold < _b->threshold) 4219 return -1; 4220 4221 return 0; 4222 } 4223 4224 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4225 { 4226 struct mem_cgroup_eventfd_list *ev; 4227 4228 spin_lock(&memcg_oom_lock); 4229 4230 list_for_each_entry(ev, &memcg->oom_notify, list) 4231 eventfd_signal(ev->eventfd, 1); 4232 4233 spin_unlock(&memcg_oom_lock); 4234 return 0; 4235 } 4236 4237 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4238 { 4239 struct mem_cgroup *iter; 4240 4241 for_each_mem_cgroup_tree(iter, memcg) 4242 mem_cgroup_oom_notify_cb(iter); 4243 } 4244 4245 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4246 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 4247 { 4248 struct mem_cgroup_thresholds *thresholds; 4249 struct mem_cgroup_threshold_ary *new; 4250 unsigned long threshold; 4251 unsigned long usage; 4252 int i, size, ret; 4253 4254 ret = page_counter_memparse(args, "-1", &threshold); 4255 if (ret) 4256 return ret; 4257 4258 mutex_lock(&memcg->thresholds_lock); 4259 4260 if (type == _MEM) { 4261 thresholds = &memcg->thresholds; 4262 usage = mem_cgroup_usage(memcg, false); 4263 } else if (type == _MEMSWAP) { 4264 thresholds = &memcg->memsw_thresholds; 4265 usage = mem_cgroup_usage(memcg, true); 4266 } else 4267 BUG(); 4268 4269 /* Check if a threshold crossed before adding a new one */ 4270 if (thresholds->primary) 4271 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4272 4273 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4274 4275 /* Allocate memory for new array of thresholds */ 4276 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 4277 if (!new) { 4278 ret = -ENOMEM; 4279 goto unlock; 4280 } 4281 new->size = size; 4282 4283 /* Copy thresholds (if any) to new array */ 4284 if (thresholds->primary) 4285 memcpy(new->entries, thresholds->primary->entries, 4286 flex_array_size(new, entries, size - 1)); 4287 4288 /* Add new threshold */ 4289 new->entries[size - 1].eventfd = eventfd; 4290 new->entries[size - 1].threshold = threshold; 4291 4292 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4293 sort(new->entries, size, sizeof(*new->entries), 4294 compare_thresholds, NULL); 4295 4296 /* Find current threshold */ 4297 new->current_threshold = -1; 4298 for (i = 0; i < size; i++) { 4299 if (new->entries[i].threshold <= usage) { 4300 /* 4301 * new->current_threshold will not be used until 4302 * rcu_assign_pointer(), so it's safe to increment 4303 * it here. 4304 */ 4305 ++new->current_threshold; 4306 } else 4307 break; 4308 } 4309 4310 /* Free old spare buffer and save old primary buffer as spare */ 4311 kfree(thresholds->spare); 4312 thresholds->spare = thresholds->primary; 4313 4314 rcu_assign_pointer(thresholds->primary, new); 4315 4316 /* To be sure that nobody uses thresholds */ 4317 synchronize_rcu(); 4318 4319 unlock: 4320 mutex_unlock(&memcg->thresholds_lock); 4321 4322 return ret; 4323 } 4324 4325 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4326 struct eventfd_ctx *eventfd, const char *args) 4327 { 4328 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4329 } 4330 4331 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4332 struct eventfd_ctx *eventfd, const char *args) 4333 { 4334 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4335 } 4336 4337 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4338 struct eventfd_ctx *eventfd, enum res_type type) 4339 { 4340 struct mem_cgroup_thresholds *thresholds; 4341 struct mem_cgroup_threshold_ary *new; 4342 unsigned long usage; 4343 int i, j, size, entries; 4344 4345 mutex_lock(&memcg->thresholds_lock); 4346 4347 if (type == _MEM) { 4348 thresholds = &memcg->thresholds; 4349 usage = mem_cgroup_usage(memcg, false); 4350 } else if (type == _MEMSWAP) { 4351 thresholds = &memcg->memsw_thresholds; 4352 usage = mem_cgroup_usage(memcg, true); 4353 } else 4354 BUG(); 4355 4356 if (!thresholds->primary) 4357 goto unlock; 4358 4359 /* Check if a threshold crossed before removing */ 4360 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4361 4362 /* Calculate new number of threshold */ 4363 size = entries = 0; 4364 for (i = 0; i < thresholds->primary->size; i++) { 4365 if (thresholds->primary->entries[i].eventfd != eventfd) 4366 size++; 4367 else 4368 entries++; 4369 } 4370 4371 new = thresholds->spare; 4372 4373 /* If no items related to eventfd have been cleared, nothing to do */ 4374 if (!entries) 4375 goto unlock; 4376 4377 /* Set thresholds array to NULL if we don't have thresholds */ 4378 if (!size) { 4379 kfree(new); 4380 new = NULL; 4381 goto swap_buffers; 4382 } 4383 4384 new->size = size; 4385 4386 /* Copy thresholds and find current threshold */ 4387 new->current_threshold = -1; 4388 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4389 if (thresholds->primary->entries[i].eventfd == eventfd) 4390 continue; 4391 4392 new->entries[j] = thresholds->primary->entries[i]; 4393 if (new->entries[j].threshold <= usage) { 4394 /* 4395 * new->current_threshold will not be used 4396 * until rcu_assign_pointer(), so it's safe to increment 4397 * it here. 4398 */ 4399 ++new->current_threshold; 4400 } 4401 j++; 4402 } 4403 4404 swap_buffers: 4405 /* Swap primary and spare array */ 4406 thresholds->spare = thresholds->primary; 4407 4408 rcu_assign_pointer(thresholds->primary, new); 4409 4410 /* To be sure that nobody uses thresholds */ 4411 synchronize_rcu(); 4412 4413 /* If all events are unregistered, free the spare array */ 4414 if (!new) { 4415 kfree(thresholds->spare); 4416 thresholds->spare = NULL; 4417 } 4418 unlock: 4419 mutex_unlock(&memcg->thresholds_lock); 4420 } 4421 4422 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4423 struct eventfd_ctx *eventfd) 4424 { 4425 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4426 } 4427 4428 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4429 struct eventfd_ctx *eventfd) 4430 { 4431 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4432 } 4433 4434 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4435 struct eventfd_ctx *eventfd, const char *args) 4436 { 4437 struct mem_cgroup_eventfd_list *event; 4438 4439 event = kmalloc(sizeof(*event), GFP_KERNEL); 4440 if (!event) 4441 return -ENOMEM; 4442 4443 spin_lock(&memcg_oom_lock); 4444 4445 event->eventfd = eventfd; 4446 list_add(&event->list, &memcg->oom_notify); 4447 4448 /* already in OOM ? */ 4449 if (memcg->under_oom) 4450 eventfd_signal(eventfd, 1); 4451 spin_unlock(&memcg_oom_lock); 4452 4453 return 0; 4454 } 4455 4456 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4457 struct eventfd_ctx *eventfd) 4458 { 4459 struct mem_cgroup_eventfd_list *ev, *tmp; 4460 4461 spin_lock(&memcg_oom_lock); 4462 4463 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4464 if (ev->eventfd == eventfd) { 4465 list_del(&ev->list); 4466 kfree(ev); 4467 } 4468 } 4469 4470 spin_unlock(&memcg_oom_lock); 4471 } 4472 4473 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4474 { 4475 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 4476 4477 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4478 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 4479 seq_printf(sf, "oom_kill %lu\n", 4480 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 4481 return 0; 4482 } 4483 4484 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4485 struct cftype *cft, u64 val) 4486 { 4487 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4488 4489 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4490 if (!css->parent || !((val == 0) || (val == 1))) 4491 return -EINVAL; 4492 4493 memcg->oom_kill_disable = val; 4494 if (!val) 4495 memcg_oom_recover(memcg); 4496 4497 return 0; 4498 } 4499 4500 #ifdef CONFIG_CGROUP_WRITEBACK 4501 4502 #include <trace/events/writeback.h> 4503 4504 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4505 { 4506 return wb_domain_init(&memcg->cgwb_domain, gfp); 4507 } 4508 4509 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4510 { 4511 wb_domain_exit(&memcg->cgwb_domain); 4512 } 4513 4514 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4515 { 4516 wb_domain_size_changed(&memcg->cgwb_domain); 4517 } 4518 4519 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) 4520 { 4521 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4522 4523 if (!memcg->css.parent) 4524 return NULL; 4525 4526 return &memcg->cgwb_domain; 4527 } 4528 4529 /* 4530 * idx can be of type enum memcg_stat_item or node_stat_item. 4531 * Keep in sync with memcg_exact_page(). 4532 */ 4533 static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) 4534 { 4535 long x = atomic_long_read(&memcg->vmstats[idx]); 4536 int cpu; 4537 4538 for_each_online_cpu(cpu) 4539 x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx]; 4540 if (x < 0) 4541 x = 0; 4542 return x; 4543 } 4544 4545 /** 4546 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg 4547 * @wb: bdi_writeback in question 4548 * @pfilepages: out parameter for number of file pages 4549 * @pheadroom: out parameter for number of allocatable pages according to memcg 4550 * @pdirty: out parameter for number of dirty pages 4551 * @pwriteback: out parameter for number of pages under writeback 4552 * 4553 * Determine the numbers of file, headroom, dirty, and writeback pages in 4554 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom 4555 * is a bit more involved. 4556 * 4557 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the 4558 * headroom is calculated as the lowest headroom of itself and the 4559 * ancestors. Note that this doesn't consider the actual amount of 4560 * available memory in the system. The caller should further cap 4561 * *@pheadroom accordingly. 4562 */ 4563 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, 4564 unsigned long *pheadroom, unsigned long *pdirty, 4565 unsigned long *pwriteback) 4566 { 4567 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4568 struct mem_cgroup *parent; 4569 4570 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); 4571 4572 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); 4573 *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + 4574 memcg_exact_page_state(memcg, NR_ACTIVE_FILE); 4575 *pheadroom = PAGE_COUNTER_MAX; 4576 4577 while ((parent = parent_mem_cgroup(memcg))) { 4578 unsigned long ceiling = min(READ_ONCE(memcg->memory.max), 4579 READ_ONCE(memcg->memory.high)); 4580 unsigned long used = page_counter_read(&memcg->memory); 4581 4582 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); 4583 memcg = parent; 4584 } 4585 } 4586 4587 /* 4588 * Foreign dirty flushing 4589 * 4590 * There's an inherent mismatch between memcg and writeback. The former 4591 * trackes ownership per-page while the latter per-inode. This was a 4592 * deliberate design decision because honoring per-page ownership in the 4593 * writeback path is complicated, may lead to higher CPU and IO overheads 4594 * and deemed unnecessary given that write-sharing an inode across 4595 * different cgroups isn't a common use-case. 4596 * 4597 * Combined with inode majority-writer ownership switching, this works well 4598 * enough in most cases but there are some pathological cases. For 4599 * example, let's say there are two cgroups A and B which keep writing to 4600 * different but confined parts of the same inode. B owns the inode and 4601 * A's memory is limited far below B's. A's dirty ratio can rise enough to 4602 * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid 4603 * triggering background writeback. A will be slowed down without a way to 4604 * make writeback of the dirty pages happen. 4605 * 4606 * Conditions like the above can lead to a cgroup getting repatedly and 4607 * severely throttled after making some progress after each 4608 * dirty_expire_interval while the underyling IO device is almost 4609 * completely idle. 4610 * 4611 * Solving this problem completely requires matching the ownership tracking 4612 * granularities between memcg and writeback in either direction. However, 4613 * the more egregious behaviors can be avoided by simply remembering the 4614 * most recent foreign dirtying events and initiating remote flushes on 4615 * them when local writeback isn't enough to keep the memory clean enough. 4616 * 4617 * The following two functions implement such mechanism. When a foreign 4618 * page - a page whose memcg and writeback ownerships don't match - is 4619 * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning 4620 * bdi_writeback on the page owning memcg. When balance_dirty_pages() 4621 * decides that the memcg needs to sleep due to high dirty ratio, it calls 4622 * mem_cgroup_flush_foreign() which queues writeback on the recorded 4623 * foreign bdi_writebacks which haven't expired. Both the numbers of 4624 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are 4625 * limited to MEMCG_CGWB_FRN_CNT. 4626 * 4627 * The mechanism only remembers IDs and doesn't hold any object references. 4628 * As being wrong occasionally doesn't matter, updates and accesses to the 4629 * records are lockless and racy. 4630 */ 4631 void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, 4632 struct bdi_writeback *wb) 4633 { 4634 struct mem_cgroup *memcg = page->mem_cgroup; 4635 struct memcg_cgwb_frn *frn; 4636 u64 now = get_jiffies_64(); 4637 u64 oldest_at = now; 4638 int oldest = -1; 4639 int i; 4640 4641 trace_track_foreign_dirty(page, wb); 4642 4643 /* 4644 * Pick the slot to use. If there is already a slot for @wb, keep 4645 * using it. If not replace the oldest one which isn't being 4646 * written out. 4647 */ 4648 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4649 frn = &memcg->cgwb_frn[i]; 4650 if (frn->bdi_id == wb->bdi->id && 4651 frn->memcg_id == wb->memcg_css->id) 4652 break; 4653 if (time_before64(frn->at, oldest_at) && 4654 atomic_read(&frn->done.cnt) == 1) { 4655 oldest = i; 4656 oldest_at = frn->at; 4657 } 4658 } 4659 4660 if (i < MEMCG_CGWB_FRN_CNT) { 4661 /* 4662 * Re-using an existing one. Update timestamp lazily to 4663 * avoid making the cacheline hot. We want them to be 4664 * reasonably up-to-date and significantly shorter than 4665 * dirty_expire_interval as that's what expires the record. 4666 * Use the shorter of 1s and dirty_expire_interval / 8. 4667 */ 4668 unsigned long update_intv = 4669 min_t(unsigned long, HZ, 4670 msecs_to_jiffies(dirty_expire_interval * 10) / 8); 4671 4672 if (time_before64(frn->at, now - update_intv)) 4673 frn->at = now; 4674 } else if (oldest >= 0) { 4675 /* replace the oldest free one */ 4676 frn = &memcg->cgwb_frn[oldest]; 4677 frn->bdi_id = wb->bdi->id; 4678 frn->memcg_id = wb->memcg_css->id; 4679 frn->at = now; 4680 } 4681 } 4682 4683 /* issue foreign writeback flushes for recorded foreign dirtying events */ 4684 void mem_cgroup_flush_foreign(struct bdi_writeback *wb) 4685 { 4686 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); 4687 unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); 4688 u64 now = jiffies_64; 4689 int i; 4690 4691 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { 4692 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; 4693 4694 /* 4695 * If the record is older than dirty_expire_interval, 4696 * writeback on it has already started. No need to kick it 4697 * off again. Also, don't start a new one if there's 4698 * already one in flight. 4699 */ 4700 if (time_after64(frn->at, now - intv) && 4701 atomic_read(&frn->done.cnt) == 1) { 4702 frn->at = 0; 4703 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); 4704 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, 4705 WB_REASON_FOREIGN_FLUSH, 4706 &frn->done); 4707 } 4708 } 4709 } 4710 4711 #else /* CONFIG_CGROUP_WRITEBACK */ 4712 4713 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) 4714 { 4715 return 0; 4716 } 4717 4718 static void memcg_wb_domain_exit(struct mem_cgroup *memcg) 4719 { 4720 } 4721 4722 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) 4723 { 4724 } 4725 4726 #endif /* CONFIG_CGROUP_WRITEBACK */ 4727 4728 /* 4729 * DO NOT USE IN NEW FILES. 4730 * 4731 * "cgroup.event_control" implementation. 4732 * 4733 * This is way over-engineered. It tries to support fully configurable 4734 * events for each user. Such level of flexibility is completely 4735 * unnecessary especially in the light of the planned unified hierarchy. 4736 * 4737 * Please deprecate this and replace with something simpler if at all 4738 * possible. 4739 */ 4740 4741 /* 4742 * Unregister event and free resources. 4743 * 4744 * Gets called from workqueue. 4745 */ 4746 static void memcg_event_remove(struct work_struct *work) 4747 { 4748 struct mem_cgroup_event *event = 4749 container_of(work, struct mem_cgroup_event, remove); 4750 struct mem_cgroup *memcg = event->memcg; 4751 4752 remove_wait_queue(event->wqh, &event->wait); 4753 4754 event->unregister_event(memcg, event->eventfd); 4755 4756 /* Notify userspace the event is going away. */ 4757 eventfd_signal(event->eventfd, 1); 4758 4759 eventfd_ctx_put(event->eventfd); 4760 kfree(event); 4761 css_put(&memcg->css); 4762 } 4763 4764 /* 4765 * Gets called on EPOLLHUP on eventfd when user closes it. 4766 * 4767 * Called with wqh->lock held and interrupts disabled. 4768 */ 4769 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 4770 int sync, void *key) 4771 { 4772 struct mem_cgroup_event *event = 4773 container_of(wait, struct mem_cgroup_event, wait); 4774 struct mem_cgroup *memcg = event->memcg; 4775 __poll_t flags = key_to_poll(key); 4776 4777 if (flags & EPOLLHUP) { 4778 /* 4779 * If the event has been detached at cgroup removal, we 4780 * can simply return knowing the other side will cleanup 4781 * for us. 4782 * 4783 * We can't race against event freeing since the other 4784 * side will require wqh->lock via remove_wait_queue(), 4785 * which we hold. 4786 */ 4787 spin_lock(&memcg->event_list_lock); 4788 if (!list_empty(&event->list)) { 4789 list_del_init(&event->list); 4790 /* 4791 * We are in atomic context, but cgroup_event_remove() 4792 * may sleep, so we have to call it in workqueue. 4793 */ 4794 schedule_work(&event->remove); 4795 } 4796 spin_unlock(&memcg->event_list_lock); 4797 } 4798 4799 return 0; 4800 } 4801 4802 static void memcg_event_ptable_queue_proc(struct file *file, 4803 wait_queue_head_t *wqh, poll_table *pt) 4804 { 4805 struct mem_cgroup_event *event = 4806 container_of(pt, struct mem_cgroup_event, pt); 4807 4808 event->wqh = wqh; 4809 add_wait_queue(wqh, &event->wait); 4810 } 4811 4812 /* 4813 * DO NOT USE IN NEW FILES. 4814 * 4815 * Parse input and register new cgroup event handler. 4816 * 4817 * Input must be in format '<event_fd> <control_fd> <args>'. 4818 * Interpretation of args is defined by control file implementation. 4819 */ 4820 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4821 char *buf, size_t nbytes, loff_t off) 4822 { 4823 struct cgroup_subsys_state *css = of_css(of); 4824 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4825 struct mem_cgroup_event *event; 4826 struct cgroup_subsys_state *cfile_css; 4827 unsigned int efd, cfd; 4828 struct fd efile; 4829 struct fd cfile; 4830 const char *name; 4831 char *endp; 4832 int ret; 4833 4834 buf = strstrip(buf); 4835 4836 efd = simple_strtoul(buf, &endp, 10); 4837 if (*endp != ' ') 4838 return -EINVAL; 4839 buf = endp + 1; 4840 4841 cfd = simple_strtoul(buf, &endp, 10); 4842 if ((*endp != ' ') && (*endp != '\0')) 4843 return -EINVAL; 4844 buf = endp + 1; 4845 4846 event = kzalloc(sizeof(*event), GFP_KERNEL); 4847 if (!event) 4848 return -ENOMEM; 4849 4850 event->memcg = memcg; 4851 INIT_LIST_HEAD(&event->list); 4852 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4853 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4854 INIT_WORK(&event->remove, memcg_event_remove); 4855 4856 efile = fdget(efd); 4857 if (!efile.file) { 4858 ret = -EBADF; 4859 goto out_kfree; 4860 } 4861 4862 event->eventfd = eventfd_ctx_fileget(efile.file); 4863 if (IS_ERR(event->eventfd)) { 4864 ret = PTR_ERR(event->eventfd); 4865 goto out_put_efile; 4866 } 4867 4868 cfile = fdget(cfd); 4869 if (!cfile.file) { 4870 ret = -EBADF; 4871 goto out_put_eventfd; 4872 } 4873 4874 /* the process need read permission on control file */ 4875 /* AV: shouldn't we check that it's been opened for read instead? */ 4876 ret = inode_permission(file_inode(cfile.file), MAY_READ); 4877 if (ret < 0) 4878 goto out_put_cfile; 4879 4880 /* 4881 * Determine the event callbacks and set them in @event. This used 4882 * to be done via struct cftype but cgroup core no longer knows 4883 * about these events. The following is crude but the whole thing 4884 * is for compatibility anyway. 4885 * 4886 * DO NOT ADD NEW FILES. 4887 */ 4888 name = cfile.file->f_path.dentry->d_name.name; 4889 4890 if (!strcmp(name, "memory.usage_in_bytes")) { 4891 event->register_event = mem_cgroup_usage_register_event; 4892 event->unregister_event = mem_cgroup_usage_unregister_event; 4893 } else if (!strcmp(name, "memory.oom_control")) { 4894 event->register_event = mem_cgroup_oom_register_event; 4895 event->unregister_event = mem_cgroup_oom_unregister_event; 4896 } else if (!strcmp(name, "memory.pressure_level")) { 4897 event->register_event = vmpressure_register_event; 4898 event->unregister_event = vmpressure_unregister_event; 4899 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4900 event->register_event = memsw_cgroup_usage_register_event; 4901 event->unregister_event = memsw_cgroup_usage_unregister_event; 4902 } else { 4903 ret = -EINVAL; 4904 goto out_put_cfile; 4905 } 4906 4907 /* 4908 * Verify @cfile should belong to @css. Also, remaining events are 4909 * automatically removed on cgroup destruction but the removal is 4910 * asynchronous, so take an extra ref on @css. 4911 */ 4912 cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, 4913 &memory_cgrp_subsys); 4914 ret = -EINVAL; 4915 if (IS_ERR(cfile_css)) 4916 goto out_put_cfile; 4917 if (cfile_css != css) { 4918 css_put(cfile_css); 4919 goto out_put_cfile; 4920 } 4921 4922 ret = event->register_event(memcg, event->eventfd, buf); 4923 if (ret) 4924 goto out_put_css; 4925 4926 vfs_poll(efile.file, &event->pt); 4927 4928 spin_lock(&memcg->event_list_lock); 4929 list_add(&event->list, &memcg->event_list); 4930 spin_unlock(&memcg->event_list_lock); 4931 4932 fdput(cfile); 4933 fdput(efile); 4934 4935 return nbytes; 4936 4937 out_put_css: 4938 css_put(css); 4939 out_put_cfile: 4940 fdput(cfile); 4941 out_put_eventfd: 4942 eventfd_ctx_put(event->eventfd); 4943 out_put_efile: 4944 fdput(efile); 4945 out_kfree: 4946 kfree(event); 4947 4948 return ret; 4949 } 4950 4951 static struct cftype mem_cgroup_legacy_files[] = { 4952 { 4953 .name = "usage_in_bytes", 4954 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4955 .read_u64 = mem_cgroup_read_u64, 4956 }, 4957 { 4958 .name = "max_usage_in_bytes", 4959 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4960 .write = mem_cgroup_reset, 4961 .read_u64 = mem_cgroup_read_u64, 4962 }, 4963 { 4964 .name = "limit_in_bytes", 4965 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4966 .write = mem_cgroup_write, 4967 .read_u64 = mem_cgroup_read_u64, 4968 }, 4969 { 4970 .name = "soft_limit_in_bytes", 4971 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4972 .write = mem_cgroup_write, 4973 .read_u64 = mem_cgroup_read_u64, 4974 }, 4975 { 4976 .name = "failcnt", 4977 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4978 .write = mem_cgroup_reset, 4979 .read_u64 = mem_cgroup_read_u64, 4980 }, 4981 { 4982 .name = "stat", 4983 .seq_show = memcg_stat_show, 4984 }, 4985 { 4986 .name = "force_empty", 4987 .write = mem_cgroup_force_empty_write, 4988 }, 4989 { 4990 .name = "use_hierarchy", 4991 .write_u64 = mem_cgroup_hierarchy_write, 4992 .read_u64 = mem_cgroup_hierarchy_read, 4993 }, 4994 { 4995 .name = "cgroup.event_control", /* XXX: for compat */ 4996 .write = memcg_write_event_control, 4997 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 4998 }, 4999 { 5000 .name = "swappiness", 5001 .read_u64 = mem_cgroup_swappiness_read, 5002 .write_u64 = mem_cgroup_swappiness_write, 5003 }, 5004 { 5005 .name = "move_charge_at_immigrate", 5006 .read_u64 = mem_cgroup_move_charge_read, 5007 .write_u64 = mem_cgroup_move_charge_write, 5008 }, 5009 { 5010 .name = "oom_control", 5011 .seq_show = mem_cgroup_oom_control_read, 5012 .write_u64 = mem_cgroup_oom_control_write, 5013 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 5014 }, 5015 { 5016 .name = "pressure_level", 5017 }, 5018 #ifdef CONFIG_NUMA 5019 { 5020 .name = "numa_stat", 5021 .seq_show = memcg_numa_stat_show, 5022 }, 5023 #endif 5024 { 5025 .name = "kmem.limit_in_bytes", 5026 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 5027 .write = mem_cgroup_write, 5028 .read_u64 = mem_cgroup_read_u64, 5029 }, 5030 { 5031 .name = "kmem.usage_in_bytes", 5032 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 5033 .read_u64 = mem_cgroup_read_u64, 5034 }, 5035 { 5036 .name = "kmem.failcnt", 5037 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 5038 .write = mem_cgroup_reset, 5039 .read_u64 = mem_cgroup_read_u64, 5040 }, 5041 { 5042 .name = "kmem.max_usage_in_bytes", 5043 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 5044 .write = mem_cgroup_reset, 5045 .read_u64 = mem_cgroup_read_u64, 5046 }, 5047 #if defined(CONFIG_MEMCG_KMEM) && \ 5048 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 5049 { 5050 .name = "kmem.slabinfo", 5051 .seq_show = memcg_slab_show, 5052 }, 5053 #endif 5054 { 5055 .name = "kmem.tcp.limit_in_bytes", 5056 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 5057 .write = mem_cgroup_write, 5058 .read_u64 = mem_cgroup_read_u64, 5059 }, 5060 { 5061 .name = "kmem.tcp.usage_in_bytes", 5062 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 5063 .read_u64 = mem_cgroup_read_u64, 5064 }, 5065 { 5066 .name = "kmem.tcp.failcnt", 5067 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 5068 .write = mem_cgroup_reset, 5069 .read_u64 = mem_cgroup_read_u64, 5070 }, 5071 { 5072 .name = "kmem.tcp.max_usage_in_bytes", 5073 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 5074 .write = mem_cgroup_reset, 5075 .read_u64 = mem_cgroup_read_u64, 5076 }, 5077 { }, /* terminate */ 5078 }; 5079 5080 /* 5081 * Private memory cgroup IDR 5082 * 5083 * Swap-out records and page cache shadow entries need to store memcg 5084 * references in constrained space, so we maintain an ID space that is 5085 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of 5086 * memory-controlled cgroups to 64k. 5087 * 5088 * However, there usually are many references to the offline CSS after 5089 * the cgroup has been destroyed, such as page cache or reclaimable 5090 * slab objects, that don't need to hang on to the ID. We want to keep 5091 * those dead CSS from occupying IDs, or we might quickly exhaust the 5092 * relatively small ID space and prevent the creation of new cgroups 5093 * even when there are much fewer than 64k cgroups - possibly none. 5094 * 5095 * Maintain a private 16-bit ID space for memcg, and allow the ID to 5096 * be freed and recycled when it's no longer needed, which is usually 5097 * when the CSS is offlined. 5098 * 5099 * The only exception to that are records of swapped out tmpfs/shmem 5100 * pages that need to be attributed to live ancestors on swapin. But 5101 * those references are manageable from userspace. 5102 */ 5103 5104 static DEFINE_IDR(mem_cgroup_idr); 5105 5106 static void mem_cgroup_id_remove(struct mem_cgroup *memcg) 5107 { 5108 if (memcg->id.id > 0) { 5109 idr_remove(&mem_cgroup_idr, memcg->id.id); 5110 memcg->id.id = 0; 5111 } 5112 } 5113 5114 static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, 5115 unsigned int n) 5116 { 5117 refcount_add(n, &memcg->id.ref); 5118 } 5119 5120 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) 5121 { 5122 if (refcount_sub_and_test(n, &memcg->id.ref)) { 5123 mem_cgroup_id_remove(memcg); 5124 5125 /* Memcg ID pins CSS */ 5126 css_put(&memcg->css); 5127 } 5128 } 5129 5130 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 5131 { 5132 mem_cgroup_id_put_many(memcg, 1); 5133 } 5134 5135 /** 5136 * mem_cgroup_from_id - look up a memcg from a memcg id 5137 * @id: the memcg id to look up 5138 * 5139 * Caller must hold rcu_read_lock(). 5140 */ 5141 struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 5142 { 5143 WARN_ON_ONCE(!rcu_read_lock_held()); 5144 return idr_find(&mem_cgroup_idr, id); 5145 } 5146 5147 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5148 { 5149 struct mem_cgroup_per_node *pn; 5150 int tmp = node; 5151 /* 5152 * This routine is called against possible nodes. 5153 * But it's BUG to call kmalloc() against offline node. 5154 * 5155 * TODO: this routine can waste much memory for nodes which will 5156 * never be onlined. It's better to use memory hotplug callback 5157 * function. 5158 */ 5159 if (!node_state(node, N_NORMAL_MEMORY)) 5160 tmp = -1; 5161 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 5162 if (!pn) 5163 return 1; 5164 5165 pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, 5166 GFP_KERNEL_ACCOUNT); 5167 if (!pn->lruvec_stat_local) { 5168 kfree(pn); 5169 return 1; 5170 } 5171 5172 pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat, 5173 GFP_KERNEL_ACCOUNT); 5174 if (!pn->lruvec_stat_cpu) { 5175 free_percpu(pn->lruvec_stat_local); 5176 kfree(pn); 5177 return 1; 5178 } 5179 5180 lruvec_init(&pn->lruvec); 5181 pn->usage_in_excess = 0; 5182 pn->on_tree = false; 5183 pn->memcg = memcg; 5184 5185 memcg->nodeinfo[node] = pn; 5186 return 0; 5187 } 5188 5189 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) 5190 { 5191 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 5192 5193 if (!pn) 5194 return; 5195 5196 free_percpu(pn->lruvec_stat_cpu); 5197 free_percpu(pn->lruvec_stat_local); 5198 kfree(pn); 5199 } 5200 5201 static void __mem_cgroup_free(struct mem_cgroup *memcg) 5202 { 5203 int node; 5204 5205 for_each_node(node) 5206 free_mem_cgroup_per_node_info(memcg, node); 5207 free_percpu(memcg->vmstats_percpu); 5208 free_percpu(memcg->vmstats_local); 5209 kfree(memcg); 5210 } 5211 5212 static void mem_cgroup_free(struct mem_cgroup *memcg) 5213 { 5214 memcg_wb_domain_exit(memcg); 5215 /* 5216 * Flush percpu vmstats and vmevents to guarantee the value correctness 5217 * on parent's and all ancestor levels. 5218 */ 5219 memcg_flush_percpu_vmstats(memcg); 5220 memcg_flush_percpu_vmevents(memcg); 5221 __mem_cgroup_free(memcg); 5222 } 5223 5224 static struct mem_cgroup *mem_cgroup_alloc(void) 5225 { 5226 struct mem_cgroup *memcg; 5227 unsigned int size; 5228 int node; 5229 int __maybe_unused i; 5230 long error = -ENOMEM; 5231 5232 size = sizeof(struct mem_cgroup); 5233 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 5234 5235 memcg = kzalloc(size, GFP_KERNEL); 5236 if (!memcg) 5237 return ERR_PTR(error); 5238 5239 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 5240 1, MEM_CGROUP_ID_MAX, 5241 GFP_KERNEL); 5242 if (memcg->id.id < 0) { 5243 error = memcg->id.id; 5244 goto fail; 5245 } 5246 5247 memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu, 5248 GFP_KERNEL_ACCOUNT); 5249 if (!memcg->vmstats_local) 5250 goto fail; 5251 5252 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, 5253 GFP_KERNEL_ACCOUNT); 5254 if (!memcg->vmstats_percpu) 5255 goto fail; 5256 5257 for_each_node(node) 5258 if (alloc_mem_cgroup_per_node_info(memcg, node)) 5259 goto fail; 5260 5261 if (memcg_wb_domain_init(memcg, GFP_KERNEL)) 5262 goto fail; 5263 5264 INIT_WORK(&memcg->high_work, high_work_func); 5265 INIT_LIST_HEAD(&memcg->oom_notify); 5266 mutex_init(&memcg->thresholds_lock); 5267 spin_lock_init(&memcg->move_lock); 5268 vmpressure_init(&memcg->vmpressure); 5269 INIT_LIST_HEAD(&memcg->event_list); 5270 spin_lock_init(&memcg->event_list_lock); 5271 memcg->socket_pressure = jiffies; 5272 #ifdef CONFIG_MEMCG_KMEM 5273 memcg->kmemcg_id = -1; 5274 INIT_LIST_HEAD(&memcg->objcg_list); 5275 #endif 5276 #ifdef CONFIG_CGROUP_WRITEBACK 5277 INIT_LIST_HEAD(&memcg->cgwb_list); 5278 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5279 memcg->cgwb_frn[i].done = 5280 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 5281 #endif 5282 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5283 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); 5284 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); 5285 memcg->deferred_split_queue.split_queue_len = 0; 5286 #endif 5287 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 5288 return memcg; 5289 fail: 5290 mem_cgroup_id_remove(memcg); 5291 __mem_cgroup_free(memcg); 5292 return ERR_PTR(error); 5293 } 5294 5295 static struct cgroup_subsys_state * __ref 5296 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5297 { 5298 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 5299 struct mem_cgroup *memcg, *old_memcg; 5300 long error = -ENOMEM; 5301 5302 old_memcg = set_active_memcg(parent); 5303 memcg = mem_cgroup_alloc(); 5304 set_active_memcg(old_memcg); 5305 if (IS_ERR(memcg)) 5306 return ERR_CAST(memcg); 5307 5308 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5309 memcg->soft_limit = PAGE_COUNTER_MAX; 5310 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5311 if (parent) { 5312 memcg->swappiness = mem_cgroup_swappiness(parent); 5313 memcg->oom_kill_disable = parent->oom_kill_disable; 5314 5315 page_counter_init(&memcg->memory, &parent->memory); 5316 page_counter_init(&memcg->swap, &parent->swap); 5317 page_counter_init(&memcg->kmem, &parent->kmem); 5318 page_counter_init(&memcg->tcpmem, &parent->tcpmem); 5319 } else { 5320 page_counter_init(&memcg->memory, NULL); 5321 page_counter_init(&memcg->swap, NULL); 5322 page_counter_init(&memcg->kmem, NULL); 5323 page_counter_init(&memcg->tcpmem, NULL); 5324 5325 root_mem_cgroup = memcg; 5326 return &memcg->css; 5327 } 5328 5329 /* The following stuff does not apply to the root */ 5330 error = memcg_online_kmem(memcg); 5331 if (error) 5332 goto fail; 5333 5334 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5335 static_branch_inc(&memcg_sockets_enabled_key); 5336 5337 return &memcg->css; 5338 fail: 5339 mem_cgroup_id_remove(memcg); 5340 mem_cgroup_free(memcg); 5341 return ERR_PTR(error); 5342 } 5343 5344 static int mem_cgroup_css_online(struct cgroup_subsys_state *css) 5345 { 5346 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5347 5348 /* 5349 * A memcg must be visible for memcg_expand_shrinker_maps() 5350 * by the time the maps are allocated. So, we allocate maps 5351 * here, when for_each_mem_cgroup() can't skip it. 5352 */ 5353 if (memcg_alloc_shrinker_maps(memcg)) { 5354 mem_cgroup_id_remove(memcg); 5355 return -ENOMEM; 5356 } 5357 5358 /* Online state pins memcg ID, memcg ID pins CSS */ 5359 refcount_set(&memcg->id.ref, 1); 5360 css_get(css); 5361 return 0; 5362 } 5363 5364 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5365 { 5366 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5367 struct mem_cgroup_event *event, *tmp; 5368 5369 /* 5370 * Unregister events and notify userspace. 5371 * Notify userspace about cgroup removing only after rmdir of cgroup 5372 * directory to avoid race between userspace and kernelspace. 5373 */ 5374 spin_lock(&memcg->event_list_lock); 5375 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5376 list_del_init(&event->list); 5377 schedule_work(&event->remove); 5378 } 5379 spin_unlock(&memcg->event_list_lock); 5380 5381 page_counter_set_min(&memcg->memory, 0); 5382 page_counter_set_low(&memcg->memory, 0); 5383 5384 memcg_offline_kmem(memcg); 5385 wb_memcg_offline(memcg); 5386 5387 drain_all_stock(memcg); 5388 5389 mem_cgroup_id_put(memcg); 5390 } 5391 5392 static void mem_cgroup_css_released(struct cgroup_subsys_state *css) 5393 { 5394 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5395 5396 invalidate_reclaim_iterators(memcg); 5397 } 5398 5399 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5400 { 5401 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5402 int __maybe_unused i; 5403 5404 #ifdef CONFIG_CGROUP_WRITEBACK 5405 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) 5406 wb_wait_for_completion(&memcg->cgwb_frn[i].done); 5407 #endif 5408 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) 5409 static_branch_dec(&memcg_sockets_enabled_key); 5410 5411 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) 5412 static_branch_dec(&memcg_sockets_enabled_key); 5413 5414 vmpressure_cleanup(&memcg->vmpressure); 5415 cancel_work_sync(&memcg->high_work); 5416 mem_cgroup_remove_from_trees(memcg); 5417 memcg_free_shrinker_maps(memcg); 5418 memcg_free_kmem(memcg); 5419 mem_cgroup_free(memcg); 5420 } 5421 5422 /** 5423 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5424 * @css: the target css 5425 * 5426 * Reset the states of the mem_cgroup associated with @css. This is 5427 * invoked when the userland requests disabling on the default hierarchy 5428 * but the memcg is pinned through dependency. The memcg should stop 5429 * applying policies and should revert to the vanilla state as it may be 5430 * made visible again. 5431 * 5432 * The current implementation only resets the essential configurations. 5433 * This needs to be expanded to cover all the visible parts. 5434 */ 5435 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5436 { 5437 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5438 5439 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); 5440 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); 5441 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); 5442 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); 5443 page_counter_set_min(&memcg->memory, 0); 5444 page_counter_set_low(&memcg->memory, 0); 5445 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); 5446 memcg->soft_limit = PAGE_COUNTER_MAX; 5447 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); 5448 memcg_wb_domain_size_changed(memcg); 5449 } 5450 5451 #ifdef CONFIG_MMU 5452 /* Handlers for move charge at task migration. */ 5453 static int mem_cgroup_do_precharge(unsigned long count) 5454 { 5455 int ret; 5456 5457 /* Try a single bulk charge without reclaim first, kswapd may wake */ 5458 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 5459 if (!ret) { 5460 mc.precharge += count; 5461 return ret; 5462 } 5463 5464 /* Try charges one by one with reclaim, but do not retry */ 5465 while (count--) { 5466 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 5467 if (ret) 5468 return ret; 5469 mc.precharge++; 5470 cond_resched(); 5471 } 5472 return 0; 5473 } 5474 5475 union mc_target { 5476 struct page *page; 5477 swp_entry_t ent; 5478 }; 5479 5480 enum mc_target_type { 5481 MC_TARGET_NONE = 0, 5482 MC_TARGET_PAGE, 5483 MC_TARGET_SWAP, 5484 MC_TARGET_DEVICE, 5485 }; 5486 5487 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5488 unsigned long addr, pte_t ptent) 5489 { 5490 struct page *page = vm_normal_page(vma, addr, ptent); 5491 5492 if (!page || !page_mapped(page)) 5493 return NULL; 5494 if (PageAnon(page)) { 5495 if (!(mc.flags & MOVE_ANON)) 5496 return NULL; 5497 } else { 5498 if (!(mc.flags & MOVE_FILE)) 5499 return NULL; 5500 } 5501 if (!get_page_unless_zero(page)) 5502 return NULL; 5503 5504 return page; 5505 } 5506 5507 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 5508 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5509 pte_t ptent, swp_entry_t *entry) 5510 { 5511 struct page *page = NULL; 5512 swp_entry_t ent = pte_to_swp_entry(ptent); 5513 5514 if (!(mc.flags & MOVE_ANON)) 5515 return NULL; 5516 5517 /* 5518 * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to 5519 * a device and because they are not accessible by CPU they are store 5520 * as special swap entry in the CPU page table. 5521 */ 5522 if (is_device_private_entry(ent)) { 5523 page = device_private_entry_to_page(ent); 5524 /* 5525 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have 5526 * a refcount of 1 when free (unlike normal page) 5527 */ 5528 if (!page_ref_add_unless(page, 1, 1)) 5529 return NULL; 5530 return page; 5531 } 5532 5533 if (non_swap_entry(ent)) 5534 return NULL; 5535 5536 /* 5537 * Because lookup_swap_cache() updates some statistics counter, 5538 * we call find_get_page() with swapper_space directly. 5539 */ 5540 page = find_get_page(swap_address_space(ent), swp_offset(ent)); 5541 entry->val = ent.val; 5542 5543 return page; 5544 } 5545 #else 5546 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5547 pte_t ptent, swp_entry_t *entry) 5548 { 5549 return NULL; 5550 } 5551 #endif 5552 5553 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5554 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5555 { 5556 if (!vma->vm_file) /* anonymous vma */ 5557 return NULL; 5558 if (!(mc.flags & MOVE_FILE)) 5559 return NULL; 5560 5561 /* page is moved even if it's not RSS of this task(page-faulted). */ 5562 /* shmem/tmpfs may report page out on swap: account for that too. */ 5563 return find_get_incore_page(vma->vm_file->f_mapping, 5564 linear_page_index(vma, addr)); 5565 } 5566 5567 /** 5568 * mem_cgroup_move_account - move account of the page 5569 * @page: the page 5570 * @compound: charge the page as compound or small page 5571 * @from: mem_cgroup which the page is moved from. 5572 * @to: mem_cgroup which the page is moved to. @from != @to. 5573 * 5574 * The caller must make sure the page is not on LRU (isolate_page() is useful.) 5575 * 5576 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 5577 * from old cgroup. 5578 */ 5579 static int mem_cgroup_move_account(struct page *page, 5580 bool compound, 5581 struct mem_cgroup *from, 5582 struct mem_cgroup *to) 5583 { 5584 struct lruvec *from_vec, *to_vec; 5585 struct pglist_data *pgdat; 5586 unsigned int nr_pages = compound ? thp_nr_pages(page) : 1; 5587 int ret; 5588 5589 VM_BUG_ON(from == to); 5590 VM_BUG_ON_PAGE(PageLRU(page), page); 5591 VM_BUG_ON(compound && !PageTransHuge(page)); 5592 5593 /* 5594 * Prevent mem_cgroup_migrate() from looking at 5595 * page->mem_cgroup of its source page while we change it. 5596 */ 5597 ret = -EBUSY; 5598 if (!trylock_page(page)) 5599 goto out; 5600 5601 ret = -EINVAL; 5602 if (page->mem_cgroup != from) 5603 goto out_unlock; 5604 5605 pgdat = page_pgdat(page); 5606 from_vec = mem_cgroup_lruvec(from, pgdat); 5607 to_vec = mem_cgroup_lruvec(to, pgdat); 5608 5609 lock_page_memcg(page); 5610 5611 if (PageAnon(page)) { 5612 if (page_mapped(page)) { 5613 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); 5614 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); 5615 if (PageTransHuge(page)) { 5616 __mod_lruvec_state(from_vec, NR_ANON_THPS, 5617 -nr_pages); 5618 __mod_lruvec_state(to_vec, NR_ANON_THPS, 5619 nr_pages); 5620 } 5621 5622 } 5623 } else { 5624 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); 5625 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); 5626 5627 if (PageSwapBacked(page)) { 5628 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); 5629 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); 5630 } 5631 5632 if (page_mapped(page)) { 5633 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); 5634 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); 5635 } 5636 5637 if (PageDirty(page)) { 5638 struct address_space *mapping = page_mapping(page); 5639 5640 if (mapping_can_writeback(mapping)) { 5641 __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 5642 -nr_pages); 5643 __mod_lruvec_state(to_vec, NR_FILE_DIRTY, 5644 nr_pages); 5645 } 5646 } 5647 } 5648 5649 if (PageWriteback(page)) { 5650 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); 5651 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); 5652 } 5653 5654 /* 5655 * All state has been migrated, let's switch to the new memcg. 5656 * 5657 * It is safe to change page->mem_cgroup here because the page 5658 * is referenced, charged, isolated, and locked: we can't race 5659 * with (un)charging, migration, LRU putback, or anything else 5660 * that would rely on a stable page->mem_cgroup. 5661 * 5662 * Note that lock_page_memcg is a memcg lock, not a page lock, 5663 * to save space. As soon as we switch page->mem_cgroup to a 5664 * new memcg that isn't locked, the above state can change 5665 * concurrently again. Make sure we're truly done with it. 5666 */ 5667 smp_mb(); 5668 5669 css_get(&to->css); 5670 css_put(&from->css); 5671 5672 page->mem_cgroup = to; 5673 5674 __unlock_page_memcg(from); 5675 5676 ret = 0; 5677 5678 local_irq_disable(); 5679 mem_cgroup_charge_statistics(to, page, nr_pages); 5680 memcg_check_events(to, page); 5681 mem_cgroup_charge_statistics(from, page, -nr_pages); 5682 memcg_check_events(from, page); 5683 local_irq_enable(); 5684 out_unlock: 5685 unlock_page(page); 5686 out: 5687 return ret; 5688 } 5689 5690 /** 5691 * get_mctgt_type - get target type of moving charge 5692 * @vma: the vma the pte to be checked belongs 5693 * @addr: the address corresponding to the pte to be checked 5694 * @ptent: the pte to be checked 5695 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5696 * 5697 * Returns 5698 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5699 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5700 * move charge. if @target is not NULL, the page is stored in target->page 5701 * with extra refcnt got(Callers should handle it). 5702 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5703 * target for charge migration. if @target is not NULL, the entry is stored 5704 * in target->ent. 5705 * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE 5706 * (so ZONE_DEVICE page and thus not on the lru). 5707 * For now we such page is charge like a regular page would be as for all 5708 * intent and purposes it is just special memory taking the place of a 5709 * regular page. 5710 * 5711 * See Documentations/vm/hmm.txt and include/linux/hmm.h 5712 * 5713 * Called with pte lock held. 5714 */ 5715 5716 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5717 unsigned long addr, pte_t ptent, union mc_target *target) 5718 { 5719 struct page *page = NULL; 5720 enum mc_target_type ret = MC_TARGET_NONE; 5721 swp_entry_t ent = { .val = 0 }; 5722 5723 if (pte_present(ptent)) 5724 page = mc_handle_present_pte(vma, addr, ptent); 5725 else if (is_swap_pte(ptent)) 5726 page = mc_handle_swap_pte(vma, ptent, &ent); 5727 else if (pte_none(ptent)) 5728 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5729 5730 if (!page && !ent.val) 5731 return ret; 5732 if (page) { 5733 /* 5734 * Do only loose check w/o serialization. 5735 * mem_cgroup_move_account() checks the page is valid or 5736 * not under LRU exclusion. 5737 */ 5738 if (page->mem_cgroup == mc.from) { 5739 ret = MC_TARGET_PAGE; 5740 if (is_device_private_page(page)) 5741 ret = MC_TARGET_DEVICE; 5742 if (target) 5743 target->page = page; 5744 } 5745 if (!ret || !target) 5746 put_page(page); 5747 } 5748 /* 5749 * There is a swap entry and a page doesn't exist or isn't charged. 5750 * But we cannot move a tail-page in a THP. 5751 */ 5752 if (ent.val && !ret && (!page || !PageTransCompound(page)) && 5753 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 5754 ret = MC_TARGET_SWAP; 5755 if (target) 5756 target->ent = ent; 5757 } 5758 return ret; 5759 } 5760 5761 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5762 /* 5763 * We don't consider PMD mapped swapping or file mapped pages because THP does 5764 * not support them for now. 5765 * Caller should make sure that pmd_trans_huge(pmd) is true. 5766 */ 5767 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5768 unsigned long addr, pmd_t pmd, union mc_target *target) 5769 { 5770 struct page *page = NULL; 5771 enum mc_target_type ret = MC_TARGET_NONE; 5772 5773 if (unlikely(is_swap_pmd(pmd))) { 5774 VM_BUG_ON(thp_migration_supported() && 5775 !is_pmd_migration_entry(pmd)); 5776 return ret; 5777 } 5778 page = pmd_page(pmd); 5779 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5780 if (!(mc.flags & MOVE_ANON)) 5781 return ret; 5782 if (page->mem_cgroup == mc.from) { 5783 ret = MC_TARGET_PAGE; 5784 if (target) { 5785 get_page(page); 5786 target->page = page; 5787 } 5788 } 5789 return ret; 5790 } 5791 #else 5792 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5793 unsigned long addr, pmd_t pmd, union mc_target *target) 5794 { 5795 return MC_TARGET_NONE; 5796 } 5797 #endif 5798 5799 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5800 unsigned long addr, unsigned long end, 5801 struct mm_walk *walk) 5802 { 5803 struct vm_area_struct *vma = walk->vma; 5804 pte_t *pte; 5805 spinlock_t *ptl; 5806 5807 ptl = pmd_trans_huge_lock(pmd, vma); 5808 if (ptl) { 5809 /* 5810 * Note their can not be MC_TARGET_DEVICE for now as we do not 5811 * support transparent huge page with MEMORY_DEVICE_PRIVATE but 5812 * this might change. 5813 */ 5814 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5815 mc.precharge += HPAGE_PMD_NR; 5816 spin_unlock(ptl); 5817 return 0; 5818 } 5819 5820 if (pmd_trans_unstable(pmd)) 5821 return 0; 5822 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5823 for (; addr != end; pte++, addr += PAGE_SIZE) 5824 if (get_mctgt_type(vma, addr, *pte, NULL)) 5825 mc.precharge++; /* increment precharge temporarily */ 5826 pte_unmap_unlock(pte - 1, ptl); 5827 cond_resched(); 5828 5829 return 0; 5830 } 5831 5832 static const struct mm_walk_ops precharge_walk_ops = { 5833 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5834 }; 5835 5836 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5837 { 5838 unsigned long precharge; 5839 5840 mmap_read_lock(mm); 5841 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); 5842 mmap_read_unlock(mm); 5843 5844 precharge = mc.precharge; 5845 mc.precharge = 0; 5846 5847 return precharge; 5848 } 5849 5850 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5851 { 5852 unsigned long precharge = mem_cgroup_count_precharge(mm); 5853 5854 VM_BUG_ON(mc.moving_task); 5855 mc.moving_task = current; 5856 return mem_cgroup_do_precharge(precharge); 5857 } 5858 5859 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5860 static void __mem_cgroup_clear_mc(void) 5861 { 5862 struct mem_cgroup *from = mc.from; 5863 struct mem_cgroup *to = mc.to; 5864 5865 /* we must uncharge all the leftover precharges from mc.to */ 5866 if (mc.precharge) { 5867 cancel_charge(mc.to, mc.precharge); 5868 mc.precharge = 0; 5869 } 5870 /* 5871 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5872 * we must uncharge here. 5873 */ 5874 if (mc.moved_charge) { 5875 cancel_charge(mc.from, mc.moved_charge); 5876 mc.moved_charge = 0; 5877 } 5878 /* we must fixup refcnts and charges */ 5879 if (mc.moved_swap) { 5880 /* uncharge swap account from the old cgroup */ 5881 if (!mem_cgroup_is_root(mc.from)) 5882 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 5883 5884 mem_cgroup_id_put_many(mc.from, mc.moved_swap); 5885 5886 /* 5887 * we charged both to->memory and to->memsw, so we 5888 * should uncharge to->memory. 5889 */ 5890 if (!mem_cgroup_is_root(mc.to)) 5891 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 5892 5893 mc.moved_swap = 0; 5894 } 5895 memcg_oom_recover(from); 5896 memcg_oom_recover(to); 5897 wake_up_all(&mc.waitq); 5898 } 5899 5900 static void mem_cgroup_clear_mc(void) 5901 { 5902 struct mm_struct *mm = mc.mm; 5903 5904 /* 5905 * we must clear moving_task before waking up waiters at the end of 5906 * task migration. 5907 */ 5908 mc.moving_task = NULL; 5909 __mem_cgroup_clear_mc(); 5910 spin_lock(&mc.lock); 5911 mc.from = NULL; 5912 mc.to = NULL; 5913 mc.mm = NULL; 5914 spin_unlock(&mc.lock); 5915 5916 mmput(mm); 5917 } 5918 5919 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 5920 { 5921 struct cgroup_subsys_state *css; 5922 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 5923 struct mem_cgroup *from; 5924 struct task_struct *leader, *p; 5925 struct mm_struct *mm; 5926 unsigned long move_flags; 5927 int ret = 0; 5928 5929 /* charge immigration isn't supported on the default hierarchy */ 5930 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 5931 return 0; 5932 5933 /* 5934 * Multi-process migrations only happen on the default hierarchy 5935 * where charge immigration is not used. Perform charge 5936 * immigration if @tset contains a leader and whine if there are 5937 * multiple. 5938 */ 5939 p = NULL; 5940 cgroup_taskset_for_each_leader(leader, css, tset) { 5941 WARN_ON_ONCE(p); 5942 p = leader; 5943 memcg = mem_cgroup_from_css(css); 5944 } 5945 if (!p) 5946 return 0; 5947 5948 /* 5949 * We are now commited to this value whatever it is. Changes in this 5950 * tunable will only affect upcoming migrations, not the current one. 5951 * So we need to save it, and keep it going. 5952 */ 5953 move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 5954 if (!move_flags) 5955 return 0; 5956 5957 from = mem_cgroup_from_task(p); 5958 5959 VM_BUG_ON(from == memcg); 5960 5961 mm = get_task_mm(p); 5962 if (!mm) 5963 return 0; 5964 /* We move charges only when we move a owner of the mm */ 5965 if (mm->owner == p) { 5966 VM_BUG_ON(mc.from); 5967 VM_BUG_ON(mc.to); 5968 VM_BUG_ON(mc.precharge); 5969 VM_BUG_ON(mc.moved_charge); 5970 VM_BUG_ON(mc.moved_swap); 5971 5972 spin_lock(&mc.lock); 5973 mc.mm = mm; 5974 mc.from = from; 5975 mc.to = memcg; 5976 mc.flags = move_flags; 5977 spin_unlock(&mc.lock); 5978 /* We set mc.moving_task later */ 5979 5980 ret = mem_cgroup_precharge_mc(mm); 5981 if (ret) 5982 mem_cgroup_clear_mc(); 5983 } else { 5984 mmput(mm); 5985 } 5986 return ret; 5987 } 5988 5989 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 5990 { 5991 if (mc.to) 5992 mem_cgroup_clear_mc(); 5993 } 5994 5995 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5996 unsigned long addr, unsigned long end, 5997 struct mm_walk *walk) 5998 { 5999 int ret = 0; 6000 struct vm_area_struct *vma = walk->vma; 6001 pte_t *pte; 6002 spinlock_t *ptl; 6003 enum mc_target_type target_type; 6004 union mc_target target; 6005 struct page *page; 6006 6007 ptl = pmd_trans_huge_lock(pmd, vma); 6008 if (ptl) { 6009 if (mc.precharge < HPAGE_PMD_NR) { 6010 spin_unlock(ptl); 6011 return 0; 6012 } 6013 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6014 if (target_type == MC_TARGET_PAGE) { 6015 page = target.page; 6016 if (!isolate_lru_page(page)) { 6017 if (!mem_cgroup_move_account(page, true, 6018 mc.from, mc.to)) { 6019 mc.precharge -= HPAGE_PMD_NR; 6020 mc.moved_charge += HPAGE_PMD_NR; 6021 } 6022 putback_lru_page(page); 6023 } 6024 put_page(page); 6025 } else if (target_type == MC_TARGET_DEVICE) { 6026 page = target.page; 6027 if (!mem_cgroup_move_account(page, true, 6028 mc.from, mc.to)) { 6029 mc.precharge -= HPAGE_PMD_NR; 6030 mc.moved_charge += HPAGE_PMD_NR; 6031 } 6032 put_page(page); 6033 } 6034 spin_unlock(ptl); 6035 return 0; 6036 } 6037 6038 if (pmd_trans_unstable(pmd)) 6039 return 0; 6040 retry: 6041 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6042 for (; addr != end; addr += PAGE_SIZE) { 6043 pte_t ptent = *(pte++); 6044 bool device = false; 6045 swp_entry_t ent; 6046 6047 if (!mc.precharge) 6048 break; 6049 6050 switch (get_mctgt_type(vma, addr, ptent, &target)) { 6051 case MC_TARGET_DEVICE: 6052 device = true; 6053 fallthrough; 6054 case MC_TARGET_PAGE: 6055 page = target.page; 6056 /* 6057 * We can have a part of the split pmd here. Moving it 6058 * can be done but it would be too convoluted so simply 6059 * ignore such a partial THP and keep it in original 6060 * memcg. There should be somebody mapping the head. 6061 */ 6062 if (PageTransCompound(page)) 6063 goto put; 6064 if (!device && isolate_lru_page(page)) 6065 goto put; 6066 if (!mem_cgroup_move_account(page, false, 6067 mc.from, mc.to)) { 6068 mc.precharge--; 6069 /* we uncharge from mc.from later. */ 6070 mc.moved_charge++; 6071 } 6072 if (!device) 6073 putback_lru_page(page); 6074 put: /* get_mctgt_type() gets the page */ 6075 put_page(page); 6076 break; 6077 case MC_TARGET_SWAP: 6078 ent = target.ent; 6079 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 6080 mc.precharge--; 6081 mem_cgroup_id_get_many(mc.to, 1); 6082 /* we fixup other refcnts and charges later. */ 6083 mc.moved_swap++; 6084 } 6085 break; 6086 default: 6087 break; 6088 } 6089 } 6090 pte_unmap_unlock(pte - 1, ptl); 6091 cond_resched(); 6092 6093 if (addr != end) { 6094 /* 6095 * We have consumed all precharges we got in can_attach(). 6096 * We try charge one by one, but don't do any additional 6097 * charges to mc.to if we have failed in charge once in attach() 6098 * phase. 6099 */ 6100 ret = mem_cgroup_do_precharge(1); 6101 if (!ret) 6102 goto retry; 6103 } 6104 6105 return ret; 6106 } 6107 6108 static const struct mm_walk_ops charge_walk_ops = { 6109 .pmd_entry = mem_cgroup_move_charge_pte_range, 6110 }; 6111 6112 static void mem_cgroup_move_charge(void) 6113 { 6114 lru_add_drain_all(); 6115 /* 6116 * Signal lock_page_memcg() to take the memcg's move_lock 6117 * while we're moving its pages to another memcg. Then wait 6118 * for already started RCU-only updates to finish. 6119 */ 6120 atomic_inc(&mc.from->moving_account); 6121 synchronize_rcu(); 6122 retry: 6123 if (unlikely(!mmap_read_trylock(mc.mm))) { 6124 /* 6125 * Someone who are holding the mmap_lock might be waiting in 6126 * waitq. So we cancel all extra charges, wake up all waiters, 6127 * and retry. Because we cancel precharges, we might not be able 6128 * to move enough charges, but moving charge is a best-effort 6129 * feature anyway, so it wouldn't be a big problem. 6130 */ 6131 __mem_cgroup_clear_mc(); 6132 cond_resched(); 6133 goto retry; 6134 } 6135 /* 6136 * When we have consumed all precharges and failed in doing 6137 * additional charge, the page walk just aborts. 6138 */ 6139 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, 6140 NULL); 6141 6142 mmap_read_unlock(mc.mm); 6143 atomic_dec(&mc.from->moving_account); 6144 } 6145 6146 static void mem_cgroup_move_task(void) 6147 { 6148 if (mc.to) { 6149 mem_cgroup_move_charge(); 6150 mem_cgroup_clear_mc(); 6151 } 6152 } 6153 #else /* !CONFIG_MMU */ 6154 static int mem_cgroup_can_attach(struct cgroup_taskset *tset) 6155 { 6156 return 0; 6157 } 6158 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) 6159 { 6160 } 6161 static void mem_cgroup_move_task(void) 6162 { 6163 } 6164 #endif 6165 6166 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) 6167 { 6168 if (value == PAGE_COUNTER_MAX) 6169 seq_puts(m, "max\n"); 6170 else 6171 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); 6172 6173 return 0; 6174 } 6175 6176 static u64 memory_current_read(struct cgroup_subsys_state *css, 6177 struct cftype *cft) 6178 { 6179 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6180 6181 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 6182 } 6183 6184 static int memory_min_show(struct seq_file *m, void *v) 6185 { 6186 return seq_puts_memcg_tunable(m, 6187 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); 6188 } 6189 6190 static ssize_t memory_min_write(struct kernfs_open_file *of, 6191 char *buf, size_t nbytes, loff_t off) 6192 { 6193 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6194 unsigned long min; 6195 int err; 6196 6197 buf = strstrip(buf); 6198 err = page_counter_memparse(buf, "max", &min); 6199 if (err) 6200 return err; 6201 6202 page_counter_set_min(&memcg->memory, min); 6203 6204 return nbytes; 6205 } 6206 6207 static int memory_low_show(struct seq_file *m, void *v) 6208 { 6209 return seq_puts_memcg_tunable(m, 6210 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); 6211 } 6212 6213 static ssize_t memory_low_write(struct kernfs_open_file *of, 6214 char *buf, size_t nbytes, loff_t off) 6215 { 6216 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6217 unsigned long low; 6218 int err; 6219 6220 buf = strstrip(buf); 6221 err = page_counter_memparse(buf, "max", &low); 6222 if (err) 6223 return err; 6224 6225 page_counter_set_low(&memcg->memory, low); 6226 6227 return nbytes; 6228 } 6229 6230 static int memory_high_show(struct seq_file *m, void *v) 6231 { 6232 return seq_puts_memcg_tunable(m, 6233 READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); 6234 } 6235 6236 static ssize_t memory_high_write(struct kernfs_open_file *of, 6237 char *buf, size_t nbytes, loff_t off) 6238 { 6239 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6240 unsigned int nr_retries = MAX_RECLAIM_RETRIES; 6241 bool drained = false; 6242 unsigned long high; 6243 int err; 6244 6245 buf = strstrip(buf); 6246 err = page_counter_memparse(buf, "max", &high); 6247 if (err) 6248 return err; 6249 6250 for (;;) { 6251 unsigned long nr_pages = page_counter_read(&memcg->memory); 6252 unsigned long reclaimed; 6253 6254 if (nr_pages <= high) 6255 break; 6256 6257 if (signal_pending(current)) 6258 break; 6259 6260 if (!drained) { 6261 drain_all_stock(memcg); 6262 drained = true; 6263 continue; 6264 } 6265 6266 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, 6267 GFP_KERNEL, true); 6268 6269 if (!reclaimed && !nr_retries--) 6270 break; 6271 } 6272 6273 page_counter_set_high(&memcg->memory, high); 6274 6275 memcg_wb_domain_size_changed(memcg); 6276 6277 return nbytes; 6278 } 6279 6280 static int memory_max_show(struct seq_file *m, void *v) 6281 { 6282 return seq_puts_memcg_tunable(m, 6283 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); 6284 } 6285 6286 static ssize_t memory_max_write(struct kernfs_open_file *of, 6287 char *buf, size_t nbytes, loff_t off) 6288 { 6289 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6290 unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; 6291 bool drained = false; 6292 unsigned long max; 6293 int err; 6294 6295 buf = strstrip(buf); 6296 err = page_counter_memparse(buf, "max", &max); 6297 if (err) 6298 return err; 6299 6300 xchg(&memcg->memory.max, max); 6301 6302 for (;;) { 6303 unsigned long nr_pages = page_counter_read(&memcg->memory); 6304 6305 if (nr_pages <= max) 6306 break; 6307 6308 if (signal_pending(current)) 6309 break; 6310 6311 if (!drained) { 6312 drain_all_stock(memcg); 6313 drained = true; 6314 continue; 6315 } 6316 6317 if (nr_reclaims) { 6318 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, 6319 GFP_KERNEL, true)) 6320 nr_reclaims--; 6321 continue; 6322 } 6323 6324 memcg_memory_event(memcg, MEMCG_OOM); 6325 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) 6326 break; 6327 } 6328 6329 memcg_wb_domain_size_changed(memcg); 6330 return nbytes; 6331 } 6332 6333 static void __memory_events_show(struct seq_file *m, atomic_long_t *events) 6334 { 6335 seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); 6336 seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); 6337 seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); 6338 seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); 6339 seq_printf(m, "oom_kill %lu\n", 6340 atomic_long_read(&events[MEMCG_OOM_KILL])); 6341 } 6342 6343 static int memory_events_show(struct seq_file *m, void *v) 6344 { 6345 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6346 6347 __memory_events_show(m, memcg->memory_events); 6348 return 0; 6349 } 6350 6351 static int memory_events_local_show(struct seq_file *m, void *v) 6352 { 6353 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6354 6355 __memory_events_show(m, memcg->memory_events_local); 6356 return 0; 6357 } 6358 6359 static int memory_stat_show(struct seq_file *m, void *v) 6360 { 6361 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6362 char *buf; 6363 6364 buf = memory_stat_format(memcg); 6365 if (!buf) 6366 return -ENOMEM; 6367 seq_puts(m, buf); 6368 kfree(buf); 6369 return 0; 6370 } 6371 6372 #ifdef CONFIG_NUMA 6373 static int memory_numa_stat_show(struct seq_file *m, void *v) 6374 { 6375 int i; 6376 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6377 6378 for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { 6379 int nid; 6380 6381 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) 6382 continue; 6383 6384 seq_printf(m, "%s", memory_stats[i].name); 6385 for_each_node_state(nid, N_MEMORY) { 6386 u64 size; 6387 struct lruvec *lruvec; 6388 6389 lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 6390 size = lruvec_page_state(lruvec, memory_stats[i].idx); 6391 size *= memory_stats[i].ratio; 6392 seq_printf(m, " N%d=%llu", nid, size); 6393 } 6394 seq_putc(m, '\n'); 6395 } 6396 6397 return 0; 6398 } 6399 #endif 6400 6401 static int memory_oom_group_show(struct seq_file *m, void *v) 6402 { 6403 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 6404 6405 seq_printf(m, "%d\n", memcg->oom_group); 6406 6407 return 0; 6408 } 6409 6410 static ssize_t memory_oom_group_write(struct kernfs_open_file *of, 6411 char *buf, size_t nbytes, loff_t off) 6412 { 6413 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 6414 int ret, oom_group; 6415 6416 buf = strstrip(buf); 6417 if (!buf) 6418 return -EINVAL; 6419 6420 ret = kstrtoint(buf, 0, &oom_group); 6421 if (ret) 6422 return ret; 6423 6424 if (oom_group != 0 && oom_group != 1) 6425 return -EINVAL; 6426 6427 memcg->oom_group = oom_group; 6428 6429 return nbytes; 6430 } 6431 6432 static struct cftype memory_files[] = { 6433 { 6434 .name = "current", 6435 .flags = CFTYPE_NOT_ON_ROOT, 6436 .read_u64 = memory_current_read, 6437 }, 6438 { 6439 .name = "min", 6440 .flags = CFTYPE_NOT_ON_ROOT, 6441 .seq_show = memory_min_show, 6442 .write = memory_min_write, 6443 }, 6444 { 6445 .name = "low", 6446 .flags = CFTYPE_NOT_ON_ROOT, 6447 .seq_show = memory_low_show, 6448 .write = memory_low_write, 6449 }, 6450 { 6451 .name = "high", 6452 .flags = CFTYPE_NOT_ON_ROOT, 6453 .seq_show = memory_high_show, 6454 .write = memory_high_write, 6455 }, 6456 { 6457 .name = "max", 6458 .flags = CFTYPE_NOT_ON_ROOT, 6459 .seq_show = memory_max_show, 6460 .write = memory_max_write, 6461 }, 6462 { 6463 .name = "events", 6464 .flags = CFTYPE_NOT_ON_ROOT, 6465 .file_offset = offsetof(struct mem_cgroup, events_file), 6466 .seq_show = memory_events_show, 6467 }, 6468 { 6469 .name = "events.local", 6470 .flags = CFTYPE_NOT_ON_ROOT, 6471 .file_offset = offsetof(struct mem_cgroup, events_local_file), 6472 .seq_show = memory_events_local_show, 6473 }, 6474 { 6475 .name = "stat", 6476 .seq_show = memory_stat_show, 6477 }, 6478 #ifdef CONFIG_NUMA 6479 { 6480 .name = "numa_stat", 6481 .seq_show = memory_numa_stat_show, 6482 }, 6483 #endif 6484 { 6485 .name = "oom.group", 6486 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, 6487 .seq_show = memory_oom_group_show, 6488 .write = memory_oom_group_write, 6489 }, 6490 { } /* terminate */ 6491 }; 6492 6493 struct cgroup_subsys memory_cgrp_subsys = { 6494 .css_alloc = mem_cgroup_css_alloc, 6495 .css_online = mem_cgroup_css_online, 6496 .css_offline = mem_cgroup_css_offline, 6497 .css_released = mem_cgroup_css_released, 6498 .css_free = mem_cgroup_css_free, 6499 .css_reset = mem_cgroup_css_reset, 6500 .can_attach = mem_cgroup_can_attach, 6501 .cancel_attach = mem_cgroup_cancel_attach, 6502 .post_attach = mem_cgroup_move_task, 6503 .dfl_cftypes = memory_files, 6504 .legacy_cftypes = mem_cgroup_legacy_files, 6505 .early_init = 0, 6506 }; 6507 6508 /* 6509 * This function calculates an individual cgroup's effective 6510 * protection which is derived from its own memory.min/low, its 6511 * parent's and siblings' settings, as well as the actual memory 6512 * distribution in the tree. 6513 * 6514 * The following rules apply to the effective protection values: 6515 * 6516 * 1. At the first level of reclaim, effective protection is equal to 6517 * the declared protection in memory.min and memory.low. 6518 * 6519 * 2. To enable safe delegation of the protection configuration, at 6520 * subsequent levels the effective protection is capped to the 6521 * parent's effective protection. 6522 * 6523 * 3. To make complex and dynamic subtrees easier to configure, the 6524 * user is allowed to overcommit the declared protection at a given 6525 * level. If that is the case, the parent's effective protection is 6526 * distributed to the children in proportion to how much protection 6527 * they have declared and how much of it they are utilizing. 6528 * 6529 * This makes distribution proportional, but also work-conserving: 6530 * if one cgroup claims much more protection than it uses memory, 6531 * the unused remainder is available to its siblings. 6532 * 6533 * 4. Conversely, when the declared protection is undercommitted at a 6534 * given level, the distribution of the larger parental protection 6535 * budget is NOT proportional. A cgroup's protection from a sibling 6536 * is capped to its own memory.min/low setting. 6537 * 6538 * 5. However, to allow protecting recursive subtrees from each other 6539 * without having to declare each individual cgroup's fixed share 6540 * of the ancestor's claim to protection, any unutilized - 6541 * "floating" - protection from up the tree is distributed in 6542 * proportion to each cgroup's *usage*. This makes the protection 6543 * neutral wrt sibling cgroups and lets them compete freely over 6544 * the shared parental protection budget, but it protects the 6545 * subtree as a whole from neighboring subtrees. 6546 * 6547 * Note that 4. and 5. are not in conflict: 4. is about protecting 6548 * against immediate siblings whereas 5. is about protecting against 6549 * neighboring subtrees. 6550 */ 6551 static unsigned long effective_protection(unsigned long usage, 6552 unsigned long parent_usage, 6553 unsigned long setting, 6554 unsigned long parent_effective, 6555 unsigned long siblings_protected) 6556 { 6557 unsigned long protected; 6558 unsigned long ep; 6559 6560 protected = min(usage, setting); 6561 /* 6562 * If all cgroups at this level combined claim and use more 6563 * protection then what the parent affords them, distribute 6564 * shares in proportion to utilization. 6565 * 6566 * We are using actual utilization rather than the statically 6567 * claimed protection in order to be work-conserving: claimed 6568 * but unused protection is available to siblings that would 6569 * otherwise get a smaller chunk than what they claimed. 6570 */ 6571 if (siblings_protected > parent_effective) 6572 return protected * parent_effective / siblings_protected; 6573 6574 /* 6575 * Ok, utilized protection of all children is within what the 6576 * parent affords them, so we know whatever this child claims 6577 * and utilizes is effectively protected. 6578 * 6579 * If there is unprotected usage beyond this value, reclaim 6580 * will apply pressure in proportion to that amount. 6581 * 6582 * If there is unutilized protection, the cgroup will be fully 6583 * shielded from reclaim, but we do return a smaller value for 6584 * protection than what the group could enjoy in theory. This 6585 * is okay. With the overcommit distribution above, effective 6586 * protection is always dependent on how memory is actually 6587 * consumed among the siblings anyway. 6588 */ 6589 ep = protected; 6590 6591 /* 6592 * If the children aren't claiming (all of) the protection 6593 * afforded to them by the parent, distribute the remainder in 6594 * proportion to the (unprotected) memory of each cgroup. That 6595 * way, cgroups that aren't explicitly prioritized wrt each 6596 * other compete freely over the allowance, but they are 6597 * collectively protected from neighboring trees. 6598 * 6599 * We're using unprotected memory for the weight so that if 6600 * some cgroups DO claim explicit protection, we don't protect 6601 * the same bytes twice. 6602 * 6603 * Check both usage and parent_usage against the respective 6604 * protected values. One should imply the other, but they 6605 * aren't read atomically - make sure the division is sane. 6606 */ 6607 if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) 6608 return ep; 6609 if (parent_effective > siblings_protected && 6610 parent_usage > siblings_protected && 6611 usage > protected) { 6612 unsigned long unclaimed; 6613 6614 unclaimed = parent_effective - siblings_protected; 6615 unclaimed *= usage - protected; 6616 unclaimed /= parent_usage - siblings_protected; 6617 6618 ep += unclaimed; 6619 } 6620 6621 return ep; 6622 } 6623 6624 /** 6625 * mem_cgroup_protected - check if memory consumption is in the normal range 6626 * @root: the top ancestor of the sub-tree being checked 6627 * @memcg: the memory cgroup to check 6628 * 6629 * WARNING: This function is not stateless! It can only be used as part 6630 * of a top-down tree iteration, not for isolated queries. 6631 */ 6632 void mem_cgroup_calculate_protection(struct mem_cgroup *root, 6633 struct mem_cgroup *memcg) 6634 { 6635 unsigned long usage, parent_usage; 6636 struct mem_cgroup *parent; 6637 6638 if (mem_cgroup_disabled()) 6639 return; 6640 6641 if (!root) 6642 root = root_mem_cgroup; 6643 6644 /* 6645 * Effective values of the reclaim targets are ignored so they 6646 * can be stale. Have a look at mem_cgroup_protection for more 6647 * details. 6648 * TODO: calculation should be more robust so that we do not need 6649 * that special casing. 6650 */ 6651 if (memcg == root) 6652 return; 6653 6654 usage = page_counter_read(&memcg->memory); 6655 if (!usage) 6656 return; 6657 6658 parent = parent_mem_cgroup(memcg); 6659 /* No parent means a non-hierarchical mode on v1 memcg */ 6660 if (!parent) 6661 return; 6662 6663 if (parent == root) { 6664 memcg->memory.emin = READ_ONCE(memcg->memory.min); 6665 memcg->memory.elow = READ_ONCE(memcg->memory.low); 6666 return; 6667 } 6668 6669 parent_usage = page_counter_read(&parent->memory); 6670 6671 WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, 6672 READ_ONCE(memcg->memory.min), 6673 READ_ONCE(parent->memory.emin), 6674 atomic_long_read(&parent->memory.children_min_usage))); 6675 6676 WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, 6677 READ_ONCE(memcg->memory.low), 6678 READ_ONCE(parent->memory.elow), 6679 atomic_long_read(&parent->memory.children_low_usage))); 6680 } 6681 6682 /** 6683 * mem_cgroup_charge - charge a newly allocated page to a cgroup 6684 * @page: page to charge 6685 * @mm: mm context of the victim 6686 * @gfp_mask: reclaim mode 6687 * 6688 * Try to charge @page to the memcg that @mm belongs to, reclaiming 6689 * pages according to @gfp_mask if necessary. 6690 * 6691 * Returns 0 on success. Otherwise, an error code is returned. 6692 */ 6693 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 6694 { 6695 unsigned int nr_pages = thp_nr_pages(page); 6696 struct mem_cgroup *memcg = NULL; 6697 int ret = 0; 6698 6699 if (mem_cgroup_disabled()) 6700 goto out; 6701 6702 if (PageSwapCache(page)) { 6703 swp_entry_t ent = { .val = page_private(page), }; 6704 unsigned short id; 6705 6706 /* 6707 * Every swap fault against a single page tries to charge the 6708 * page, bail as early as possible. shmem_unuse() encounters 6709 * already charged pages, too. page->mem_cgroup is protected 6710 * by the page lock, which serializes swap cache removal, which 6711 * in turn serializes uncharging. 6712 */ 6713 VM_BUG_ON_PAGE(!PageLocked(page), page); 6714 if (compound_head(page)->mem_cgroup) 6715 goto out; 6716 6717 id = lookup_swap_cgroup_id(ent); 6718 rcu_read_lock(); 6719 memcg = mem_cgroup_from_id(id); 6720 if (memcg && !css_tryget_online(&memcg->css)) 6721 memcg = NULL; 6722 rcu_read_unlock(); 6723 } 6724 6725 if (!memcg) 6726 memcg = get_mem_cgroup_from_mm(mm); 6727 6728 ret = try_charge(memcg, gfp_mask, nr_pages); 6729 if (ret) 6730 goto out_put; 6731 6732 css_get(&memcg->css); 6733 commit_charge(page, memcg); 6734 6735 local_irq_disable(); 6736 mem_cgroup_charge_statistics(memcg, page, nr_pages); 6737 memcg_check_events(memcg, page); 6738 local_irq_enable(); 6739 6740 if (PageSwapCache(page)) { 6741 swp_entry_t entry = { .val = page_private(page) }; 6742 /* 6743 * The swap entry might not get freed for a long time, 6744 * let's not wait for it. The page already received a 6745 * memory+swap charge, drop the swap entry duplicate. 6746 */ 6747 mem_cgroup_uncharge_swap(entry, nr_pages); 6748 } 6749 6750 out_put: 6751 css_put(&memcg->css); 6752 out: 6753 return ret; 6754 } 6755 6756 struct uncharge_gather { 6757 struct mem_cgroup *memcg; 6758 unsigned long nr_pages; 6759 unsigned long pgpgout; 6760 unsigned long nr_kmem; 6761 struct page *dummy_page; 6762 }; 6763 6764 static inline void uncharge_gather_clear(struct uncharge_gather *ug) 6765 { 6766 memset(ug, 0, sizeof(*ug)); 6767 } 6768 6769 static void uncharge_batch(const struct uncharge_gather *ug) 6770 { 6771 unsigned long flags; 6772 6773 if (!mem_cgroup_is_root(ug->memcg)) { 6774 page_counter_uncharge(&ug->memcg->memory, ug->nr_pages); 6775 if (do_memsw_account()) 6776 page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages); 6777 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) 6778 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); 6779 memcg_oom_recover(ug->memcg); 6780 } 6781 6782 local_irq_save(flags); 6783 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); 6784 __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); 6785 memcg_check_events(ug->memcg, ug->dummy_page); 6786 local_irq_restore(flags); 6787 6788 /* drop reference from uncharge_page */ 6789 css_put(&ug->memcg->css); 6790 } 6791 6792 static void uncharge_page(struct page *page, struct uncharge_gather *ug) 6793 { 6794 unsigned long nr_pages; 6795 6796 VM_BUG_ON_PAGE(PageLRU(page), page); 6797 6798 if (!page->mem_cgroup) 6799 return; 6800 6801 /* 6802 * Nobody should be changing or seriously looking at 6803 * page->mem_cgroup at this point, we have fully 6804 * exclusive access to the page. 6805 */ 6806 6807 if (ug->memcg != page->mem_cgroup) { 6808 if (ug->memcg) { 6809 uncharge_batch(ug); 6810 uncharge_gather_clear(ug); 6811 } 6812 ug->memcg = page->mem_cgroup; 6813 6814 /* pairs with css_put in uncharge_batch */ 6815 css_get(&ug->memcg->css); 6816 } 6817 6818 nr_pages = compound_nr(page); 6819 ug->nr_pages += nr_pages; 6820 6821 if (!PageKmemcg(page)) { 6822 ug->pgpgout++; 6823 } else { 6824 ug->nr_kmem += nr_pages; 6825 __ClearPageKmemcg(page); 6826 } 6827 6828 ug->dummy_page = page; 6829 page->mem_cgroup = NULL; 6830 css_put(&ug->memcg->css); 6831 } 6832 6833 static void uncharge_list(struct list_head *page_list) 6834 { 6835 struct uncharge_gather ug; 6836 struct list_head *next; 6837 6838 uncharge_gather_clear(&ug); 6839 6840 /* 6841 * Note that the list can be a single page->lru; hence the 6842 * do-while loop instead of a simple list_for_each_entry(). 6843 */ 6844 next = page_list->next; 6845 do { 6846 struct page *page; 6847 6848 page = list_entry(next, struct page, lru); 6849 next = page->lru.next; 6850 6851 uncharge_page(page, &ug); 6852 } while (next != page_list); 6853 6854 if (ug.memcg) 6855 uncharge_batch(&ug); 6856 } 6857 6858 /** 6859 * mem_cgroup_uncharge - uncharge a page 6860 * @page: page to uncharge 6861 * 6862 * Uncharge a page previously charged with mem_cgroup_charge(). 6863 */ 6864 void mem_cgroup_uncharge(struct page *page) 6865 { 6866 struct uncharge_gather ug; 6867 6868 if (mem_cgroup_disabled()) 6869 return; 6870 6871 /* Don't touch page->lru of any random page, pre-check: */ 6872 if (!page->mem_cgroup) 6873 return; 6874 6875 uncharge_gather_clear(&ug); 6876 uncharge_page(page, &ug); 6877 uncharge_batch(&ug); 6878 } 6879 6880 /** 6881 * mem_cgroup_uncharge_list - uncharge a list of page 6882 * @page_list: list of pages to uncharge 6883 * 6884 * Uncharge a list of pages previously charged with 6885 * mem_cgroup_charge(). 6886 */ 6887 void mem_cgroup_uncharge_list(struct list_head *page_list) 6888 { 6889 if (mem_cgroup_disabled()) 6890 return; 6891 6892 if (!list_empty(page_list)) 6893 uncharge_list(page_list); 6894 } 6895 6896 /** 6897 * mem_cgroup_migrate - charge a page's replacement 6898 * @oldpage: currently circulating page 6899 * @newpage: replacement page 6900 * 6901 * Charge @newpage as a replacement page for @oldpage. @oldpage will 6902 * be uncharged upon free. 6903 * 6904 * Both pages must be locked, @newpage->mapping must be set up. 6905 */ 6906 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) 6907 { 6908 struct mem_cgroup *memcg; 6909 unsigned int nr_pages; 6910 unsigned long flags; 6911 6912 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 6913 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 6914 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 6915 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 6916 newpage); 6917 6918 if (mem_cgroup_disabled()) 6919 return; 6920 6921 /* Page cache replacement: new page already charged? */ 6922 if (newpage->mem_cgroup) 6923 return; 6924 6925 memcg = oldpage->mem_cgroup; 6926 if (!memcg) 6927 return; 6928 6929 /* Force-charge the new page. The old one will be freed soon */ 6930 nr_pages = thp_nr_pages(newpage); 6931 6932 page_counter_charge(&memcg->memory, nr_pages); 6933 if (do_memsw_account()) 6934 page_counter_charge(&memcg->memsw, nr_pages); 6935 6936 css_get(&memcg->css); 6937 commit_charge(newpage, memcg); 6938 6939 local_irq_save(flags); 6940 mem_cgroup_charge_statistics(memcg, newpage, nr_pages); 6941 memcg_check_events(memcg, newpage); 6942 local_irq_restore(flags); 6943 } 6944 6945 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); 6946 EXPORT_SYMBOL(memcg_sockets_enabled_key); 6947 6948 void mem_cgroup_sk_alloc(struct sock *sk) 6949 { 6950 struct mem_cgroup *memcg; 6951 6952 if (!mem_cgroup_sockets_enabled) 6953 return; 6954 6955 /* Do not associate the sock with unrelated interrupted task's memcg. */ 6956 if (in_interrupt()) 6957 return; 6958 6959 rcu_read_lock(); 6960 memcg = mem_cgroup_from_task(current); 6961 if (memcg == root_mem_cgroup) 6962 goto out; 6963 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) 6964 goto out; 6965 if (css_tryget(&memcg->css)) 6966 sk->sk_memcg = memcg; 6967 out: 6968 rcu_read_unlock(); 6969 } 6970 6971 void mem_cgroup_sk_free(struct sock *sk) 6972 { 6973 if (sk->sk_memcg) 6974 css_put(&sk->sk_memcg->css); 6975 } 6976 6977 /** 6978 * mem_cgroup_charge_skmem - charge socket memory 6979 * @memcg: memcg to charge 6980 * @nr_pages: number of pages to charge 6981 * 6982 * Charges @nr_pages to @memcg. Returns %true if the charge fit within 6983 * @memcg's configured limit, %false if the charge had to be forced. 6984 */ 6985 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 6986 { 6987 gfp_t gfp_mask = GFP_KERNEL; 6988 6989 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 6990 struct page_counter *fail; 6991 6992 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 6993 memcg->tcpmem_pressure = 0; 6994 return true; 6995 } 6996 page_counter_charge(&memcg->tcpmem, nr_pages); 6997 memcg->tcpmem_pressure = 1; 6998 return false; 6999 } 7000 7001 /* Don't block in the packet receive path */ 7002 if (in_softirq()) 7003 gfp_mask = GFP_NOWAIT; 7004 7005 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); 7006 7007 if (try_charge(memcg, gfp_mask, nr_pages) == 0) 7008 return true; 7009 7010 try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); 7011 return false; 7012 } 7013 7014 /** 7015 * mem_cgroup_uncharge_skmem - uncharge socket memory 7016 * @memcg: memcg to uncharge 7017 * @nr_pages: number of pages to uncharge 7018 */ 7019 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) 7020 { 7021 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 7022 page_counter_uncharge(&memcg->tcpmem, nr_pages); 7023 return; 7024 } 7025 7026 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); 7027 7028 refill_stock(memcg, nr_pages); 7029 } 7030 7031 static int __init cgroup_memory(char *s) 7032 { 7033 char *token; 7034 7035 while ((token = strsep(&s, ",")) != NULL) { 7036 if (!*token) 7037 continue; 7038 if (!strcmp(token, "nosocket")) 7039 cgroup_memory_nosocket = true; 7040 if (!strcmp(token, "nokmem")) 7041 cgroup_memory_nokmem = true; 7042 } 7043 return 0; 7044 } 7045 __setup("cgroup.memory=", cgroup_memory); 7046 7047 /* 7048 * subsys_initcall() for memory controller. 7049 * 7050 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this 7051 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but 7052 * basically everything that doesn't depend on a specific mem_cgroup structure 7053 * should be initialized from here. 7054 */ 7055 static int __init mem_cgroup_init(void) 7056 { 7057 int cpu, node; 7058 7059 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 7060 memcg_hotplug_cpu_dead); 7061 7062 for_each_possible_cpu(cpu) 7063 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, 7064 drain_local_stock); 7065 7066 for_each_node(node) { 7067 struct mem_cgroup_tree_per_node *rtpn; 7068 7069 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, 7070 node_online(node) ? node : NUMA_NO_NODE); 7071 7072 rtpn->rb_root = RB_ROOT; 7073 rtpn->rb_rightmost = NULL; 7074 spin_lock_init(&rtpn->lock); 7075 soft_limit_tree.rb_tree_per_node[node] = rtpn; 7076 } 7077 7078 return 0; 7079 } 7080 subsys_initcall(mem_cgroup_init); 7081 7082 #ifdef CONFIG_MEMCG_SWAP 7083 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) 7084 { 7085 while (!refcount_inc_not_zero(&memcg->id.ref)) { 7086 /* 7087 * The root cgroup cannot be destroyed, so it's refcount must 7088 * always be >= 1. 7089 */ 7090 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { 7091 VM_BUG_ON(1); 7092 break; 7093 } 7094 memcg = parent_mem_cgroup(memcg); 7095 if (!memcg) 7096 memcg = root_mem_cgroup; 7097 } 7098 return memcg; 7099 } 7100 7101 /** 7102 * mem_cgroup_swapout - transfer a memsw charge to swap 7103 * @page: page whose memsw charge to transfer 7104 * @entry: swap entry to move the charge to 7105 * 7106 * Transfer the memsw charge of @page to @entry. 7107 */ 7108 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 7109 { 7110 struct mem_cgroup *memcg, *swap_memcg; 7111 unsigned int nr_entries; 7112 unsigned short oldid; 7113 7114 VM_BUG_ON_PAGE(PageLRU(page), page); 7115 VM_BUG_ON_PAGE(page_count(page), page); 7116 7117 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7118 return; 7119 7120 memcg = page->mem_cgroup; 7121 7122 /* Readahead page, never charged */ 7123 if (!memcg) 7124 return; 7125 7126 /* 7127 * In case the memcg owning these pages has been offlined and doesn't 7128 * have an ID allocated to it anymore, charge the closest online 7129 * ancestor for the swap instead and transfer the memory+swap charge. 7130 */ 7131 swap_memcg = mem_cgroup_id_get_online(memcg); 7132 nr_entries = thp_nr_pages(page); 7133 /* Get references for the tail pages, too */ 7134 if (nr_entries > 1) 7135 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 7136 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 7137 nr_entries); 7138 VM_BUG_ON_PAGE(oldid, page); 7139 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 7140 7141 page->mem_cgroup = NULL; 7142 7143 if (!mem_cgroup_is_root(memcg)) 7144 page_counter_uncharge(&memcg->memory, nr_entries); 7145 7146 if (!cgroup_memory_noswap && memcg != swap_memcg) { 7147 if (!mem_cgroup_is_root(swap_memcg)) 7148 page_counter_charge(&swap_memcg->memsw, nr_entries); 7149 page_counter_uncharge(&memcg->memsw, nr_entries); 7150 } 7151 7152 /* 7153 * Interrupts should be disabled here because the caller holds the 7154 * i_pages lock which is taken with interrupts-off. It is 7155 * important here to have the interrupts disabled because it is the 7156 * only synchronisation we have for updating the per-CPU variables. 7157 */ 7158 VM_BUG_ON(!irqs_disabled()); 7159 mem_cgroup_charge_statistics(memcg, page, -nr_entries); 7160 memcg_check_events(memcg, page); 7161 7162 css_put(&memcg->css); 7163 } 7164 7165 /** 7166 * mem_cgroup_try_charge_swap - try charging swap space for a page 7167 * @page: page being added to swap 7168 * @entry: swap entry to charge 7169 * 7170 * Try to charge @page's memcg for the swap space at @entry. 7171 * 7172 * Returns 0 on success, -ENOMEM on failure. 7173 */ 7174 int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) 7175 { 7176 unsigned int nr_pages = thp_nr_pages(page); 7177 struct page_counter *counter; 7178 struct mem_cgroup *memcg; 7179 unsigned short oldid; 7180 7181 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7182 return 0; 7183 7184 memcg = page->mem_cgroup; 7185 7186 /* Readahead page, never charged */ 7187 if (!memcg) 7188 return 0; 7189 7190 if (!entry.val) { 7191 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7192 return 0; 7193 } 7194 7195 memcg = mem_cgroup_id_get_online(memcg); 7196 7197 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && 7198 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { 7199 memcg_memory_event(memcg, MEMCG_SWAP_MAX); 7200 memcg_memory_event(memcg, MEMCG_SWAP_FAIL); 7201 mem_cgroup_id_put(memcg); 7202 return -ENOMEM; 7203 } 7204 7205 /* Get references for the tail pages, too */ 7206 if (nr_pages > 1) 7207 mem_cgroup_id_get_many(memcg, nr_pages - 1); 7208 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); 7209 VM_BUG_ON_PAGE(oldid, page); 7210 mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); 7211 7212 return 0; 7213 } 7214 7215 /** 7216 * mem_cgroup_uncharge_swap - uncharge swap space 7217 * @entry: swap entry to uncharge 7218 * @nr_pages: the amount of swap space to uncharge 7219 */ 7220 void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) 7221 { 7222 struct mem_cgroup *memcg; 7223 unsigned short id; 7224 7225 id = swap_cgroup_record(entry, 0, nr_pages); 7226 rcu_read_lock(); 7227 memcg = mem_cgroup_from_id(id); 7228 if (memcg) { 7229 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { 7230 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7231 page_counter_uncharge(&memcg->swap, nr_pages); 7232 else 7233 page_counter_uncharge(&memcg->memsw, nr_pages); 7234 } 7235 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); 7236 mem_cgroup_id_put_many(memcg, nr_pages); 7237 } 7238 rcu_read_unlock(); 7239 } 7240 7241 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) 7242 { 7243 long nr_swap_pages = get_nr_swap_pages(); 7244 7245 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7246 return nr_swap_pages; 7247 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) 7248 nr_swap_pages = min_t(long, nr_swap_pages, 7249 READ_ONCE(memcg->swap.max) - 7250 page_counter_read(&memcg->swap)); 7251 return nr_swap_pages; 7252 } 7253 7254 bool mem_cgroup_swap_full(struct page *page) 7255 { 7256 struct mem_cgroup *memcg; 7257 7258 VM_BUG_ON_PAGE(!PageLocked(page), page); 7259 7260 if (vm_swap_full()) 7261 return true; 7262 if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7263 return false; 7264 7265 memcg = page->mem_cgroup; 7266 if (!memcg) 7267 return false; 7268 7269 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { 7270 unsigned long usage = page_counter_read(&memcg->swap); 7271 7272 if (usage * 2 >= READ_ONCE(memcg->swap.high) || 7273 usage * 2 >= READ_ONCE(memcg->swap.max)) 7274 return true; 7275 } 7276 7277 return false; 7278 } 7279 7280 static int __init setup_swap_account(char *s) 7281 { 7282 if (!strcmp(s, "1")) 7283 cgroup_memory_noswap = false; 7284 else if (!strcmp(s, "0")) 7285 cgroup_memory_noswap = true; 7286 return 1; 7287 } 7288 __setup("swapaccount=", setup_swap_account); 7289 7290 static u64 swap_current_read(struct cgroup_subsys_state *css, 7291 struct cftype *cft) 7292 { 7293 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7294 7295 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 7296 } 7297 7298 static int swap_high_show(struct seq_file *m, void *v) 7299 { 7300 return seq_puts_memcg_tunable(m, 7301 READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); 7302 } 7303 7304 static ssize_t swap_high_write(struct kernfs_open_file *of, 7305 char *buf, size_t nbytes, loff_t off) 7306 { 7307 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7308 unsigned long high; 7309 int err; 7310 7311 buf = strstrip(buf); 7312 err = page_counter_memparse(buf, "max", &high); 7313 if (err) 7314 return err; 7315 7316 page_counter_set_high(&memcg->swap, high); 7317 7318 return nbytes; 7319 } 7320 7321 static int swap_max_show(struct seq_file *m, void *v) 7322 { 7323 return seq_puts_memcg_tunable(m, 7324 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); 7325 } 7326 7327 static ssize_t swap_max_write(struct kernfs_open_file *of, 7328 char *buf, size_t nbytes, loff_t off) 7329 { 7330 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 7331 unsigned long max; 7332 int err; 7333 7334 buf = strstrip(buf); 7335 err = page_counter_memparse(buf, "max", &max); 7336 if (err) 7337 return err; 7338 7339 xchg(&memcg->swap.max, max); 7340 7341 return nbytes; 7342 } 7343 7344 static int swap_events_show(struct seq_file *m, void *v) 7345 { 7346 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 7347 7348 seq_printf(m, "high %lu\n", 7349 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); 7350 seq_printf(m, "max %lu\n", 7351 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); 7352 seq_printf(m, "fail %lu\n", 7353 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); 7354 7355 return 0; 7356 } 7357 7358 static struct cftype swap_files[] = { 7359 { 7360 .name = "swap.current", 7361 .flags = CFTYPE_NOT_ON_ROOT, 7362 .read_u64 = swap_current_read, 7363 }, 7364 { 7365 .name = "swap.high", 7366 .flags = CFTYPE_NOT_ON_ROOT, 7367 .seq_show = swap_high_show, 7368 .write = swap_high_write, 7369 }, 7370 { 7371 .name = "swap.max", 7372 .flags = CFTYPE_NOT_ON_ROOT, 7373 .seq_show = swap_max_show, 7374 .write = swap_max_write, 7375 }, 7376 { 7377 .name = "swap.events", 7378 .flags = CFTYPE_NOT_ON_ROOT, 7379 .file_offset = offsetof(struct mem_cgroup, swap_events_file), 7380 .seq_show = swap_events_show, 7381 }, 7382 { } /* terminate */ 7383 }; 7384 7385 static struct cftype memsw_files[] = { 7386 { 7387 .name = "memsw.usage_in_bytes", 7388 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 7389 .read_u64 = mem_cgroup_read_u64, 7390 }, 7391 { 7392 .name = "memsw.max_usage_in_bytes", 7393 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 7394 .write = mem_cgroup_reset, 7395 .read_u64 = mem_cgroup_read_u64, 7396 }, 7397 { 7398 .name = "memsw.limit_in_bytes", 7399 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 7400 .write = mem_cgroup_write, 7401 .read_u64 = mem_cgroup_read_u64, 7402 }, 7403 { 7404 .name = "memsw.failcnt", 7405 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 7406 .write = mem_cgroup_reset, 7407 .read_u64 = mem_cgroup_read_u64, 7408 }, 7409 { }, /* terminate */ 7410 }; 7411 7412 /* 7413 * If mem_cgroup_swap_init() is implemented as a subsys_initcall() 7414 * instead of a core_initcall(), this could mean cgroup_memory_noswap still 7415 * remains set to false even when memcg is disabled via "cgroup_disable=memory" 7416 * boot parameter. This may result in premature OOPS inside 7417 * mem_cgroup_get_nr_swap_pages() function in corner cases. 7418 */ 7419 static int __init mem_cgroup_swap_init(void) 7420 { 7421 /* No memory control -> no swap control */ 7422 if (mem_cgroup_disabled()) 7423 cgroup_memory_noswap = true; 7424 7425 if (cgroup_memory_noswap) 7426 return 0; 7427 7428 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); 7429 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); 7430 7431 return 0; 7432 } 7433 core_initcall(mem_cgroup_swap_init); 7434 7435 #endif /* CONFIG_MEMCG_SWAP */ 7436