1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/memcontrol.h> 4 #include <linux/swap.h> 5 #include <linux/mm_inline.h> 6 #include <linux/pagewalk.h> 7 #include <linux/backing-dev.h> 8 #include <linux/swap_cgroup.h> 9 #include <linux/eventfd.h> 10 #include <linux/poll.h> 11 #include <linux/sort.h> 12 #include <linux/file.h> 13 #include <linux/seq_buf.h> 14 15 #include "internal.h" 16 #include "swap.h" 17 #include "memcontrol-v1.h" 18 19 /* 20 * Cgroups above their limits are maintained in a RB-Tree, independent of 21 * their hierarchy representation 22 */ 23 24 struct mem_cgroup_tree_per_node { 25 struct rb_root rb_root; 26 struct rb_node *rb_rightmost; 27 spinlock_t lock; 28 }; 29 30 struct mem_cgroup_tree { 31 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 32 }; 33 34 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 35 36 /* 37 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft 38 * limit reclaim to prevent infinite loops, if they ever occur. 39 */ 40 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 41 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 42 43 /* for OOM */ 44 struct mem_cgroup_eventfd_list { 45 struct list_head list; 46 struct eventfd_ctx *eventfd; 47 }; 48 49 /* 50 * cgroup_event represents events which userspace want to receive. 51 */ 52 struct mem_cgroup_event { 53 /* 54 * memcg which the event belongs to. 55 */ 56 struct mem_cgroup *memcg; 57 /* 58 * eventfd to signal userspace about the event. 59 */ 60 struct eventfd_ctx *eventfd; 61 /* 62 * Each of these stored in a list by the cgroup. 63 */ 64 struct list_head list; 65 /* 66 * register_event() callback will be used to add new userspace 67 * waiter for changes related to this event. Use eventfd_signal() 68 * on eventfd to send notification to userspace. 69 */ 70 int (*register_event)(struct mem_cgroup *memcg, 71 struct eventfd_ctx *eventfd, const char *args); 72 /* 73 * unregister_event() callback will be called when userspace closes 74 * the eventfd or on cgroup removing. This callback must be set, 75 * if you want provide notification functionality. 76 */ 77 void (*unregister_event)(struct mem_cgroup *memcg, 78 struct eventfd_ctx *eventfd); 79 /* 80 * All fields below needed to unregister event when 81 * userspace closes eventfd. 82 */ 83 poll_table pt; 84 wait_queue_head_t *wqh; 85 wait_queue_entry_t wait; 86 struct work_struct remove; 87 }; 88 89 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 90 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 91 #define MEMFILE_ATTR(val) ((val) & 0xffff) 92 93 enum { 94 RES_USAGE, 95 RES_LIMIT, 96 RES_MAX_USAGE, 97 RES_FAILCNT, 98 RES_SOFT_LIMIT, 99 }; 100 101 #ifdef CONFIG_LOCKDEP 102 static struct lockdep_map memcg_oom_lock_dep_map = { 103 .name = "memcg_oom_lock", 104 }; 105 #endif 106 107 DEFINE_SPINLOCK(memcg_oom_lock); 108 109 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 110 struct mem_cgroup_tree_per_node *mctz, 111 unsigned long new_usage_in_excess) 112 { 113 struct rb_node **p = &mctz->rb_root.rb_node; 114 struct rb_node *parent = NULL; 115 struct mem_cgroup_per_node *mz_node; 116 bool rightmost = true; 117 118 if (mz->on_tree) 119 return; 120 121 mz->usage_in_excess = new_usage_in_excess; 122 if (!mz->usage_in_excess) 123 return; 124 while (*p) { 125 parent = *p; 126 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 127 tree_node); 128 if (mz->usage_in_excess < mz_node->usage_in_excess) { 129 p = &(*p)->rb_left; 130 rightmost = false; 131 } else { 132 p = &(*p)->rb_right; 133 } 134 } 135 136 if (rightmost) 137 mctz->rb_rightmost = &mz->tree_node; 138 139 rb_link_node(&mz->tree_node, parent, p); 140 rb_insert_color(&mz->tree_node, &mctz->rb_root); 141 mz->on_tree = true; 142 } 143 144 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 145 struct mem_cgroup_tree_per_node *mctz) 146 { 147 if (!mz->on_tree) 148 return; 149 150 if (&mz->tree_node == mctz->rb_rightmost) 151 mctz->rb_rightmost = rb_prev(&mz->tree_node); 152 153 rb_erase(&mz->tree_node, &mctz->rb_root); 154 mz->on_tree = false; 155 } 156 157 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 158 struct mem_cgroup_tree_per_node *mctz) 159 { 160 unsigned long flags; 161 162 spin_lock_irqsave(&mctz->lock, flags); 163 __mem_cgroup_remove_exceeded(mz, mctz); 164 spin_unlock_irqrestore(&mctz->lock, flags); 165 } 166 167 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 168 { 169 unsigned long nr_pages = page_counter_read(&memcg->memory); 170 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 171 unsigned long excess = 0; 172 173 if (nr_pages > soft_limit) 174 excess = nr_pages - soft_limit; 175 176 return excess; 177 } 178 179 static void memcg1_update_tree(struct mem_cgroup *memcg, int nid) 180 { 181 unsigned long excess; 182 struct mem_cgroup_per_node *mz; 183 struct mem_cgroup_tree_per_node *mctz; 184 185 if (lru_gen_enabled()) { 186 if (soft_limit_excess(memcg)) 187 lru_gen_soft_reclaim(memcg, nid); 188 return; 189 } 190 191 mctz = soft_limit_tree.rb_tree_per_node[nid]; 192 if (!mctz) 193 return; 194 /* 195 * Necessary to update all ancestors when hierarchy is used. 196 * because their event counter is not touched. 197 */ 198 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 199 mz = memcg->nodeinfo[nid]; 200 excess = soft_limit_excess(memcg); 201 /* 202 * We have to update the tree if mz is on RB-tree or 203 * mem is over its softlimit. 204 */ 205 if (excess || mz->on_tree) { 206 unsigned long flags; 207 208 spin_lock_irqsave(&mctz->lock, flags); 209 /* if on-tree, remove it */ 210 if (mz->on_tree) 211 __mem_cgroup_remove_exceeded(mz, mctz); 212 /* 213 * Insert again. mz->usage_in_excess will be updated. 214 * If excess is 0, no tree ops. 215 */ 216 __mem_cgroup_insert_exceeded(mz, mctz, excess); 217 spin_unlock_irqrestore(&mctz->lock, flags); 218 } 219 } 220 } 221 222 void memcg1_remove_from_trees(struct mem_cgroup *memcg) 223 { 224 struct mem_cgroup_tree_per_node *mctz; 225 struct mem_cgroup_per_node *mz; 226 int nid; 227 228 for_each_node(nid) { 229 mz = memcg->nodeinfo[nid]; 230 mctz = soft_limit_tree.rb_tree_per_node[nid]; 231 if (mctz) 232 mem_cgroup_remove_exceeded(mz, mctz); 233 } 234 } 235 236 static struct mem_cgroup_per_node * 237 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 238 { 239 struct mem_cgroup_per_node *mz; 240 241 retry: 242 mz = NULL; 243 if (!mctz->rb_rightmost) 244 goto done; /* Nothing to reclaim from */ 245 246 mz = rb_entry(mctz->rb_rightmost, 247 struct mem_cgroup_per_node, tree_node); 248 /* 249 * Remove the node now but someone else can add it back, 250 * we will to add it back at the end of reclaim to its correct 251 * position in the tree. 252 */ 253 __mem_cgroup_remove_exceeded(mz, mctz); 254 if (!soft_limit_excess(mz->memcg) || 255 !css_tryget(&mz->memcg->css)) 256 goto retry; 257 done: 258 return mz; 259 } 260 261 static struct mem_cgroup_per_node * 262 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 263 { 264 struct mem_cgroup_per_node *mz; 265 266 spin_lock_irq(&mctz->lock); 267 mz = __mem_cgroup_largest_soft_limit_node(mctz); 268 spin_unlock_irq(&mctz->lock); 269 return mz; 270 } 271 272 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 273 pg_data_t *pgdat, 274 gfp_t gfp_mask, 275 unsigned long *total_scanned) 276 { 277 struct mem_cgroup *victim = NULL; 278 int total = 0; 279 int loop = 0; 280 unsigned long excess; 281 unsigned long nr_scanned; 282 struct mem_cgroup_reclaim_cookie reclaim = { 283 .pgdat = pgdat, 284 }; 285 286 excess = soft_limit_excess(root_memcg); 287 288 while (1) { 289 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 290 if (!victim) { 291 loop++; 292 if (loop >= 2) { 293 /* 294 * If we have not been able to reclaim 295 * anything, it might because there are 296 * no reclaimable pages under this hierarchy 297 */ 298 if (!total) 299 break; 300 /* 301 * We want to do more targeted reclaim. 302 * excess >> 2 is not to excessive so as to 303 * reclaim too much, nor too less that we keep 304 * coming back to reclaim from this cgroup 305 */ 306 if (total >= (excess >> 2) || 307 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 308 break; 309 } 310 continue; 311 } 312 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 313 pgdat, &nr_scanned); 314 *total_scanned += nr_scanned; 315 if (!soft_limit_excess(root_memcg)) 316 break; 317 } 318 mem_cgroup_iter_break(root_memcg, victim); 319 return total; 320 } 321 322 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, 323 gfp_t gfp_mask, 324 unsigned long *total_scanned) 325 { 326 unsigned long nr_reclaimed = 0; 327 struct mem_cgroup_per_node *mz, *next_mz = NULL; 328 unsigned long reclaimed; 329 int loop = 0; 330 struct mem_cgroup_tree_per_node *mctz; 331 unsigned long excess; 332 333 if (lru_gen_enabled()) 334 return 0; 335 336 if (order > 0) 337 return 0; 338 339 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; 340 341 /* 342 * Do not even bother to check the largest node if the root 343 * is empty. Do it lockless to prevent lock bouncing. Races 344 * are acceptable as soft limit is best effort anyway. 345 */ 346 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 347 return 0; 348 349 /* 350 * This loop can run a while, specially if mem_cgroup's continuously 351 * keep exceeding their soft limit and putting the system under 352 * pressure 353 */ 354 do { 355 if (next_mz) 356 mz = next_mz; 357 else 358 mz = mem_cgroup_largest_soft_limit_node(mctz); 359 if (!mz) 360 break; 361 362 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 363 gfp_mask, total_scanned); 364 nr_reclaimed += reclaimed; 365 spin_lock_irq(&mctz->lock); 366 367 /* 368 * If we failed to reclaim anything from this memory cgroup 369 * it is time to move on to the next cgroup 370 */ 371 next_mz = NULL; 372 if (!reclaimed) 373 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 374 375 excess = soft_limit_excess(mz->memcg); 376 /* 377 * One school of thought says that we should not add 378 * back the node to the tree if reclaim returns 0. 379 * But our reclaim could return 0, simply because due 380 * to priority we are exposing a smaller subset of 381 * memory to reclaim from. Consider this as a longer 382 * term TODO. 383 */ 384 /* If excess == 0, no tree ops */ 385 __mem_cgroup_insert_exceeded(mz, mctz, excess); 386 spin_unlock_irq(&mctz->lock); 387 css_put(&mz->memcg->css); 388 loop++; 389 /* 390 * Could not reclaim anything and there are no more 391 * mem cgroups to try or we seem to be looping without 392 * reclaiming anything. 393 */ 394 if (!nr_reclaimed && 395 (next_mz == NULL || 396 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 397 break; 398 } while (!nr_reclaimed); 399 if (next_mz) 400 css_put(&next_mz->memcg->css); 401 return nr_reclaimed; 402 } 403 404 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 405 struct cftype *cft) 406 { 407 return 0; 408 } 409 410 #ifdef CONFIG_MMU 411 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 412 struct cftype *cft, u64 val) 413 { 414 pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " 415 "Please report your usecase to linux-mm@kvack.org if you " 416 "depend on this functionality.\n"); 417 418 if (val != 0) 419 return -EINVAL; 420 return 0; 421 } 422 #else 423 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 424 struct cftype *cft, u64 val) 425 { 426 return -ENOSYS; 427 } 428 #endif 429 430 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 431 { 432 unsigned long val; 433 434 if (mem_cgroup_is_root(memcg)) { 435 /* 436 * Approximate root's usage from global state. This isn't 437 * perfect, but the root usage was always an approximation. 438 */ 439 val = global_node_page_state(NR_FILE_PAGES) + 440 global_node_page_state(NR_ANON_MAPPED); 441 if (swap) 442 val += total_swap_pages - get_nr_swap_pages(); 443 } else { 444 if (!swap) 445 val = page_counter_read(&memcg->memory); 446 else 447 val = page_counter_read(&memcg->memsw); 448 } 449 return val; 450 } 451 452 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 453 { 454 struct mem_cgroup_threshold_ary *t; 455 unsigned long usage; 456 int i; 457 458 rcu_read_lock(); 459 if (!swap) 460 t = rcu_dereference(memcg->thresholds.primary); 461 else 462 t = rcu_dereference(memcg->memsw_thresholds.primary); 463 464 if (!t) 465 goto unlock; 466 467 usage = mem_cgroup_usage(memcg, swap); 468 469 /* 470 * current_threshold points to threshold just below or equal to usage. 471 * If it's not true, a threshold was crossed after last 472 * call of __mem_cgroup_threshold(). 473 */ 474 i = t->current_threshold; 475 476 /* 477 * Iterate backward over array of thresholds starting from 478 * current_threshold and check if a threshold is crossed. 479 * If none of thresholds below usage is crossed, we read 480 * only one element of the array here. 481 */ 482 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 483 eventfd_signal(t->entries[i].eventfd); 484 485 /* i = current_threshold + 1 */ 486 i++; 487 488 /* 489 * Iterate forward over array of thresholds starting from 490 * current_threshold+1 and check if a threshold is crossed. 491 * If none of thresholds above usage is crossed, we read 492 * only one element of the array here. 493 */ 494 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 495 eventfd_signal(t->entries[i].eventfd); 496 497 /* Update current_threshold */ 498 t->current_threshold = i - 1; 499 unlock: 500 rcu_read_unlock(); 501 } 502 503 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 504 { 505 while (memcg) { 506 __mem_cgroup_threshold(memcg, false); 507 if (do_memsw_account()) 508 __mem_cgroup_threshold(memcg, true); 509 510 memcg = parent_mem_cgroup(memcg); 511 } 512 } 513 514 /* Cgroup1: threshold notifications & softlimit tree updates */ 515 516 /* 517 * Per memcg event counter is incremented at every pagein/pageout. With THP, 518 * it will be incremented by the number of pages. This counter is used 519 * to trigger some periodic events. This is straightforward and better 520 * than using jiffies etc. to handle periodic memcg event. 521 */ 522 enum mem_cgroup_events_target { 523 MEM_CGROUP_TARGET_THRESH, 524 MEM_CGROUP_TARGET_SOFTLIMIT, 525 MEM_CGROUP_NTARGETS, 526 }; 527 528 struct memcg1_events_percpu { 529 unsigned long nr_page_events; 530 unsigned long targets[MEM_CGROUP_NTARGETS]; 531 }; 532 533 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages) 534 { 535 /* pagein of a big page is an event. So, ignore page size */ 536 if (nr_pages > 0) 537 count_memcg_events(memcg, PGPGIN, 1); 538 else { 539 count_memcg_events(memcg, PGPGOUT, 1); 540 nr_pages = -nr_pages; /* for event */ 541 } 542 543 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages); 544 } 545 546 #define THRESHOLDS_EVENTS_TARGET 128 547 #define SOFTLIMIT_EVENTS_TARGET 1024 548 549 static bool memcg1_event_ratelimit(struct mem_cgroup *memcg, 550 enum mem_cgroup_events_target target) 551 { 552 unsigned long val, next; 553 554 val = __this_cpu_read(memcg->events_percpu->nr_page_events); 555 next = __this_cpu_read(memcg->events_percpu->targets[target]); 556 /* from time_after() in jiffies.h */ 557 if ((long)(next - val) < 0) { 558 switch (target) { 559 case MEM_CGROUP_TARGET_THRESH: 560 next = val + THRESHOLDS_EVENTS_TARGET; 561 break; 562 case MEM_CGROUP_TARGET_SOFTLIMIT: 563 next = val + SOFTLIMIT_EVENTS_TARGET; 564 break; 565 default: 566 break; 567 } 568 __this_cpu_write(memcg->events_percpu->targets[target], next); 569 return true; 570 } 571 return false; 572 } 573 574 /* 575 * Check events in order. 576 * 577 */ 578 static void memcg1_check_events(struct mem_cgroup *memcg, int nid) 579 { 580 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 581 return; 582 583 /* threshold event is triggered in finer grain than soft limit */ 584 if (unlikely(memcg1_event_ratelimit(memcg, 585 MEM_CGROUP_TARGET_THRESH))) { 586 bool do_softlimit; 587 588 do_softlimit = memcg1_event_ratelimit(memcg, 589 MEM_CGROUP_TARGET_SOFTLIMIT); 590 mem_cgroup_threshold(memcg); 591 if (unlikely(do_softlimit)) 592 memcg1_update_tree(memcg, nid); 593 } 594 } 595 596 void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) 597 { 598 unsigned long flags; 599 600 local_irq_save(flags); 601 memcg1_charge_statistics(memcg, folio_nr_pages(folio)); 602 memcg1_check_events(memcg, folio_nid(folio)); 603 local_irq_restore(flags); 604 } 605 606 /** 607 * memcg1_swapout - transfer a memsw charge to swap 608 * @folio: folio whose memsw charge to transfer 609 * @entry: swap entry to move the charge to 610 * 611 * Transfer the memsw charge of @folio to @entry. 612 */ 613 void memcg1_swapout(struct folio *folio, swp_entry_t entry) 614 { 615 struct mem_cgroup *memcg, *swap_memcg; 616 unsigned int nr_entries; 617 618 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 619 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 620 621 if (mem_cgroup_disabled()) 622 return; 623 624 if (!do_memsw_account()) 625 return; 626 627 memcg = folio_memcg(folio); 628 629 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 630 if (!memcg) 631 return; 632 633 /* 634 * In case the memcg owning these pages has been offlined and doesn't 635 * have an ID allocated to it anymore, charge the closest online 636 * ancestor for the swap instead and transfer the memory+swap charge. 637 */ 638 swap_memcg = mem_cgroup_private_id_get_online(memcg); 639 nr_entries = folio_nr_pages(folio); 640 /* Get references for the tail pages, too */ 641 if (nr_entries > 1) 642 mem_cgroup_private_id_get_many(swap_memcg, nr_entries - 1); 643 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 644 645 swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry); 646 647 folio_unqueue_deferred_split(folio); 648 folio->memcg_data = 0; 649 650 if (!mem_cgroup_is_root(memcg)) 651 page_counter_uncharge(&memcg->memory, nr_entries); 652 653 if (memcg != swap_memcg) { 654 if (!mem_cgroup_is_root(swap_memcg)) 655 page_counter_charge(&swap_memcg->memsw, nr_entries); 656 page_counter_uncharge(&memcg->memsw, nr_entries); 657 } 658 659 /* 660 * Interrupts should be disabled here because the caller holds the 661 * i_pages lock which is taken with interrupts-off. It is 662 * important here to have the interrupts disabled because it is the 663 * only synchronisation we have for updating the per-CPU variables. 664 */ 665 preempt_disable_nested(); 666 VM_WARN_ON_IRQS_ENABLED(); 667 memcg1_charge_statistics(memcg, -folio_nr_pages(folio)); 668 preempt_enable_nested(); 669 memcg1_check_events(memcg, folio_nid(folio)); 670 671 css_put(&memcg->css); 672 } 673 674 /* 675 * memcg1_swapin - uncharge swap slot 676 * @entry: the first swap entry for which the pages are charged 677 * @nr_pages: number of pages which will be uncharged 678 * 679 * Call this function after successfully adding the charged page to swapcache. 680 * 681 * Note: This function assumes the page for which swap slot is being uncharged 682 * is order 0 page. 683 */ 684 void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) 685 { 686 /* 687 * Cgroup1's unified memory+swap counter has been charged with the 688 * new swapcache page, finish the transfer by uncharging the swap 689 * slot. The swap slot would also get uncharged when it dies, but 690 * it can stick around indefinitely and we'd count the page twice 691 * the entire time. 692 * 693 * Cgroup2 has separate resource counters for memory and swap, 694 * so this is a non-issue here. Memory and swap charge lifetimes 695 * correspond 1:1 to page and swap slot lifetimes: we charge the 696 * page to memory here, and uncharge swap when the slot is freed. 697 */ 698 if (do_memsw_account()) { 699 /* 700 * The swap entry might not get freed for a long time, 701 * let's not wait for it. The page already received a 702 * memory+swap charge, drop the swap entry duplicate. 703 */ 704 mem_cgroup_uncharge_swap(entry, nr_pages); 705 } 706 } 707 708 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 709 unsigned long nr_memory, int nid) 710 { 711 unsigned long flags; 712 713 local_irq_save(flags); 714 count_memcg_events(memcg, PGPGOUT, pgpgout); 715 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory); 716 memcg1_check_events(memcg, nid); 717 local_irq_restore(flags); 718 } 719 720 static int compare_thresholds(const void *a, const void *b) 721 { 722 const struct mem_cgroup_threshold *_a = a; 723 const struct mem_cgroup_threshold *_b = b; 724 725 if (_a->threshold > _b->threshold) 726 return 1; 727 728 if (_a->threshold < _b->threshold) 729 return -1; 730 731 return 0; 732 } 733 734 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 735 { 736 struct mem_cgroup_eventfd_list *ev; 737 738 spin_lock(&memcg_oom_lock); 739 740 list_for_each_entry(ev, &memcg->oom_notify, list) 741 eventfd_signal(ev->eventfd); 742 743 spin_unlock(&memcg_oom_lock); 744 return 0; 745 } 746 747 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 748 { 749 struct mem_cgroup *iter; 750 751 for_each_mem_cgroup_tree(iter, memcg) 752 mem_cgroup_oom_notify_cb(iter); 753 } 754 755 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 756 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 757 { 758 struct mem_cgroup_thresholds *thresholds; 759 struct mem_cgroup_threshold_ary *new; 760 unsigned long threshold; 761 unsigned long usage; 762 int i, size, ret; 763 764 ret = page_counter_memparse(args, "-1", &threshold); 765 if (ret) 766 return ret; 767 768 mutex_lock(&memcg->thresholds_lock); 769 770 if (type == _MEM) { 771 thresholds = &memcg->thresholds; 772 usage = mem_cgroup_usage(memcg, false); 773 } else if (type == _MEMSWAP) { 774 thresholds = &memcg->memsw_thresholds; 775 usage = mem_cgroup_usage(memcg, true); 776 } else 777 BUG(); 778 779 /* Check if a threshold crossed before adding a new one */ 780 if (thresholds->primary) 781 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 782 783 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 784 785 /* Allocate memory for new array of thresholds */ 786 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL_ACCOUNT); 787 if (!new) { 788 ret = -ENOMEM; 789 goto unlock; 790 } 791 new->size = size; 792 793 /* Copy thresholds (if any) to new array */ 794 if (thresholds->primary) 795 memcpy(new->entries, thresholds->primary->entries, 796 flex_array_size(new, entries, size - 1)); 797 798 /* Add new threshold */ 799 new->entries[size - 1].eventfd = eventfd; 800 new->entries[size - 1].threshold = threshold; 801 802 /* Sort thresholds. Registering of new threshold isn't time-critical */ 803 sort(new->entries, size, sizeof(*new->entries), 804 compare_thresholds, NULL); 805 806 /* Find current threshold */ 807 new->current_threshold = -1; 808 for (i = 0; i < size; i++) { 809 if (new->entries[i].threshold <= usage) { 810 /* 811 * new->current_threshold will not be used until 812 * rcu_assign_pointer(), so it's safe to increment 813 * it here. 814 */ 815 ++new->current_threshold; 816 } else 817 break; 818 } 819 820 /* Free old spare buffer and save old primary buffer as spare */ 821 kfree(thresholds->spare); 822 thresholds->spare = thresholds->primary; 823 824 rcu_assign_pointer(thresholds->primary, new); 825 826 /* To be sure that nobody uses thresholds */ 827 synchronize_rcu(); 828 829 unlock: 830 mutex_unlock(&memcg->thresholds_lock); 831 832 return ret; 833 } 834 835 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 836 struct eventfd_ctx *eventfd, const char *args) 837 { 838 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 839 } 840 841 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 842 struct eventfd_ctx *eventfd, const char *args) 843 { 844 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 845 } 846 847 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 848 struct eventfd_ctx *eventfd, enum res_type type) 849 { 850 struct mem_cgroup_thresholds *thresholds; 851 struct mem_cgroup_threshold_ary *new; 852 unsigned long usage; 853 int i, j, size, entries; 854 855 mutex_lock(&memcg->thresholds_lock); 856 857 if (type == _MEM) { 858 thresholds = &memcg->thresholds; 859 usage = mem_cgroup_usage(memcg, false); 860 } else if (type == _MEMSWAP) { 861 thresholds = &memcg->memsw_thresholds; 862 usage = mem_cgroup_usage(memcg, true); 863 } else 864 BUG(); 865 866 if (!thresholds->primary) 867 goto unlock; 868 869 /* Check if a threshold crossed before removing */ 870 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 871 872 /* Calculate new number of threshold */ 873 size = entries = 0; 874 for (i = 0; i < thresholds->primary->size; i++) { 875 if (thresholds->primary->entries[i].eventfd != eventfd) 876 size++; 877 else 878 entries++; 879 } 880 881 new = thresholds->spare; 882 883 /* If no items related to eventfd have been cleared, nothing to do */ 884 if (!entries) 885 goto unlock; 886 887 /* Set thresholds array to NULL if we don't have thresholds */ 888 if (!size) { 889 kfree(new); 890 new = NULL; 891 goto swap_buffers; 892 } 893 894 new->size = size; 895 896 /* Copy thresholds and find current threshold */ 897 new->current_threshold = -1; 898 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 899 if (thresholds->primary->entries[i].eventfd == eventfd) 900 continue; 901 902 new->entries[j] = thresholds->primary->entries[i]; 903 if (new->entries[j].threshold <= usage) { 904 /* 905 * new->current_threshold will not be used 906 * until rcu_assign_pointer(), so it's safe to increment 907 * it here. 908 */ 909 ++new->current_threshold; 910 } 911 j++; 912 } 913 914 swap_buffers: 915 /* Swap primary and spare array */ 916 thresholds->spare = thresholds->primary; 917 918 rcu_assign_pointer(thresholds->primary, new); 919 920 /* To be sure that nobody uses thresholds */ 921 synchronize_rcu(); 922 923 /* If all events are unregistered, free the spare array */ 924 if (!new) { 925 kfree(thresholds->spare); 926 thresholds->spare = NULL; 927 } 928 unlock: 929 mutex_unlock(&memcg->thresholds_lock); 930 } 931 932 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 933 struct eventfd_ctx *eventfd) 934 { 935 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 936 } 937 938 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 939 struct eventfd_ctx *eventfd) 940 { 941 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 942 } 943 944 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 945 struct eventfd_ctx *eventfd, const char *args) 946 { 947 struct mem_cgroup_eventfd_list *event; 948 949 event = kmalloc(sizeof(*event), GFP_KERNEL_ACCOUNT); 950 if (!event) 951 return -ENOMEM; 952 953 spin_lock(&memcg_oom_lock); 954 955 event->eventfd = eventfd; 956 list_add(&event->list, &memcg->oom_notify); 957 958 /* already in OOM ? */ 959 if (memcg->under_oom) 960 eventfd_signal(eventfd); 961 spin_unlock(&memcg_oom_lock); 962 963 return 0; 964 } 965 966 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 967 struct eventfd_ctx *eventfd) 968 { 969 struct mem_cgroup_eventfd_list *ev, *tmp; 970 971 spin_lock(&memcg_oom_lock); 972 973 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 974 if (ev->eventfd == eventfd) { 975 list_del(&ev->list); 976 kfree(ev); 977 } 978 } 979 980 spin_unlock(&memcg_oom_lock); 981 } 982 983 /* 984 * DO NOT USE IN NEW FILES. 985 * 986 * "cgroup.event_control" implementation. 987 * 988 * This is way over-engineered. It tries to support fully configurable 989 * events for each user. Such level of flexibility is completely 990 * unnecessary especially in the light of the planned unified hierarchy. 991 * 992 * Please deprecate this and replace with something simpler if at all 993 * possible. 994 */ 995 996 /* 997 * Unregister event and free resources. 998 * 999 * Gets called from workqueue. 1000 */ 1001 static void memcg_event_remove(struct work_struct *work) 1002 { 1003 struct mem_cgroup_event *event = 1004 container_of(work, struct mem_cgroup_event, remove); 1005 struct mem_cgroup *memcg = event->memcg; 1006 1007 remove_wait_queue(event->wqh, &event->wait); 1008 1009 event->unregister_event(memcg, event->eventfd); 1010 1011 /* Notify userspace the event is going away. */ 1012 eventfd_signal(event->eventfd); 1013 1014 eventfd_ctx_put(event->eventfd); 1015 kfree(event); 1016 css_put(&memcg->css); 1017 } 1018 1019 /* 1020 * Gets called on EPOLLHUP on eventfd when user closes it. 1021 * 1022 * Called with wqh->lock held and interrupts disabled. 1023 */ 1024 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode, 1025 int sync, void *key) 1026 { 1027 struct mem_cgroup_event *event = 1028 container_of(wait, struct mem_cgroup_event, wait); 1029 struct mem_cgroup *memcg = event->memcg; 1030 __poll_t flags = key_to_poll(key); 1031 1032 if (flags & EPOLLHUP) { 1033 /* 1034 * If the event has been detached at cgroup removal, we 1035 * can simply return knowing the other side will cleanup 1036 * for us. 1037 * 1038 * We can't race against event freeing since the other 1039 * side will require wqh->lock via remove_wait_queue(), 1040 * which we hold. 1041 */ 1042 spin_lock(&memcg->event_list_lock); 1043 if (!list_empty(&event->list)) { 1044 list_del_init(&event->list); 1045 /* 1046 * We are in atomic context, but cgroup_event_remove() 1047 * may sleep, so we have to call it in workqueue. 1048 */ 1049 schedule_work(&event->remove); 1050 } 1051 spin_unlock(&memcg->event_list_lock); 1052 } 1053 1054 return 0; 1055 } 1056 1057 static void memcg_event_ptable_queue_proc(struct file *file, 1058 wait_queue_head_t *wqh, poll_table *pt) 1059 { 1060 struct mem_cgroup_event *event = 1061 container_of(pt, struct mem_cgroup_event, pt); 1062 1063 event->wqh = wqh; 1064 add_wait_queue(wqh, &event->wait); 1065 } 1066 1067 /* 1068 * DO NOT USE IN NEW FILES. 1069 * 1070 * Parse input and register new cgroup event handler. 1071 * 1072 * Input must be in format '<event_fd> <control_fd> <args>'. 1073 * Interpretation of args is defined by control file implementation. 1074 */ 1075 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 1076 char *buf, size_t nbytes, loff_t off) 1077 { 1078 struct cgroup_subsys_state *css = of_css(of); 1079 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1080 struct mem_cgroup_event *event; 1081 struct cgroup_subsys_state *cfile_css; 1082 unsigned int efd, cfd; 1083 struct dentry *cdentry; 1084 const char *name; 1085 char *endp; 1086 int ret; 1087 1088 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 1089 return -EOPNOTSUPP; 1090 1091 buf = strstrip(buf); 1092 1093 efd = simple_strtoul(buf, &endp, 10); 1094 if (*endp != ' ') 1095 return -EINVAL; 1096 buf = endp + 1; 1097 1098 cfd = simple_strtoul(buf, &endp, 10); 1099 if (*endp == '\0') 1100 buf = endp; 1101 else if (*endp == ' ') 1102 buf = endp + 1; 1103 else 1104 return -EINVAL; 1105 1106 CLASS(fd, efile)(efd); 1107 if (fd_empty(efile)) 1108 return -EBADF; 1109 1110 CLASS(fd, cfile)(cfd); 1111 1112 event = kzalloc(sizeof(*event), GFP_KERNEL_ACCOUNT); 1113 if (!event) 1114 return -ENOMEM; 1115 1116 event->memcg = memcg; 1117 INIT_LIST_HEAD(&event->list); 1118 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 1119 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 1120 INIT_WORK(&event->remove, memcg_event_remove); 1121 1122 event->eventfd = eventfd_ctx_fileget(fd_file(efile)); 1123 if (IS_ERR(event->eventfd)) { 1124 ret = PTR_ERR(event->eventfd); 1125 goto out_kfree; 1126 } 1127 1128 if (fd_empty(cfile)) { 1129 ret = -EBADF; 1130 goto out_put_eventfd; 1131 } 1132 1133 /* the process need read permission on control file */ 1134 /* AV: shouldn't we check that it's been opened for read instead? */ 1135 ret = file_permission(fd_file(cfile), MAY_READ); 1136 if (ret < 0) 1137 goto out_put_eventfd; 1138 1139 /* 1140 * The control file must be a regular cgroup1 file. As a regular cgroup 1141 * file can't be renamed, it's safe to access its name afterwards. 1142 */ 1143 cdentry = fd_file(cfile)->f_path.dentry; 1144 if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 1145 ret = -EINVAL; 1146 goto out_put_eventfd; 1147 } 1148 1149 /* 1150 * Determine the event callbacks and set them in @event. This used 1151 * to be done via struct cftype but cgroup core no longer knows 1152 * about these events. The following is crude but the whole thing 1153 * is for compatibility anyway. 1154 * 1155 * DO NOT ADD NEW FILES. 1156 */ 1157 name = cdentry->d_name.name; 1158 1159 if (!strcmp(name, "memory.usage_in_bytes")) { 1160 event->register_event = mem_cgroup_usage_register_event; 1161 event->unregister_event = mem_cgroup_usage_unregister_event; 1162 } else if (!strcmp(name, "memory.oom_control")) { 1163 pr_warn_once("oom_control is deprecated and will be removed. " 1164 "Please report your usecase to linux-mm-@kvack.org" 1165 " if you depend on this functionality.\n"); 1166 event->register_event = mem_cgroup_oom_register_event; 1167 event->unregister_event = mem_cgroup_oom_unregister_event; 1168 } else if (!strcmp(name, "memory.pressure_level")) { 1169 pr_warn_once("pressure_level is deprecated and will be removed. " 1170 "Please report your usecase to linux-mm-@kvack.org " 1171 "if you depend on this functionality.\n"); 1172 event->register_event = vmpressure_register_event; 1173 event->unregister_event = vmpressure_unregister_event; 1174 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 1175 event->register_event = memsw_cgroup_usage_register_event; 1176 event->unregister_event = memsw_cgroup_usage_unregister_event; 1177 } else { 1178 ret = -EINVAL; 1179 goto out_put_eventfd; 1180 } 1181 1182 /* 1183 * Verify @cfile should belong to @css. Also, remaining events are 1184 * automatically removed on cgroup destruction but the removal is 1185 * asynchronous, so take an extra ref on @css. 1186 */ 1187 cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 1188 &memory_cgrp_subsys); 1189 ret = -EINVAL; 1190 if (IS_ERR(cfile_css)) 1191 goto out_put_eventfd; 1192 if (cfile_css != css) 1193 goto out_put_css; 1194 1195 ret = event->register_event(memcg, event->eventfd, buf); 1196 if (ret) 1197 goto out_put_css; 1198 1199 vfs_poll(fd_file(efile), &event->pt); 1200 1201 spin_lock_irq(&memcg->event_list_lock); 1202 list_add(&event->list, &memcg->event_list); 1203 spin_unlock_irq(&memcg->event_list_lock); 1204 return nbytes; 1205 1206 out_put_css: 1207 css_put(cfile_css); 1208 out_put_eventfd: 1209 eventfd_ctx_put(event->eventfd); 1210 out_kfree: 1211 kfree(event); 1212 return ret; 1213 } 1214 1215 void memcg1_memcg_init(struct mem_cgroup *memcg) 1216 { 1217 INIT_LIST_HEAD(&memcg->oom_notify); 1218 mutex_init(&memcg->thresholds_lock); 1219 INIT_LIST_HEAD(&memcg->event_list); 1220 spin_lock_init(&memcg->event_list_lock); 1221 } 1222 1223 void memcg1_css_offline(struct mem_cgroup *memcg) 1224 { 1225 struct mem_cgroup_event *event, *tmp; 1226 1227 /* 1228 * Unregister events and notify userspace. 1229 * Notify userspace about cgroup removing only after rmdir of cgroup 1230 * directory to avoid race between userspace and kernelspace. 1231 */ 1232 spin_lock_irq(&memcg->event_list_lock); 1233 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 1234 list_del_init(&event->list); 1235 schedule_work(&event->remove); 1236 } 1237 spin_unlock_irq(&memcg->event_list_lock); 1238 } 1239 1240 /* 1241 * Check OOM-Killer is already running under our hierarchy. 1242 * If someone is running, return false. 1243 */ 1244 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1245 { 1246 struct mem_cgroup *iter, *failed = NULL; 1247 1248 spin_lock(&memcg_oom_lock); 1249 1250 for_each_mem_cgroup_tree(iter, memcg) { 1251 if (iter->oom_lock) { 1252 /* 1253 * this subtree of our hierarchy is already locked 1254 * so we cannot give a lock. 1255 */ 1256 failed = iter; 1257 mem_cgroup_iter_break(memcg, iter); 1258 break; 1259 } 1260 iter->oom_lock = true; 1261 } 1262 1263 if (failed) { 1264 /* 1265 * OK, we failed to lock the whole subtree so we have 1266 * to clean up what we set up to the failing subtree 1267 */ 1268 for_each_mem_cgroup_tree(iter, memcg) { 1269 if (iter == failed) { 1270 mem_cgroup_iter_break(memcg, iter); 1271 break; 1272 } 1273 iter->oom_lock = false; 1274 } 1275 } else 1276 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1277 1278 spin_unlock(&memcg_oom_lock); 1279 1280 return !failed; 1281 } 1282 1283 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1284 { 1285 struct mem_cgroup *iter; 1286 1287 spin_lock(&memcg_oom_lock); 1288 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1289 for_each_mem_cgroup_tree(iter, memcg) 1290 iter->oom_lock = false; 1291 spin_unlock(&memcg_oom_lock); 1292 } 1293 1294 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1295 { 1296 struct mem_cgroup *iter; 1297 1298 spin_lock(&memcg_oom_lock); 1299 for_each_mem_cgroup_tree(iter, memcg) 1300 iter->under_oom++; 1301 spin_unlock(&memcg_oom_lock); 1302 } 1303 1304 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1305 { 1306 struct mem_cgroup *iter; 1307 1308 /* 1309 * Be careful about under_oom underflows because a child memcg 1310 * could have been added after mem_cgroup_mark_under_oom. 1311 */ 1312 spin_lock(&memcg_oom_lock); 1313 for_each_mem_cgroup_tree(iter, memcg) 1314 if (iter->under_oom > 0) 1315 iter->under_oom--; 1316 spin_unlock(&memcg_oom_lock); 1317 } 1318 1319 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1320 1321 struct oom_wait_info { 1322 struct mem_cgroup *memcg; 1323 wait_queue_entry_t wait; 1324 }; 1325 1326 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1327 unsigned int mode, int sync, void *arg) 1328 { 1329 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1330 struct mem_cgroup *oom_wait_memcg; 1331 struct oom_wait_info *oom_wait_info; 1332 1333 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1334 oom_wait_memcg = oom_wait_info->memcg; 1335 1336 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1337 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1338 return 0; 1339 return autoremove_wake_function(wait, mode, sync, arg); 1340 } 1341 1342 void memcg1_oom_recover(struct mem_cgroup *memcg) 1343 { 1344 /* 1345 * For the following lockless ->under_oom test, the only required 1346 * guarantee is that it must see the state asserted by an OOM when 1347 * this function is called as a result of userland actions 1348 * triggered by the notification of the OOM. This is trivially 1349 * achieved by invoking mem_cgroup_mark_under_oom() before 1350 * triggering notification. 1351 */ 1352 if (memcg && memcg->under_oom) 1353 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1354 } 1355 1356 /** 1357 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1358 * @handle: actually kill/wait or just clean up the OOM state 1359 * 1360 * This has to be called at the end of a page fault if the memcg OOM 1361 * handler was enabled. 1362 * 1363 * Memcg supports userspace OOM handling where failed allocations must 1364 * sleep on a waitqueue until the userspace task resolves the 1365 * situation. Sleeping directly in the charge context with all kinds 1366 * of locks held is not a good idea, instead we remember an OOM state 1367 * in the task and mem_cgroup_oom_synchronize() has to be called at 1368 * the end of the page fault to complete the OOM handling. 1369 * 1370 * Returns %true if an ongoing memcg OOM situation was detected and 1371 * completed, %false otherwise. 1372 */ 1373 bool mem_cgroup_oom_synchronize(bool handle) 1374 { 1375 struct mem_cgroup *memcg = current->memcg_in_oom; 1376 struct oom_wait_info owait; 1377 bool locked; 1378 1379 /* OOM is global, do not handle */ 1380 if (!memcg) 1381 return false; 1382 1383 if (!handle) 1384 goto cleanup; 1385 1386 owait.memcg = memcg; 1387 owait.wait.flags = 0; 1388 owait.wait.func = memcg_oom_wake_function; 1389 owait.wait.private = current; 1390 INIT_LIST_HEAD(&owait.wait.entry); 1391 1392 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1393 mem_cgroup_mark_under_oom(memcg); 1394 1395 locked = mem_cgroup_oom_trylock(memcg); 1396 1397 if (locked) 1398 mem_cgroup_oom_notify(memcg); 1399 1400 schedule(); 1401 mem_cgroup_unmark_under_oom(memcg); 1402 finish_wait(&memcg_oom_waitq, &owait.wait); 1403 1404 if (locked) 1405 mem_cgroup_oom_unlock(memcg); 1406 cleanup: 1407 current->memcg_in_oom = NULL; 1408 css_put(&memcg->css); 1409 return true; 1410 } 1411 1412 1413 bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) 1414 { 1415 /* 1416 * We are in the middle of the charge context here, so we 1417 * don't want to block when potentially sitting on a callstack 1418 * that holds all kinds of filesystem and mm locks. 1419 * 1420 * cgroup1 allows disabling the OOM killer and waiting for outside 1421 * handling until the charge can succeed; remember the context and put 1422 * the task to sleep at the end of the page fault when all locks are 1423 * released. 1424 * 1425 * On the other hand, in-kernel OOM killer allows for an async victim 1426 * memory reclaim (oom_reaper) and that means that we are not solely 1427 * relying on the oom victim to make a forward progress and we can 1428 * invoke the oom killer here. 1429 * 1430 * Please note that mem_cgroup_out_of_memory might fail to find a 1431 * victim and then we have to bail out from the charge path. 1432 */ 1433 if (READ_ONCE(memcg->oom_kill_disable)) { 1434 if (current->in_user_fault) { 1435 css_get(&memcg->css); 1436 current->memcg_in_oom = memcg; 1437 } 1438 return false; 1439 } 1440 1441 mem_cgroup_mark_under_oom(memcg); 1442 1443 *locked = mem_cgroup_oom_trylock(memcg); 1444 1445 if (*locked) 1446 mem_cgroup_oom_notify(memcg); 1447 1448 mem_cgroup_unmark_under_oom(memcg); 1449 1450 return true; 1451 } 1452 1453 void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked) 1454 { 1455 if (locked) 1456 mem_cgroup_oom_unlock(memcg); 1457 } 1458 1459 static DEFINE_MUTEX(memcg_max_mutex); 1460 1461 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 1462 unsigned long max, bool memsw) 1463 { 1464 bool enlarge = false; 1465 bool drained = false; 1466 int ret; 1467 bool limits_invariant; 1468 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 1469 1470 do { 1471 if (signal_pending(current)) { 1472 ret = -EINTR; 1473 break; 1474 } 1475 1476 mutex_lock(&memcg_max_mutex); 1477 /* 1478 * Make sure that the new limit (memsw or memory limit) doesn't 1479 * break our basic invariant rule memory.max <= memsw.max. 1480 */ 1481 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 1482 max <= memcg->memsw.max; 1483 if (!limits_invariant) { 1484 mutex_unlock(&memcg_max_mutex); 1485 ret = -EINVAL; 1486 break; 1487 } 1488 if (max > counter->max) 1489 enlarge = true; 1490 ret = page_counter_set_max(counter, max); 1491 mutex_unlock(&memcg_max_mutex); 1492 1493 if (!ret) 1494 break; 1495 1496 if (!drained) { 1497 drain_all_stock(memcg); 1498 drained = true; 1499 continue; 1500 } 1501 1502 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 1503 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) { 1504 ret = -EBUSY; 1505 break; 1506 } 1507 } while (true); 1508 1509 if (!ret && enlarge) 1510 memcg1_oom_recover(memcg); 1511 1512 return ret; 1513 } 1514 1515 /* 1516 * Reclaims as many pages from the given memcg as possible. 1517 * 1518 * Caller is responsible for holding css reference for memcg. 1519 */ 1520 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 1521 { 1522 int nr_retries = MAX_RECLAIM_RETRIES; 1523 1524 /* we call try-to-free pages for make this cgroup empty */ 1525 lru_add_drain_all(); 1526 1527 drain_all_stock(memcg); 1528 1529 /* try to free all pages in this cgroup */ 1530 while (nr_retries && page_counter_read(&memcg->memory)) { 1531 if (signal_pending(current)) 1532 return -EINTR; 1533 1534 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 1535 MEMCG_RECLAIM_MAY_SWAP, NULL)) 1536 nr_retries--; 1537 } 1538 1539 return 0; 1540 } 1541 1542 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 1543 char *buf, size_t nbytes, 1544 loff_t off) 1545 { 1546 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1547 1548 if (mem_cgroup_is_root(memcg)) 1549 return -EINVAL; 1550 return mem_cgroup_force_empty(memcg) ?: nbytes; 1551 } 1552 1553 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 1554 struct cftype *cft) 1555 { 1556 return 1; 1557 } 1558 1559 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 1560 struct cftype *cft, u64 val) 1561 { 1562 if (val == 1) 1563 return 0; 1564 1565 pr_warn_once("Non-hierarchical mode is deprecated. " 1566 "Please report your usecase to linux-mm@kvack.org if you " 1567 "depend on this functionality.\n"); 1568 1569 return -EINVAL; 1570 } 1571 1572 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 1573 struct cftype *cft) 1574 { 1575 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1576 struct page_counter *counter; 1577 1578 switch (MEMFILE_TYPE(cft->private)) { 1579 case _MEM: 1580 counter = &memcg->memory; 1581 break; 1582 case _MEMSWAP: 1583 counter = &memcg->memsw; 1584 break; 1585 case _KMEM: 1586 counter = &memcg->kmem; 1587 break; 1588 case _TCP: 1589 counter = &memcg->tcpmem; 1590 break; 1591 default: 1592 BUG(); 1593 } 1594 1595 switch (MEMFILE_ATTR(cft->private)) { 1596 case RES_USAGE: 1597 if (counter == &memcg->memory) 1598 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 1599 if (counter == &memcg->memsw) 1600 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 1601 return (u64)page_counter_read(counter) * PAGE_SIZE; 1602 case RES_LIMIT: 1603 return (u64)counter->max * PAGE_SIZE; 1604 case RES_MAX_USAGE: 1605 return (u64)counter->watermark * PAGE_SIZE; 1606 case RES_FAILCNT: 1607 return counter->failcnt; 1608 case RES_SOFT_LIMIT: 1609 return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE; 1610 default: 1611 BUG(); 1612 } 1613 } 1614 1615 /* 1616 * This function doesn't do anything useful. Its only job is to provide a read 1617 * handler for a file so that cgroup_file_mode() will add read permissions. 1618 */ 1619 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m, 1620 __always_unused void *v) 1621 { 1622 return -EINVAL; 1623 } 1624 1625 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 1626 { 1627 int ret; 1628 1629 mutex_lock(&memcg_max_mutex); 1630 1631 ret = page_counter_set_max(&memcg->tcpmem, max); 1632 if (ret) 1633 goto out; 1634 1635 if (!memcg->tcpmem_active) { 1636 /* 1637 * The active flag needs to be written after the static_key 1638 * update. This is what guarantees that the socket activation 1639 * function is the last one to run. See mem_cgroup_sk_alloc() 1640 * for details, and note that we don't mark any socket as 1641 * belonging to this memcg until that flag is up. 1642 * 1643 * We need to do this, because static_keys will span multiple 1644 * sites, but we can't control their order. If we mark a socket 1645 * as accounted, but the accounting functions are not patched in 1646 * yet, we'll lose accounting. 1647 * 1648 * We never race with the readers in mem_cgroup_sk_alloc(), 1649 * because when this value change, the code to process it is not 1650 * patched in yet. 1651 */ 1652 static_branch_inc(&memcg_sockets_enabled_key); 1653 memcg->tcpmem_active = true; 1654 } 1655 out: 1656 mutex_unlock(&memcg_max_mutex); 1657 return ret; 1658 } 1659 1660 /* 1661 * The user of this function is... 1662 * RES_LIMIT. 1663 */ 1664 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 1665 char *buf, size_t nbytes, loff_t off) 1666 { 1667 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1668 unsigned long nr_pages; 1669 int ret; 1670 1671 buf = strstrip(buf); 1672 ret = page_counter_memparse(buf, "-1", &nr_pages); 1673 if (ret) 1674 return ret; 1675 1676 switch (MEMFILE_ATTR(of_cft(of)->private)) { 1677 case RES_LIMIT: 1678 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 1679 ret = -EINVAL; 1680 break; 1681 } 1682 switch (MEMFILE_TYPE(of_cft(of)->private)) { 1683 case _MEM: 1684 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 1685 break; 1686 case _MEMSWAP: 1687 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 1688 break; 1689 case _KMEM: 1690 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 1691 "Writing any value to this file has no effect. " 1692 "Please report your usecase to linux-mm@kvack.org if you " 1693 "depend on this functionality.\n"); 1694 ret = 0; 1695 break; 1696 case _TCP: 1697 pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. " 1698 "Please report your usecase to linux-mm@kvack.org if you " 1699 "depend on this functionality.\n"); 1700 ret = memcg_update_tcp_max(memcg, nr_pages); 1701 break; 1702 } 1703 break; 1704 case RES_SOFT_LIMIT: 1705 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 1706 ret = -EOPNOTSUPP; 1707 } else { 1708 pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. " 1709 "Please report your usecase to linux-mm@kvack.org if you " 1710 "depend on this functionality.\n"); 1711 WRITE_ONCE(memcg->soft_limit, nr_pages); 1712 ret = 0; 1713 } 1714 break; 1715 } 1716 return ret ?: nbytes; 1717 } 1718 1719 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 1720 size_t nbytes, loff_t off) 1721 { 1722 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1723 struct page_counter *counter; 1724 1725 switch (MEMFILE_TYPE(of_cft(of)->private)) { 1726 case _MEM: 1727 counter = &memcg->memory; 1728 break; 1729 case _MEMSWAP: 1730 counter = &memcg->memsw; 1731 break; 1732 case _KMEM: 1733 counter = &memcg->kmem; 1734 break; 1735 case _TCP: 1736 counter = &memcg->tcpmem; 1737 break; 1738 default: 1739 BUG(); 1740 } 1741 1742 switch (MEMFILE_ATTR(of_cft(of)->private)) { 1743 case RES_MAX_USAGE: 1744 page_counter_reset_watermark(counter); 1745 break; 1746 case RES_FAILCNT: 1747 counter->failcnt = 0; 1748 break; 1749 default: 1750 BUG(); 1751 } 1752 1753 return nbytes; 1754 } 1755 1756 #ifdef CONFIG_NUMA 1757 1758 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 1759 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 1760 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 1761 1762 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 1763 int nid, unsigned int lru_mask, bool tree) 1764 { 1765 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 1766 unsigned long nr = 0; 1767 enum lru_list lru; 1768 1769 VM_BUG_ON((unsigned int)nid >= nr_node_ids); 1770 1771 for_each_lru(lru) { 1772 if (!(BIT(lru) & lru_mask)) 1773 continue; 1774 if (tree) 1775 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 1776 else 1777 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 1778 } 1779 return nr; 1780 } 1781 1782 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 1783 unsigned int lru_mask, 1784 bool tree) 1785 { 1786 unsigned long nr = 0; 1787 enum lru_list lru; 1788 1789 for_each_lru(lru) { 1790 if (!(BIT(lru) & lru_mask)) 1791 continue; 1792 if (tree) 1793 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 1794 else 1795 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 1796 } 1797 return nr; 1798 } 1799 1800 static int memcg_numa_stat_show(struct seq_file *m, void *v) 1801 { 1802 struct numa_stat { 1803 const char *name; 1804 unsigned int lru_mask; 1805 }; 1806 1807 static const struct numa_stat stats[] = { 1808 { "total", LRU_ALL }, 1809 { "file", LRU_ALL_FILE }, 1810 { "anon", LRU_ALL_ANON }, 1811 { "unevictable", BIT(LRU_UNEVICTABLE) }, 1812 }; 1813 const struct numa_stat *stat; 1814 int nid; 1815 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 1816 1817 mem_cgroup_flush_stats(memcg); 1818 1819 for (stat = stats; stat < ARRAY_END(stats); stat++) { 1820 seq_printf(m, "%s=%lu", stat->name, 1821 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 1822 false)); 1823 for_each_node_state(nid, N_MEMORY) 1824 seq_printf(m, " N%d=%lu", nid, 1825 mem_cgroup_node_nr_lru_pages(memcg, nid, 1826 stat->lru_mask, false)); 1827 seq_putc(m, '\n'); 1828 } 1829 1830 for (stat = stats; stat < ARRAY_END(stats); stat++) { 1831 1832 seq_printf(m, "hierarchical_%s=%lu", stat->name, 1833 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 1834 true)); 1835 for_each_node_state(nid, N_MEMORY) 1836 seq_printf(m, " N%d=%lu", nid, 1837 mem_cgroup_node_nr_lru_pages(memcg, nid, 1838 stat->lru_mask, true)); 1839 seq_putc(m, '\n'); 1840 } 1841 1842 return 0; 1843 } 1844 #endif /* CONFIG_NUMA */ 1845 1846 static const unsigned int memcg1_stats[] = { 1847 NR_FILE_PAGES, 1848 NR_ANON_MAPPED, 1849 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1850 NR_ANON_THPS, 1851 #endif 1852 NR_SHMEM, 1853 NR_FILE_MAPPED, 1854 NR_FILE_DIRTY, 1855 NR_WRITEBACK, 1856 WORKINGSET_REFAULT_ANON, 1857 WORKINGSET_REFAULT_FILE, 1858 #ifdef CONFIG_SWAP 1859 MEMCG_SWAP, 1860 NR_SWAPCACHE, 1861 #endif 1862 }; 1863 1864 static const char *const memcg1_stat_names[] = { 1865 "cache", 1866 "rss", 1867 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1868 "rss_huge", 1869 #endif 1870 "shmem", 1871 "mapped_file", 1872 "dirty", 1873 "writeback", 1874 "workingset_refault_anon", 1875 "workingset_refault_file", 1876 #ifdef CONFIG_SWAP 1877 "swap", 1878 "swapcached", 1879 #endif 1880 }; 1881 1882 /* Universal VM events cgroup1 shows, original sort order */ 1883 static const unsigned int memcg1_events[] = { 1884 PGPGIN, 1885 PGPGOUT, 1886 PGFAULT, 1887 PGMAJFAULT, 1888 }; 1889 1890 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 1891 { 1892 unsigned long memory, memsw; 1893 struct mem_cgroup *mi; 1894 unsigned int i; 1895 1896 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 1897 1898 mem_cgroup_flush_stats(memcg); 1899 1900 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1901 unsigned long nr; 1902 1903 nr = memcg_page_state_local_output(memcg, memcg1_stats[i]); 1904 seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr); 1905 } 1906 1907 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 1908 seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), 1909 memcg_events_local(memcg, memcg1_events[i])); 1910 1911 for (i = 0; i < NR_LRU_LISTS; i++) 1912 seq_buf_printf(s, "%s %lu\n", lru_list_name(i), 1913 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 1914 PAGE_SIZE); 1915 1916 /* Hierarchical information */ 1917 memory = memsw = PAGE_COUNTER_MAX; 1918 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 1919 memory = min(memory, READ_ONCE(mi->memory.max)); 1920 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 1921 } 1922 seq_buf_printf(s, "hierarchical_memory_limit %llu\n", 1923 (u64)memory * PAGE_SIZE); 1924 seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", 1925 (u64)memsw * PAGE_SIZE); 1926 1927 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1928 unsigned long nr; 1929 1930 nr = memcg_page_state_output(memcg, memcg1_stats[i]); 1931 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], 1932 (u64)nr); 1933 } 1934 1935 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 1936 seq_buf_printf(s, "total_%s %llu\n", 1937 vm_event_name(memcg1_events[i]), 1938 (u64)memcg_events(memcg, memcg1_events[i])); 1939 1940 for (i = 0; i < NR_LRU_LISTS; i++) 1941 seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), 1942 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 1943 PAGE_SIZE); 1944 1945 #ifdef CONFIG_DEBUG_VM 1946 { 1947 pg_data_t *pgdat; 1948 struct mem_cgroup_per_node *mz; 1949 unsigned long anon_cost = 0; 1950 unsigned long file_cost = 0; 1951 1952 for_each_online_pgdat(pgdat) { 1953 mz = memcg->nodeinfo[pgdat->node_id]; 1954 1955 anon_cost += mz->lruvec.anon_cost; 1956 file_cost += mz->lruvec.file_cost; 1957 } 1958 seq_buf_printf(s, "anon_cost %lu\n", anon_cost); 1959 seq_buf_printf(s, "file_cost %lu\n", file_cost); 1960 } 1961 #endif 1962 } 1963 1964 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 1965 struct cftype *cft) 1966 { 1967 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1968 1969 return mem_cgroup_swappiness(memcg); 1970 } 1971 1972 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 1973 struct cftype *cft, u64 val) 1974 { 1975 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1976 1977 if (val > MAX_SWAPPINESS) 1978 return -EINVAL; 1979 1980 if (!mem_cgroup_is_root(memcg)) { 1981 pr_info_once("Per memcg swappiness does not exist in cgroup v2. " 1982 "See memory.reclaim or memory.swap.max there\n "); 1983 WRITE_ONCE(memcg->swappiness, val); 1984 } else 1985 WRITE_ONCE(vm_swappiness, val); 1986 1987 return 0; 1988 } 1989 1990 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 1991 { 1992 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 1993 1994 seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable)); 1995 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 1996 seq_printf(sf, "oom_kill %lu\n", 1997 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 1998 return 0; 1999 } 2000 2001 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 2002 struct cftype *cft, u64 val) 2003 { 2004 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2005 2006 pr_warn_once("oom_control is deprecated and will be removed. " 2007 "Please report your usecase to linux-mm-@kvack.org if you " 2008 "depend on this functionality.\n"); 2009 2010 /* cannot set to root cgroup and only 0 and 1 are allowed */ 2011 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 2012 return -EINVAL; 2013 2014 WRITE_ONCE(memcg->oom_kill_disable, val); 2015 if (!val) 2016 memcg1_oom_recover(memcg); 2017 2018 return 0; 2019 } 2020 2021 #ifdef CONFIG_SLUB_DEBUG 2022 static int mem_cgroup_slab_show(struct seq_file *m, void *p) 2023 { 2024 /* 2025 * Deprecated. 2026 * Please, take a look at tools/cgroup/memcg_slabinfo.py . 2027 */ 2028 return 0; 2029 } 2030 #endif 2031 2032 struct cftype mem_cgroup_legacy_files[] = { 2033 { 2034 .name = "usage_in_bytes", 2035 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 2036 .read_u64 = mem_cgroup_read_u64, 2037 }, 2038 { 2039 .name = "max_usage_in_bytes", 2040 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 2041 .write = mem_cgroup_reset, 2042 .read_u64 = mem_cgroup_read_u64, 2043 }, 2044 { 2045 .name = "limit_in_bytes", 2046 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 2047 .write = mem_cgroup_write, 2048 .read_u64 = mem_cgroup_read_u64, 2049 }, 2050 { 2051 .name = "soft_limit_in_bytes", 2052 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 2053 .write = mem_cgroup_write, 2054 .read_u64 = mem_cgroup_read_u64, 2055 }, 2056 { 2057 .name = "failcnt", 2058 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2059 .write = mem_cgroup_reset, 2060 .read_u64 = mem_cgroup_read_u64, 2061 }, 2062 { 2063 .name = "stat", 2064 .seq_show = memory_stat_show, 2065 }, 2066 { 2067 .name = "force_empty", 2068 .write = mem_cgroup_force_empty_write, 2069 }, 2070 { 2071 .name = "use_hierarchy", 2072 .write_u64 = mem_cgroup_hierarchy_write, 2073 .read_u64 = mem_cgroup_hierarchy_read, 2074 }, 2075 { 2076 .name = "cgroup.event_control", /* XXX: for compat */ 2077 .write = memcg_write_event_control, 2078 .flags = CFTYPE_NO_PREFIX, 2079 }, 2080 { 2081 .name = "swappiness", 2082 .read_u64 = mem_cgroup_swappiness_read, 2083 .write_u64 = mem_cgroup_swappiness_write, 2084 }, 2085 { 2086 .name = "move_charge_at_immigrate", 2087 .read_u64 = mem_cgroup_move_charge_read, 2088 .write_u64 = mem_cgroup_move_charge_write, 2089 }, 2090 { 2091 .name = "oom_control", 2092 .seq_show = mem_cgroup_oom_control_read, 2093 .write_u64 = mem_cgroup_oom_control_write, 2094 }, 2095 { 2096 .name = "pressure_level", 2097 .seq_show = mem_cgroup_dummy_seq_show, 2098 }, 2099 #ifdef CONFIG_NUMA 2100 { 2101 .name = "numa_stat", 2102 .seq_show = memcg_numa_stat_show, 2103 }, 2104 #endif 2105 { 2106 .name = "kmem.limit_in_bytes", 2107 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 2108 .write = mem_cgroup_write, 2109 .read_u64 = mem_cgroup_read_u64, 2110 }, 2111 { 2112 .name = "kmem.usage_in_bytes", 2113 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 2114 .read_u64 = mem_cgroup_read_u64, 2115 }, 2116 { 2117 .name = "kmem.failcnt", 2118 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 2119 .write = mem_cgroup_reset, 2120 .read_u64 = mem_cgroup_read_u64, 2121 }, 2122 { 2123 .name = "kmem.max_usage_in_bytes", 2124 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 2125 .write = mem_cgroup_reset, 2126 .read_u64 = mem_cgroup_read_u64, 2127 }, 2128 #ifdef CONFIG_SLUB_DEBUG 2129 { 2130 .name = "kmem.slabinfo", 2131 .seq_show = mem_cgroup_slab_show, 2132 }, 2133 #endif 2134 { 2135 .name = "kmem.tcp.limit_in_bytes", 2136 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 2137 .write = mem_cgroup_write, 2138 .read_u64 = mem_cgroup_read_u64, 2139 }, 2140 { 2141 .name = "kmem.tcp.usage_in_bytes", 2142 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 2143 .read_u64 = mem_cgroup_read_u64, 2144 }, 2145 { 2146 .name = "kmem.tcp.failcnt", 2147 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 2148 .write = mem_cgroup_reset, 2149 .read_u64 = mem_cgroup_read_u64, 2150 }, 2151 { 2152 .name = "kmem.tcp.max_usage_in_bytes", 2153 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 2154 .write = mem_cgroup_reset, 2155 .read_u64 = mem_cgroup_read_u64, 2156 }, 2157 { }, /* terminate */ 2158 }; 2159 2160 struct cftype memsw_files[] = { 2161 { 2162 .name = "memsw.usage_in_bytes", 2163 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 2164 .read_u64 = mem_cgroup_read_u64, 2165 }, 2166 { 2167 .name = "memsw.max_usage_in_bytes", 2168 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 2169 .write = mem_cgroup_reset, 2170 .read_u64 = mem_cgroup_read_u64, 2171 }, 2172 { 2173 .name = "memsw.limit_in_bytes", 2174 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 2175 .write = mem_cgroup_write, 2176 .read_u64 = mem_cgroup_read_u64, 2177 }, 2178 { 2179 .name = "memsw.failcnt", 2180 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 2181 .write = mem_cgroup_reset, 2182 .read_u64 = mem_cgroup_read_u64, 2183 }, 2184 { }, /* terminate */ 2185 }; 2186 2187 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) 2188 { 2189 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 2190 if (nr_pages > 0) 2191 page_counter_charge(&memcg->kmem, nr_pages); 2192 else 2193 page_counter_uncharge(&memcg->kmem, -nr_pages); 2194 } 2195 } 2196 2197 bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 2198 gfp_t gfp_mask) 2199 { 2200 struct page_counter *fail; 2201 2202 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 2203 memcg->tcpmem_pressure = 0; 2204 return true; 2205 } 2206 memcg->tcpmem_pressure = 1; 2207 if (gfp_mask & __GFP_NOFAIL) { 2208 page_counter_charge(&memcg->tcpmem, nr_pages); 2209 return true; 2210 } 2211 return false; 2212 } 2213 2214 bool memcg1_alloc_events(struct mem_cgroup *memcg) 2215 { 2216 memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu, 2217 GFP_KERNEL_ACCOUNT); 2218 return !!memcg->events_percpu; 2219 } 2220 2221 void memcg1_free_events(struct mem_cgroup *memcg) 2222 { 2223 free_percpu(memcg->events_percpu); 2224 } 2225 2226 static int __init memcg1_init(void) 2227 { 2228 int node; 2229 2230 for_each_node(node) { 2231 struct mem_cgroup_tree_per_node *rtpn; 2232 2233 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); 2234 2235 rtpn->rb_root = RB_ROOT; 2236 rtpn->rb_rightmost = NULL; 2237 spin_lock_init(&rtpn->lock); 2238 soft_limit_tree.rb_tree_per_node[node] = rtpn; 2239 } 2240 2241 return 0; 2242 } 2243 subsys_initcall(memcg1_init); 2244