1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/memcontrol.h> 4 #include <linux/swap.h> 5 #include <linux/mm_inline.h> 6 #include <linux/pagewalk.h> 7 #include <linux/backing-dev.h> 8 #include <linux/swap_cgroup.h> 9 #include <linux/eventfd.h> 10 #include <linux/poll.h> 11 #include <linux/sort.h> 12 #include <linux/file.h> 13 #include <linux/seq_buf.h> 14 15 #include "internal.h" 16 #include "swap.h" 17 #include "memcontrol-v1.h" 18 19 /* 20 * Cgroups above their limits are maintained in a RB-Tree, independent of 21 * their hierarchy representation 22 */ 23 24 struct mem_cgroup_tree_per_node { 25 struct rb_root rb_root; 26 struct rb_node *rb_rightmost; 27 spinlock_t lock; 28 }; 29 30 struct mem_cgroup_tree { 31 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 32 }; 33 34 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 35 36 /* 37 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft 38 * limit reclaim to prevent infinite loops, if they ever occur. 39 */ 40 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 41 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 42 43 /* for OOM */ 44 struct mem_cgroup_eventfd_list { 45 struct list_head list; 46 struct eventfd_ctx *eventfd; 47 }; 48 49 /* 50 * cgroup_event represents events which userspace want to receive. 51 */ 52 struct mem_cgroup_event { 53 /* 54 * memcg which the event belongs to. 55 */ 56 struct mem_cgroup *memcg; 57 /* 58 * eventfd to signal userspace about the event. 59 */ 60 struct eventfd_ctx *eventfd; 61 /* 62 * Each of these stored in a list by the cgroup. 63 */ 64 struct list_head list; 65 /* 66 * register_event() callback will be used to add new userspace 67 * waiter for changes related to this event. Use eventfd_signal() 68 * on eventfd to send notification to userspace. 69 */ 70 int (*register_event)(struct mem_cgroup *memcg, 71 struct eventfd_ctx *eventfd, const char *args); 72 /* 73 * unregister_event() callback will be called when userspace closes 74 * the eventfd or on cgroup removing. This callback must be set, 75 * if you want provide notification functionality. 76 */ 77 void (*unregister_event)(struct mem_cgroup *memcg, 78 struct eventfd_ctx *eventfd); 79 /* 80 * All fields below needed to unregister event when 81 * userspace closes eventfd. 82 */ 83 poll_table pt; 84 wait_queue_head_t *wqh; 85 wait_queue_entry_t wait; 86 struct work_struct remove; 87 }; 88 89 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 90 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 91 #define MEMFILE_ATTR(val) ((val) & 0xffff) 92 93 enum { 94 RES_USAGE, 95 RES_LIMIT, 96 RES_MAX_USAGE, 97 RES_FAILCNT, 98 RES_SOFT_LIMIT, 99 }; 100 101 #ifdef CONFIG_LOCKDEP 102 static struct lockdep_map memcg_oom_lock_dep_map = { 103 .name = "memcg_oom_lock", 104 }; 105 #endif 106 107 DEFINE_SPINLOCK(memcg_oom_lock); 108 109 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 110 struct mem_cgroup_tree_per_node *mctz, 111 unsigned long new_usage_in_excess) 112 { 113 struct rb_node **p = &mctz->rb_root.rb_node; 114 struct rb_node *parent = NULL; 115 struct mem_cgroup_per_node *mz_node; 116 bool rightmost = true; 117 118 if (mz->on_tree) 119 return; 120 121 mz->usage_in_excess = new_usage_in_excess; 122 if (!mz->usage_in_excess) 123 return; 124 while (*p) { 125 parent = *p; 126 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 127 tree_node); 128 if (mz->usage_in_excess < mz_node->usage_in_excess) { 129 p = &(*p)->rb_left; 130 rightmost = false; 131 } else { 132 p = &(*p)->rb_right; 133 } 134 } 135 136 if (rightmost) 137 mctz->rb_rightmost = &mz->tree_node; 138 139 rb_link_node(&mz->tree_node, parent, p); 140 rb_insert_color(&mz->tree_node, &mctz->rb_root); 141 mz->on_tree = true; 142 } 143 144 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 145 struct mem_cgroup_tree_per_node *mctz) 146 { 147 if (!mz->on_tree) 148 return; 149 150 if (&mz->tree_node == mctz->rb_rightmost) 151 mctz->rb_rightmost = rb_prev(&mz->tree_node); 152 153 rb_erase(&mz->tree_node, &mctz->rb_root); 154 mz->on_tree = false; 155 } 156 157 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 158 struct mem_cgroup_tree_per_node *mctz) 159 { 160 unsigned long flags; 161 162 spin_lock_irqsave(&mctz->lock, flags); 163 __mem_cgroup_remove_exceeded(mz, mctz); 164 spin_unlock_irqrestore(&mctz->lock, flags); 165 } 166 167 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 168 { 169 unsigned long nr_pages = page_counter_read(&memcg->memory); 170 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 171 unsigned long excess = 0; 172 173 if (nr_pages > soft_limit) 174 excess = nr_pages - soft_limit; 175 176 return excess; 177 } 178 179 static void memcg1_update_tree(struct mem_cgroup *memcg, int nid) 180 { 181 unsigned long excess; 182 struct mem_cgroup_per_node *mz; 183 struct mem_cgroup_tree_per_node *mctz; 184 185 if (lru_gen_enabled()) { 186 if (soft_limit_excess(memcg)) 187 lru_gen_soft_reclaim(memcg, nid); 188 return; 189 } 190 191 mctz = soft_limit_tree.rb_tree_per_node[nid]; 192 if (!mctz) 193 return; 194 /* 195 * Necessary to update all ancestors when hierarchy is used. 196 * because their event counter is not touched. 197 */ 198 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 199 mz = memcg->nodeinfo[nid]; 200 excess = soft_limit_excess(memcg); 201 /* 202 * We have to update the tree if mz is on RB-tree or 203 * mem is over its softlimit. 204 */ 205 if (excess || mz->on_tree) { 206 unsigned long flags; 207 208 spin_lock_irqsave(&mctz->lock, flags); 209 /* if on-tree, remove it */ 210 if (mz->on_tree) 211 __mem_cgroup_remove_exceeded(mz, mctz); 212 /* 213 * Insert again. mz->usage_in_excess will be updated. 214 * If excess is 0, no tree ops. 215 */ 216 __mem_cgroup_insert_exceeded(mz, mctz, excess); 217 spin_unlock_irqrestore(&mctz->lock, flags); 218 } 219 } 220 } 221 222 void memcg1_remove_from_trees(struct mem_cgroup *memcg) 223 { 224 struct mem_cgroup_tree_per_node *mctz; 225 struct mem_cgroup_per_node *mz; 226 int nid; 227 228 for_each_node(nid) { 229 mz = memcg->nodeinfo[nid]; 230 mctz = soft_limit_tree.rb_tree_per_node[nid]; 231 if (mctz) 232 mem_cgroup_remove_exceeded(mz, mctz); 233 } 234 } 235 236 static struct mem_cgroup_per_node * 237 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 238 { 239 struct mem_cgroup_per_node *mz; 240 241 retry: 242 mz = NULL; 243 if (!mctz->rb_rightmost) 244 goto done; /* Nothing to reclaim from */ 245 246 mz = rb_entry(mctz->rb_rightmost, 247 struct mem_cgroup_per_node, tree_node); 248 /* 249 * Remove the node now but someone else can add it back, 250 * we will to add it back at the end of reclaim to its correct 251 * position in the tree. 252 */ 253 __mem_cgroup_remove_exceeded(mz, mctz); 254 if (!soft_limit_excess(mz->memcg) || 255 !css_tryget(&mz->memcg->css)) 256 goto retry; 257 done: 258 return mz; 259 } 260 261 static struct mem_cgroup_per_node * 262 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 263 { 264 struct mem_cgroup_per_node *mz; 265 266 spin_lock_irq(&mctz->lock); 267 mz = __mem_cgroup_largest_soft_limit_node(mctz); 268 spin_unlock_irq(&mctz->lock); 269 return mz; 270 } 271 272 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 273 pg_data_t *pgdat, 274 gfp_t gfp_mask, 275 unsigned long *total_scanned) 276 { 277 struct mem_cgroup *victim = NULL; 278 int total = 0; 279 int loop = 0; 280 unsigned long excess; 281 unsigned long nr_scanned; 282 struct mem_cgroup_reclaim_cookie reclaim = { 283 .pgdat = pgdat, 284 }; 285 286 excess = soft_limit_excess(root_memcg); 287 288 while (1) { 289 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 290 if (!victim) { 291 loop++; 292 if (loop >= 2) { 293 /* 294 * If we have not been able to reclaim 295 * anything, it might because there are 296 * no reclaimable pages under this hierarchy 297 */ 298 if (!total) 299 break; 300 /* 301 * We want to do more targeted reclaim. 302 * excess >> 2 is not to excessive so as to 303 * reclaim too much, nor too less that we keep 304 * coming back to reclaim from this cgroup 305 */ 306 if (total >= (excess >> 2) || 307 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 308 break; 309 } 310 continue; 311 } 312 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 313 pgdat, &nr_scanned); 314 *total_scanned += nr_scanned; 315 if (!soft_limit_excess(root_memcg)) 316 break; 317 } 318 mem_cgroup_iter_break(root_memcg, victim); 319 return total; 320 } 321 322 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, 323 gfp_t gfp_mask, 324 unsigned long *total_scanned) 325 { 326 unsigned long nr_reclaimed = 0; 327 struct mem_cgroup_per_node *mz, *next_mz = NULL; 328 unsigned long reclaimed; 329 int loop = 0; 330 struct mem_cgroup_tree_per_node *mctz; 331 unsigned long excess; 332 333 if (lru_gen_enabled()) 334 return 0; 335 336 if (order > 0) 337 return 0; 338 339 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; 340 341 /* 342 * Do not even bother to check the largest node if the root 343 * is empty. Do it lockless to prevent lock bouncing. Races 344 * are acceptable as soft limit is best effort anyway. 345 */ 346 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 347 return 0; 348 349 /* 350 * This loop can run a while, specially if mem_cgroup's continuously 351 * keep exceeding their soft limit and putting the system under 352 * pressure 353 */ 354 do { 355 if (next_mz) 356 mz = next_mz; 357 else 358 mz = mem_cgroup_largest_soft_limit_node(mctz); 359 if (!mz) 360 break; 361 362 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 363 gfp_mask, total_scanned); 364 nr_reclaimed += reclaimed; 365 spin_lock_irq(&mctz->lock); 366 367 /* 368 * If we failed to reclaim anything from this memory cgroup 369 * it is time to move on to the next cgroup 370 */ 371 next_mz = NULL; 372 if (!reclaimed) 373 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 374 375 excess = soft_limit_excess(mz->memcg); 376 /* 377 * One school of thought says that we should not add 378 * back the node to the tree if reclaim returns 0. 379 * But our reclaim could return 0, simply because due 380 * to priority we are exposing a smaller subset of 381 * memory to reclaim from. Consider this as a longer 382 * term TODO. 383 */ 384 /* If excess == 0, no tree ops */ 385 __mem_cgroup_insert_exceeded(mz, mctz, excess); 386 spin_unlock_irq(&mctz->lock); 387 css_put(&mz->memcg->css); 388 loop++; 389 /* 390 * Could not reclaim anything and there are no more 391 * mem cgroups to try or we seem to be looping without 392 * reclaiming anything. 393 */ 394 if (!nr_reclaimed && 395 (next_mz == NULL || 396 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 397 break; 398 } while (!nr_reclaimed); 399 if (next_mz) 400 css_put(&next_mz->memcg->css); 401 return nr_reclaimed; 402 } 403 404 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 405 struct cftype *cft) 406 { 407 return 0; 408 } 409 410 #ifdef CONFIG_MMU 411 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 412 struct cftype *cft, u64 val) 413 { 414 pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " 415 "Please report your usecase to linux-mm@kvack.org if you " 416 "depend on this functionality.\n"); 417 418 if (val != 0) 419 return -EINVAL; 420 return 0; 421 } 422 #else 423 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 424 struct cftype *cft, u64 val) 425 { 426 return -ENOSYS; 427 } 428 #endif 429 430 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 431 { 432 unsigned long val; 433 434 if (mem_cgroup_is_root(memcg)) { 435 /* 436 * Approximate root's usage from global state. This isn't 437 * perfect, but the root usage was always an approximation. 438 */ 439 val = global_node_page_state(NR_FILE_PAGES) + 440 global_node_page_state(NR_ANON_MAPPED); 441 if (swap) 442 val += total_swap_pages - get_nr_swap_pages(); 443 } else { 444 if (!swap) 445 val = page_counter_read(&memcg->memory); 446 else 447 val = page_counter_read(&memcg->memsw); 448 } 449 return val; 450 } 451 452 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 453 { 454 struct mem_cgroup_threshold_ary *t; 455 unsigned long usage; 456 int i; 457 458 rcu_read_lock(); 459 if (!swap) 460 t = rcu_dereference(memcg->thresholds.primary); 461 else 462 t = rcu_dereference(memcg->memsw_thresholds.primary); 463 464 if (!t) 465 goto unlock; 466 467 usage = mem_cgroup_usage(memcg, swap); 468 469 /* 470 * current_threshold points to threshold just below or equal to usage. 471 * If it's not true, a threshold was crossed after last 472 * call of __mem_cgroup_threshold(). 473 */ 474 i = t->current_threshold; 475 476 /* 477 * Iterate backward over array of thresholds starting from 478 * current_threshold and check if a threshold is crossed. 479 * If none of thresholds below usage is crossed, we read 480 * only one element of the array here. 481 */ 482 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 483 eventfd_signal(t->entries[i].eventfd); 484 485 /* i = current_threshold + 1 */ 486 i++; 487 488 /* 489 * Iterate forward over array of thresholds starting from 490 * current_threshold+1 and check if a threshold is crossed. 491 * If none of thresholds above usage is crossed, we read 492 * only one element of the array here. 493 */ 494 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 495 eventfd_signal(t->entries[i].eventfd); 496 497 /* Update current_threshold */ 498 t->current_threshold = i - 1; 499 unlock: 500 rcu_read_unlock(); 501 } 502 503 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 504 { 505 while (memcg) { 506 __mem_cgroup_threshold(memcg, false); 507 if (do_memsw_account()) 508 __mem_cgroup_threshold(memcg, true); 509 510 memcg = parent_mem_cgroup(memcg); 511 } 512 } 513 514 /* Cgroup1: threshold notifications & softlimit tree updates */ 515 516 /* 517 * Per memcg event counter is incremented at every pagein/pageout. With THP, 518 * it will be incremented by the number of pages. This counter is used 519 * to trigger some periodic events. This is straightforward and better 520 * than using jiffies etc. to handle periodic memcg event. 521 */ 522 enum mem_cgroup_events_target { 523 MEM_CGROUP_TARGET_THRESH, 524 MEM_CGROUP_TARGET_SOFTLIMIT, 525 MEM_CGROUP_NTARGETS, 526 }; 527 528 struct memcg1_events_percpu { 529 unsigned long nr_page_events; 530 unsigned long targets[MEM_CGROUP_NTARGETS]; 531 }; 532 533 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages) 534 { 535 /* pagein of a big page is an event. So, ignore page size */ 536 if (nr_pages > 0) 537 count_memcg_events(memcg, PGPGIN, 1); 538 else { 539 count_memcg_events(memcg, PGPGOUT, 1); 540 nr_pages = -nr_pages; /* for event */ 541 } 542 543 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages); 544 } 545 546 #define THRESHOLDS_EVENTS_TARGET 128 547 #define SOFTLIMIT_EVENTS_TARGET 1024 548 549 static bool memcg1_event_ratelimit(struct mem_cgroup *memcg, 550 enum mem_cgroup_events_target target) 551 { 552 unsigned long val, next; 553 554 val = __this_cpu_read(memcg->events_percpu->nr_page_events); 555 next = __this_cpu_read(memcg->events_percpu->targets[target]); 556 /* from time_after() in jiffies.h */ 557 if ((long)(next - val) < 0) { 558 switch (target) { 559 case MEM_CGROUP_TARGET_THRESH: 560 next = val + THRESHOLDS_EVENTS_TARGET; 561 break; 562 case MEM_CGROUP_TARGET_SOFTLIMIT: 563 next = val + SOFTLIMIT_EVENTS_TARGET; 564 break; 565 default: 566 break; 567 } 568 __this_cpu_write(memcg->events_percpu->targets[target], next); 569 return true; 570 } 571 return false; 572 } 573 574 /* 575 * Check events in order. 576 * 577 */ 578 static void memcg1_check_events(struct mem_cgroup *memcg, int nid) 579 { 580 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 581 return; 582 583 /* threshold event is triggered in finer grain than soft limit */ 584 if (unlikely(memcg1_event_ratelimit(memcg, 585 MEM_CGROUP_TARGET_THRESH))) { 586 bool do_softlimit; 587 588 do_softlimit = memcg1_event_ratelimit(memcg, 589 MEM_CGROUP_TARGET_SOFTLIMIT); 590 mem_cgroup_threshold(memcg); 591 if (unlikely(do_softlimit)) 592 memcg1_update_tree(memcg, nid); 593 } 594 } 595 596 void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) 597 { 598 unsigned long flags; 599 600 local_irq_save(flags); 601 memcg1_charge_statistics(memcg, folio_nr_pages(folio)); 602 memcg1_check_events(memcg, folio_nid(folio)); 603 local_irq_restore(flags); 604 } 605 606 /** 607 * memcg1_swapout - transfer a memsw charge to swap 608 * @folio: folio whose memsw charge to transfer 609 * @entry: swap entry to move the charge to 610 * 611 * Transfer the memsw charge of @folio to @entry. 612 */ 613 void memcg1_swapout(struct folio *folio, swp_entry_t entry) 614 { 615 struct mem_cgroup *memcg, *swap_memcg; 616 unsigned int nr_entries; 617 618 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 619 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 620 621 if (mem_cgroup_disabled()) 622 return; 623 624 if (!do_memsw_account()) 625 return; 626 627 memcg = folio_memcg(folio); 628 629 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 630 if (!memcg) 631 return; 632 633 /* 634 * In case the memcg owning these pages has been offlined and doesn't 635 * have an ID allocated to it anymore, charge the closest online 636 * ancestor for the swap instead and transfer the memory+swap charge. 637 */ 638 nr_entries = folio_nr_pages(folio); 639 swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries); 640 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 641 642 swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry); 643 644 folio_unqueue_deferred_split(folio); 645 folio->memcg_data = 0; 646 647 if (!mem_cgroup_is_root(memcg)) 648 page_counter_uncharge(&memcg->memory, nr_entries); 649 650 if (memcg != swap_memcg) { 651 if (!mem_cgroup_is_root(swap_memcg)) 652 page_counter_charge(&swap_memcg->memsw, nr_entries); 653 page_counter_uncharge(&memcg->memsw, nr_entries); 654 } 655 656 /* 657 * Interrupts should be disabled here because the caller holds the 658 * i_pages lock which is taken with interrupts-off. It is 659 * important here to have the interrupts disabled because it is the 660 * only synchronisation we have for updating the per-CPU variables. 661 */ 662 preempt_disable_nested(); 663 VM_WARN_ON_IRQS_ENABLED(); 664 memcg1_charge_statistics(memcg, -folio_nr_pages(folio)); 665 preempt_enable_nested(); 666 memcg1_check_events(memcg, folio_nid(folio)); 667 668 css_put(&memcg->css); 669 } 670 671 /* 672 * memcg1_swapin - uncharge swap slot 673 * @entry: the first swap entry for which the pages are charged 674 * @nr_pages: number of pages which will be uncharged 675 * 676 * Call this function after successfully adding the charged page to swapcache. 677 * 678 * Note: This function assumes the page for which swap slot is being uncharged 679 * is order 0 page. 680 */ 681 void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) 682 { 683 /* 684 * Cgroup1's unified memory+swap counter has been charged with the 685 * new swapcache page, finish the transfer by uncharging the swap 686 * slot. The swap slot would also get uncharged when it dies, but 687 * it can stick around indefinitely and we'd count the page twice 688 * the entire time. 689 * 690 * Cgroup2 has separate resource counters for memory and swap, 691 * so this is a non-issue here. Memory and swap charge lifetimes 692 * correspond 1:1 to page and swap slot lifetimes: we charge the 693 * page to memory here, and uncharge swap when the slot is freed. 694 */ 695 if (do_memsw_account()) { 696 /* 697 * The swap entry might not get freed for a long time, 698 * let's not wait for it. The page already received a 699 * memory+swap charge, drop the swap entry duplicate. 700 */ 701 mem_cgroup_uncharge_swap(entry, nr_pages); 702 } 703 } 704 705 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 706 unsigned long nr_memory, int nid) 707 { 708 unsigned long flags; 709 710 local_irq_save(flags); 711 count_memcg_events(memcg, PGPGOUT, pgpgout); 712 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory); 713 memcg1_check_events(memcg, nid); 714 local_irq_restore(flags); 715 } 716 717 static int compare_thresholds(const void *a, const void *b) 718 { 719 const struct mem_cgroup_threshold *_a = a; 720 const struct mem_cgroup_threshold *_b = b; 721 722 if (_a->threshold > _b->threshold) 723 return 1; 724 725 if (_a->threshold < _b->threshold) 726 return -1; 727 728 return 0; 729 } 730 731 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 732 { 733 struct mem_cgroup_eventfd_list *ev; 734 735 spin_lock(&memcg_oom_lock); 736 737 list_for_each_entry(ev, &memcg->oom_notify, list) 738 eventfd_signal(ev->eventfd); 739 740 spin_unlock(&memcg_oom_lock); 741 return 0; 742 } 743 744 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 745 { 746 struct mem_cgroup *iter; 747 748 for_each_mem_cgroup_tree(iter, memcg) 749 mem_cgroup_oom_notify_cb(iter); 750 } 751 752 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 753 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 754 { 755 struct mem_cgroup_thresholds *thresholds; 756 struct mem_cgroup_threshold_ary *new; 757 unsigned long threshold; 758 unsigned long usage; 759 int i, size, ret; 760 761 ret = page_counter_memparse(args, "-1", &threshold); 762 if (ret) 763 return ret; 764 765 mutex_lock(&memcg->thresholds_lock); 766 767 if (type == _MEM) { 768 thresholds = &memcg->thresholds; 769 usage = mem_cgroup_usage(memcg, false); 770 } else if (type == _MEMSWAP) { 771 thresholds = &memcg->memsw_thresholds; 772 usage = mem_cgroup_usage(memcg, true); 773 } else 774 BUG(); 775 776 /* Check if a threshold crossed before adding a new one */ 777 if (thresholds->primary) 778 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 779 780 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 781 782 /* Allocate memory for new array of thresholds */ 783 new = kmalloc_flex(*new, entries, size, GFP_KERNEL_ACCOUNT); 784 if (!new) { 785 ret = -ENOMEM; 786 goto unlock; 787 } 788 new->size = size; 789 790 /* Copy thresholds (if any) to new array */ 791 if (thresholds->primary) 792 memcpy(new->entries, thresholds->primary->entries, 793 flex_array_size(new, entries, size - 1)); 794 795 /* Add new threshold */ 796 new->entries[size - 1].eventfd = eventfd; 797 new->entries[size - 1].threshold = threshold; 798 799 /* Sort thresholds. Registering of new threshold isn't time-critical */ 800 sort(new->entries, size, sizeof(*new->entries), 801 compare_thresholds, NULL); 802 803 /* Find current threshold */ 804 new->current_threshold = -1; 805 for (i = 0; i < size; i++) { 806 if (new->entries[i].threshold <= usage) { 807 /* 808 * new->current_threshold will not be used until 809 * rcu_assign_pointer(), so it's safe to increment 810 * it here. 811 */ 812 ++new->current_threshold; 813 } else 814 break; 815 } 816 817 /* Free old spare buffer and save old primary buffer as spare */ 818 kfree(thresholds->spare); 819 thresholds->spare = thresholds->primary; 820 821 rcu_assign_pointer(thresholds->primary, new); 822 823 /* To be sure that nobody uses thresholds */ 824 synchronize_rcu(); 825 826 unlock: 827 mutex_unlock(&memcg->thresholds_lock); 828 829 return ret; 830 } 831 832 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 833 struct eventfd_ctx *eventfd, const char *args) 834 { 835 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 836 } 837 838 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 839 struct eventfd_ctx *eventfd, const char *args) 840 { 841 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 842 } 843 844 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 845 struct eventfd_ctx *eventfd, enum res_type type) 846 { 847 struct mem_cgroup_thresholds *thresholds; 848 struct mem_cgroup_threshold_ary *new; 849 unsigned long usage; 850 int i, j, size, entries; 851 852 mutex_lock(&memcg->thresholds_lock); 853 854 if (type == _MEM) { 855 thresholds = &memcg->thresholds; 856 usage = mem_cgroup_usage(memcg, false); 857 } else if (type == _MEMSWAP) { 858 thresholds = &memcg->memsw_thresholds; 859 usage = mem_cgroup_usage(memcg, true); 860 } else 861 BUG(); 862 863 if (!thresholds->primary) 864 goto unlock; 865 866 /* Check if a threshold crossed before removing */ 867 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 868 869 /* Calculate new number of threshold */ 870 size = entries = 0; 871 for (i = 0; i < thresholds->primary->size; i++) { 872 if (thresholds->primary->entries[i].eventfd != eventfd) 873 size++; 874 else 875 entries++; 876 } 877 878 new = thresholds->spare; 879 880 /* If no items related to eventfd have been cleared, nothing to do */ 881 if (!entries) 882 goto unlock; 883 884 /* Set thresholds array to NULL if we don't have thresholds */ 885 if (!size) { 886 kfree(new); 887 new = NULL; 888 goto swap_buffers; 889 } 890 891 new->size = size; 892 893 /* Copy thresholds and find current threshold */ 894 new->current_threshold = -1; 895 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 896 if (thresholds->primary->entries[i].eventfd == eventfd) 897 continue; 898 899 new->entries[j] = thresholds->primary->entries[i]; 900 if (new->entries[j].threshold <= usage) { 901 /* 902 * new->current_threshold will not be used 903 * until rcu_assign_pointer(), so it's safe to increment 904 * it here. 905 */ 906 ++new->current_threshold; 907 } 908 j++; 909 } 910 911 swap_buffers: 912 /* Swap primary and spare array */ 913 thresholds->spare = thresholds->primary; 914 915 rcu_assign_pointer(thresholds->primary, new); 916 917 /* To be sure that nobody uses thresholds */ 918 synchronize_rcu(); 919 920 /* If all events are unregistered, free the spare array */ 921 if (!new) { 922 kfree(thresholds->spare); 923 thresholds->spare = NULL; 924 } 925 unlock: 926 mutex_unlock(&memcg->thresholds_lock); 927 } 928 929 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 930 struct eventfd_ctx *eventfd) 931 { 932 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 933 } 934 935 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 936 struct eventfd_ctx *eventfd) 937 { 938 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 939 } 940 941 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 942 struct eventfd_ctx *eventfd, const char *args) 943 { 944 struct mem_cgroup_eventfd_list *event; 945 946 event = kmalloc_obj(*event, GFP_KERNEL_ACCOUNT); 947 if (!event) 948 return -ENOMEM; 949 950 spin_lock(&memcg_oom_lock); 951 952 event->eventfd = eventfd; 953 list_add(&event->list, &memcg->oom_notify); 954 955 /* already in OOM ? */ 956 if (memcg->under_oom) 957 eventfd_signal(eventfd); 958 spin_unlock(&memcg_oom_lock); 959 960 return 0; 961 } 962 963 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 964 struct eventfd_ctx *eventfd) 965 { 966 struct mem_cgroup_eventfd_list *ev, *tmp; 967 968 spin_lock(&memcg_oom_lock); 969 970 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 971 if (ev->eventfd == eventfd) { 972 list_del(&ev->list); 973 kfree(ev); 974 } 975 } 976 977 spin_unlock(&memcg_oom_lock); 978 } 979 980 /* 981 * DO NOT USE IN NEW FILES. 982 * 983 * "cgroup.event_control" implementation. 984 * 985 * This is way over-engineered. It tries to support fully configurable 986 * events for each user. Such level of flexibility is completely 987 * unnecessary especially in the light of the planned unified hierarchy. 988 * 989 * Please deprecate this and replace with something simpler if at all 990 * possible. 991 */ 992 993 /* 994 * Unregister event and free resources. 995 * 996 * Gets called from workqueue. 997 */ 998 static void memcg_event_remove(struct work_struct *work) 999 { 1000 struct mem_cgroup_event *event = 1001 container_of(work, struct mem_cgroup_event, remove); 1002 struct mem_cgroup *memcg = event->memcg; 1003 1004 remove_wait_queue(event->wqh, &event->wait); 1005 1006 event->unregister_event(memcg, event->eventfd); 1007 1008 /* Notify userspace the event is going away. */ 1009 eventfd_signal(event->eventfd); 1010 1011 eventfd_ctx_put(event->eventfd); 1012 kfree(event); 1013 css_put(&memcg->css); 1014 } 1015 1016 /* 1017 * Gets called on EPOLLHUP on eventfd when user closes it. 1018 * 1019 * Called with wqh->lock held and interrupts disabled. 1020 */ 1021 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode, 1022 int sync, void *key) 1023 { 1024 struct mem_cgroup_event *event = 1025 container_of(wait, struct mem_cgroup_event, wait); 1026 struct mem_cgroup *memcg = event->memcg; 1027 __poll_t flags = key_to_poll(key); 1028 1029 if (flags & EPOLLHUP) { 1030 /* 1031 * If the event has been detached at cgroup removal, we 1032 * can simply return knowing the other side will cleanup 1033 * for us. 1034 * 1035 * We can't race against event freeing since the other 1036 * side will require wqh->lock via remove_wait_queue(), 1037 * which we hold. 1038 */ 1039 spin_lock(&memcg->event_list_lock); 1040 if (!list_empty(&event->list)) { 1041 list_del_init(&event->list); 1042 /* 1043 * We are in atomic context, but cgroup_event_remove() 1044 * may sleep, so we have to call it in workqueue. 1045 */ 1046 schedule_work(&event->remove); 1047 } 1048 spin_unlock(&memcg->event_list_lock); 1049 } 1050 1051 return 0; 1052 } 1053 1054 static void memcg_event_ptable_queue_proc(struct file *file, 1055 wait_queue_head_t *wqh, poll_table *pt) 1056 { 1057 struct mem_cgroup_event *event = 1058 container_of(pt, struct mem_cgroup_event, pt); 1059 1060 event->wqh = wqh; 1061 add_wait_queue(wqh, &event->wait); 1062 } 1063 1064 /* 1065 * DO NOT USE IN NEW FILES. 1066 * 1067 * Parse input and register new cgroup event handler. 1068 * 1069 * Input must be in format '<event_fd> <control_fd> <args>'. 1070 * Interpretation of args is defined by control file implementation. 1071 */ 1072 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 1073 char *buf, size_t nbytes, loff_t off) 1074 { 1075 struct cgroup_subsys_state *css = of_css(of); 1076 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1077 struct mem_cgroup_event *event; 1078 struct cgroup_subsys_state *cfile_css; 1079 unsigned int efd, cfd; 1080 struct dentry *cdentry; 1081 const char *name; 1082 char *endp; 1083 int ret; 1084 1085 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 1086 return -EOPNOTSUPP; 1087 1088 buf = strstrip(buf); 1089 1090 efd = simple_strtoul(buf, &endp, 10); 1091 if (*endp != ' ') 1092 return -EINVAL; 1093 buf = endp + 1; 1094 1095 cfd = simple_strtoul(buf, &endp, 10); 1096 if (*endp == '\0') 1097 buf = endp; 1098 else if (*endp == ' ') 1099 buf = endp + 1; 1100 else 1101 return -EINVAL; 1102 1103 CLASS(fd, efile)(efd); 1104 if (fd_empty(efile)) 1105 return -EBADF; 1106 1107 CLASS(fd, cfile)(cfd); 1108 1109 event = kzalloc_obj(*event, GFP_KERNEL_ACCOUNT); 1110 if (!event) 1111 return -ENOMEM; 1112 1113 event->memcg = memcg; 1114 INIT_LIST_HEAD(&event->list); 1115 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 1116 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 1117 INIT_WORK(&event->remove, memcg_event_remove); 1118 1119 event->eventfd = eventfd_ctx_fileget(fd_file(efile)); 1120 if (IS_ERR(event->eventfd)) { 1121 ret = PTR_ERR(event->eventfd); 1122 goto out_kfree; 1123 } 1124 1125 if (fd_empty(cfile)) { 1126 ret = -EBADF; 1127 goto out_put_eventfd; 1128 } 1129 1130 /* the process need read permission on control file */ 1131 /* AV: shouldn't we check that it's been opened for read instead? */ 1132 ret = file_permission(fd_file(cfile), MAY_READ); 1133 if (ret < 0) 1134 goto out_put_eventfd; 1135 1136 /* 1137 * The control file must be a regular cgroup1 file. As a regular cgroup 1138 * file can't be renamed, it's safe to access its name afterwards. 1139 */ 1140 cdentry = fd_file(cfile)->f_path.dentry; 1141 if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 1142 ret = -EINVAL; 1143 goto out_put_eventfd; 1144 } 1145 1146 /* 1147 * Determine the event callbacks and set them in @event. This used 1148 * to be done via struct cftype but cgroup core no longer knows 1149 * about these events. The following is crude but the whole thing 1150 * is for compatibility anyway. 1151 * 1152 * DO NOT ADD NEW FILES. 1153 */ 1154 name = cdentry->d_name.name; 1155 1156 if (!strcmp(name, "memory.usage_in_bytes")) { 1157 event->register_event = mem_cgroup_usage_register_event; 1158 event->unregister_event = mem_cgroup_usage_unregister_event; 1159 } else if (!strcmp(name, "memory.oom_control")) { 1160 pr_warn_once("oom_control is deprecated and will be removed. " 1161 "Please report your usecase to linux-mm-@kvack.org" 1162 " if you depend on this functionality.\n"); 1163 event->register_event = mem_cgroup_oom_register_event; 1164 event->unregister_event = mem_cgroup_oom_unregister_event; 1165 } else if (!strcmp(name, "memory.pressure_level")) { 1166 pr_warn_once("pressure_level is deprecated and will be removed. " 1167 "Please report your usecase to linux-mm-@kvack.org " 1168 "if you depend on this functionality.\n"); 1169 event->register_event = vmpressure_register_event; 1170 event->unregister_event = vmpressure_unregister_event; 1171 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 1172 event->register_event = memsw_cgroup_usage_register_event; 1173 event->unregister_event = memsw_cgroup_usage_unregister_event; 1174 } else { 1175 ret = -EINVAL; 1176 goto out_put_eventfd; 1177 } 1178 1179 /* 1180 * Verify @cfile should belong to @css. Also, remaining events are 1181 * automatically removed on cgroup destruction but the removal is 1182 * asynchronous, so take an extra ref on @css. 1183 */ 1184 cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 1185 &memory_cgrp_subsys); 1186 ret = -EINVAL; 1187 if (IS_ERR(cfile_css)) 1188 goto out_put_eventfd; 1189 if (cfile_css != css) 1190 goto out_put_css; 1191 1192 ret = event->register_event(memcg, event->eventfd, buf); 1193 if (ret) 1194 goto out_put_css; 1195 1196 vfs_poll(fd_file(efile), &event->pt); 1197 1198 spin_lock_irq(&memcg->event_list_lock); 1199 list_add(&event->list, &memcg->event_list); 1200 spin_unlock_irq(&memcg->event_list_lock); 1201 return nbytes; 1202 1203 out_put_css: 1204 css_put(cfile_css); 1205 out_put_eventfd: 1206 eventfd_ctx_put(event->eventfd); 1207 out_kfree: 1208 kfree(event); 1209 return ret; 1210 } 1211 1212 void memcg1_memcg_init(struct mem_cgroup *memcg) 1213 { 1214 INIT_LIST_HEAD(&memcg->oom_notify); 1215 mutex_init(&memcg->thresholds_lock); 1216 INIT_LIST_HEAD(&memcg->event_list); 1217 spin_lock_init(&memcg->event_list_lock); 1218 } 1219 1220 void memcg1_css_offline(struct mem_cgroup *memcg) 1221 { 1222 struct mem_cgroup_event *event, *tmp; 1223 1224 /* 1225 * Unregister events and notify userspace. 1226 * Notify userspace about cgroup removing only after rmdir of cgroup 1227 * directory to avoid race between userspace and kernelspace. 1228 */ 1229 spin_lock_irq(&memcg->event_list_lock); 1230 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 1231 list_del_init(&event->list); 1232 schedule_work(&event->remove); 1233 } 1234 spin_unlock_irq(&memcg->event_list_lock); 1235 } 1236 1237 /* 1238 * Check OOM-Killer is already running under our hierarchy. 1239 * If someone is running, return false. 1240 */ 1241 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1242 { 1243 struct mem_cgroup *iter, *failed = NULL; 1244 1245 spin_lock(&memcg_oom_lock); 1246 1247 for_each_mem_cgroup_tree(iter, memcg) { 1248 if (iter->oom_lock) { 1249 /* 1250 * this subtree of our hierarchy is already locked 1251 * so we cannot give a lock. 1252 */ 1253 failed = iter; 1254 mem_cgroup_iter_break(memcg, iter); 1255 break; 1256 } 1257 iter->oom_lock = true; 1258 } 1259 1260 if (failed) { 1261 /* 1262 * OK, we failed to lock the whole subtree so we have 1263 * to clean up what we set up to the failing subtree 1264 */ 1265 for_each_mem_cgroup_tree(iter, memcg) { 1266 if (iter == failed) { 1267 mem_cgroup_iter_break(memcg, iter); 1268 break; 1269 } 1270 iter->oom_lock = false; 1271 } 1272 } else 1273 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1274 1275 spin_unlock(&memcg_oom_lock); 1276 1277 return !failed; 1278 } 1279 1280 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1281 { 1282 struct mem_cgroup *iter; 1283 1284 spin_lock(&memcg_oom_lock); 1285 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1286 for_each_mem_cgroup_tree(iter, memcg) 1287 iter->oom_lock = false; 1288 spin_unlock(&memcg_oom_lock); 1289 } 1290 1291 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1292 { 1293 struct mem_cgroup *iter; 1294 1295 spin_lock(&memcg_oom_lock); 1296 for_each_mem_cgroup_tree(iter, memcg) 1297 iter->under_oom++; 1298 spin_unlock(&memcg_oom_lock); 1299 } 1300 1301 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1302 { 1303 struct mem_cgroup *iter; 1304 1305 /* 1306 * Be careful about under_oom underflows because a child memcg 1307 * could have been added after mem_cgroup_mark_under_oom. 1308 */ 1309 spin_lock(&memcg_oom_lock); 1310 for_each_mem_cgroup_tree(iter, memcg) 1311 if (iter->under_oom > 0) 1312 iter->under_oom--; 1313 spin_unlock(&memcg_oom_lock); 1314 } 1315 1316 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1317 1318 struct oom_wait_info { 1319 struct mem_cgroup *memcg; 1320 wait_queue_entry_t wait; 1321 }; 1322 1323 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1324 unsigned int mode, int sync, void *arg) 1325 { 1326 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1327 struct mem_cgroup *oom_wait_memcg; 1328 struct oom_wait_info *oom_wait_info; 1329 1330 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1331 oom_wait_memcg = oom_wait_info->memcg; 1332 1333 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1334 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1335 return 0; 1336 return autoremove_wake_function(wait, mode, sync, arg); 1337 } 1338 1339 void memcg1_oom_recover(struct mem_cgroup *memcg) 1340 { 1341 /* 1342 * For the following lockless ->under_oom test, the only required 1343 * guarantee is that it must see the state asserted by an OOM when 1344 * this function is called as a result of userland actions 1345 * triggered by the notification of the OOM. This is trivially 1346 * achieved by invoking mem_cgroup_mark_under_oom() before 1347 * triggering notification. 1348 */ 1349 if (memcg && memcg->under_oom) 1350 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1351 } 1352 1353 /** 1354 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1355 * @handle: actually kill/wait or just clean up the OOM state 1356 * 1357 * This has to be called at the end of a page fault if the memcg OOM 1358 * handler was enabled. 1359 * 1360 * Memcg supports userspace OOM handling where failed allocations must 1361 * sleep on a waitqueue until the userspace task resolves the 1362 * situation. Sleeping directly in the charge context with all kinds 1363 * of locks held is not a good idea, instead we remember an OOM state 1364 * in the task and mem_cgroup_oom_synchronize() has to be called at 1365 * the end of the page fault to complete the OOM handling. 1366 * 1367 * Returns %true if an ongoing memcg OOM situation was detected and 1368 * completed, %false otherwise. 1369 */ 1370 bool mem_cgroup_oom_synchronize(bool handle) 1371 { 1372 struct mem_cgroup *memcg = current->memcg_in_oom; 1373 struct oom_wait_info owait; 1374 bool locked; 1375 1376 /* OOM is global, do not handle */ 1377 if (!memcg) 1378 return false; 1379 1380 if (!handle) 1381 goto cleanup; 1382 1383 owait.memcg = memcg; 1384 owait.wait.flags = 0; 1385 owait.wait.func = memcg_oom_wake_function; 1386 owait.wait.private = current; 1387 INIT_LIST_HEAD(&owait.wait.entry); 1388 1389 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1390 mem_cgroup_mark_under_oom(memcg); 1391 1392 locked = mem_cgroup_oom_trylock(memcg); 1393 1394 if (locked) 1395 mem_cgroup_oom_notify(memcg); 1396 1397 schedule(); 1398 mem_cgroup_unmark_under_oom(memcg); 1399 finish_wait(&memcg_oom_waitq, &owait.wait); 1400 1401 if (locked) 1402 mem_cgroup_oom_unlock(memcg); 1403 cleanup: 1404 current->memcg_in_oom = NULL; 1405 css_put(&memcg->css); 1406 return true; 1407 } 1408 1409 1410 bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) 1411 { 1412 /* 1413 * We are in the middle of the charge context here, so we 1414 * don't want to block when potentially sitting on a callstack 1415 * that holds all kinds of filesystem and mm locks. 1416 * 1417 * cgroup1 allows disabling the OOM killer and waiting for outside 1418 * handling until the charge can succeed; remember the context and put 1419 * the task to sleep at the end of the page fault when all locks are 1420 * released. 1421 * 1422 * On the other hand, in-kernel OOM killer allows for an async victim 1423 * memory reclaim (oom_reaper) and that means that we are not solely 1424 * relying on the oom victim to make a forward progress and we can 1425 * invoke the oom killer here. 1426 * 1427 * Please note that mem_cgroup_out_of_memory might fail to find a 1428 * victim and then we have to bail out from the charge path. 1429 */ 1430 if (READ_ONCE(memcg->oom_kill_disable)) { 1431 if (current->in_user_fault) { 1432 css_get(&memcg->css); 1433 current->memcg_in_oom = memcg; 1434 } 1435 return false; 1436 } 1437 1438 mem_cgroup_mark_under_oom(memcg); 1439 1440 *locked = mem_cgroup_oom_trylock(memcg); 1441 1442 if (*locked) 1443 mem_cgroup_oom_notify(memcg); 1444 1445 mem_cgroup_unmark_under_oom(memcg); 1446 1447 return true; 1448 } 1449 1450 void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked) 1451 { 1452 if (locked) 1453 mem_cgroup_oom_unlock(memcg); 1454 } 1455 1456 static DEFINE_MUTEX(memcg_max_mutex); 1457 1458 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 1459 unsigned long max, bool memsw) 1460 { 1461 bool enlarge = false; 1462 bool drained = false; 1463 int ret; 1464 bool limits_invariant; 1465 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 1466 1467 do { 1468 if (signal_pending(current)) { 1469 ret = -EINTR; 1470 break; 1471 } 1472 1473 mutex_lock(&memcg_max_mutex); 1474 /* 1475 * Make sure that the new limit (memsw or memory limit) doesn't 1476 * break our basic invariant rule memory.max <= memsw.max. 1477 */ 1478 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 1479 max <= memcg->memsw.max; 1480 if (!limits_invariant) { 1481 mutex_unlock(&memcg_max_mutex); 1482 ret = -EINVAL; 1483 break; 1484 } 1485 if (max > counter->max) 1486 enlarge = true; 1487 ret = page_counter_set_max(counter, max); 1488 mutex_unlock(&memcg_max_mutex); 1489 1490 if (!ret) 1491 break; 1492 1493 if (!drained) { 1494 drain_all_stock(memcg); 1495 drained = true; 1496 continue; 1497 } 1498 1499 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 1500 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) { 1501 ret = -EBUSY; 1502 break; 1503 } 1504 } while (true); 1505 1506 if (!ret && enlarge) 1507 memcg1_oom_recover(memcg); 1508 1509 return ret; 1510 } 1511 1512 /* 1513 * Reclaims as many pages from the given memcg as possible. 1514 * 1515 * Caller is responsible for holding css reference for memcg. 1516 */ 1517 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 1518 { 1519 int nr_retries = MAX_RECLAIM_RETRIES; 1520 1521 /* we call try-to-free pages for make this cgroup empty */ 1522 lru_add_drain_all(); 1523 1524 drain_all_stock(memcg); 1525 1526 /* try to free all pages in this cgroup */ 1527 while (nr_retries && page_counter_read(&memcg->memory)) { 1528 if (signal_pending(current)) 1529 return -EINTR; 1530 1531 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 1532 MEMCG_RECLAIM_MAY_SWAP, NULL)) 1533 nr_retries--; 1534 } 1535 1536 return 0; 1537 } 1538 1539 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 1540 char *buf, size_t nbytes, 1541 loff_t off) 1542 { 1543 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1544 1545 if (mem_cgroup_is_root(memcg)) 1546 return -EINVAL; 1547 return mem_cgroup_force_empty(memcg) ?: nbytes; 1548 } 1549 1550 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 1551 struct cftype *cft) 1552 { 1553 return 1; 1554 } 1555 1556 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 1557 struct cftype *cft, u64 val) 1558 { 1559 if (val == 1) 1560 return 0; 1561 1562 pr_warn_once("Non-hierarchical mode is deprecated. " 1563 "Please report your usecase to linux-mm@kvack.org if you " 1564 "depend on this functionality.\n"); 1565 1566 return -EINVAL; 1567 } 1568 1569 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 1570 struct cftype *cft) 1571 { 1572 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1573 struct page_counter *counter; 1574 1575 switch (MEMFILE_TYPE(cft->private)) { 1576 case _MEM: 1577 counter = &memcg->memory; 1578 break; 1579 case _MEMSWAP: 1580 counter = &memcg->memsw; 1581 break; 1582 case _KMEM: 1583 counter = &memcg->kmem; 1584 break; 1585 case _TCP: 1586 counter = &memcg->tcpmem; 1587 break; 1588 default: 1589 BUG(); 1590 } 1591 1592 switch (MEMFILE_ATTR(cft->private)) { 1593 case RES_USAGE: 1594 if (counter == &memcg->memory) 1595 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 1596 if (counter == &memcg->memsw) 1597 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 1598 return (u64)page_counter_read(counter) * PAGE_SIZE; 1599 case RES_LIMIT: 1600 return (u64)counter->max * PAGE_SIZE; 1601 case RES_MAX_USAGE: 1602 return (u64)counter->watermark * PAGE_SIZE; 1603 case RES_FAILCNT: 1604 return counter->failcnt; 1605 case RES_SOFT_LIMIT: 1606 return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE; 1607 default: 1608 BUG(); 1609 } 1610 } 1611 1612 /* 1613 * This function doesn't do anything useful. Its only job is to provide a read 1614 * handler for a file so that cgroup_file_mode() will add read permissions. 1615 */ 1616 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m, 1617 __always_unused void *v) 1618 { 1619 return -EINVAL; 1620 } 1621 1622 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 1623 { 1624 int ret; 1625 1626 mutex_lock(&memcg_max_mutex); 1627 1628 ret = page_counter_set_max(&memcg->tcpmem, max); 1629 if (ret) 1630 goto out; 1631 1632 if (!memcg->tcpmem_active) { 1633 /* 1634 * The active flag needs to be written after the static_key 1635 * update. This is what guarantees that the socket activation 1636 * function is the last one to run. See mem_cgroup_sk_alloc() 1637 * for details, and note that we don't mark any socket as 1638 * belonging to this memcg until that flag is up. 1639 * 1640 * We need to do this, because static_keys will span multiple 1641 * sites, but we can't control their order. If we mark a socket 1642 * as accounted, but the accounting functions are not patched in 1643 * yet, we'll lose accounting. 1644 * 1645 * We never race with the readers in mem_cgroup_sk_alloc(), 1646 * because when this value change, the code to process it is not 1647 * patched in yet. 1648 */ 1649 static_branch_inc(&memcg_sockets_enabled_key); 1650 memcg->tcpmem_active = true; 1651 } 1652 out: 1653 mutex_unlock(&memcg_max_mutex); 1654 return ret; 1655 } 1656 1657 /* 1658 * The user of this function is... 1659 * RES_LIMIT. 1660 */ 1661 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 1662 char *buf, size_t nbytes, loff_t off) 1663 { 1664 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1665 unsigned long nr_pages; 1666 int ret; 1667 1668 buf = strstrip(buf); 1669 ret = page_counter_memparse(buf, "-1", &nr_pages); 1670 if (ret) 1671 return ret; 1672 1673 switch (MEMFILE_ATTR(of_cft(of)->private)) { 1674 case RES_LIMIT: 1675 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 1676 ret = -EINVAL; 1677 break; 1678 } 1679 switch (MEMFILE_TYPE(of_cft(of)->private)) { 1680 case _MEM: 1681 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 1682 break; 1683 case _MEMSWAP: 1684 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 1685 break; 1686 case _KMEM: 1687 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 1688 "Writing any value to this file has no effect. " 1689 "Please report your usecase to linux-mm@kvack.org if you " 1690 "depend on this functionality.\n"); 1691 ret = 0; 1692 break; 1693 case _TCP: 1694 pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. " 1695 "Please report your usecase to linux-mm@kvack.org if you " 1696 "depend on this functionality.\n"); 1697 ret = memcg_update_tcp_max(memcg, nr_pages); 1698 break; 1699 } 1700 break; 1701 case RES_SOFT_LIMIT: 1702 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 1703 ret = -EOPNOTSUPP; 1704 } else { 1705 pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. " 1706 "Please report your usecase to linux-mm@kvack.org if you " 1707 "depend on this functionality.\n"); 1708 WRITE_ONCE(memcg->soft_limit, nr_pages); 1709 ret = 0; 1710 } 1711 break; 1712 } 1713 return ret ?: nbytes; 1714 } 1715 1716 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 1717 size_t nbytes, loff_t off) 1718 { 1719 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1720 struct page_counter *counter; 1721 1722 switch (MEMFILE_TYPE(of_cft(of)->private)) { 1723 case _MEM: 1724 counter = &memcg->memory; 1725 break; 1726 case _MEMSWAP: 1727 counter = &memcg->memsw; 1728 break; 1729 case _KMEM: 1730 counter = &memcg->kmem; 1731 break; 1732 case _TCP: 1733 counter = &memcg->tcpmem; 1734 break; 1735 default: 1736 BUG(); 1737 } 1738 1739 switch (MEMFILE_ATTR(of_cft(of)->private)) { 1740 case RES_MAX_USAGE: 1741 page_counter_reset_watermark(counter); 1742 break; 1743 case RES_FAILCNT: 1744 counter->failcnt = 0; 1745 break; 1746 default: 1747 BUG(); 1748 } 1749 1750 return nbytes; 1751 } 1752 1753 #ifdef CONFIG_NUMA 1754 1755 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 1756 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 1757 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 1758 1759 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 1760 int nid, unsigned int lru_mask, bool tree) 1761 { 1762 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 1763 unsigned long nr = 0; 1764 enum lru_list lru; 1765 1766 VM_BUG_ON((unsigned int)nid >= nr_node_ids); 1767 1768 for_each_lru(lru) { 1769 if (!(BIT(lru) & lru_mask)) 1770 continue; 1771 if (tree) 1772 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 1773 else 1774 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 1775 } 1776 return nr; 1777 } 1778 1779 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 1780 unsigned int lru_mask, 1781 bool tree) 1782 { 1783 unsigned long nr = 0; 1784 enum lru_list lru; 1785 1786 for_each_lru(lru) { 1787 if (!(BIT(lru) & lru_mask)) 1788 continue; 1789 if (tree) 1790 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 1791 else 1792 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 1793 } 1794 return nr; 1795 } 1796 1797 static int memcg_numa_stat_show(struct seq_file *m, void *v) 1798 { 1799 struct numa_stat { 1800 const char *name; 1801 unsigned int lru_mask; 1802 }; 1803 1804 static const struct numa_stat stats[] = { 1805 { "total", LRU_ALL }, 1806 { "file", LRU_ALL_FILE }, 1807 { "anon", LRU_ALL_ANON }, 1808 { "unevictable", BIT(LRU_UNEVICTABLE) }, 1809 }; 1810 const struct numa_stat *stat; 1811 int nid; 1812 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 1813 1814 mem_cgroup_flush_stats(memcg); 1815 1816 for (stat = stats; stat < ARRAY_END(stats); stat++) { 1817 seq_printf(m, "%s=%lu", stat->name, 1818 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 1819 false)); 1820 for_each_node_state(nid, N_MEMORY) 1821 seq_printf(m, " N%d=%lu", nid, 1822 mem_cgroup_node_nr_lru_pages(memcg, nid, 1823 stat->lru_mask, false)); 1824 seq_putc(m, '\n'); 1825 } 1826 1827 for (stat = stats; stat < ARRAY_END(stats); stat++) { 1828 1829 seq_printf(m, "hierarchical_%s=%lu", stat->name, 1830 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 1831 true)); 1832 for_each_node_state(nid, N_MEMORY) 1833 seq_printf(m, " N%d=%lu", nid, 1834 mem_cgroup_node_nr_lru_pages(memcg, nid, 1835 stat->lru_mask, true)); 1836 seq_putc(m, '\n'); 1837 } 1838 1839 return 0; 1840 } 1841 #endif /* CONFIG_NUMA */ 1842 1843 static const unsigned int memcg1_stats[] = { 1844 NR_FILE_PAGES, 1845 NR_ANON_MAPPED, 1846 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1847 NR_ANON_THPS, 1848 #endif 1849 NR_SHMEM, 1850 NR_FILE_MAPPED, 1851 NR_FILE_DIRTY, 1852 NR_WRITEBACK, 1853 WORKINGSET_REFAULT_ANON, 1854 WORKINGSET_REFAULT_FILE, 1855 #ifdef CONFIG_SWAP 1856 MEMCG_SWAP, 1857 NR_SWAPCACHE, 1858 #endif 1859 }; 1860 1861 static const char *const memcg1_stat_names[] = { 1862 "cache", 1863 "rss", 1864 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1865 "rss_huge", 1866 #endif 1867 "shmem", 1868 "mapped_file", 1869 "dirty", 1870 "writeback", 1871 "workingset_refault_anon", 1872 "workingset_refault_file", 1873 #ifdef CONFIG_SWAP 1874 "swap", 1875 "swapcached", 1876 #endif 1877 }; 1878 1879 /* Universal VM events cgroup1 shows, original sort order */ 1880 static const unsigned int memcg1_events[] = { 1881 PGPGIN, 1882 PGPGOUT, 1883 PGFAULT, 1884 PGMAJFAULT, 1885 }; 1886 1887 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 1888 { 1889 unsigned long memory, memsw; 1890 struct mem_cgroup *mi; 1891 unsigned int i; 1892 1893 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 1894 1895 mem_cgroup_flush_stats(memcg); 1896 1897 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1898 unsigned long nr; 1899 1900 nr = memcg_page_state_local_output(memcg, memcg1_stats[i]); 1901 seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr); 1902 } 1903 1904 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 1905 seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), 1906 memcg_events_local(memcg, memcg1_events[i])); 1907 1908 for (i = 0; i < NR_LRU_LISTS; i++) 1909 seq_buf_printf(s, "%s %lu\n", lru_list_name(i), 1910 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 1911 PAGE_SIZE); 1912 1913 /* Hierarchical information */ 1914 memory = memsw = PAGE_COUNTER_MAX; 1915 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 1916 memory = min(memory, READ_ONCE(mi->memory.max)); 1917 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 1918 } 1919 seq_buf_printf(s, "hierarchical_memory_limit %llu\n", 1920 (u64)memory * PAGE_SIZE); 1921 seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", 1922 (u64)memsw * PAGE_SIZE); 1923 1924 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1925 unsigned long nr; 1926 1927 nr = memcg_page_state_output(memcg, memcg1_stats[i]); 1928 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], 1929 (u64)nr); 1930 } 1931 1932 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 1933 seq_buf_printf(s, "total_%s %llu\n", 1934 vm_event_name(memcg1_events[i]), 1935 (u64)memcg_events(memcg, memcg1_events[i])); 1936 1937 for (i = 0; i < NR_LRU_LISTS; i++) 1938 seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), 1939 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 1940 PAGE_SIZE); 1941 1942 #ifdef CONFIG_DEBUG_VM 1943 { 1944 pg_data_t *pgdat; 1945 struct mem_cgroup_per_node *mz; 1946 unsigned long anon_cost = 0; 1947 unsigned long file_cost = 0; 1948 1949 for_each_online_pgdat(pgdat) { 1950 mz = memcg->nodeinfo[pgdat->node_id]; 1951 1952 anon_cost += mz->lruvec.anon_cost; 1953 file_cost += mz->lruvec.file_cost; 1954 } 1955 seq_buf_printf(s, "anon_cost %lu\n", anon_cost); 1956 seq_buf_printf(s, "file_cost %lu\n", file_cost); 1957 } 1958 #endif 1959 } 1960 1961 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 1962 struct cftype *cft) 1963 { 1964 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1965 1966 return mem_cgroup_swappiness(memcg); 1967 } 1968 1969 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 1970 struct cftype *cft, u64 val) 1971 { 1972 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1973 1974 if (val > MAX_SWAPPINESS) 1975 return -EINVAL; 1976 1977 if (!mem_cgroup_is_root(memcg)) { 1978 pr_info_once("Per memcg swappiness does not exist in cgroup v2. " 1979 "See memory.reclaim or memory.swap.max there\n "); 1980 WRITE_ONCE(memcg->swappiness, val); 1981 } else 1982 WRITE_ONCE(vm_swappiness, val); 1983 1984 return 0; 1985 } 1986 1987 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 1988 { 1989 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 1990 1991 seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable)); 1992 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 1993 seq_printf(sf, "oom_kill %lu\n", 1994 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 1995 return 0; 1996 } 1997 1998 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 1999 struct cftype *cft, u64 val) 2000 { 2001 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2002 2003 pr_warn_once("oom_control is deprecated and will be removed. " 2004 "Please report your usecase to linux-mm-@kvack.org if you " 2005 "depend on this functionality.\n"); 2006 2007 /* cannot set to root cgroup and only 0 and 1 are allowed */ 2008 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 2009 return -EINVAL; 2010 2011 WRITE_ONCE(memcg->oom_kill_disable, val); 2012 if (!val) 2013 memcg1_oom_recover(memcg); 2014 2015 return 0; 2016 } 2017 2018 #ifdef CONFIG_SLUB_DEBUG 2019 static int mem_cgroup_slab_show(struct seq_file *m, void *p) 2020 { 2021 /* 2022 * Deprecated. 2023 * Please, take a look at tools/cgroup/memcg_slabinfo.py . 2024 */ 2025 return 0; 2026 } 2027 #endif 2028 2029 struct cftype mem_cgroup_legacy_files[] = { 2030 { 2031 .name = "usage_in_bytes", 2032 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 2033 .read_u64 = mem_cgroup_read_u64, 2034 }, 2035 { 2036 .name = "max_usage_in_bytes", 2037 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 2038 .write = mem_cgroup_reset, 2039 .read_u64 = mem_cgroup_read_u64, 2040 }, 2041 { 2042 .name = "limit_in_bytes", 2043 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 2044 .write = mem_cgroup_write, 2045 .read_u64 = mem_cgroup_read_u64, 2046 }, 2047 { 2048 .name = "soft_limit_in_bytes", 2049 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 2050 .write = mem_cgroup_write, 2051 .read_u64 = mem_cgroup_read_u64, 2052 }, 2053 { 2054 .name = "failcnt", 2055 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2056 .write = mem_cgroup_reset, 2057 .read_u64 = mem_cgroup_read_u64, 2058 }, 2059 { 2060 .name = "stat", 2061 .seq_show = memory_stat_show, 2062 }, 2063 { 2064 .name = "force_empty", 2065 .write = mem_cgroup_force_empty_write, 2066 }, 2067 { 2068 .name = "use_hierarchy", 2069 .write_u64 = mem_cgroup_hierarchy_write, 2070 .read_u64 = mem_cgroup_hierarchy_read, 2071 }, 2072 { 2073 .name = "cgroup.event_control", /* XXX: for compat */ 2074 .write = memcg_write_event_control, 2075 .flags = CFTYPE_NO_PREFIX, 2076 }, 2077 { 2078 .name = "swappiness", 2079 .read_u64 = mem_cgroup_swappiness_read, 2080 .write_u64 = mem_cgroup_swappiness_write, 2081 }, 2082 { 2083 .name = "move_charge_at_immigrate", 2084 .read_u64 = mem_cgroup_move_charge_read, 2085 .write_u64 = mem_cgroup_move_charge_write, 2086 }, 2087 { 2088 .name = "oom_control", 2089 .seq_show = mem_cgroup_oom_control_read, 2090 .write_u64 = mem_cgroup_oom_control_write, 2091 }, 2092 { 2093 .name = "pressure_level", 2094 .seq_show = mem_cgroup_dummy_seq_show, 2095 }, 2096 #ifdef CONFIG_NUMA 2097 { 2098 .name = "numa_stat", 2099 .seq_show = memcg_numa_stat_show, 2100 }, 2101 #endif 2102 { 2103 .name = "kmem.limit_in_bytes", 2104 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 2105 .write = mem_cgroup_write, 2106 .read_u64 = mem_cgroup_read_u64, 2107 }, 2108 { 2109 .name = "kmem.usage_in_bytes", 2110 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 2111 .read_u64 = mem_cgroup_read_u64, 2112 }, 2113 { 2114 .name = "kmem.failcnt", 2115 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 2116 .write = mem_cgroup_reset, 2117 .read_u64 = mem_cgroup_read_u64, 2118 }, 2119 { 2120 .name = "kmem.max_usage_in_bytes", 2121 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 2122 .write = mem_cgroup_reset, 2123 .read_u64 = mem_cgroup_read_u64, 2124 }, 2125 #ifdef CONFIG_SLUB_DEBUG 2126 { 2127 .name = "kmem.slabinfo", 2128 .seq_show = mem_cgroup_slab_show, 2129 }, 2130 #endif 2131 { 2132 .name = "kmem.tcp.limit_in_bytes", 2133 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 2134 .write = mem_cgroup_write, 2135 .read_u64 = mem_cgroup_read_u64, 2136 }, 2137 { 2138 .name = "kmem.tcp.usage_in_bytes", 2139 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 2140 .read_u64 = mem_cgroup_read_u64, 2141 }, 2142 { 2143 .name = "kmem.tcp.failcnt", 2144 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 2145 .write = mem_cgroup_reset, 2146 .read_u64 = mem_cgroup_read_u64, 2147 }, 2148 { 2149 .name = "kmem.tcp.max_usage_in_bytes", 2150 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 2151 .write = mem_cgroup_reset, 2152 .read_u64 = mem_cgroup_read_u64, 2153 }, 2154 { }, /* terminate */ 2155 }; 2156 2157 struct cftype memsw_files[] = { 2158 { 2159 .name = "memsw.usage_in_bytes", 2160 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 2161 .read_u64 = mem_cgroup_read_u64, 2162 }, 2163 { 2164 .name = "memsw.max_usage_in_bytes", 2165 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 2166 .write = mem_cgroup_reset, 2167 .read_u64 = mem_cgroup_read_u64, 2168 }, 2169 { 2170 .name = "memsw.limit_in_bytes", 2171 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 2172 .write = mem_cgroup_write, 2173 .read_u64 = mem_cgroup_read_u64, 2174 }, 2175 { 2176 .name = "memsw.failcnt", 2177 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 2178 .write = mem_cgroup_reset, 2179 .read_u64 = mem_cgroup_read_u64, 2180 }, 2181 { }, /* terminate */ 2182 }; 2183 2184 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) 2185 { 2186 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 2187 if (nr_pages > 0) 2188 page_counter_charge(&memcg->kmem, nr_pages); 2189 else 2190 page_counter_uncharge(&memcg->kmem, -nr_pages); 2191 } 2192 } 2193 2194 bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 2195 gfp_t gfp_mask) 2196 { 2197 struct page_counter *fail; 2198 2199 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 2200 memcg->tcpmem_pressure = 0; 2201 return true; 2202 } 2203 memcg->tcpmem_pressure = 1; 2204 if (gfp_mask & __GFP_NOFAIL) { 2205 page_counter_charge(&memcg->tcpmem, nr_pages); 2206 return true; 2207 } 2208 return false; 2209 } 2210 2211 bool memcg1_alloc_events(struct mem_cgroup *memcg) 2212 { 2213 memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu, 2214 GFP_KERNEL_ACCOUNT); 2215 return !!memcg->events_percpu; 2216 } 2217 2218 void memcg1_free_events(struct mem_cgroup *memcg) 2219 { 2220 free_percpu(memcg->events_percpu); 2221 } 2222 2223 static int __init memcg1_init(void) 2224 { 2225 int node; 2226 2227 for_each_node(node) { 2228 struct mem_cgroup_tree_per_node *rtpn; 2229 2230 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); 2231 2232 rtpn->rb_root = RB_ROOT; 2233 rtpn->rb_rightmost = NULL; 2234 spin_lock_init(&rtpn->lock); 2235 soft_limit_tree.rb_tree_per_node[node] = rtpn; 2236 } 2237 2238 return 0; 2239 } 2240 subsys_initcall(memcg1_init); 2241