1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/memcontrol.h> 4 #include <linux/swap.h> 5 #include <linux/mm_inline.h> 6 #include <linux/pagewalk.h> 7 #include <linux/backing-dev.h> 8 #include <linux/eventfd.h> 9 #include <linux/poll.h> 10 #include <linux/sort.h> 11 #include <linux/file.h> 12 #include <linux/seq_buf.h> 13 14 #include "internal.h" 15 #include "swap.h" 16 #include "swap_table.h" 17 #include "memcontrol-v1.h" 18 19 /* 20 * Cgroups above their limits are maintained in a RB-Tree, independent of 21 * their hierarchy representation 22 */ 23 24 struct mem_cgroup_tree_per_node { 25 struct rb_root rb_root; 26 struct rb_node *rb_rightmost; 27 spinlock_t lock; 28 }; 29 30 struct mem_cgroup_tree { 31 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 32 }; 33 34 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 35 36 /* 37 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft 38 * limit reclaim to prevent infinite loops, if they ever occur. 39 */ 40 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 41 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 42 43 /* for OOM */ 44 struct mem_cgroup_eventfd_list { 45 struct list_head list; 46 struct eventfd_ctx *eventfd; 47 }; 48 49 /* 50 * cgroup_event represents events which userspace want to receive. 51 */ 52 struct mem_cgroup_event { 53 /* 54 * memcg which the event belongs to. 55 */ 56 struct mem_cgroup *memcg; 57 /* 58 * eventfd to signal userspace about the event. 59 */ 60 struct eventfd_ctx *eventfd; 61 /* 62 * Each of these stored in a list by the cgroup. 63 */ 64 struct list_head list; 65 /* 66 * register_event() callback will be used to add new userspace 67 * waiter for changes related to this event. Use eventfd_signal() 68 * on eventfd to send notification to userspace. 69 */ 70 int (*register_event)(struct mem_cgroup *memcg, 71 struct eventfd_ctx *eventfd, const char *args); 72 /* 73 * unregister_event() callback will be called when userspace closes 74 * the eventfd or on cgroup removing. This callback must be set, 75 * if you want provide notification functionality. 76 */ 77 void (*unregister_event)(struct mem_cgroup *memcg, 78 struct eventfd_ctx *eventfd); 79 /* 80 * All fields below needed to unregister event when 81 * userspace closes eventfd. 82 */ 83 poll_table pt; 84 wait_queue_head_t *wqh; 85 wait_queue_entry_t wait; 86 struct work_struct remove; 87 }; 88 89 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 90 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 91 #define MEMFILE_ATTR(val) ((val) & 0xffff) 92 93 enum { 94 RES_USAGE, 95 RES_LIMIT, 96 RES_MAX_USAGE, 97 RES_FAILCNT, 98 RES_SOFT_LIMIT, 99 }; 100 101 #ifdef CONFIG_LOCKDEP 102 static struct lockdep_map memcg_oom_lock_dep_map = { 103 .name = "memcg_oom_lock", 104 }; 105 #endif 106 107 DEFINE_SPINLOCK(memcg_oom_lock); 108 109 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 110 struct mem_cgroup_tree_per_node *mctz, 111 unsigned long new_usage_in_excess) 112 { 113 struct rb_node **p = &mctz->rb_root.rb_node; 114 struct rb_node *parent = NULL; 115 struct mem_cgroup_per_node *mz_node; 116 bool rightmost = true; 117 118 if (mz->on_tree) 119 return; 120 121 mz->usage_in_excess = new_usage_in_excess; 122 if (!mz->usage_in_excess) 123 return; 124 while (*p) { 125 parent = *p; 126 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 127 tree_node); 128 if (mz->usage_in_excess < mz_node->usage_in_excess) { 129 p = &(*p)->rb_left; 130 rightmost = false; 131 } else { 132 p = &(*p)->rb_right; 133 } 134 } 135 136 if (rightmost) 137 mctz->rb_rightmost = &mz->tree_node; 138 139 rb_link_node(&mz->tree_node, parent, p); 140 rb_insert_color(&mz->tree_node, &mctz->rb_root); 141 mz->on_tree = true; 142 } 143 144 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 145 struct mem_cgroup_tree_per_node *mctz) 146 { 147 if (!mz->on_tree) 148 return; 149 150 if (&mz->tree_node == mctz->rb_rightmost) 151 mctz->rb_rightmost = rb_prev(&mz->tree_node); 152 153 rb_erase(&mz->tree_node, &mctz->rb_root); 154 mz->on_tree = false; 155 } 156 157 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 158 struct mem_cgroup_tree_per_node *mctz) 159 { 160 unsigned long flags; 161 162 spin_lock_irqsave(&mctz->lock, flags); 163 __mem_cgroup_remove_exceeded(mz, mctz); 164 spin_unlock_irqrestore(&mctz->lock, flags); 165 } 166 167 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 168 { 169 unsigned long nr_pages = page_counter_read(&memcg->memory); 170 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 171 unsigned long excess = 0; 172 173 if (nr_pages > soft_limit) 174 excess = nr_pages - soft_limit; 175 176 return excess; 177 } 178 179 static void memcg1_update_tree(struct mem_cgroup *memcg, int nid) 180 { 181 unsigned long excess; 182 struct mem_cgroup_per_node *mz; 183 struct mem_cgroup_tree_per_node *mctz; 184 185 if (lru_gen_enabled()) { 186 if (soft_limit_excess(memcg)) 187 lru_gen_soft_reclaim(memcg, nid); 188 return; 189 } 190 191 mctz = soft_limit_tree.rb_tree_per_node[nid]; 192 if (!mctz) 193 return; 194 /* 195 * Necessary to update all ancestors when hierarchy is used. 196 * because their event counter is not touched. 197 */ 198 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 199 mz = memcg->nodeinfo[nid]; 200 excess = soft_limit_excess(memcg); 201 /* 202 * We have to update the tree if mz is on RB-tree or 203 * mem is over its softlimit. 204 */ 205 if (excess || mz->on_tree) { 206 unsigned long flags; 207 208 spin_lock_irqsave(&mctz->lock, flags); 209 /* if on-tree, remove it */ 210 if (mz->on_tree) 211 __mem_cgroup_remove_exceeded(mz, mctz); 212 /* 213 * Insert again. mz->usage_in_excess will be updated. 214 * If excess is 0, no tree ops. 215 */ 216 __mem_cgroup_insert_exceeded(mz, mctz, excess); 217 spin_unlock_irqrestore(&mctz->lock, flags); 218 } 219 } 220 } 221 222 void memcg1_remove_from_trees(struct mem_cgroup *memcg) 223 { 224 struct mem_cgroup_tree_per_node *mctz; 225 struct mem_cgroup_per_node *mz; 226 int nid; 227 228 for_each_node(nid) { 229 mz = memcg->nodeinfo[nid]; 230 mctz = soft_limit_tree.rb_tree_per_node[nid]; 231 if (mctz) 232 mem_cgroup_remove_exceeded(mz, mctz); 233 } 234 } 235 236 static struct mem_cgroup_per_node * 237 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 238 { 239 struct mem_cgroup_per_node *mz; 240 241 retry: 242 mz = NULL; 243 if (!mctz->rb_rightmost) 244 goto done; /* Nothing to reclaim from */ 245 246 mz = rb_entry(mctz->rb_rightmost, 247 struct mem_cgroup_per_node, tree_node); 248 /* 249 * Remove the node now but someone else can add it back, 250 * we will to add it back at the end of reclaim to its correct 251 * position in the tree. 252 */ 253 __mem_cgroup_remove_exceeded(mz, mctz); 254 if (!soft_limit_excess(mz->memcg) || 255 !css_tryget(&mz->memcg->css)) 256 goto retry; 257 done: 258 return mz; 259 } 260 261 static struct mem_cgroup_per_node * 262 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 263 { 264 struct mem_cgroup_per_node *mz; 265 266 spin_lock_irq(&mctz->lock); 267 mz = __mem_cgroup_largest_soft_limit_node(mctz); 268 spin_unlock_irq(&mctz->lock); 269 return mz; 270 } 271 272 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 273 pg_data_t *pgdat, 274 gfp_t gfp_mask, 275 unsigned long *total_scanned) 276 { 277 struct mem_cgroup *victim = NULL; 278 int total = 0; 279 int loop = 0; 280 unsigned long excess; 281 unsigned long nr_scanned; 282 struct mem_cgroup_reclaim_cookie reclaim = { 283 .pgdat = pgdat, 284 }; 285 286 excess = soft_limit_excess(root_memcg); 287 288 while (1) { 289 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 290 if (!victim) { 291 loop++; 292 if (loop >= 2) { 293 /* 294 * If we have not been able to reclaim 295 * anything, it might because there are 296 * no reclaimable pages under this hierarchy 297 */ 298 if (!total) 299 break; 300 /* 301 * We want to do more targeted reclaim. 302 * excess >> 2 is not to excessive so as to 303 * reclaim too much, nor too less that we keep 304 * coming back to reclaim from this cgroup 305 */ 306 if (total >= (excess >> 2) || 307 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 308 break; 309 } 310 continue; 311 } 312 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 313 pgdat, &nr_scanned); 314 *total_scanned += nr_scanned; 315 if (!soft_limit_excess(root_memcg)) 316 break; 317 } 318 mem_cgroup_iter_break(root_memcg, victim); 319 return total; 320 } 321 322 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, 323 gfp_t gfp_mask, 324 unsigned long *total_scanned) 325 { 326 unsigned long nr_reclaimed = 0; 327 struct mem_cgroup_per_node *mz, *next_mz = NULL; 328 unsigned long reclaimed; 329 int loop = 0; 330 struct mem_cgroup_tree_per_node *mctz; 331 unsigned long excess; 332 333 if (lru_gen_enabled()) 334 return 0; 335 336 if (order > 0) 337 return 0; 338 339 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; 340 341 /* 342 * Do not even bother to check the largest node if the root 343 * is empty. Do it lockless to prevent lock bouncing. Races 344 * are acceptable as soft limit is best effort anyway. 345 */ 346 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 347 return 0; 348 349 /* 350 * This loop can run a while, specially if mem_cgroup's continuously 351 * keep exceeding their soft limit and putting the system under 352 * pressure 353 */ 354 do { 355 if (next_mz) 356 mz = next_mz; 357 else 358 mz = mem_cgroup_largest_soft_limit_node(mctz); 359 if (!mz) 360 break; 361 362 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 363 gfp_mask, total_scanned); 364 nr_reclaimed += reclaimed; 365 spin_lock_irq(&mctz->lock); 366 367 /* 368 * If we failed to reclaim anything from this memory cgroup 369 * it is time to move on to the next cgroup 370 */ 371 next_mz = NULL; 372 if (!reclaimed) 373 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 374 375 excess = soft_limit_excess(mz->memcg); 376 /* 377 * One school of thought says that we should not add 378 * back the node to the tree if reclaim returns 0. 379 * But our reclaim could return 0, simply because due 380 * to priority we are exposing a smaller subset of 381 * memory to reclaim from. Consider this as a longer 382 * term TODO. 383 */ 384 /* If excess == 0, no tree ops */ 385 __mem_cgroup_insert_exceeded(mz, mctz, excess); 386 spin_unlock_irq(&mctz->lock); 387 css_put(&mz->memcg->css); 388 loop++; 389 /* 390 * Could not reclaim anything and there are no more 391 * mem cgroups to try or we seem to be looping without 392 * reclaiming anything. 393 */ 394 if (!nr_reclaimed && 395 (next_mz == NULL || 396 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 397 break; 398 } while (!nr_reclaimed); 399 if (next_mz) 400 css_put(&next_mz->memcg->css); 401 return nr_reclaimed; 402 } 403 404 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 405 struct cftype *cft) 406 { 407 return 0; 408 } 409 410 #ifdef CONFIG_MMU 411 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 412 struct cftype *cft, u64 val) 413 { 414 pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " 415 "Please report your usecase to linux-mm@kvack.org if you " 416 "depend on this functionality.\n"); 417 418 if (val != 0) 419 return -EINVAL; 420 return 0; 421 } 422 #else 423 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 424 struct cftype *cft, u64 val) 425 { 426 return -ENOSYS; 427 } 428 #endif 429 430 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 431 { 432 unsigned long val; 433 434 if (mem_cgroup_is_root(memcg)) { 435 /* 436 * Approximate root's usage from global state. This isn't 437 * perfect, but the root usage was always an approximation. 438 */ 439 val = global_node_page_state(NR_FILE_PAGES) + 440 global_node_page_state(NR_ANON_MAPPED); 441 if (swap) 442 val += total_swap_pages - get_nr_swap_pages(); 443 } else { 444 if (!swap) 445 val = page_counter_read(&memcg->memory); 446 else 447 val = page_counter_read(&memcg->memsw); 448 } 449 return val; 450 } 451 452 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 453 { 454 struct mem_cgroup_threshold_ary *t; 455 unsigned long usage; 456 int i; 457 458 rcu_read_lock(); 459 if (!swap) 460 t = rcu_dereference(memcg->thresholds.primary); 461 else 462 t = rcu_dereference(memcg->memsw_thresholds.primary); 463 464 if (!t) 465 goto unlock; 466 467 usage = mem_cgroup_usage(memcg, swap); 468 469 /* 470 * current_threshold points to threshold just below or equal to usage. 471 * If it's not true, a threshold was crossed after last 472 * call of __mem_cgroup_threshold(). 473 */ 474 i = t->current_threshold; 475 476 /* 477 * Iterate backward over array of thresholds starting from 478 * current_threshold and check if a threshold is crossed. 479 * If none of thresholds below usage is crossed, we read 480 * only one element of the array here. 481 */ 482 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 483 eventfd_signal(t->entries[i].eventfd); 484 485 /* i = current_threshold + 1 */ 486 i++; 487 488 /* 489 * Iterate forward over array of thresholds starting from 490 * current_threshold+1 and check if a threshold is crossed. 491 * If none of thresholds above usage is crossed, we read 492 * only one element of the array here. 493 */ 494 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 495 eventfd_signal(t->entries[i].eventfd); 496 497 /* Update current_threshold */ 498 t->current_threshold = i - 1; 499 unlock: 500 rcu_read_unlock(); 501 } 502 503 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 504 { 505 while (memcg) { 506 __mem_cgroup_threshold(memcg, false); 507 if (do_memsw_account()) 508 __mem_cgroup_threshold(memcg, true); 509 510 memcg = parent_mem_cgroup(memcg); 511 } 512 } 513 514 /* Cgroup1: threshold notifications & softlimit tree updates */ 515 516 /* 517 * Per memcg event counter is incremented at every pagein/pageout. With THP, 518 * it will be incremented by the number of pages. This counter is used 519 * to trigger some periodic events. This is straightforward and better 520 * than using jiffies etc. to handle periodic memcg event. 521 */ 522 enum mem_cgroup_events_target { 523 MEM_CGROUP_TARGET_THRESH, 524 MEM_CGROUP_TARGET_SOFTLIMIT, 525 MEM_CGROUP_NTARGETS, 526 }; 527 528 struct memcg1_events_percpu { 529 unsigned long nr_page_events; 530 unsigned long targets[MEM_CGROUP_NTARGETS]; 531 }; 532 533 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages) 534 { 535 /* pagein of a big page is an event. So, ignore page size */ 536 if (nr_pages > 0) 537 count_memcg_events(memcg, PGPGIN, 1); 538 else { 539 count_memcg_events(memcg, PGPGOUT, 1); 540 nr_pages = -nr_pages; /* for event */ 541 } 542 543 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages); 544 } 545 546 #define THRESHOLDS_EVENTS_TARGET 128 547 #define SOFTLIMIT_EVENTS_TARGET 1024 548 549 static bool memcg1_event_ratelimit(struct mem_cgroup *memcg, 550 enum mem_cgroup_events_target target) 551 { 552 unsigned long val, next; 553 554 val = __this_cpu_read(memcg->events_percpu->nr_page_events); 555 next = __this_cpu_read(memcg->events_percpu->targets[target]); 556 /* from time_after() in jiffies.h */ 557 if ((long)(next - val) < 0) { 558 switch (target) { 559 case MEM_CGROUP_TARGET_THRESH: 560 next = val + THRESHOLDS_EVENTS_TARGET; 561 break; 562 case MEM_CGROUP_TARGET_SOFTLIMIT: 563 next = val + SOFTLIMIT_EVENTS_TARGET; 564 break; 565 default: 566 break; 567 } 568 __this_cpu_write(memcg->events_percpu->targets[target], next); 569 return true; 570 } 571 return false; 572 } 573 574 /* 575 * Check events in order. 576 * 577 */ 578 static void memcg1_check_events(struct mem_cgroup *memcg, int nid) 579 { 580 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 581 return; 582 583 /* threshold event is triggered in finer grain than soft limit */ 584 if (unlikely(memcg1_event_ratelimit(memcg, 585 MEM_CGROUP_TARGET_THRESH))) { 586 bool do_softlimit; 587 588 do_softlimit = memcg1_event_ratelimit(memcg, 589 MEM_CGROUP_TARGET_SOFTLIMIT); 590 mem_cgroup_threshold(memcg); 591 if (unlikely(do_softlimit)) 592 memcg1_update_tree(memcg, nid); 593 } 594 } 595 596 void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) 597 { 598 unsigned long flags; 599 600 local_irq_save(flags); 601 memcg1_charge_statistics(memcg, folio_nr_pages(folio)); 602 memcg1_check_events(memcg, folio_nid(folio)); 603 local_irq_restore(flags); 604 } 605 606 #ifdef CONFIG_SWAP 607 /** 608 * __memcg1_swapout - transfer a memsw charge to swap 609 * @folio: folio whose memsw charge to transfer 610 * @ci: the locked swap cluster holding the swap entries 611 * 612 * Transfer the memsw charge of @folio to the swap entry stored in 613 * folio->swap. 614 * 615 * Context: folio must be isolated, unmapped, locked and is just about to 616 * be freed, and caller must disable IRQs and hold the swap cluster lock. 617 */ 618 void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci) 619 { 620 struct mem_cgroup *memcg, *swap_memcg; 621 struct obj_cgroup *objcg; 622 unsigned int nr_entries; 623 624 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); 625 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 626 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 627 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 628 629 if (mem_cgroup_disabled()) 630 return; 631 632 if (!do_memsw_account()) 633 return; 634 635 objcg = folio_objcg(folio); 636 VM_WARN_ON_ONCE_FOLIO(!objcg, folio); 637 if (!objcg) 638 return; 639 640 rcu_read_lock(); 641 memcg = obj_cgroup_memcg(objcg); 642 /* 643 * In case the memcg owning these pages has been offlined and doesn't 644 * have an ID allocated to it anymore, charge the closest online 645 * ancestor for the swap instead and transfer the memory+swap charge. 646 */ 647 nr_entries = folio_nr_pages(folio); 648 swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries); 649 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 650 651 __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_entries, 652 mem_cgroup_private_id(swap_memcg)); 653 654 folio_unqueue_deferred_split(folio); 655 folio->memcg_data = 0; 656 657 if (!obj_cgroup_is_root(objcg)) 658 page_counter_uncharge(&memcg->memory, nr_entries); 659 660 if (memcg != swap_memcg) { 661 if (!mem_cgroup_is_root(swap_memcg)) 662 page_counter_charge(&swap_memcg->memsw, nr_entries); 663 page_counter_uncharge(&memcg->memsw, nr_entries); 664 } 665 666 /* 667 * The caller must hold the swap cluster lock with IRQ off. It is 668 * important here to have the interrupts disabled because it is the 669 * only synchronisation we have for updating the per-CPU variables. 670 */ 671 preempt_disable_nested(); 672 VM_WARN_ON_IRQS_ENABLED(); 673 memcg1_charge_statistics(memcg, -folio_nr_pages(folio)); 674 preempt_enable_nested(); 675 memcg1_check_events(memcg, folio_nid(folio)); 676 677 rcu_read_unlock(); 678 obj_cgroup_put(objcg); 679 } 680 681 /** 682 * memcg1_swapin - uncharge swap slot on swapin 683 * @folio: folio being swapped in 684 * 685 * Call this function after successfully adding the charged 686 * folio to swapcache. 687 * 688 * Context: The folio has to be in swap cache and locked. 689 */ 690 void memcg1_swapin(struct folio *folio) 691 { 692 struct swap_cluster_info *ci; 693 unsigned long nr_pages; 694 unsigned short id; 695 696 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); 697 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 698 699 /* 700 * Cgroup1's unified memory+swap counter has been charged with the 701 * new swapcache page, finish the transfer by uncharging the swap 702 * slot. The swap slot would also get uncharged when it dies, but 703 * it can stick around indefinitely and we'd count the page twice 704 * the entire time. 705 * 706 * Cgroup2 has separate resource counters for memory and swap, 707 * so this is a non-issue here. Memory and swap charge lifetimes 708 * correspond 1:1 to page and swap slot lifetimes: we charge the 709 * page to memory here, and uncharge swap when the slot is freed. 710 */ 711 if (!do_memsw_account()) 712 return; 713 714 /* 715 * The swap entry might not get freed for a long time, 716 * let's not wait for it. The page already received a 717 * memory+swap charge, drop the swap entry duplicate. 718 */ 719 nr_pages = folio_nr_pages(folio); 720 ci = swap_cluster_get_and_lock(folio); 721 id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap), 722 nr_pages); 723 swap_cluster_unlock(ci); 724 mem_cgroup_uncharge_swap(id, nr_pages); 725 } 726 #endif 727 728 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 729 unsigned long nr_memory, int nid) 730 { 731 unsigned long flags; 732 733 local_irq_save(flags); 734 count_memcg_events(memcg, PGPGOUT, pgpgout); 735 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory); 736 memcg1_check_events(memcg, nid); 737 local_irq_restore(flags); 738 } 739 740 static int compare_thresholds(const void *a, const void *b) 741 { 742 const struct mem_cgroup_threshold *_a = a; 743 const struct mem_cgroup_threshold *_b = b; 744 745 if (_a->threshold > _b->threshold) 746 return 1; 747 748 if (_a->threshold < _b->threshold) 749 return -1; 750 751 return 0; 752 } 753 754 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 755 { 756 struct mem_cgroup_eventfd_list *ev; 757 758 spin_lock(&memcg_oom_lock); 759 760 list_for_each_entry(ev, &memcg->oom_notify, list) 761 eventfd_signal(ev->eventfd); 762 763 spin_unlock(&memcg_oom_lock); 764 return 0; 765 } 766 767 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 768 { 769 struct mem_cgroup *iter; 770 771 for_each_mem_cgroup_tree(iter, memcg) 772 mem_cgroup_oom_notify_cb(iter); 773 } 774 775 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 776 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 777 { 778 struct mem_cgroup_thresholds *thresholds; 779 struct mem_cgroup_threshold_ary *new; 780 unsigned long threshold; 781 unsigned long usage; 782 int i, size, ret; 783 784 ret = page_counter_memparse(args, "-1", &threshold); 785 if (ret) 786 return ret; 787 788 mutex_lock(&memcg->thresholds_lock); 789 790 if (type == _MEM) { 791 thresholds = &memcg->thresholds; 792 usage = mem_cgroup_usage(memcg, false); 793 } else if (type == _MEMSWAP) { 794 thresholds = &memcg->memsw_thresholds; 795 usage = mem_cgroup_usage(memcg, true); 796 } else 797 BUG(); 798 799 /* Check if a threshold crossed before adding a new one */ 800 if (thresholds->primary) 801 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 802 803 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 804 805 /* Allocate memory for new array of thresholds */ 806 new = kmalloc_flex(*new, entries, size, GFP_KERNEL_ACCOUNT); 807 if (!new) { 808 ret = -ENOMEM; 809 goto unlock; 810 } 811 new->size = size; 812 813 /* Copy thresholds (if any) to new array */ 814 if (thresholds->primary) 815 memcpy(new->entries, thresholds->primary->entries, 816 flex_array_size(new, entries, size - 1)); 817 818 /* Add new threshold */ 819 new->entries[size - 1].eventfd = eventfd; 820 new->entries[size - 1].threshold = threshold; 821 822 /* Sort thresholds. Registering of new threshold isn't time-critical */ 823 sort(new->entries, size, sizeof(*new->entries), 824 compare_thresholds, NULL); 825 826 /* Find current threshold */ 827 new->current_threshold = -1; 828 for (i = 0; i < size; i++) { 829 if (new->entries[i].threshold <= usage) { 830 /* 831 * new->current_threshold will not be used until 832 * rcu_assign_pointer(), so it's safe to increment 833 * it here. 834 */ 835 ++new->current_threshold; 836 } else 837 break; 838 } 839 840 /* Free old spare buffer and save old primary buffer as spare */ 841 kfree(thresholds->spare); 842 thresholds->spare = thresholds->primary; 843 844 rcu_assign_pointer(thresholds->primary, new); 845 846 /* To be sure that nobody uses thresholds */ 847 synchronize_rcu(); 848 849 unlock: 850 mutex_unlock(&memcg->thresholds_lock); 851 852 return ret; 853 } 854 855 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 856 struct eventfd_ctx *eventfd, const char *args) 857 { 858 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 859 } 860 861 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 862 struct eventfd_ctx *eventfd, const char *args) 863 { 864 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 865 } 866 867 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 868 struct eventfd_ctx *eventfd, enum res_type type) 869 { 870 struct mem_cgroup_thresholds *thresholds; 871 struct mem_cgroup_threshold_ary *new; 872 unsigned long usage; 873 int i, j, size, entries; 874 875 mutex_lock(&memcg->thresholds_lock); 876 877 if (type == _MEM) { 878 thresholds = &memcg->thresholds; 879 usage = mem_cgroup_usage(memcg, false); 880 } else if (type == _MEMSWAP) { 881 thresholds = &memcg->memsw_thresholds; 882 usage = mem_cgroup_usage(memcg, true); 883 } else 884 BUG(); 885 886 if (!thresholds->primary) 887 goto unlock; 888 889 /* Check if a threshold crossed before removing */ 890 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 891 892 /* Calculate new number of threshold */ 893 size = entries = 0; 894 for (i = 0; i < thresholds->primary->size; i++) { 895 if (thresholds->primary->entries[i].eventfd != eventfd) 896 size++; 897 else 898 entries++; 899 } 900 901 new = thresholds->spare; 902 903 /* If no items related to eventfd have been cleared, nothing to do */ 904 if (!entries) 905 goto unlock; 906 907 /* Set thresholds array to NULL if we don't have thresholds */ 908 if (!size) { 909 kfree(new); 910 new = NULL; 911 goto swap_buffers; 912 } 913 914 new->size = size; 915 916 /* Copy thresholds and find current threshold */ 917 new->current_threshold = -1; 918 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 919 if (thresholds->primary->entries[i].eventfd == eventfd) 920 continue; 921 922 new->entries[j] = thresholds->primary->entries[i]; 923 if (new->entries[j].threshold <= usage) { 924 /* 925 * new->current_threshold will not be used 926 * until rcu_assign_pointer(), so it's safe to increment 927 * it here. 928 */ 929 ++new->current_threshold; 930 } 931 j++; 932 } 933 934 swap_buffers: 935 /* Swap primary and spare array */ 936 thresholds->spare = thresholds->primary; 937 938 rcu_assign_pointer(thresholds->primary, new); 939 940 /* To be sure that nobody uses thresholds */ 941 synchronize_rcu(); 942 943 /* If all events are unregistered, free the spare array */ 944 if (!new) { 945 kfree(thresholds->spare); 946 thresholds->spare = NULL; 947 } 948 unlock: 949 mutex_unlock(&memcg->thresholds_lock); 950 } 951 952 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 953 struct eventfd_ctx *eventfd) 954 { 955 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 956 } 957 958 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 959 struct eventfd_ctx *eventfd) 960 { 961 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 962 } 963 964 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 965 struct eventfd_ctx *eventfd, const char *args) 966 { 967 struct mem_cgroup_eventfd_list *event; 968 969 event = kmalloc_obj(*event, GFP_KERNEL_ACCOUNT); 970 if (!event) 971 return -ENOMEM; 972 973 spin_lock(&memcg_oom_lock); 974 975 event->eventfd = eventfd; 976 list_add(&event->list, &memcg->oom_notify); 977 978 /* already in OOM ? */ 979 if (memcg->under_oom) 980 eventfd_signal(eventfd); 981 spin_unlock(&memcg_oom_lock); 982 983 return 0; 984 } 985 986 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 987 struct eventfd_ctx *eventfd) 988 { 989 struct mem_cgroup_eventfd_list *ev, *tmp; 990 991 spin_lock(&memcg_oom_lock); 992 993 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 994 if (ev->eventfd == eventfd) { 995 list_del(&ev->list); 996 kfree(ev); 997 } 998 } 999 1000 spin_unlock(&memcg_oom_lock); 1001 } 1002 1003 /* 1004 * DO NOT USE IN NEW FILES. 1005 * 1006 * "cgroup.event_control" implementation. 1007 * 1008 * This is way over-engineered. It tries to support fully configurable 1009 * events for each user. Such level of flexibility is completely 1010 * unnecessary especially in the light of the planned unified hierarchy. 1011 * 1012 * Please deprecate this and replace with something simpler if at all 1013 * possible. 1014 */ 1015 1016 /* 1017 * Unregister event and free resources. 1018 * 1019 * Gets called from workqueue. 1020 */ 1021 static void memcg_event_remove(struct work_struct *work) 1022 { 1023 struct mem_cgroup_event *event = 1024 container_of(work, struct mem_cgroup_event, remove); 1025 struct mem_cgroup *memcg = event->memcg; 1026 1027 remove_wait_queue(event->wqh, &event->wait); 1028 1029 event->unregister_event(memcg, event->eventfd); 1030 1031 /* Notify userspace the event is going away. */ 1032 eventfd_signal(event->eventfd); 1033 1034 eventfd_ctx_put(event->eventfd); 1035 kfree(event); 1036 css_put(&memcg->css); 1037 } 1038 1039 /* 1040 * Gets called on EPOLLHUP on eventfd when user closes it. 1041 * 1042 * Called with wqh->lock held and interrupts disabled. 1043 */ 1044 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode, 1045 int sync, void *key) 1046 { 1047 struct mem_cgroup_event *event = 1048 container_of(wait, struct mem_cgroup_event, wait); 1049 struct mem_cgroup *memcg = event->memcg; 1050 __poll_t flags = key_to_poll(key); 1051 1052 if (flags & EPOLLHUP) { 1053 /* 1054 * If the event has been detached at cgroup removal, we 1055 * can simply return knowing the other side will cleanup 1056 * for us. 1057 * 1058 * We can't race against event freeing since the other 1059 * side will require wqh->lock via remove_wait_queue(), 1060 * which we hold. 1061 */ 1062 spin_lock(&memcg->event_list_lock); 1063 if (!list_empty(&event->list)) { 1064 list_del_init(&event->list); 1065 /* 1066 * We are in atomic context, but cgroup_event_remove() 1067 * may sleep, so we have to call it in workqueue. 1068 */ 1069 schedule_work(&event->remove); 1070 } 1071 spin_unlock(&memcg->event_list_lock); 1072 } 1073 1074 return 0; 1075 } 1076 1077 static void memcg_event_ptable_queue_proc(struct file *file, 1078 wait_queue_head_t *wqh, poll_table *pt) 1079 { 1080 struct mem_cgroup_event *event = 1081 container_of(pt, struct mem_cgroup_event, pt); 1082 1083 event->wqh = wqh; 1084 add_wait_queue(wqh, &event->wait); 1085 } 1086 1087 /* 1088 * DO NOT USE IN NEW FILES. 1089 * 1090 * Parse input and register new cgroup event handler. 1091 * 1092 * Input must be in format '<event_fd> <control_fd> <args>'. 1093 * Interpretation of args is defined by control file implementation. 1094 */ 1095 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 1096 char *buf, size_t nbytes, loff_t off) 1097 { 1098 struct cgroup_subsys_state *css = of_css(of); 1099 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1100 struct mem_cgroup_event *event; 1101 struct cgroup_subsys_state *cfile_css; 1102 unsigned int efd, cfd; 1103 struct dentry *cdentry; 1104 const char *name; 1105 char *endp; 1106 int ret; 1107 1108 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 1109 return -EOPNOTSUPP; 1110 1111 buf = strstrip(buf); 1112 1113 efd = simple_strtoul(buf, &endp, 10); 1114 if (*endp != ' ') 1115 return -EINVAL; 1116 buf = endp + 1; 1117 1118 cfd = simple_strtoul(buf, &endp, 10); 1119 if (*endp == '\0') 1120 buf = endp; 1121 else if (*endp == ' ') 1122 buf = endp + 1; 1123 else 1124 return -EINVAL; 1125 1126 CLASS(fd, efile)(efd); 1127 if (fd_empty(efile)) 1128 return -EBADF; 1129 1130 CLASS(fd, cfile)(cfd); 1131 1132 event = kzalloc_obj(*event, GFP_KERNEL_ACCOUNT); 1133 if (!event) 1134 return -ENOMEM; 1135 1136 event->memcg = memcg; 1137 INIT_LIST_HEAD(&event->list); 1138 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 1139 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 1140 INIT_WORK(&event->remove, memcg_event_remove); 1141 1142 event->eventfd = eventfd_ctx_fileget(fd_file(efile)); 1143 if (IS_ERR(event->eventfd)) { 1144 ret = PTR_ERR(event->eventfd); 1145 goto out_kfree; 1146 } 1147 1148 if (fd_empty(cfile)) { 1149 ret = -EBADF; 1150 goto out_put_eventfd; 1151 } 1152 1153 /* the process need read permission on control file */ 1154 /* AV: shouldn't we check that it's been opened for read instead? */ 1155 ret = file_permission(fd_file(cfile), MAY_READ); 1156 if (ret < 0) 1157 goto out_put_eventfd; 1158 1159 /* 1160 * The control file must be a regular cgroup1 file. As a regular cgroup 1161 * file can't be renamed, it's safe to access its name afterwards. 1162 */ 1163 cdentry = fd_file(cfile)->f_path.dentry; 1164 if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 1165 ret = -EINVAL; 1166 goto out_put_eventfd; 1167 } 1168 1169 /* 1170 * Determine the event callbacks and set them in @event. This used 1171 * to be done via struct cftype but cgroup core no longer knows 1172 * about these events. The following is crude but the whole thing 1173 * is for compatibility anyway. 1174 * 1175 * DO NOT ADD NEW FILES. 1176 */ 1177 name = cdentry->d_name.name; 1178 1179 if (!strcmp(name, "memory.usage_in_bytes")) { 1180 event->register_event = mem_cgroup_usage_register_event; 1181 event->unregister_event = mem_cgroup_usage_unregister_event; 1182 } else if (!strcmp(name, "memory.oom_control")) { 1183 pr_warn_once("oom_control is deprecated and will be removed. " 1184 "Please report your usecase to linux-mm-@kvack.org" 1185 " if you depend on this functionality.\n"); 1186 event->register_event = mem_cgroup_oom_register_event; 1187 event->unregister_event = mem_cgroup_oom_unregister_event; 1188 } else if (!strcmp(name, "memory.pressure_level")) { 1189 pr_warn_once("pressure_level is deprecated and will be removed. " 1190 "Please report your usecase to linux-mm-@kvack.org " 1191 "if you depend on this functionality.\n"); 1192 event->register_event = vmpressure_register_event; 1193 event->unregister_event = vmpressure_unregister_event; 1194 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 1195 event->register_event = memsw_cgroup_usage_register_event; 1196 event->unregister_event = memsw_cgroup_usage_unregister_event; 1197 } else { 1198 ret = -EINVAL; 1199 goto out_put_eventfd; 1200 } 1201 1202 /* 1203 * Verify @cfile should belong to @css. Also, remaining events are 1204 * automatically removed on cgroup destruction but the removal is 1205 * asynchronous, so take an extra ref on @css. 1206 */ 1207 cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 1208 &memory_cgrp_subsys); 1209 ret = -EINVAL; 1210 if (IS_ERR(cfile_css)) 1211 goto out_put_eventfd; 1212 if (cfile_css != css) 1213 goto out_put_css; 1214 1215 ret = event->register_event(memcg, event->eventfd, buf); 1216 if (ret) 1217 goto out_put_css; 1218 1219 vfs_poll(fd_file(efile), &event->pt); 1220 1221 spin_lock_irq(&memcg->event_list_lock); 1222 list_add(&event->list, &memcg->event_list); 1223 spin_unlock_irq(&memcg->event_list_lock); 1224 return nbytes; 1225 1226 out_put_css: 1227 css_put(cfile_css); 1228 out_put_eventfd: 1229 eventfd_ctx_put(event->eventfd); 1230 out_kfree: 1231 kfree(event); 1232 return ret; 1233 } 1234 1235 void memcg1_memcg_init(struct mem_cgroup *memcg) 1236 { 1237 INIT_LIST_HEAD(&memcg->oom_notify); 1238 mutex_init(&memcg->thresholds_lock); 1239 INIT_LIST_HEAD(&memcg->event_list); 1240 spin_lock_init(&memcg->event_list_lock); 1241 } 1242 1243 void memcg1_css_offline(struct mem_cgroup *memcg) 1244 { 1245 struct mem_cgroup_event *event, *tmp; 1246 1247 /* 1248 * Unregister events and notify userspace. 1249 * Notify userspace about cgroup removing only after rmdir of cgroup 1250 * directory to avoid race between userspace and kernelspace. 1251 */ 1252 spin_lock_irq(&memcg->event_list_lock); 1253 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 1254 list_del_init(&event->list); 1255 schedule_work(&event->remove); 1256 } 1257 spin_unlock_irq(&memcg->event_list_lock); 1258 } 1259 1260 /* 1261 * Check OOM-Killer is already running under our hierarchy. 1262 * If someone is running, return false. 1263 */ 1264 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1265 { 1266 struct mem_cgroup *iter, *failed = NULL; 1267 1268 spin_lock(&memcg_oom_lock); 1269 1270 for_each_mem_cgroup_tree(iter, memcg) { 1271 if (iter->oom_lock) { 1272 /* 1273 * this subtree of our hierarchy is already locked 1274 * so we cannot give a lock. 1275 */ 1276 failed = iter; 1277 mem_cgroup_iter_break(memcg, iter); 1278 break; 1279 } 1280 iter->oom_lock = true; 1281 } 1282 1283 if (failed) { 1284 /* 1285 * OK, we failed to lock the whole subtree so we have 1286 * to clean up what we set up to the failing subtree 1287 */ 1288 for_each_mem_cgroup_tree(iter, memcg) { 1289 if (iter == failed) { 1290 mem_cgroup_iter_break(memcg, iter); 1291 break; 1292 } 1293 iter->oom_lock = false; 1294 } 1295 } else 1296 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1297 1298 spin_unlock(&memcg_oom_lock); 1299 1300 return !failed; 1301 } 1302 1303 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1304 { 1305 struct mem_cgroup *iter; 1306 1307 spin_lock(&memcg_oom_lock); 1308 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1309 for_each_mem_cgroup_tree(iter, memcg) 1310 iter->oom_lock = false; 1311 spin_unlock(&memcg_oom_lock); 1312 } 1313 1314 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1315 { 1316 struct mem_cgroup *iter; 1317 1318 spin_lock(&memcg_oom_lock); 1319 for_each_mem_cgroup_tree(iter, memcg) 1320 iter->under_oom++; 1321 spin_unlock(&memcg_oom_lock); 1322 } 1323 1324 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1325 { 1326 struct mem_cgroup *iter; 1327 1328 /* 1329 * Be careful about under_oom underflows because a child memcg 1330 * could have been added after mem_cgroup_mark_under_oom. 1331 */ 1332 spin_lock(&memcg_oom_lock); 1333 for_each_mem_cgroup_tree(iter, memcg) 1334 if (iter->under_oom > 0) 1335 iter->under_oom--; 1336 spin_unlock(&memcg_oom_lock); 1337 } 1338 1339 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1340 1341 struct oom_wait_info { 1342 struct mem_cgroup *memcg; 1343 wait_queue_entry_t wait; 1344 }; 1345 1346 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1347 unsigned int mode, int sync, void *arg) 1348 { 1349 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1350 struct mem_cgroup *oom_wait_memcg; 1351 struct oom_wait_info *oom_wait_info; 1352 1353 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1354 oom_wait_memcg = oom_wait_info->memcg; 1355 1356 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1357 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1358 return 0; 1359 return autoremove_wake_function(wait, mode, sync, arg); 1360 } 1361 1362 void memcg1_oom_recover(struct mem_cgroup *memcg) 1363 { 1364 /* 1365 * For the following lockless ->under_oom test, the only required 1366 * guarantee is that it must see the state asserted by an OOM when 1367 * this function is called as a result of userland actions 1368 * triggered by the notification of the OOM. This is trivially 1369 * achieved by invoking mem_cgroup_mark_under_oom() before 1370 * triggering notification. 1371 */ 1372 if (memcg && memcg->under_oom) 1373 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1374 } 1375 1376 /** 1377 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1378 * @handle: actually kill/wait or just clean up the OOM state 1379 * 1380 * This has to be called at the end of a page fault if the memcg OOM 1381 * handler was enabled. 1382 * 1383 * Memcg supports userspace OOM handling where failed allocations must 1384 * sleep on a waitqueue until the userspace task resolves the 1385 * situation. Sleeping directly in the charge context with all kinds 1386 * of locks held is not a good idea, instead we remember an OOM state 1387 * in the task and mem_cgroup_oom_synchronize() has to be called at 1388 * the end of the page fault to complete the OOM handling. 1389 * 1390 * Returns %true if an ongoing memcg OOM situation was detected and 1391 * completed, %false otherwise. 1392 */ 1393 bool mem_cgroup_oom_synchronize(bool handle) 1394 { 1395 struct mem_cgroup *memcg = current->memcg_in_oom; 1396 struct oom_wait_info owait; 1397 bool locked; 1398 1399 /* OOM is global, do not handle */ 1400 if (!memcg) 1401 return false; 1402 1403 if (!handle) 1404 goto cleanup; 1405 1406 owait.memcg = memcg; 1407 owait.wait.flags = 0; 1408 owait.wait.func = memcg_oom_wake_function; 1409 owait.wait.private = current; 1410 INIT_LIST_HEAD(&owait.wait.entry); 1411 1412 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1413 mem_cgroup_mark_under_oom(memcg); 1414 1415 locked = mem_cgroup_oom_trylock(memcg); 1416 1417 if (locked) 1418 mem_cgroup_oom_notify(memcg); 1419 1420 schedule(); 1421 mem_cgroup_unmark_under_oom(memcg); 1422 finish_wait(&memcg_oom_waitq, &owait.wait); 1423 1424 if (locked) 1425 mem_cgroup_oom_unlock(memcg); 1426 cleanup: 1427 current->memcg_in_oom = NULL; 1428 css_put(&memcg->css); 1429 return true; 1430 } 1431 1432 1433 bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) 1434 { 1435 /* 1436 * We are in the middle of the charge context here, so we 1437 * don't want to block when potentially sitting on a callstack 1438 * that holds all kinds of filesystem and mm locks. 1439 * 1440 * cgroup1 allows disabling the OOM killer and waiting for outside 1441 * handling until the charge can succeed; remember the context and put 1442 * the task to sleep at the end of the page fault when all locks are 1443 * released. 1444 * 1445 * On the other hand, in-kernel OOM killer allows for an async victim 1446 * memory reclaim (oom_reaper) and that means that we are not solely 1447 * relying on the oom victim to make a forward progress and we can 1448 * invoke the oom killer here. 1449 * 1450 * Please note that mem_cgroup_out_of_memory might fail to find a 1451 * victim and then we have to bail out from the charge path. 1452 */ 1453 if (READ_ONCE(memcg->oom_kill_disable)) { 1454 if (current->in_user_fault) { 1455 css_get(&memcg->css); 1456 current->memcg_in_oom = memcg; 1457 } 1458 return false; 1459 } 1460 1461 mem_cgroup_mark_under_oom(memcg); 1462 1463 *locked = mem_cgroup_oom_trylock(memcg); 1464 1465 if (*locked) 1466 mem_cgroup_oom_notify(memcg); 1467 1468 mem_cgroup_unmark_under_oom(memcg); 1469 1470 return true; 1471 } 1472 1473 void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked) 1474 { 1475 if (locked) 1476 mem_cgroup_oom_unlock(memcg); 1477 } 1478 1479 static DEFINE_MUTEX(memcg_max_mutex); 1480 1481 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 1482 unsigned long max, bool memsw) 1483 { 1484 bool enlarge = false; 1485 bool drained = false; 1486 int ret; 1487 bool limits_invariant; 1488 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 1489 1490 do { 1491 if (signal_pending(current)) { 1492 ret = -EINTR; 1493 break; 1494 } 1495 1496 mutex_lock(&memcg_max_mutex); 1497 /* 1498 * Make sure that the new limit (memsw or memory limit) doesn't 1499 * break our basic invariant rule memory.max <= memsw.max. 1500 */ 1501 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 1502 max <= memcg->memsw.max; 1503 if (!limits_invariant) { 1504 mutex_unlock(&memcg_max_mutex); 1505 ret = -EINVAL; 1506 break; 1507 } 1508 if (max > counter->max) 1509 enlarge = true; 1510 ret = page_counter_set_max(counter, max); 1511 mutex_unlock(&memcg_max_mutex); 1512 1513 if (!ret) 1514 break; 1515 1516 if (!drained) { 1517 drain_all_stock(memcg); 1518 drained = true; 1519 continue; 1520 } 1521 1522 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 1523 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) { 1524 ret = -EBUSY; 1525 break; 1526 } 1527 } while (true); 1528 1529 if (!ret && enlarge) 1530 memcg1_oom_recover(memcg); 1531 1532 return ret; 1533 } 1534 1535 /* 1536 * Reclaims as many pages from the given memcg as possible. 1537 * 1538 * Caller is responsible for holding css reference for memcg. 1539 */ 1540 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 1541 { 1542 int nr_retries = MAX_RECLAIM_RETRIES; 1543 1544 /* we call try-to-free pages for make this cgroup empty */ 1545 lru_add_drain_all(); 1546 1547 drain_all_stock(memcg); 1548 1549 /* try to free all pages in this cgroup */ 1550 while (nr_retries && page_counter_read(&memcg->memory)) { 1551 if (signal_pending(current)) 1552 return -EINTR; 1553 1554 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 1555 MEMCG_RECLAIM_MAY_SWAP, NULL)) 1556 nr_retries--; 1557 } 1558 1559 return 0; 1560 } 1561 1562 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 1563 char *buf, size_t nbytes, 1564 loff_t off) 1565 { 1566 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1567 1568 if (mem_cgroup_is_root(memcg)) 1569 return -EINVAL; 1570 return mem_cgroup_force_empty(memcg) ?: nbytes; 1571 } 1572 1573 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 1574 struct cftype *cft) 1575 { 1576 return 1; 1577 } 1578 1579 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 1580 struct cftype *cft, u64 val) 1581 { 1582 if (val == 1) 1583 return 0; 1584 1585 pr_warn_once("Non-hierarchical mode is deprecated. " 1586 "Please report your usecase to linux-mm@kvack.org if you " 1587 "depend on this functionality.\n"); 1588 1589 return -EINVAL; 1590 } 1591 1592 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 1593 struct cftype *cft) 1594 { 1595 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1596 struct page_counter *counter; 1597 1598 switch (MEMFILE_TYPE(cft->private)) { 1599 case _MEM: 1600 counter = &memcg->memory; 1601 break; 1602 case _MEMSWAP: 1603 counter = &memcg->memsw; 1604 break; 1605 case _KMEM: 1606 counter = &memcg->kmem; 1607 break; 1608 case _TCP: 1609 counter = &memcg->tcpmem; 1610 break; 1611 default: 1612 BUG(); 1613 } 1614 1615 switch (MEMFILE_ATTR(cft->private)) { 1616 case RES_USAGE: 1617 if (counter == &memcg->memory) 1618 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 1619 if (counter == &memcg->memsw) 1620 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 1621 return (u64)page_counter_read(counter) * PAGE_SIZE; 1622 case RES_LIMIT: 1623 return (u64)counter->max * PAGE_SIZE; 1624 case RES_MAX_USAGE: 1625 return (u64)counter->watermark * PAGE_SIZE; 1626 case RES_FAILCNT: 1627 return counter->failcnt; 1628 case RES_SOFT_LIMIT: 1629 return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE; 1630 default: 1631 BUG(); 1632 } 1633 } 1634 1635 /* 1636 * This function doesn't do anything useful. Its only job is to provide a read 1637 * handler for a file so that cgroup_file_mode() will add read permissions. 1638 */ 1639 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m, 1640 __always_unused void *v) 1641 { 1642 return -EINVAL; 1643 } 1644 1645 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 1646 { 1647 int ret; 1648 1649 mutex_lock(&memcg_max_mutex); 1650 1651 ret = page_counter_set_max(&memcg->tcpmem, max); 1652 if (ret) 1653 goto out; 1654 1655 if (!memcg->tcpmem_active) { 1656 /* 1657 * The active flag needs to be written after the static_key 1658 * update. This is what guarantees that the socket activation 1659 * function is the last one to run. See mem_cgroup_sk_alloc() 1660 * for details, and note that we don't mark any socket as 1661 * belonging to this memcg until that flag is up. 1662 * 1663 * We need to do this, because static_keys will span multiple 1664 * sites, but we can't control their order. If we mark a socket 1665 * as accounted, but the accounting functions are not patched in 1666 * yet, we'll lose accounting. 1667 * 1668 * We never race with the readers in mem_cgroup_sk_alloc(), 1669 * because when this value change, the code to process it is not 1670 * patched in yet. 1671 */ 1672 static_branch_inc(&memcg_sockets_enabled_key); 1673 memcg->tcpmem_active = true; 1674 } 1675 out: 1676 mutex_unlock(&memcg_max_mutex); 1677 return ret; 1678 } 1679 1680 /* 1681 * The user of this function is... 1682 * RES_LIMIT. 1683 */ 1684 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 1685 char *buf, size_t nbytes, loff_t off) 1686 { 1687 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1688 unsigned long nr_pages; 1689 int ret; 1690 1691 buf = strstrip(buf); 1692 ret = page_counter_memparse(buf, "-1", &nr_pages); 1693 if (ret) 1694 return ret; 1695 1696 switch (MEMFILE_ATTR(of_cft(of)->private)) { 1697 case RES_LIMIT: 1698 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 1699 ret = -EINVAL; 1700 break; 1701 } 1702 switch (MEMFILE_TYPE(of_cft(of)->private)) { 1703 case _MEM: 1704 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 1705 break; 1706 case _MEMSWAP: 1707 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 1708 break; 1709 case _KMEM: 1710 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 1711 "Writing any value to this file has no effect. " 1712 "Please report your usecase to linux-mm@kvack.org if you " 1713 "depend on this functionality.\n"); 1714 ret = 0; 1715 break; 1716 case _TCP: 1717 pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. " 1718 "Please report your usecase to linux-mm@kvack.org if you " 1719 "depend on this functionality.\n"); 1720 ret = memcg_update_tcp_max(memcg, nr_pages); 1721 break; 1722 } 1723 break; 1724 case RES_SOFT_LIMIT: 1725 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 1726 ret = -EOPNOTSUPP; 1727 } else { 1728 pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. " 1729 "Please report your usecase to linux-mm@kvack.org if you " 1730 "depend on this functionality.\n"); 1731 WRITE_ONCE(memcg->soft_limit, nr_pages); 1732 ret = 0; 1733 } 1734 break; 1735 } 1736 return ret ?: nbytes; 1737 } 1738 1739 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 1740 size_t nbytes, loff_t off) 1741 { 1742 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1743 struct page_counter *counter; 1744 1745 switch (MEMFILE_TYPE(of_cft(of)->private)) { 1746 case _MEM: 1747 counter = &memcg->memory; 1748 break; 1749 case _MEMSWAP: 1750 counter = &memcg->memsw; 1751 break; 1752 case _KMEM: 1753 counter = &memcg->kmem; 1754 break; 1755 case _TCP: 1756 counter = &memcg->tcpmem; 1757 break; 1758 default: 1759 BUG(); 1760 } 1761 1762 switch (MEMFILE_ATTR(of_cft(of)->private)) { 1763 case RES_MAX_USAGE: 1764 page_counter_reset_watermark(counter); 1765 break; 1766 case RES_FAILCNT: 1767 counter->failcnt = 0; 1768 break; 1769 default: 1770 BUG(); 1771 } 1772 1773 return nbytes; 1774 } 1775 1776 #ifdef CONFIG_NUMA 1777 1778 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 1779 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 1780 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 1781 1782 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 1783 int nid, unsigned int lru_mask, bool tree) 1784 { 1785 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 1786 unsigned long nr = 0; 1787 enum lru_list lru; 1788 1789 VM_BUG_ON((unsigned int)nid >= nr_node_ids); 1790 1791 for_each_lru(lru) { 1792 if (!(BIT(lru) & lru_mask)) 1793 continue; 1794 if (tree) 1795 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 1796 else 1797 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 1798 } 1799 return nr; 1800 } 1801 1802 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 1803 unsigned int lru_mask, 1804 bool tree) 1805 { 1806 unsigned long nr = 0; 1807 enum lru_list lru; 1808 1809 for_each_lru(lru) { 1810 if (!(BIT(lru) & lru_mask)) 1811 continue; 1812 if (tree) 1813 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 1814 else 1815 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 1816 } 1817 return nr; 1818 } 1819 1820 static int memcg_numa_stat_show(struct seq_file *m, void *v) 1821 { 1822 struct numa_stat { 1823 const char *name; 1824 unsigned int lru_mask; 1825 }; 1826 1827 static const struct numa_stat stats[] = { 1828 { "total", LRU_ALL }, 1829 { "file", LRU_ALL_FILE }, 1830 { "anon", LRU_ALL_ANON }, 1831 { "unevictable", BIT(LRU_UNEVICTABLE) }, 1832 }; 1833 const struct numa_stat *stat; 1834 int nid; 1835 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 1836 1837 mem_cgroup_flush_stats(memcg); 1838 1839 for (stat = stats; stat < ARRAY_END(stats); stat++) { 1840 seq_printf(m, "%s=%lu", stat->name, 1841 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 1842 false)); 1843 for_each_node_state(nid, N_MEMORY) 1844 seq_printf(m, " N%d=%lu", nid, 1845 mem_cgroup_node_nr_lru_pages(memcg, nid, 1846 stat->lru_mask, false)); 1847 seq_putc(m, '\n'); 1848 } 1849 1850 for (stat = stats; stat < ARRAY_END(stats); stat++) { 1851 1852 seq_printf(m, "hierarchical_%s=%lu", stat->name, 1853 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 1854 true)); 1855 for_each_node_state(nid, N_MEMORY) 1856 seq_printf(m, " N%d=%lu", nid, 1857 mem_cgroup_node_nr_lru_pages(memcg, nid, 1858 stat->lru_mask, true)); 1859 seq_putc(m, '\n'); 1860 } 1861 1862 return 0; 1863 } 1864 #endif /* CONFIG_NUMA */ 1865 1866 static const unsigned int memcg1_stats[] = { 1867 NR_FILE_PAGES, 1868 NR_ANON_MAPPED, 1869 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1870 NR_ANON_THPS, 1871 #endif 1872 NR_SHMEM, 1873 NR_FILE_MAPPED, 1874 NR_FILE_DIRTY, 1875 NR_WRITEBACK, 1876 WORKINGSET_REFAULT_ANON, 1877 WORKINGSET_REFAULT_FILE, 1878 #ifdef CONFIG_SWAP 1879 MEMCG_SWAP, 1880 NR_SWAPCACHE, 1881 #endif 1882 }; 1883 1884 static const char *const memcg1_stat_names[] = { 1885 "cache", 1886 "rss", 1887 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1888 "rss_huge", 1889 #endif 1890 "shmem", 1891 "mapped_file", 1892 "dirty", 1893 "writeback", 1894 "workingset_refault_anon", 1895 "workingset_refault_file", 1896 #ifdef CONFIG_SWAP 1897 "swap", 1898 "swapcached", 1899 #endif 1900 }; 1901 1902 /* Universal VM events cgroup1 shows, original sort order */ 1903 static const unsigned int memcg1_events[] = { 1904 PGPGIN, 1905 PGPGOUT, 1906 PGFAULT, 1907 PGMAJFAULT, 1908 }; 1909 1910 void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) 1911 { 1912 int i; 1913 1914 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) 1915 reparent_memcg_state_local(memcg, parent, memcg1_stats[i]); 1916 } 1917 1918 void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent) 1919 { 1920 int i; 1921 1922 for (i = 0; i < NR_LRU_LISTS; i++) 1923 reparent_memcg_lruvec_state_local(memcg, parent, i); 1924 } 1925 1926 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 1927 { 1928 unsigned long memory, memsw; 1929 struct mem_cgroup *mi; 1930 unsigned int i; 1931 1932 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 1933 1934 mem_cgroup_flush_stats(memcg); 1935 1936 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1937 unsigned long nr; 1938 1939 nr = memcg_page_state_local_output(memcg, memcg1_stats[i]); 1940 seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr); 1941 } 1942 1943 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 1944 seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), 1945 memcg_events_local(memcg, memcg1_events[i])); 1946 1947 for (i = 0; i < NR_LRU_LISTS; i++) 1948 seq_buf_printf(s, "%s %lu\n", lru_list_name(i), 1949 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 1950 PAGE_SIZE); 1951 1952 /* Hierarchical information */ 1953 memory = memsw = PAGE_COUNTER_MAX; 1954 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 1955 memory = min(memory, READ_ONCE(mi->memory.max)); 1956 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 1957 } 1958 seq_buf_printf(s, "hierarchical_memory_limit %llu\n", 1959 (u64)memory * PAGE_SIZE); 1960 seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", 1961 (u64)memsw * PAGE_SIZE); 1962 1963 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1964 unsigned long nr; 1965 1966 nr = memcg_page_state_output(memcg, memcg1_stats[i]); 1967 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], 1968 (u64)nr); 1969 } 1970 1971 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 1972 seq_buf_printf(s, "total_%s %llu\n", 1973 vm_event_name(memcg1_events[i]), 1974 (u64)memcg_events(memcg, memcg1_events[i])); 1975 1976 for (i = 0; i < NR_LRU_LISTS; i++) 1977 seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), 1978 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 1979 PAGE_SIZE); 1980 1981 #ifdef CONFIG_DEBUG_VM 1982 { 1983 pg_data_t *pgdat; 1984 struct mem_cgroup_per_node *mz; 1985 unsigned long anon_cost = 0; 1986 unsigned long file_cost = 0; 1987 1988 for_each_online_pgdat(pgdat) { 1989 mz = memcg->nodeinfo[pgdat->node_id]; 1990 1991 anon_cost += mz->lruvec.anon_cost; 1992 file_cost += mz->lruvec.file_cost; 1993 } 1994 seq_buf_printf(s, "anon_cost %lu\n", anon_cost); 1995 seq_buf_printf(s, "file_cost %lu\n", file_cost); 1996 } 1997 #endif 1998 } 1999 2000 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 2001 struct cftype *cft) 2002 { 2003 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2004 2005 return mem_cgroup_swappiness(memcg); 2006 } 2007 2008 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 2009 struct cftype *cft, u64 val) 2010 { 2011 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2012 2013 if (val > MAX_SWAPPINESS) 2014 return -EINVAL; 2015 2016 if (!mem_cgroup_is_root(memcg)) { 2017 pr_info_once("Per memcg swappiness does not exist in cgroup v2. " 2018 "See memory.reclaim or memory.swap.max there\n "); 2019 WRITE_ONCE(memcg->swappiness, val); 2020 } else 2021 WRITE_ONCE(vm_swappiness, val); 2022 2023 return 0; 2024 } 2025 2026 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 2027 { 2028 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 2029 2030 seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable)); 2031 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 2032 seq_printf(sf, "oom_kill %lu\n", 2033 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 2034 return 0; 2035 } 2036 2037 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 2038 struct cftype *cft, u64 val) 2039 { 2040 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 2041 2042 pr_warn_once("oom_control is deprecated and will be removed. " 2043 "Please report your usecase to linux-mm-@kvack.org if you " 2044 "depend on this functionality.\n"); 2045 2046 /* cannot set to root cgroup and only 0 and 1 are allowed */ 2047 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 2048 return -EINVAL; 2049 2050 WRITE_ONCE(memcg->oom_kill_disable, val); 2051 if (!val) 2052 memcg1_oom_recover(memcg); 2053 2054 return 0; 2055 } 2056 2057 #ifdef CONFIG_SLUB_DEBUG 2058 static int mem_cgroup_slab_show(struct seq_file *m, void *p) 2059 { 2060 /* 2061 * Deprecated. 2062 * Please, take a look at tools/cgroup/memcg_slabinfo.py . 2063 */ 2064 return 0; 2065 } 2066 #endif 2067 2068 struct cftype mem_cgroup_legacy_files[] = { 2069 { 2070 .name = "usage_in_bytes", 2071 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 2072 .read_u64 = mem_cgroup_read_u64, 2073 }, 2074 { 2075 .name = "max_usage_in_bytes", 2076 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 2077 .write = mem_cgroup_reset, 2078 .read_u64 = mem_cgroup_read_u64, 2079 }, 2080 { 2081 .name = "limit_in_bytes", 2082 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 2083 .write = mem_cgroup_write, 2084 .read_u64 = mem_cgroup_read_u64, 2085 }, 2086 { 2087 .name = "soft_limit_in_bytes", 2088 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 2089 .write = mem_cgroup_write, 2090 .read_u64 = mem_cgroup_read_u64, 2091 }, 2092 { 2093 .name = "failcnt", 2094 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2095 .write = mem_cgroup_reset, 2096 .read_u64 = mem_cgroup_read_u64, 2097 }, 2098 { 2099 .name = "stat", 2100 .seq_show = memory_stat_show, 2101 }, 2102 { 2103 .name = "force_empty", 2104 .write = mem_cgroup_force_empty_write, 2105 }, 2106 { 2107 .name = "use_hierarchy", 2108 .write_u64 = mem_cgroup_hierarchy_write, 2109 .read_u64 = mem_cgroup_hierarchy_read, 2110 }, 2111 { 2112 .name = "cgroup.event_control", /* XXX: for compat */ 2113 .write = memcg_write_event_control, 2114 .flags = CFTYPE_NO_PREFIX, 2115 }, 2116 { 2117 .name = "swappiness", 2118 .read_u64 = mem_cgroup_swappiness_read, 2119 .write_u64 = mem_cgroup_swappiness_write, 2120 }, 2121 { 2122 .name = "move_charge_at_immigrate", 2123 .read_u64 = mem_cgroup_move_charge_read, 2124 .write_u64 = mem_cgroup_move_charge_write, 2125 }, 2126 { 2127 .name = "oom_control", 2128 .seq_show = mem_cgroup_oom_control_read, 2129 .write_u64 = mem_cgroup_oom_control_write, 2130 }, 2131 { 2132 .name = "pressure_level", 2133 .seq_show = mem_cgroup_dummy_seq_show, 2134 }, 2135 #ifdef CONFIG_NUMA 2136 { 2137 .name = "numa_stat", 2138 .seq_show = memcg_numa_stat_show, 2139 }, 2140 #endif 2141 { 2142 .name = "kmem.limit_in_bytes", 2143 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 2144 .write = mem_cgroup_write, 2145 .read_u64 = mem_cgroup_read_u64, 2146 }, 2147 { 2148 .name = "kmem.usage_in_bytes", 2149 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 2150 .read_u64 = mem_cgroup_read_u64, 2151 }, 2152 { 2153 .name = "kmem.failcnt", 2154 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 2155 .write = mem_cgroup_reset, 2156 .read_u64 = mem_cgroup_read_u64, 2157 }, 2158 { 2159 .name = "kmem.max_usage_in_bytes", 2160 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 2161 .write = mem_cgroup_reset, 2162 .read_u64 = mem_cgroup_read_u64, 2163 }, 2164 #ifdef CONFIG_SLUB_DEBUG 2165 { 2166 .name = "kmem.slabinfo", 2167 .seq_show = mem_cgroup_slab_show, 2168 }, 2169 #endif 2170 { 2171 .name = "kmem.tcp.limit_in_bytes", 2172 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 2173 .write = mem_cgroup_write, 2174 .read_u64 = mem_cgroup_read_u64, 2175 }, 2176 { 2177 .name = "kmem.tcp.usage_in_bytes", 2178 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 2179 .read_u64 = mem_cgroup_read_u64, 2180 }, 2181 { 2182 .name = "kmem.tcp.failcnt", 2183 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 2184 .write = mem_cgroup_reset, 2185 .read_u64 = mem_cgroup_read_u64, 2186 }, 2187 { 2188 .name = "kmem.tcp.max_usage_in_bytes", 2189 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 2190 .write = mem_cgroup_reset, 2191 .read_u64 = mem_cgroup_read_u64, 2192 }, 2193 { }, /* terminate */ 2194 }; 2195 2196 struct cftype memsw_files[] = { 2197 { 2198 .name = "memsw.usage_in_bytes", 2199 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 2200 .read_u64 = mem_cgroup_read_u64, 2201 }, 2202 { 2203 .name = "memsw.max_usage_in_bytes", 2204 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 2205 .write = mem_cgroup_reset, 2206 .read_u64 = mem_cgroup_read_u64, 2207 }, 2208 { 2209 .name = "memsw.limit_in_bytes", 2210 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 2211 .write = mem_cgroup_write, 2212 .read_u64 = mem_cgroup_read_u64, 2213 }, 2214 { 2215 .name = "memsw.failcnt", 2216 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 2217 .write = mem_cgroup_reset, 2218 .read_u64 = mem_cgroup_read_u64, 2219 }, 2220 { }, /* terminate */ 2221 }; 2222 2223 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) 2224 { 2225 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 2226 if (nr_pages > 0) 2227 page_counter_charge(&memcg->kmem, nr_pages); 2228 else 2229 page_counter_uncharge(&memcg->kmem, -nr_pages); 2230 } 2231 } 2232 2233 bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 2234 gfp_t gfp_mask) 2235 { 2236 struct page_counter *fail; 2237 2238 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 2239 memcg->tcpmem_pressure = 0; 2240 return true; 2241 } 2242 memcg->tcpmem_pressure = 1; 2243 if (gfp_mask & __GFP_NOFAIL) { 2244 page_counter_charge(&memcg->tcpmem, nr_pages); 2245 return true; 2246 } 2247 return false; 2248 } 2249 2250 bool memcg1_alloc_events(struct mem_cgroup *memcg) 2251 { 2252 memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu, 2253 GFP_KERNEL_ACCOUNT); 2254 return !!memcg->events_percpu; 2255 } 2256 2257 void memcg1_free_events(struct mem_cgroup *memcg) 2258 { 2259 free_percpu(memcg->events_percpu); 2260 } 2261 2262 static int __init memcg1_init(void) 2263 { 2264 int node; 2265 2266 for_each_node(node) { 2267 struct mem_cgroup_tree_per_node *rtpn; 2268 2269 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); 2270 2271 rtpn->rb_root = RB_ROOT; 2272 rtpn->rb_rightmost = NULL; 2273 spin_lock_init(&rtpn->lock); 2274 soft_limit_tree.rb_tree_per_node[node] = rtpn; 2275 } 2276 2277 return 0; 2278 } 2279 subsys_initcall(memcg1_init); 2280