1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/memcontrol.h> 4 #include <linux/swap.h> 5 #include <linux/mm_inline.h> 6 #include <linux/pagewalk.h> 7 #include <linux/backing-dev.h> 8 #include <linux/swap_cgroup.h> 9 #include <linux/eventfd.h> 10 #include <linux/poll.h> 11 #include <linux/sort.h> 12 #include <linux/file.h> 13 #include <linux/seq_buf.h> 14 15 #include "internal.h" 16 #include "swap.h" 17 #include "memcontrol-v1.h" 18 19 /* 20 * Cgroups above their limits are maintained in a RB-Tree, independent of 21 * their hierarchy representation 22 */ 23 24 struct mem_cgroup_tree_per_node { 25 struct rb_root rb_root; 26 struct rb_node *rb_rightmost; 27 spinlock_t lock; 28 }; 29 30 struct mem_cgroup_tree { 31 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 32 }; 33 34 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 35 36 /* 37 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft 38 * limit reclaim to prevent infinite loops, if they ever occur. 39 */ 40 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 41 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 42 43 /* for OOM */ 44 struct mem_cgroup_eventfd_list { 45 struct list_head list; 46 struct eventfd_ctx *eventfd; 47 }; 48 49 /* 50 * cgroup_event represents events which userspace want to receive. 51 */ 52 struct mem_cgroup_event { 53 /* 54 * memcg which the event belongs to. 55 */ 56 struct mem_cgroup *memcg; 57 /* 58 * eventfd to signal userspace about the event. 59 */ 60 struct eventfd_ctx *eventfd; 61 /* 62 * Each of these stored in a list by the cgroup. 63 */ 64 struct list_head list; 65 /* 66 * register_event() callback will be used to add new userspace 67 * waiter for changes related to this event. Use eventfd_signal() 68 * on eventfd to send notification to userspace. 69 */ 70 int (*register_event)(struct mem_cgroup *memcg, 71 struct eventfd_ctx *eventfd, const char *args); 72 /* 73 * unregister_event() callback will be called when userspace closes 74 * the eventfd or on cgroup removing. This callback must be set, 75 * if you want provide notification functionality. 76 */ 77 void (*unregister_event)(struct mem_cgroup *memcg, 78 struct eventfd_ctx *eventfd); 79 /* 80 * All fields below needed to unregister event when 81 * userspace closes eventfd. 82 */ 83 poll_table pt; 84 wait_queue_head_t *wqh; 85 wait_queue_entry_t wait; 86 struct work_struct remove; 87 }; 88 89 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 90 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 91 #define MEMFILE_ATTR(val) ((val) & 0xffff) 92 93 enum { 94 RES_USAGE, 95 RES_LIMIT, 96 RES_MAX_USAGE, 97 RES_FAILCNT, 98 RES_SOFT_LIMIT, 99 }; 100 101 #ifdef CONFIG_LOCKDEP 102 static struct lockdep_map memcg_oom_lock_dep_map = { 103 .name = "memcg_oom_lock", 104 }; 105 #endif 106 107 DEFINE_SPINLOCK(memcg_oom_lock); 108 109 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 110 struct mem_cgroup_tree_per_node *mctz, 111 unsigned long new_usage_in_excess) 112 { 113 struct rb_node **p = &mctz->rb_root.rb_node; 114 struct rb_node *parent = NULL; 115 struct mem_cgroup_per_node *mz_node; 116 bool rightmost = true; 117 118 if (mz->on_tree) 119 return; 120 121 mz->usage_in_excess = new_usage_in_excess; 122 if (!mz->usage_in_excess) 123 return; 124 while (*p) { 125 parent = *p; 126 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 127 tree_node); 128 if (mz->usage_in_excess < mz_node->usage_in_excess) { 129 p = &(*p)->rb_left; 130 rightmost = false; 131 } else { 132 p = &(*p)->rb_right; 133 } 134 } 135 136 if (rightmost) 137 mctz->rb_rightmost = &mz->tree_node; 138 139 rb_link_node(&mz->tree_node, parent, p); 140 rb_insert_color(&mz->tree_node, &mctz->rb_root); 141 mz->on_tree = true; 142 } 143 144 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 145 struct mem_cgroup_tree_per_node *mctz) 146 { 147 if (!mz->on_tree) 148 return; 149 150 if (&mz->tree_node == mctz->rb_rightmost) 151 mctz->rb_rightmost = rb_prev(&mz->tree_node); 152 153 rb_erase(&mz->tree_node, &mctz->rb_root); 154 mz->on_tree = false; 155 } 156 157 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 158 struct mem_cgroup_tree_per_node *mctz) 159 { 160 unsigned long flags; 161 162 spin_lock_irqsave(&mctz->lock, flags); 163 __mem_cgroup_remove_exceeded(mz, mctz); 164 spin_unlock_irqrestore(&mctz->lock, flags); 165 } 166 167 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 168 { 169 unsigned long nr_pages = page_counter_read(&memcg->memory); 170 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 171 unsigned long excess = 0; 172 173 if (nr_pages > soft_limit) 174 excess = nr_pages - soft_limit; 175 176 return excess; 177 } 178 179 static void memcg1_update_tree(struct mem_cgroup *memcg, int nid) 180 { 181 unsigned long excess; 182 struct mem_cgroup_per_node *mz; 183 struct mem_cgroup_tree_per_node *mctz; 184 185 if (lru_gen_enabled()) { 186 if (soft_limit_excess(memcg)) 187 lru_gen_soft_reclaim(memcg, nid); 188 return; 189 } 190 191 mctz = soft_limit_tree.rb_tree_per_node[nid]; 192 if (!mctz) 193 return; 194 /* 195 * Necessary to update all ancestors when hierarchy is used. 196 * because their event counter is not touched. 197 */ 198 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 199 mz = memcg->nodeinfo[nid]; 200 excess = soft_limit_excess(memcg); 201 /* 202 * We have to update the tree if mz is on RB-tree or 203 * mem is over its softlimit. 204 */ 205 if (excess || mz->on_tree) { 206 unsigned long flags; 207 208 spin_lock_irqsave(&mctz->lock, flags); 209 /* if on-tree, remove it */ 210 if (mz->on_tree) 211 __mem_cgroup_remove_exceeded(mz, mctz); 212 /* 213 * Insert again. mz->usage_in_excess will be updated. 214 * If excess is 0, no tree ops. 215 */ 216 __mem_cgroup_insert_exceeded(mz, mctz, excess); 217 spin_unlock_irqrestore(&mctz->lock, flags); 218 } 219 } 220 } 221 222 void memcg1_remove_from_trees(struct mem_cgroup *memcg) 223 { 224 struct mem_cgroup_tree_per_node *mctz; 225 struct mem_cgroup_per_node *mz; 226 int nid; 227 228 for_each_node(nid) { 229 mz = memcg->nodeinfo[nid]; 230 mctz = soft_limit_tree.rb_tree_per_node[nid]; 231 if (mctz) 232 mem_cgroup_remove_exceeded(mz, mctz); 233 } 234 } 235 236 static struct mem_cgroup_per_node * 237 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 238 { 239 struct mem_cgroup_per_node *mz; 240 241 retry: 242 mz = NULL; 243 if (!mctz->rb_rightmost) 244 goto done; /* Nothing to reclaim from */ 245 246 mz = rb_entry(mctz->rb_rightmost, 247 struct mem_cgroup_per_node, tree_node); 248 /* 249 * Remove the node now but someone else can add it back, 250 * we will to add it back at the end of reclaim to its correct 251 * position in the tree. 252 */ 253 __mem_cgroup_remove_exceeded(mz, mctz); 254 if (!soft_limit_excess(mz->memcg) || 255 !css_tryget(&mz->memcg->css)) 256 goto retry; 257 done: 258 return mz; 259 } 260 261 static struct mem_cgroup_per_node * 262 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 263 { 264 struct mem_cgroup_per_node *mz; 265 266 spin_lock_irq(&mctz->lock); 267 mz = __mem_cgroup_largest_soft_limit_node(mctz); 268 spin_unlock_irq(&mctz->lock); 269 return mz; 270 } 271 272 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 273 pg_data_t *pgdat, 274 gfp_t gfp_mask, 275 unsigned long *total_scanned) 276 { 277 struct mem_cgroup *victim = NULL; 278 int total = 0; 279 int loop = 0; 280 unsigned long excess; 281 unsigned long nr_scanned; 282 struct mem_cgroup_reclaim_cookie reclaim = { 283 .pgdat = pgdat, 284 }; 285 286 excess = soft_limit_excess(root_memcg); 287 288 while (1) { 289 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 290 if (!victim) { 291 loop++; 292 if (loop >= 2) { 293 /* 294 * If we have not been able to reclaim 295 * anything, it might because there are 296 * no reclaimable pages under this hierarchy 297 */ 298 if (!total) 299 break; 300 /* 301 * We want to do more targeted reclaim. 302 * excess >> 2 is not to excessive so as to 303 * reclaim too much, nor too less that we keep 304 * coming back to reclaim from this cgroup 305 */ 306 if (total >= (excess >> 2) || 307 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 308 break; 309 } 310 continue; 311 } 312 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 313 pgdat, &nr_scanned); 314 *total_scanned += nr_scanned; 315 if (!soft_limit_excess(root_memcg)) 316 break; 317 } 318 mem_cgroup_iter_break(root_memcg, victim); 319 return total; 320 } 321 322 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, 323 gfp_t gfp_mask, 324 unsigned long *total_scanned) 325 { 326 unsigned long nr_reclaimed = 0; 327 struct mem_cgroup_per_node *mz, *next_mz = NULL; 328 unsigned long reclaimed; 329 int loop = 0; 330 struct mem_cgroup_tree_per_node *mctz; 331 unsigned long excess; 332 333 if (lru_gen_enabled()) 334 return 0; 335 336 if (order > 0) 337 return 0; 338 339 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; 340 341 /* 342 * Do not even bother to check the largest node if the root 343 * is empty. Do it lockless to prevent lock bouncing. Races 344 * are acceptable as soft limit is best effort anyway. 345 */ 346 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 347 return 0; 348 349 /* 350 * This loop can run a while, specially if mem_cgroup's continuously 351 * keep exceeding their soft limit and putting the system under 352 * pressure 353 */ 354 do { 355 if (next_mz) 356 mz = next_mz; 357 else 358 mz = mem_cgroup_largest_soft_limit_node(mctz); 359 if (!mz) 360 break; 361 362 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 363 gfp_mask, total_scanned); 364 nr_reclaimed += reclaimed; 365 spin_lock_irq(&mctz->lock); 366 367 /* 368 * If we failed to reclaim anything from this memory cgroup 369 * it is time to move on to the next cgroup 370 */ 371 next_mz = NULL; 372 if (!reclaimed) 373 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 374 375 excess = soft_limit_excess(mz->memcg); 376 /* 377 * One school of thought says that we should not add 378 * back the node to the tree if reclaim returns 0. 379 * But our reclaim could return 0, simply because due 380 * to priority we are exposing a smaller subset of 381 * memory to reclaim from. Consider this as a longer 382 * term TODO. 383 */ 384 /* If excess == 0, no tree ops */ 385 __mem_cgroup_insert_exceeded(mz, mctz, excess); 386 spin_unlock_irq(&mctz->lock); 387 css_put(&mz->memcg->css); 388 loop++; 389 /* 390 * Could not reclaim anything and there are no more 391 * mem cgroups to try or we seem to be looping without 392 * reclaiming anything. 393 */ 394 if (!nr_reclaimed && 395 (next_mz == NULL || 396 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 397 break; 398 } while (!nr_reclaimed); 399 if (next_mz) 400 css_put(&next_mz->memcg->css); 401 return nr_reclaimed; 402 } 403 404 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 405 struct cftype *cft) 406 { 407 return 0; 408 } 409 410 #ifdef CONFIG_MMU 411 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 412 struct cftype *cft, u64 val) 413 { 414 pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " 415 "Please report your usecase to linux-mm@kvack.org if you " 416 "depend on this functionality.\n"); 417 418 if (val != 0) 419 return -EINVAL; 420 return 0; 421 } 422 #else 423 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 424 struct cftype *cft, u64 val) 425 { 426 return -ENOSYS; 427 } 428 #endif 429 430 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 431 { 432 struct mem_cgroup_threshold_ary *t; 433 unsigned long usage; 434 int i; 435 436 rcu_read_lock(); 437 if (!swap) 438 t = rcu_dereference(memcg->thresholds.primary); 439 else 440 t = rcu_dereference(memcg->memsw_thresholds.primary); 441 442 if (!t) 443 goto unlock; 444 445 usage = mem_cgroup_usage(memcg, swap); 446 447 /* 448 * current_threshold points to threshold just below or equal to usage. 449 * If it's not true, a threshold was crossed after last 450 * call of __mem_cgroup_threshold(). 451 */ 452 i = t->current_threshold; 453 454 /* 455 * Iterate backward over array of thresholds starting from 456 * current_threshold and check if a threshold is crossed. 457 * If none of thresholds below usage is crossed, we read 458 * only one element of the array here. 459 */ 460 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 461 eventfd_signal(t->entries[i].eventfd); 462 463 /* i = current_threshold + 1 */ 464 i++; 465 466 /* 467 * Iterate forward over array of thresholds starting from 468 * current_threshold+1 and check if a threshold is crossed. 469 * If none of thresholds above usage is crossed, we read 470 * only one element of the array here. 471 */ 472 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 473 eventfd_signal(t->entries[i].eventfd); 474 475 /* Update current_threshold */ 476 t->current_threshold = i - 1; 477 unlock: 478 rcu_read_unlock(); 479 } 480 481 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 482 { 483 while (memcg) { 484 __mem_cgroup_threshold(memcg, false); 485 if (do_memsw_account()) 486 __mem_cgroup_threshold(memcg, true); 487 488 memcg = parent_mem_cgroup(memcg); 489 } 490 } 491 492 /* Cgroup1: threshold notifications & softlimit tree updates */ 493 struct memcg1_events_percpu { 494 unsigned long nr_page_events; 495 unsigned long targets[MEM_CGROUP_NTARGETS]; 496 }; 497 498 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages) 499 { 500 /* pagein of a big page is an event. So, ignore page size */ 501 if (nr_pages > 0) 502 __count_memcg_events(memcg, PGPGIN, 1); 503 else { 504 __count_memcg_events(memcg, PGPGOUT, 1); 505 nr_pages = -nr_pages; /* for event */ 506 } 507 508 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages); 509 } 510 511 #define THRESHOLDS_EVENTS_TARGET 128 512 #define SOFTLIMIT_EVENTS_TARGET 1024 513 514 static bool memcg1_event_ratelimit(struct mem_cgroup *memcg, 515 enum mem_cgroup_events_target target) 516 { 517 unsigned long val, next; 518 519 val = __this_cpu_read(memcg->events_percpu->nr_page_events); 520 next = __this_cpu_read(memcg->events_percpu->targets[target]); 521 /* from time_after() in jiffies.h */ 522 if ((long)(next - val) < 0) { 523 switch (target) { 524 case MEM_CGROUP_TARGET_THRESH: 525 next = val + THRESHOLDS_EVENTS_TARGET; 526 break; 527 case MEM_CGROUP_TARGET_SOFTLIMIT: 528 next = val + SOFTLIMIT_EVENTS_TARGET; 529 break; 530 default: 531 break; 532 } 533 __this_cpu_write(memcg->events_percpu->targets[target], next); 534 return true; 535 } 536 return false; 537 } 538 539 /* 540 * Check events in order. 541 * 542 */ 543 static void memcg1_check_events(struct mem_cgroup *memcg, int nid) 544 { 545 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 546 return; 547 548 /* threshold event is triggered in finer grain than soft limit */ 549 if (unlikely(memcg1_event_ratelimit(memcg, 550 MEM_CGROUP_TARGET_THRESH))) { 551 bool do_softlimit; 552 553 do_softlimit = memcg1_event_ratelimit(memcg, 554 MEM_CGROUP_TARGET_SOFTLIMIT); 555 mem_cgroup_threshold(memcg); 556 if (unlikely(do_softlimit)) 557 memcg1_update_tree(memcg, nid); 558 } 559 } 560 561 void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) 562 { 563 unsigned long flags; 564 565 local_irq_save(flags); 566 memcg1_charge_statistics(memcg, folio_nr_pages(folio)); 567 memcg1_check_events(memcg, folio_nid(folio)); 568 local_irq_restore(flags); 569 } 570 571 void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) 572 { 573 /* 574 * Interrupts should be disabled here because the caller holds the 575 * i_pages lock which is taken with interrupts-off. It is 576 * important here to have the interrupts disabled because it is the 577 * only synchronisation we have for updating the per-CPU variables. 578 */ 579 preempt_disable_nested(); 580 VM_WARN_ON_IRQS_ENABLED(); 581 memcg1_charge_statistics(memcg, -folio_nr_pages(folio)); 582 preempt_enable_nested(); 583 memcg1_check_events(memcg, folio_nid(folio)); 584 } 585 586 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 587 unsigned long nr_memory, int nid) 588 { 589 unsigned long flags; 590 591 local_irq_save(flags); 592 __count_memcg_events(memcg, PGPGOUT, pgpgout); 593 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory); 594 memcg1_check_events(memcg, nid); 595 local_irq_restore(flags); 596 } 597 598 static int compare_thresholds(const void *a, const void *b) 599 { 600 const struct mem_cgroup_threshold *_a = a; 601 const struct mem_cgroup_threshold *_b = b; 602 603 if (_a->threshold > _b->threshold) 604 return 1; 605 606 if (_a->threshold < _b->threshold) 607 return -1; 608 609 return 0; 610 } 611 612 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 613 { 614 struct mem_cgroup_eventfd_list *ev; 615 616 spin_lock(&memcg_oom_lock); 617 618 list_for_each_entry(ev, &memcg->oom_notify, list) 619 eventfd_signal(ev->eventfd); 620 621 spin_unlock(&memcg_oom_lock); 622 return 0; 623 } 624 625 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 626 { 627 struct mem_cgroup *iter; 628 629 for_each_mem_cgroup_tree(iter, memcg) 630 mem_cgroup_oom_notify_cb(iter); 631 } 632 633 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 634 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 635 { 636 struct mem_cgroup_thresholds *thresholds; 637 struct mem_cgroup_threshold_ary *new; 638 unsigned long threshold; 639 unsigned long usage; 640 int i, size, ret; 641 642 ret = page_counter_memparse(args, "-1", &threshold); 643 if (ret) 644 return ret; 645 646 mutex_lock(&memcg->thresholds_lock); 647 648 if (type == _MEM) { 649 thresholds = &memcg->thresholds; 650 usage = mem_cgroup_usage(memcg, false); 651 } else if (type == _MEMSWAP) { 652 thresholds = &memcg->memsw_thresholds; 653 usage = mem_cgroup_usage(memcg, true); 654 } else 655 BUG(); 656 657 /* Check if a threshold crossed before adding a new one */ 658 if (thresholds->primary) 659 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 660 661 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 662 663 /* Allocate memory for new array of thresholds */ 664 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 665 if (!new) { 666 ret = -ENOMEM; 667 goto unlock; 668 } 669 new->size = size; 670 671 /* Copy thresholds (if any) to new array */ 672 if (thresholds->primary) 673 memcpy(new->entries, thresholds->primary->entries, 674 flex_array_size(new, entries, size - 1)); 675 676 /* Add new threshold */ 677 new->entries[size - 1].eventfd = eventfd; 678 new->entries[size - 1].threshold = threshold; 679 680 /* Sort thresholds. Registering of new threshold isn't time-critical */ 681 sort(new->entries, size, sizeof(*new->entries), 682 compare_thresholds, NULL); 683 684 /* Find current threshold */ 685 new->current_threshold = -1; 686 for (i = 0; i < size; i++) { 687 if (new->entries[i].threshold <= usage) { 688 /* 689 * new->current_threshold will not be used until 690 * rcu_assign_pointer(), so it's safe to increment 691 * it here. 692 */ 693 ++new->current_threshold; 694 } else 695 break; 696 } 697 698 /* Free old spare buffer and save old primary buffer as spare */ 699 kfree(thresholds->spare); 700 thresholds->spare = thresholds->primary; 701 702 rcu_assign_pointer(thresholds->primary, new); 703 704 /* To be sure that nobody uses thresholds */ 705 synchronize_rcu(); 706 707 unlock: 708 mutex_unlock(&memcg->thresholds_lock); 709 710 return ret; 711 } 712 713 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 714 struct eventfd_ctx *eventfd, const char *args) 715 { 716 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 717 } 718 719 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 720 struct eventfd_ctx *eventfd, const char *args) 721 { 722 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 723 } 724 725 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 726 struct eventfd_ctx *eventfd, enum res_type type) 727 { 728 struct mem_cgroup_thresholds *thresholds; 729 struct mem_cgroup_threshold_ary *new; 730 unsigned long usage; 731 int i, j, size, entries; 732 733 mutex_lock(&memcg->thresholds_lock); 734 735 if (type == _MEM) { 736 thresholds = &memcg->thresholds; 737 usage = mem_cgroup_usage(memcg, false); 738 } else if (type == _MEMSWAP) { 739 thresholds = &memcg->memsw_thresholds; 740 usage = mem_cgroup_usage(memcg, true); 741 } else 742 BUG(); 743 744 if (!thresholds->primary) 745 goto unlock; 746 747 /* Check if a threshold crossed before removing */ 748 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 749 750 /* Calculate new number of threshold */ 751 size = entries = 0; 752 for (i = 0; i < thresholds->primary->size; i++) { 753 if (thresholds->primary->entries[i].eventfd != eventfd) 754 size++; 755 else 756 entries++; 757 } 758 759 new = thresholds->spare; 760 761 /* If no items related to eventfd have been cleared, nothing to do */ 762 if (!entries) 763 goto unlock; 764 765 /* Set thresholds array to NULL if we don't have thresholds */ 766 if (!size) { 767 kfree(new); 768 new = NULL; 769 goto swap_buffers; 770 } 771 772 new->size = size; 773 774 /* Copy thresholds and find current threshold */ 775 new->current_threshold = -1; 776 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 777 if (thresholds->primary->entries[i].eventfd == eventfd) 778 continue; 779 780 new->entries[j] = thresholds->primary->entries[i]; 781 if (new->entries[j].threshold <= usage) { 782 /* 783 * new->current_threshold will not be used 784 * until rcu_assign_pointer(), so it's safe to increment 785 * it here. 786 */ 787 ++new->current_threshold; 788 } 789 j++; 790 } 791 792 swap_buffers: 793 /* Swap primary and spare array */ 794 thresholds->spare = thresholds->primary; 795 796 rcu_assign_pointer(thresholds->primary, new); 797 798 /* To be sure that nobody uses thresholds */ 799 synchronize_rcu(); 800 801 /* If all events are unregistered, free the spare array */ 802 if (!new) { 803 kfree(thresholds->spare); 804 thresholds->spare = NULL; 805 } 806 unlock: 807 mutex_unlock(&memcg->thresholds_lock); 808 } 809 810 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 811 struct eventfd_ctx *eventfd) 812 { 813 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 814 } 815 816 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 817 struct eventfd_ctx *eventfd) 818 { 819 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 820 } 821 822 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 823 struct eventfd_ctx *eventfd, const char *args) 824 { 825 struct mem_cgroup_eventfd_list *event; 826 827 event = kmalloc(sizeof(*event), GFP_KERNEL); 828 if (!event) 829 return -ENOMEM; 830 831 spin_lock(&memcg_oom_lock); 832 833 event->eventfd = eventfd; 834 list_add(&event->list, &memcg->oom_notify); 835 836 /* already in OOM ? */ 837 if (memcg->under_oom) 838 eventfd_signal(eventfd); 839 spin_unlock(&memcg_oom_lock); 840 841 return 0; 842 } 843 844 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 845 struct eventfd_ctx *eventfd) 846 { 847 struct mem_cgroup_eventfd_list *ev, *tmp; 848 849 spin_lock(&memcg_oom_lock); 850 851 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 852 if (ev->eventfd == eventfd) { 853 list_del(&ev->list); 854 kfree(ev); 855 } 856 } 857 858 spin_unlock(&memcg_oom_lock); 859 } 860 861 /* 862 * DO NOT USE IN NEW FILES. 863 * 864 * "cgroup.event_control" implementation. 865 * 866 * This is way over-engineered. It tries to support fully configurable 867 * events for each user. Such level of flexibility is completely 868 * unnecessary especially in the light of the planned unified hierarchy. 869 * 870 * Please deprecate this and replace with something simpler if at all 871 * possible. 872 */ 873 874 /* 875 * Unregister event and free resources. 876 * 877 * Gets called from workqueue. 878 */ 879 static void memcg_event_remove(struct work_struct *work) 880 { 881 struct mem_cgroup_event *event = 882 container_of(work, struct mem_cgroup_event, remove); 883 struct mem_cgroup *memcg = event->memcg; 884 885 remove_wait_queue(event->wqh, &event->wait); 886 887 event->unregister_event(memcg, event->eventfd); 888 889 /* Notify userspace the event is going away. */ 890 eventfd_signal(event->eventfd); 891 892 eventfd_ctx_put(event->eventfd); 893 kfree(event); 894 css_put(&memcg->css); 895 } 896 897 /* 898 * Gets called on EPOLLHUP on eventfd when user closes it. 899 * 900 * Called with wqh->lock held and interrupts disabled. 901 */ 902 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 903 int sync, void *key) 904 { 905 struct mem_cgroup_event *event = 906 container_of(wait, struct mem_cgroup_event, wait); 907 struct mem_cgroup *memcg = event->memcg; 908 __poll_t flags = key_to_poll(key); 909 910 if (flags & EPOLLHUP) { 911 /* 912 * If the event has been detached at cgroup removal, we 913 * can simply return knowing the other side will cleanup 914 * for us. 915 * 916 * We can't race against event freeing since the other 917 * side will require wqh->lock via remove_wait_queue(), 918 * which we hold. 919 */ 920 spin_lock(&memcg->event_list_lock); 921 if (!list_empty(&event->list)) { 922 list_del_init(&event->list); 923 /* 924 * We are in atomic context, but cgroup_event_remove() 925 * may sleep, so we have to call it in workqueue. 926 */ 927 schedule_work(&event->remove); 928 } 929 spin_unlock(&memcg->event_list_lock); 930 } 931 932 return 0; 933 } 934 935 static void memcg_event_ptable_queue_proc(struct file *file, 936 wait_queue_head_t *wqh, poll_table *pt) 937 { 938 struct mem_cgroup_event *event = 939 container_of(pt, struct mem_cgroup_event, pt); 940 941 event->wqh = wqh; 942 add_wait_queue(wqh, &event->wait); 943 } 944 945 /* 946 * DO NOT USE IN NEW FILES. 947 * 948 * Parse input and register new cgroup event handler. 949 * 950 * Input must be in format '<event_fd> <control_fd> <args>'. 951 * Interpretation of args is defined by control file implementation. 952 */ 953 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 954 char *buf, size_t nbytes, loff_t off) 955 { 956 struct cgroup_subsys_state *css = of_css(of); 957 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 958 struct mem_cgroup_event *event; 959 struct cgroup_subsys_state *cfile_css; 960 unsigned int efd, cfd; 961 struct fd efile; 962 struct fd cfile; 963 struct dentry *cdentry; 964 const char *name; 965 char *endp; 966 int ret; 967 968 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 969 return -EOPNOTSUPP; 970 971 buf = strstrip(buf); 972 973 efd = simple_strtoul(buf, &endp, 10); 974 if (*endp != ' ') 975 return -EINVAL; 976 buf = endp + 1; 977 978 cfd = simple_strtoul(buf, &endp, 10); 979 if (*endp == '\0') 980 buf = endp; 981 else if (*endp == ' ') 982 buf = endp + 1; 983 else 984 return -EINVAL; 985 986 event = kzalloc(sizeof(*event), GFP_KERNEL); 987 if (!event) 988 return -ENOMEM; 989 990 event->memcg = memcg; 991 INIT_LIST_HEAD(&event->list); 992 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 993 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 994 INIT_WORK(&event->remove, memcg_event_remove); 995 996 efile = fdget(efd); 997 if (!fd_file(efile)) { 998 ret = -EBADF; 999 goto out_kfree; 1000 } 1001 1002 event->eventfd = eventfd_ctx_fileget(fd_file(efile)); 1003 if (IS_ERR(event->eventfd)) { 1004 ret = PTR_ERR(event->eventfd); 1005 goto out_put_efile; 1006 } 1007 1008 cfile = fdget(cfd); 1009 if (!fd_file(cfile)) { 1010 ret = -EBADF; 1011 goto out_put_eventfd; 1012 } 1013 1014 /* the process need read permission on control file */ 1015 /* AV: shouldn't we check that it's been opened for read instead? */ 1016 ret = file_permission(fd_file(cfile), MAY_READ); 1017 if (ret < 0) 1018 goto out_put_cfile; 1019 1020 /* 1021 * The control file must be a regular cgroup1 file. As a regular cgroup 1022 * file can't be renamed, it's safe to access its name afterwards. 1023 */ 1024 cdentry = fd_file(cfile)->f_path.dentry; 1025 if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 1026 ret = -EINVAL; 1027 goto out_put_cfile; 1028 } 1029 1030 /* 1031 * Determine the event callbacks and set them in @event. This used 1032 * to be done via struct cftype but cgroup core no longer knows 1033 * about these events. The following is crude but the whole thing 1034 * is for compatibility anyway. 1035 * 1036 * DO NOT ADD NEW FILES. 1037 */ 1038 name = cdentry->d_name.name; 1039 1040 if (!strcmp(name, "memory.usage_in_bytes")) { 1041 event->register_event = mem_cgroup_usage_register_event; 1042 event->unregister_event = mem_cgroup_usage_unregister_event; 1043 } else if (!strcmp(name, "memory.oom_control")) { 1044 pr_warn_once("oom_control is deprecated and will be removed. " 1045 "Please report your usecase to linux-mm-@kvack.org" 1046 " if you depend on this functionality. \n"); 1047 event->register_event = mem_cgroup_oom_register_event; 1048 event->unregister_event = mem_cgroup_oom_unregister_event; 1049 } else if (!strcmp(name, "memory.pressure_level")) { 1050 pr_warn_once("pressure_level is deprecated and will be removed. " 1051 "Please report your usecase to linux-mm-@kvack.org " 1052 "if you depend on this functionality. \n"); 1053 event->register_event = vmpressure_register_event; 1054 event->unregister_event = vmpressure_unregister_event; 1055 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 1056 event->register_event = memsw_cgroup_usage_register_event; 1057 event->unregister_event = memsw_cgroup_usage_unregister_event; 1058 } else { 1059 ret = -EINVAL; 1060 goto out_put_cfile; 1061 } 1062 1063 /* 1064 * Verify @cfile should belong to @css. Also, remaining events are 1065 * automatically removed on cgroup destruction but the removal is 1066 * asynchronous, so take an extra ref on @css. 1067 */ 1068 cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 1069 &memory_cgrp_subsys); 1070 ret = -EINVAL; 1071 if (IS_ERR(cfile_css)) 1072 goto out_put_cfile; 1073 if (cfile_css != css) { 1074 css_put(cfile_css); 1075 goto out_put_cfile; 1076 } 1077 1078 ret = event->register_event(memcg, event->eventfd, buf); 1079 if (ret) 1080 goto out_put_css; 1081 1082 vfs_poll(fd_file(efile), &event->pt); 1083 1084 spin_lock_irq(&memcg->event_list_lock); 1085 list_add(&event->list, &memcg->event_list); 1086 spin_unlock_irq(&memcg->event_list_lock); 1087 1088 fdput(cfile); 1089 fdput(efile); 1090 1091 return nbytes; 1092 1093 out_put_css: 1094 css_put(css); 1095 out_put_cfile: 1096 fdput(cfile); 1097 out_put_eventfd: 1098 eventfd_ctx_put(event->eventfd); 1099 out_put_efile: 1100 fdput(efile); 1101 out_kfree: 1102 kfree(event); 1103 1104 return ret; 1105 } 1106 1107 void memcg1_memcg_init(struct mem_cgroup *memcg) 1108 { 1109 INIT_LIST_HEAD(&memcg->oom_notify); 1110 mutex_init(&memcg->thresholds_lock); 1111 INIT_LIST_HEAD(&memcg->event_list); 1112 spin_lock_init(&memcg->event_list_lock); 1113 } 1114 1115 void memcg1_css_offline(struct mem_cgroup *memcg) 1116 { 1117 struct mem_cgroup_event *event, *tmp; 1118 1119 /* 1120 * Unregister events and notify userspace. 1121 * Notify userspace about cgroup removing only after rmdir of cgroup 1122 * directory to avoid race between userspace and kernelspace. 1123 */ 1124 spin_lock_irq(&memcg->event_list_lock); 1125 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 1126 list_del_init(&event->list); 1127 schedule_work(&event->remove); 1128 } 1129 spin_unlock_irq(&memcg->event_list_lock); 1130 } 1131 1132 /* 1133 * Check OOM-Killer is already running under our hierarchy. 1134 * If someone is running, return false. 1135 */ 1136 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1137 { 1138 struct mem_cgroup *iter, *failed = NULL; 1139 1140 spin_lock(&memcg_oom_lock); 1141 1142 for_each_mem_cgroup_tree(iter, memcg) { 1143 if (iter->oom_lock) { 1144 /* 1145 * this subtree of our hierarchy is already locked 1146 * so we cannot give a lock. 1147 */ 1148 failed = iter; 1149 mem_cgroup_iter_break(memcg, iter); 1150 break; 1151 } else 1152 iter->oom_lock = true; 1153 } 1154 1155 if (failed) { 1156 /* 1157 * OK, we failed to lock the whole subtree so we have 1158 * to clean up what we set up to the failing subtree 1159 */ 1160 for_each_mem_cgroup_tree(iter, memcg) { 1161 if (iter == failed) { 1162 mem_cgroup_iter_break(memcg, iter); 1163 break; 1164 } 1165 iter->oom_lock = false; 1166 } 1167 } else 1168 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1169 1170 spin_unlock(&memcg_oom_lock); 1171 1172 return !failed; 1173 } 1174 1175 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1176 { 1177 struct mem_cgroup *iter; 1178 1179 spin_lock(&memcg_oom_lock); 1180 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1181 for_each_mem_cgroup_tree(iter, memcg) 1182 iter->oom_lock = false; 1183 spin_unlock(&memcg_oom_lock); 1184 } 1185 1186 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1187 { 1188 struct mem_cgroup *iter; 1189 1190 spin_lock(&memcg_oom_lock); 1191 for_each_mem_cgroup_tree(iter, memcg) 1192 iter->under_oom++; 1193 spin_unlock(&memcg_oom_lock); 1194 } 1195 1196 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1197 { 1198 struct mem_cgroup *iter; 1199 1200 /* 1201 * Be careful about under_oom underflows because a child memcg 1202 * could have been added after mem_cgroup_mark_under_oom. 1203 */ 1204 spin_lock(&memcg_oom_lock); 1205 for_each_mem_cgroup_tree(iter, memcg) 1206 if (iter->under_oom > 0) 1207 iter->under_oom--; 1208 spin_unlock(&memcg_oom_lock); 1209 } 1210 1211 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1212 1213 struct oom_wait_info { 1214 struct mem_cgroup *memcg; 1215 wait_queue_entry_t wait; 1216 }; 1217 1218 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1219 unsigned mode, int sync, void *arg) 1220 { 1221 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1222 struct mem_cgroup *oom_wait_memcg; 1223 struct oom_wait_info *oom_wait_info; 1224 1225 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1226 oom_wait_memcg = oom_wait_info->memcg; 1227 1228 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1229 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1230 return 0; 1231 return autoremove_wake_function(wait, mode, sync, arg); 1232 } 1233 1234 void memcg1_oom_recover(struct mem_cgroup *memcg) 1235 { 1236 /* 1237 * For the following lockless ->under_oom test, the only required 1238 * guarantee is that it must see the state asserted by an OOM when 1239 * this function is called as a result of userland actions 1240 * triggered by the notification of the OOM. This is trivially 1241 * achieved by invoking mem_cgroup_mark_under_oom() before 1242 * triggering notification. 1243 */ 1244 if (memcg && memcg->under_oom) 1245 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1246 } 1247 1248 /** 1249 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1250 * @handle: actually kill/wait or just clean up the OOM state 1251 * 1252 * This has to be called at the end of a page fault if the memcg OOM 1253 * handler was enabled. 1254 * 1255 * Memcg supports userspace OOM handling where failed allocations must 1256 * sleep on a waitqueue until the userspace task resolves the 1257 * situation. Sleeping directly in the charge context with all kinds 1258 * of locks held is not a good idea, instead we remember an OOM state 1259 * in the task and mem_cgroup_oom_synchronize() has to be called at 1260 * the end of the page fault to complete the OOM handling. 1261 * 1262 * Returns %true if an ongoing memcg OOM situation was detected and 1263 * completed, %false otherwise. 1264 */ 1265 bool mem_cgroup_oom_synchronize(bool handle) 1266 { 1267 struct mem_cgroup *memcg = current->memcg_in_oom; 1268 struct oom_wait_info owait; 1269 bool locked; 1270 1271 /* OOM is global, do not handle */ 1272 if (!memcg) 1273 return false; 1274 1275 if (!handle) 1276 goto cleanup; 1277 1278 owait.memcg = memcg; 1279 owait.wait.flags = 0; 1280 owait.wait.func = memcg_oom_wake_function; 1281 owait.wait.private = current; 1282 INIT_LIST_HEAD(&owait.wait.entry); 1283 1284 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1285 mem_cgroup_mark_under_oom(memcg); 1286 1287 locked = mem_cgroup_oom_trylock(memcg); 1288 1289 if (locked) 1290 mem_cgroup_oom_notify(memcg); 1291 1292 schedule(); 1293 mem_cgroup_unmark_under_oom(memcg); 1294 finish_wait(&memcg_oom_waitq, &owait.wait); 1295 1296 if (locked) 1297 mem_cgroup_oom_unlock(memcg); 1298 cleanup: 1299 current->memcg_in_oom = NULL; 1300 css_put(&memcg->css); 1301 return true; 1302 } 1303 1304 1305 bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) 1306 { 1307 /* 1308 * We are in the middle of the charge context here, so we 1309 * don't want to block when potentially sitting on a callstack 1310 * that holds all kinds of filesystem and mm locks. 1311 * 1312 * cgroup1 allows disabling the OOM killer and waiting for outside 1313 * handling until the charge can succeed; remember the context and put 1314 * the task to sleep at the end of the page fault when all locks are 1315 * released. 1316 * 1317 * On the other hand, in-kernel OOM killer allows for an async victim 1318 * memory reclaim (oom_reaper) and that means that we are not solely 1319 * relying on the oom victim to make a forward progress and we can 1320 * invoke the oom killer here. 1321 * 1322 * Please note that mem_cgroup_out_of_memory might fail to find a 1323 * victim and then we have to bail out from the charge path. 1324 */ 1325 if (READ_ONCE(memcg->oom_kill_disable)) { 1326 if (current->in_user_fault) { 1327 css_get(&memcg->css); 1328 current->memcg_in_oom = memcg; 1329 } 1330 return false; 1331 } 1332 1333 mem_cgroup_mark_under_oom(memcg); 1334 1335 *locked = mem_cgroup_oom_trylock(memcg); 1336 1337 if (*locked) 1338 mem_cgroup_oom_notify(memcg); 1339 1340 mem_cgroup_unmark_under_oom(memcg); 1341 1342 return true; 1343 } 1344 1345 void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked) 1346 { 1347 if (locked) 1348 mem_cgroup_oom_unlock(memcg); 1349 } 1350 1351 static DEFINE_MUTEX(memcg_max_mutex); 1352 1353 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 1354 unsigned long max, bool memsw) 1355 { 1356 bool enlarge = false; 1357 bool drained = false; 1358 int ret; 1359 bool limits_invariant; 1360 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 1361 1362 do { 1363 if (signal_pending(current)) { 1364 ret = -EINTR; 1365 break; 1366 } 1367 1368 mutex_lock(&memcg_max_mutex); 1369 /* 1370 * Make sure that the new limit (memsw or memory limit) doesn't 1371 * break our basic invariant rule memory.max <= memsw.max. 1372 */ 1373 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 1374 max <= memcg->memsw.max; 1375 if (!limits_invariant) { 1376 mutex_unlock(&memcg_max_mutex); 1377 ret = -EINVAL; 1378 break; 1379 } 1380 if (max > counter->max) 1381 enlarge = true; 1382 ret = page_counter_set_max(counter, max); 1383 mutex_unlock(&memcg_max_mutex); 1384 1385 if (!ret) 1386 break; 1387 1388 if (!drained) { 1389 drain_all_stock(memcg); 1390 drained = true; 1391 continue; 1392 } 1393 1394 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 1395 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) { 1396 ret = -EBUSY; 1397 break; 1398 } 1399 } while (true); 1400 1401 if (!ret && enlarge) 1402 memcg1_oom_recover(memcg); 1403 1404 return ret; 1405 } 1406 1407 /* 1408 * Reclaims as many pages from the given memcg as possible. 1409 * 1410 * Caller is responsible for holding css reference for memcg. 1411 */ 1412 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 1413 { 1414 int nr_retries = MAX_RECLAIM_RETRIES; 1415 1416 /* we call try-to-free pages for make this cgroup empty */ 1417 lru_add_drain_all(); 1418 1419 drain_all_stock(memcg); 1420 1421 /* try to free all pages in this cgroup */ 1422 while (nr_retries && page_counter_read(&memcg->memory)) { 1423 if (signal_pending(current)) 1424 return -EINTR; 1425 1426 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 1427 MEMCG_RECLAIM_MAY_SWAP, NULL)) 1428 nr_retries--; 1429 } 1430 1431 return 0; 1432 } 1433 1434 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 1435 char *buf, size_t nbytes, 1436 loff_t off) 1437 { 1438 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1439 1440 if (mem_cgroup_is_root(memcg)) 1441 return -EINVAL; 1442 return mem_cgroup_force_empty(memcg) ?: nbytes; 1443 } 1444 1445 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 1446 struct cftype *cft) 1447 { 1448 return 1; 1449 } 1450 1451 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 1452 struct cftype *cft, u64 val) 1453 { 1454 if (val == 1) 1455 return 0; 1456 1457 pr_warn_once("Non-hierarchical mode is deprecated. " 1458 "Please report your usecase to linux-mm@kvack.org if you " 1459 "depend on this functionality.\n"); 1460 1461 return -EINVAL; 1462 } 1463 1464 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 1465 struct cftype *cft) 1466 { 1467 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1468 struct page_counter *counter; 1469 1470 switch (MEMFILE_TYPE(cft->private)) { 1471 case _MEM: 1472 counter = &memcg->memory; 1473 break; 1474 case _MEMSWAP: 1475 counter = &memcg->memsw; 1476 break; 1477 case _KMEM: 1478 counter = &memcg->kmem; 1479 break; 1480 case _TCP: 1481 counter = &memcg->tcpmem; 1482 break; 1483 default: 1484 BUG(); 1485 } 1486 1487 switch (MEMFILE_ATTR(cft->private)) { 1488 case RES_USAGE: 1489 if (counter == &memcg->memory) 1490 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 1491 if (counter == &memcg->memsw) 1492 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 1493 return (u64)page_counter_read(counter) * PAGE_SIZE; 1494 case RES_LIMIT: 1495 return (u64)counter->max * PAGE_SIZE; 1496 case RES_MAX_USAGE: 1497 return (u64)counter->watermark * PAGE_SIZE; 1498 case RES_FAILCNT: 1499 return counter->failcnt; 1500 case RES_SOFT_LIMIT: 1501 return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE; 1502 default: 1503 BUG(); 1504 } 1505 } 1506 1507 /* 1508 * This function doesn't do anything useful. Its only job is to provide a read 1509 * handler for a file so that cgroup_file_mode() will add read permissions. 1510 */ 1511 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m, 1512 __always_unused void *v) 1513 { 1514 return -EINVAL; 1515 } 1516 1517 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 1518 { 1519 int ret; 1520 1521 mutex_lock(&memcg_max_mutex); 1522 1523 ret = page_counter_set_max(&memcg->tcpmem, max); 1524 if (ret) 1525 goto out; 1526 1527 if (!memcg->tcpmem_active) { 1528 /* 1529 * The active flag needs to be written after the static_key 1530 * update. This is what guarantees that the socket activation 1531 * function is the last one to run. See mem_cgroup_sk_alloc() 1532 * for details, and note that we don't mark any socket as 1533 * belonging to this memcg until that flag is up. 1534 * 1535 * We need to do this, because static_keys will span multiple 1536 * sites, but we can't control their order. If we mark a socket 1537 * as accounted, but the accounting functions are not patched in 1538 * yet, we'll lose accounting. 1539 * 1540 * We never race with the readers in mem_cgroup_sk_alloc(), 1541 * because when this value change, the code to process it is not 1542 * patched in yet. 1543 */ 1544 static_branch_inc(&memcg_sockets_enabled_key); 1545 memcg->tcpmem_active = true; 1546 } 1547 out: 1548 mutex_unlock(&memcg_max_mutex); 1549 return ret; 1550 } 1551 1552 /* 1553 * The user of this function is... 1554 * RES_LIMIT. 1555 */ 1556 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 1557 char *buf, size_t nbytes, loff_t off) 1558 { 1559 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1560 unsigned long nr_pages; 1561 int ret; 1562 1563 buf = strstrip(buf); 1564 ret = page_counter_memparse(buf, "-1", &nr_pages); 1565 if (ret) 1566 return ret; 1567 1568 switch (MEMFILE_ATTR(of_cft(of)->private)) { 1569 case RES_LIMIT: 1570 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 1571 ret = -EINVAL; 1572 break; 1573 } 1574 switch (MEMFILE_TYPE(of_cft(of)->private)) { 1575 case _MEM: 1576 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 1577 break; 1578 case _MEMSWAP: 1579 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 1580 break; 1581 case _KMEM: 1582 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 1583 "Writing any value to this file has no effect. " 1584 "Please report your usecase to linux-mm@kvack.org if you " 1585 "depend on this functionality.\n"); 1586 ret = 0; 1587 break; 1588 case _TCP: 1589 pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. " 1590 "Please report your usecase to linux-mm@kvack.org if you " 1591 "depend on this functionality.\n"); 1592 ret = memcg_update_tcp_max(memcg, nr_pages); 1593 break; 1594 } 1595 break; 1596 case RES_SOFT_LIMIT: 1597 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 1598 ret = -EOPNOTSUPP; 1599 } else { 1600 pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. " 1601 "Please report your usecase to linux-mm@kvack.org if you " 1602 "depend on this functionality.\n"); 1603 WRITE_ONCE(memcg->soft_limit, nr_pages); 1604 ret = 0; 1605 } 1606 break; 1607 } 1608 return ret ?: nbytes; 1609 } 1610 1611 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 1612 size_t nbytes, loff_t off) 1613 { 1614 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1615 struct page_counter *counter; 1616 1617 switch (MEMFILE_TYPE(of_cft(of)->private)) { 1618 case _MEM: 1619 counter = &memcg->memory; 1620 break; 1621 case _MEMSWAP: 1622 counter = &memcg->memsw; 1623 break; 1624 case _KMEM: 1625 counter = &memcg->kmem; 1626 break; 1627 case _TCP: 1628 counter = &memcg->tcpmem; 1629 break; 1630 default: 1631 BUG(); 1632 } 1633 1634 switch (MEMFILE_ATTR(of_cft(of)->private)) { 1635 case RES_MAX_USAGE: 1636 page_counter_reset_watermark(counter); 1637 break; 1638 case RES_FAILCNT: 1639 counter->failcnt = 0; 1640 break; 1641 default: 1642 BUG(); 1643 } 1644 1645 return nbytes; 1646 } 1647 1648 #ifdef CONFIG_NUMA 1649 1650 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 1651 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 1652 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 1653 1654 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 1655 int nid, unsigned int lru_mask, bool tree) 1656 { 1657 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 1658 unsigned long nr = 0; 1659 enum lru_list lru; 1660 1661 VM_BUG_ON((unsigned)nid >= nr_node_ids); 1662 1663 for_each_lru(lru) { 1664 if (!(BIT(lru) & lru_mask)) 1665 continue; 1666 if (tree) 1667 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 1668 else 1669 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 1670 } 1671 return nr; 1672 } 1673 1674 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 1675 unsigned int lru_mask, 1676 bool tree) 1677 { 1678 unsigned long nr = 0; 1679 enum lru_list lru; 1680 1681 for_each_lru(lru) { 1682 if (!(BIT(lru) & lru_mask)) 1683 continue; 1684 if (tree) 1685 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 1686 else 1687 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 1688 } 1689 return nr; 1690 } 1691 1692 static int memcg_numa_stat_show(struct seq_file *m, void *v) 1693 { 1694 struct numa_stat { 1695 const char *name; 1696 unsigned int lru_mask; 1697 }; 1698 1699 static const struct numa_stat stats[] = { 1700 { "total", LRU_ALL }, 1701 { "file", LRU_ALL_FILE }, 1702 { "anon", LRU_ALL_ANON }, 1703 { "unevictable", BIT(LRU_UNEVICTABLE) }, 1704 }; 1705 const struct numa_stat *stat; 1706 int nid; 1707 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 1708 1709 mem_cgroup_flush_stats(memcg); 1710 1711 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 1712 seq_printf(m, "%s=%lu", stat->name, 1713 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 1714 false)); 1715 for_each_node_state(nid, N_MEMORY) 1716 seq_printf(m, " N%d=%lu", nid, 1717 mem_cgroup_node_nr_lru_pages(memcg, nid, 1718 stat->lru_mask, false)); 1719 seq_putc(m, '\n'); 1720 } 1721 1722 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 1723 1724 seq_printf(m, "hierarchical_%s=%lu", stat->name, 1725 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 1726 true)); 1727 for_each_node_state(nid, N_MEMORY) 1728 seq_printf(m, " N%d=%lu", nid, 1729 mem_cgroup_node_nr_lru_pages(memcg, nid, 1730 stat->lru_mask, true)); 1731 seq_putc(m, '\n'); 1732 } 1733 1734 return 0; 1735 } 1736 #endif /* CONFIG_NUMA */ 1737 1738 static const unsigned int memcg1_stats[] = { 1739 NR_FILE_PAGES, 1740 NR_ANON_MAPPED, 1741 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1742 NR_ANON_THPS, 1743 #endif 1744 NR_SHMEM, 1745 NR_FILE_MAPPED, 1746 NR_FILE_DIRTY, 1747 NR_WRITEBACK, 1748 WORKINGSET_REFAULT_ANON, 1749 WORKINGSET_REFAULT_FILE, 1750 #ifdef CONFIG_SWAP 1751 MEMCG_SWAP, 1752 NR_SWAPCACHE, 1753 #endif 1754 }; 1755 1756 static const char *const memcg1_stat_names[] = { 1757 "cache", 1758 "rss", 1759 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1760 "rss_huge", 1761 #endif 1762 "shmem", 1763 "mapped_file", 1764 "dirty", 1765 "writeback", 1766 "workingset_refault_anon", 1767 "workingset_refault_file", 1768 #ifdef CONFIG_SWAP 1769 "swap", 1770 "swapcached", 1771 #endif 1772 }; 1773 1774 /* Universal VM events cgroup1 shows, original sort order */ 1775 static const unsigned int memcg1_events[] = { 1776 PGPGIN, 1777 PGPGOUT, 1778 PGFAULT, 1779 PGMAJFAULT, 1780 }; 1781 1782 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 1783 { 1784 unsigned long memory, memsw; 1785 struct mem_cgroup *mi; 1786 unsigned int i; 1787 1788 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 1789 1790 mem_cgroup_flush_stats(memcg); 1791 1792 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1793 unsigned long nr; 1794 1795 nr = memcg_page_state_local_output(memcg, memcg1_stats[i]); 1796 seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr); 1797 } 1798 1799 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 1800 seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), 1801 memcg_events_local(memcg, memcg1_events[i])); 1802 1803 for (i = 0; i < NR_LRU_LISTS; i++) 1804 seq_buf_printf(s, "%s %lu\n", lru_list_name(i), 1805 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 1806 PAGE_SIZE); 1807 1808 /* Hierarchical information */ 1809 memory = memsw = PAGE_COUNTER_MAX; 1810 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 1811 memory = min(memory, READ_ONCE(mi->memory.max)); 1812 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 1813 } 1814 seq_buf_printf(s, "hierarchical_memory_limit %llu\n", 1815 (u64)memory * PAGE_SIZE); 1816 seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", 1817 (u64)memsw * PAGE_SIZE); 1818 1819 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1820 unsigned long nr; 1821 1822 nr = memcg_page_state_output(memcg, memcg1_stats[i]); 1823 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], 1824 (u64)nr); 1825 } 1826 1827 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 1828 seq_buf_printf(s, "total_%s %llu\n", 1829 vm_event_name(memcg1_events[i]), 1830 (u64)memcg_events(memcg, memcg1_events[i])); 1831 1832 for (i = 0; i < NR_LRU_LISTS; i++) 1833 seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), 1834 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 1835 PAGE_SIZE); 1836 1837 #ifdef CONFIG_DEBUG_VM 1838 { 1839 pg_data_t *pgdat; 1840 struct mem_cgroup_per_node *mz; 1841 unsigned long anon_cost = 0; 1842 unsigned long file_cost = 0; 1843 1844 for_each_online_pgdat(pgdat) { 1845 mz = memcg->nodeinfo[pgdat->node_id]; 1846 1847 anon_cost += mz->lruvec.anon_cost; 1848 file_cost += mz->lruvec.file_cost; 1849 } 1850 seq_buf_printf(s, "anon_cost %lu\n", anon_cost); 1851 seq_buf_printf(s, "file_cost %lu\n", file_cost); 1852 } 1853 #endif 1854 } 1855 1856 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 1857 struct cftype *cft) 1858 { 1859 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1860 1861 return mem_cgroup_swappiness(memcg); 1862 } 1863 1864 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 1865 struct cftype *cft, u64 val) 1866 { 1867 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1868 1869 if (val > MAX_SWAPPINESS) 1870 return -EINVAL; 1871 1872 if (!mem_cgroup_is_root(memcg)) 1873 WRITE_ONCE(memcg->swappiness, val); 1874 else 1875 WRITE_ONCE(vm_swappiness, val); 1876 1877 return 0; 1878 } 1879 1880 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 1881 { 1882 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 1883 1884 seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable)); 1885 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 1886 seq_printf(sf, "oom_kill %lu\n", 1887 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 1888 return 0; 1889 } 1890 1891 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 1892 struct cftype *cft, u64 val) 1893 { 1894 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1895 1896 pr_warn_once("oom_control is deprecated and will be removed. " 1897 "Please report your usecase to linux-mm-@kvack.org if you " 1898 "depend on this functionality. \n"); 1899 1900 /* cannot set to root cgroup and only 0 and 1 are allowed */ 1901 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 1902 return -EINVAL; 1903 1904 WRITE_ONCE(memcg->oom_kill_disable, val); 1905 if (!val) 1906 memcg1_oom_recover(memcg); 1907 1908 return 0; 1909 } 1910 1911 #ifdef CONFIG_SLUB_DEBUG 1912 static int mem_cgroup_slab_show(struct seq_file *m, void *p) 1913 { 1914 /* 1915 * Deprecated. 1916 * Please, take a look at tools/cgroup/memcg_slabinfo.py . 1917 */ 1918 return 0; 1919 } 1920 #endif 1921 1922 struct cftype mem_cgroup_legacy_files[] = { 1923 { 1924 .name = "usage_in_bytes", 1925 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 1926 .read_u64 = mem_cgroup_read_u64, 1927 }, 1928 { 1929 .name = "max_usage_in_bytes", 1930 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 1931 .write = mem_cgroup_reset, 1932 .read_u64 = mem_cgroup_read_u64, 1933 }, 1934 { 1935 .name = "limit_in_bytes", 1936 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 1937 .write = mem_cgroup_write, 1938 .read_u64 = mem_cgroup_read_u64, 1939 }, 1940 { 1941 .name = "soft_limit_in_bytes", 1942 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 1943 .write = mem_cgroup_write, 1944 .read_u64 = mem_cgroup_read_u64, 1945 }, 1946 { 1947 .name = "failcnt", 1948 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 1949 .write = mem_cgroup_reset, 1950 .read_u64 = mem_cgroup_read_u64, 1951 }, 1952 { 1953 .name = "stat", 1954 .seq_show = memory_stat_show, 1955 }, 1956 { 1957 .name = "force_empty", 1958 .write = mem_cgroup_force_empty_write, 1959 }, 1960 { 1961 .name = "use_hierarchy", 1962 .write_u64 = mem_cgroup_hierarchy_write, 1963 .read_u64 = mem_cgroup_hierarchy_read, 1964 }, 1965 { 1966 .name = "cgroup.event_control", /* XXX: for compat */ 1967 .write = memcg_write_event_control, 1968 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 1969 }, 1970 { 1971 .name = "swappiness", 1972 .read_u64 = mem_cgroup_swappiness_read, 1973 .write_u64 = mem_cgroup_swappiness_write, 1974 }, 1975 { 1976 .name = "move_charge_at_immigrate", 1977 .read_u64 = mem_cgroup_move_charge_read, 1978 .write_u64 = mem_cgroup_move_charge_write, 1979 }, 1980 { 1981 .name = "oom_control", 1982 .seq_show = mem_cgroup_oom_control_read, 1983 .write_u64 = mem_cgroup_oom_control_write, 1984 }, 1985 { 1986 .name = "pressure_level", 1987 .seq_show = mem_cgroup_dummy_seq_show, 1988 }, 1989 #ifdef CONFIG_NUMA 1990 { 1991 .name = "numa_stat", 1992 .seq_show = memcg_numa_stat_show, 1993 }, 1994 #endif 1995 { 1996 .name = "kmem.limit_in_bytes", 1997 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 1998 .write = mem_cgroup_write, 1999 .read_u64 = mem_cgroup_read_u64, 2000 }, 2001 { 2002 .name = "kmem.usage_in_bytes", 2003 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 2004 .read_u64 = mem_cgroup_read_u64, 2005 }, 2006 { 2007 .name = "kmem.failcnt", 2008 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 2009 .write = mem_cgroup_reset, 2010 .read_u64 = mem_cgroup_read_u64, 2011 }, 2012 { 2013 .name = "kmem.max_usage_in_bytes", 2014 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 2015 .write = mem_cgroup_reset, 2016 .read_u64 = mem_cgroup_read_u64, 2017 }, 2018 #ifdef CONFIG_SLUB_DEBUG 2019 { 2020 .name = "kmem.slabinfo", 2021 .seq_show = mem_cgroup_slab_show, 2022 }, 2023 #endif 2024 { 2025 .name = "kmem.tcp.limit_in_bytes", 2026 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 2027 .write = mem_cgroup_write, 2028 .read_u64 = mem_cgroup_read_u64, 2029 }, 2030 { 2031 .name = "kmem.tcp.usage_in_bytes", 2032 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 2033 .read_u64 = mem_cgroup_read_u64, 2034 }, 2035 { 2036 .name = "kmem.tcp.failcnt", 2037 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 2038 .write = mem_cgroup_reset, 2039 .read_u64 = mem_cgroup_read_u64, 2040 }, 2041 { 2042 .name = "kmem.tcp.max_usage_in_bytes", 2043 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 2044 .write = mem_cgroup_reset, 2045 .read_u64 = mem_cgroup_read_u64, 2046 }, 2047 { }, /* terminate */ 2048 }; 2049 2050 struct cftype memsw_files[] = { 2051 { 2052 .name = "memsw.usage_in_bytes", 2053 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 2054 .read_u64 = mem_cgroup_read_u64, 2055 }, 2056 { 2057 .name = "memsw.max_usage_in_bytes", 2058 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 2059 .write = mem_cgroup_reset, 2060 .read_u64 = mem_cgroup_read_u64, 2061 }, 2062 { 2063 .name = "memsw.limit_in_bytes", 2064 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 2065 .write = mem_cgroup_write, 2066 .read_u64 = mem_cgroup_read_u64, 2067 }, 2068 { 2069 .name = "memsw.failcnt", 2070 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 2071 .write = mem_cgroup_reset, 2072 .read_u64 = mem_cgroup_read_u64, 2073 }, 2074 { }, /* terminate */ 2075 }; 2076 2077 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) 2078 { 2079 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 2080 if (nr_pages > 0) 2081 page_counter_charge(&memcg->kmem, nr_pages); 2082 else 2083 page_counter_uncharge(&memcg->kmem, -nr_pages); 2084 } 2085 } 2086 2087 bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 2088 gfp_t gfp_mask) 2089 { 2090 struct page_counter *fail; 2091 2092 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 2093 memcg->tcpmem_pressure = 0; 2094 return true; 2095 } 2096 memcg->tcpmem_pressure = 1; 2097 if (gfp_mask & __GFP_NOFAIL) { 2098 page_counter_charge(&memcg->tcpmem, nr_pages); 2099 return true; 2100 } 2101 return false; 2102 } 2103 2104 bool memcg1_alloc_events(struct mem_cgroup *memcg) 2105 { 2106 memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu, 2107 GFP_KERNEL_ACCOUNT); 2108 return !!memcg->events_percpu; 2109 } 2110 2111 void memcg1_free_events(struct mem_cgroup *memcg) 2112 { 2113 if (memcg->events_percpu) 2114 free_percpu(memcg->events_percpu); 2115 } 2116 2117 static int __init memcg1_init(void) 2118 { 2119 int node; 2120 2121 for_each_node(node) { 2122 struct mem_cgroup_tree_per_node *rtpn; 2123 2124 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); 2125 2126 rtpn->rb_root = RB_ROOT; 2127 rtpn->rb_rightmost = NULL; 2128 spin_lock_init(&rtpn->lock); 2129 soft_limit_tree.rb_tree_per_node[node] = rtpn; 2130 } 2131 2132 return 0; 2133 } 2134 subsys_initcall(memcg1_init); 2135