1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/memcontrol.h> 4 #include <linux/swap.h> 5 #include <linux/mm_inline.h> 6 #include <linux/pagewalk.h> 7 #include <linux/backing-dev.h> 8 #include <linux/swap_cgroup.h> 9 #include <linux/eventfd.h> 10 #include <linux/poll.h> 11 #include <linux/sort.h> 12 #include <linux/file.h> 13 #include <linux/seq_buf.h> 14 15 #include "internal.h" 16 #include "swap.h" 17 #include "memcontrol-v1.h" 18 19 /* 20 * Cgroups above their limits are maintained in a RB-Tree, independent of 21 * their hierarchy representation 22 */ 23 24 struct mem_cgroup_tree_per_node { 25 struct rb_root rb_root; 26 struct rb_node *rb_rightmost; 27 spinlock_t lock; 28 }; 29 30 struct mem_cgroup_tree { 31 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 32 }; 33 34 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 35 36 /* 37 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft 38 * limit reclaim to prevent infinite loops, if they ever occur. 39 */ 40 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 41 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 42 43 /* for OOM */ 44 struct mem_cgroup_eventfd_list { 45 struct list_head list; 46 struct eventfd_ctx *eventfd; 47 }; 48 49 /* 50 * cgroup_event represents events which userspace want to receive. 51 */ 52 struct mem_cgroup_event { 53 /* 54 * memcg which the event belongs to. 55 */ 56 struct mem_cgroup *memcg; 57 /* 58 * eventfd to signal userspace about the event. 59 */ 60 struct eventfd_ctx *eventfd; 61 /* 62 * Each of these stored in a list by the cgroup. 63 */ 64 struct list_head list; 65 /* 66 * register_event() callback will be used to add new userspace 67 * waiter for changes related to this event. Use eventfd_signal() 68 * on eventfd to send notification to userspace. 69 */ 70 int (*register_event)(struct mem_cgroup *memcg, 71 struct eventfd_ctx *eventfd, const char *args); 72 /* 73 * unregister_event() callback will be called when userspace closes 74 * the eventfd or on cgroup removing. This callback must be set, 75 * if you want provide notification functionality. 76 */ 77 void (*unregister_event)(struct mem_cgroup *memcg, 78 struct eventfd_ctx *eventfd); 79 /* 80 * All fields below needed to unregister event when 81 * userspace closes eventfd. 82 */ 83 poll_table pt; 84 wait_queue_head_t *wqh; 85 wait_queue_entry_t wait; 86 struct work_struct remove; 87 }; 88 89 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 90 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 91 #define MEMFILE_ATTR(val) ((val) & 0xffff) 92 93 enum { 94 RES_USAGE, 95 RES_LIMIT, 96 RES_MAX_USAGE, 97 RES_FAILCNT, 98 RES_SOFT_LIMIT, 99 }; 100 101 #ifdef CONFIG_LOCKDEP 102 static struct lockdep_map memcg_oom_lock_dep_map = { 103 .name = "memcg_oom_lock", 104 }; 105 #endif 106 107 DEFINE_SPINLOCK(memcg_oom_lock); 108 109 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 110 struct mem_cgroup_tree_per_node *mctz, 111 unsigned long new_usage_in_excess) 112 { 113 struct rb_node **p = &mctz->rb_root.rb_node; 114 struct rb_node *parent = NULL; 115 struct mem_cgroup_per_node *mz_node; 116 bool rightmost = true; 117 118 if (mz->on_tree) 119 return; 120 121 mz->usage_in_excess = new_usage_in_excess; 122 if (!mz->usage_in_excess) 123 return; 124 while (*p) { 125 parent = *p; 126 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 127 tree_node); 128 if (mz->usage_in_excess < mz_node->usage_in_excess) { 129 p = &(*p)->rb_left; 130 rightmost = false; 131 } else { 132 p = &(*p)->rb_right; 133 } 134 } 135 136 if (rightmost) 137 mctz->rb_rightmost = &mz->tree_node; 138 139 rb_link_node(&mz->tree_node, parent, p); 140 rb_insert_color(&mz->tree_node, &mctz->rb_root); 141 mz->on_tree = true; 142 } 143 144 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 145 struct mem_cgroup_tree_per_node *mctz) 146 { 147 if (!mz->on_tree) 148 return; 149 150 if (&mz->tree_node == mctz->rb_rightmost) 151 mctz->rb_rightmost = rb_prev(&mz->tree_node); 152 153 rb_erase(&mz->tree_node, &mctz->rb_root); 154 mz->on_tree = false; 155 } 156 157 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 158 struct mem_cgroup_tree_per_node *mctz) 159 { 160 unsigned long flags; 161 162 spin_lock_irqsave(&mctz->lock, flags); 163 __mem_cgroup_remove_exceeded(mz, mctz); 164 spin_unlock_irqrestore(&mctz->lock, flags); 165 } 166 167 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 168 { 169 unsigned long nr_pages = page_counter_read(&memcg->memory); 170 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 171 unsigned long excess = 0; 172 173 if (nr_pages > soft_limit) 174 excess = nr_pages - soft_limit; 175 176 return excess; 177 } 178 179 static void memcg1_update_tree(struct mem_cgroup *memcg, int nid) 180 { 181 unsigned long excess; 182 struct mem_cgroup_per_node *mz; 183 struct mem_cgroup_tree_per_node *mctz; 184 185 if (lru_gen_enabled()) { 186 if (soft_limit_excess(memcg)) 187 lru_gen_soft_reclaim(memcg, nid); 188 return; 189 } 190 191 mctz = soft_limit_tree.rb_tree_per_node[nid]; 192 if (!mctz) 193 return; 194 /* 195 * Necessary to update all ancestors when hierarchy is used. 196 * because their event counter is not touched. 197 */ 198 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 199 mz = memcg->nodeinfo[nid]; 200 excess = soft_limit_excess(memcg); 201 /* 202 * We have to update the tree if mz is on RB-tree or 203 * mem is over its softlimit. 204 */ 205 if (excess || mz->on_tree) { 206 unsigned long flags; 207 208 spin_lock_irqsave(&mctz->lock, flags); 209 /* if on-tree, remove it */ 210 if (mz->on_tree) 211 __mem_cgroup_remove_exceeded(mz, mctz); 212 /* 213 * Insert again. mz->usage_in_excess will be updated. 214 * If excess is 0, no tree ops. 215 */ 216 __mem_cgroup_insert_exceeded(mz, mctz, excess); 217 spin_unlock_irqrestore(&mctz->lock, flags); 218 } 219 } 220 } 221 222 void memcg1_remove_from_trees(struct mem_cgroup *memcg) 223 { 224 struct mem_cgroup_tree_per_node *mctz; 225 struct mem_cgroup_per_node *mz; 226 int nid; 227 228 for_each_node(nid) { 229 mz = memcg->nodeinfo[nid]; 230 mctz = soft_limit_tree.rb_tree_per_node[nid]; 231 if (mctz) 232 mem_cgroup_remove_exceeded(mz, mctz); 233 } 234 } 235 236 static struct mem_cgroup_per_node * 237 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 238 { 239 struct mem_cgroup_per_node *mz; 240 241 retry: 242 mz = NULL; 243 if (!mctz->rb_rightmost) 244 goto done; /* Nothing to reclaim from */ 245 246 mz = rb_entry(mctz->rb_rightmost, 247 struct mem_cgroup_per_node, tree_node); 248 /* 249 * Remove the node now but someone else can add it back, 250 * we will to add it back at the end of reclaim to its correct 251 * position in the tree. 252 */ 253 __mem_cgroup_remove_exceeded(mz, mctz); 254 if (!soft_limit_excess(mz->memcg) || 255 !css_tryget(&mz->memcg->css)) 256 goto retry; 257 done: 258 return mz; 259 } 260 261 static struct mem_cgroup_per_node * 262 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 263 { 264 struct mem_cgroup_per_node *mz; 265 266 spin_lock_irq(&mctz->lock); 267 mz = __mem_cgroup_largest_soft_limit_node(mctz); 268 spin_unlock_irq(&mctz->lock); 269 return mz; 270 } 271 272 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 273 pg_data_t *pgdat, 274 gfp_t gfp_mask, 275 unsigned long *total_scanned) 276 { 277 struct mem_cgroup *victim = NULL; 278 int total = 0; 279 int loop = 0; 280 unsigned long excess; 281 unsigned long nr_scanned; 282 struct mem_cgroup_reclaim_cookie reclaim = { 283 .pgdat = pgdat, 284 }; 285 286 excess = soft_limit_excess(root_memcg); 287 288 while (1) { 289 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 290 if (!victim) { 291 loop++; 292 if (loop >= 2) { 293 /* 294 * If we have not been able to reclaim 295 * anything, it might because there are 296 * no reclaimable pages under this hierarchy 297 */ 298 if (!total) 299 break; 300 /* 301 * We want to do more targeted reclaim. 302 * excess >> 2 is not to excessive so as to 303 * reclaim too much, nor too less that we keep 304 * coming back to reclaim from this cgroup 305 */ 306 if (total >= (excess >> 2) || 307 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 308 break; 309 } 310 continue; 311 } 312 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 313 pgdat, &nr_scanned); 314 *total_scanned += nr_scanned; 315 if (!soft_limit_excess(root_memcg)) 316 break; 317 } 318 mem_cgroup_iter_break(root_memcg, victim); 319 return total; 320 } 321 322 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, 323 gfp_t gfp_mask, 324 unsigned long *total_scanned) 325 { 326 unsigned long nr_reclaimed = 0; 327 struct mem_cgroup_per_node *mz, *next_mz = NULL; 328 unsigned long reclaimed; 329 int loop = 0; 330 struct mem_cgroup_tree_per_node *mctz; 331 unsigned long excess; 332 333 if (lru_gen_enabled()) 334 return 0; 335 336 if (order > 0) 337 return 0; 338 339 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; 340 341 /* 342 * Do not even bother to check the largest node if the root 343 * is empty. Do it lockless to prevent lock bouncing. Races 344 * are acceptable as soft limit is best effort anyway. 345 */ 346 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 347 return 0; 348 349 /* 350 * This loop can run a while, specially if mem_cgroup's continuously 351 * keep exceeding their soft limit and putting the system under 352 * pressure 353 */ 354 do { 355 if (next_mz) 356 mz = next_mz; 357 else 358 mz = mem_cgroup_largest_soft_limit_node(mctz); 359 if (!mz) 360 break; 361 362 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 363 gfp_mask, total_scanned); 364 nr_reclaimed += reclaimed; 365 spin_lock_irq(&mctz->lock); 366 367 /* 368 * If we failed to reclaim anything from this memory cgroup 369 * it is time to move on to the next cgroup 370 */ 371 next_mz = NULL; 372 if (!reclaimed) 373 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 374 375 excess = soft_limit_excess(mz->memcg); 376 /* 377 * One school of thought says that we should not add 378 * back the node to the tree if reclaim returns 0. 379 * But our reclaim could return 0, simply because due 380 * to priority we are exposing a smaller subset of 381 * memory to reclaim from. Consider this as a longer 382 * term TODO. 383 */ 384 /* If excess == 0, no tree ops */ 385 __mem_cgroup_insert_exceeded(mz, mctz, excess); 386 spin_unlock_irq(&mctz->lock); 387 css_put(&mz->memcg->css); 388 loop++; 389 /* 390 * Could not reclaim anything and there are no more 391 * mem cgroups to try or we seem to be looping without 392 * reclaiming anything. 393 */ 394 if (!nr_reclaimed && 395 (next_mz == NULL || 396 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 397 break; 398 } while (!nr_reclaimed); 399 if (next_mz) 400 css_put(&next_mz->memcg->css); 401 return nr_reclaimed; 402 } 403 404 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 405 struct cftype *cft) 406 { 407 return 0; 408 } 409 410 #ifdef CONFIG_MMU 411 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 412 struct cftype *cft, u64 val) 413 { 414 pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " 415 "Please report your usecase to linux-mm@kvack.org if you " 416 "depend on this functionality.\n"); 417 418 if (val != 0) 419 return -EINVAL; 420 return 0; 421 } 422 #else 423 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 424 struct cftype *cft, u64 val) 425 { 426 return -ENOSYS; 427 } 428 #endif 429 430 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 431 { 432 struct mem_cgroup_threshold_ary *t; 433 unsigned long usage; 434 int i; 435 436 rcu_read_lock(); 437 if (!swap) 438 t = rcu_dereference(memcg->thresholds.primary); 439 else 440 t = rcu_dereference(memcg->memsw_thresholds.primary); 441 442 if (!t) 443 goto unlock; 444 445 usage = mem_cgroup_usage(memcg, swap); 446 447 /* 448 * current_threshold points to threshold just below or equal to usage. 449 * If it's not true, a threshold was crossed after last 450 * call of __mem_cgroup_threshold(). 451 */ 452 i = t->current_threshold; 453 454 /* 455 * Iterate backward over array of thresholds starting from 456 * current_threshold and check if a threshold is crossed. 457 * If none of thresholds below usage is crossed, we read 458 * only one element of the array here. 459 */ 460 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 461 eventfd_signal(t->entries[i].eventfd); 462 463 /* i = current_threshold + 1 */ 464 i++; 465 466 /* 467 * Iterate forward over array of thresholds starting from 468 * current_threshold+1 and check if a threshold is crossed. 469 * If none of thresholds above usage is crossed, we read 470 * only one element of the array here. 471 */ 472 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 473 eventfd_signal(t->entries[i].eventfd); 474 475 /* Update current_threshold */ 476 t->current_threshold = i - 1; 477 unlock: 478 rcu_read_unlock(); 479 } 480 481 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 482 { 483 while (memcg) { 484 __mem_cgroup_threshold(memcg, false); 485 if (do_memsw_account()) 486 __mem_cgroup_threshold(memcg, true); 487 488 memcg = parent_mem_cgroup(memcg); 489 } 490 } 491 492 /* Cgroup1: threshold notifications & softlimit tree updates */ 493 struct memcg1_events_percpu { 494 unsigned long nr_page_events; 495 unsigned long targets[MEM_CGROUP_NTARGETS]; 496 }; 497 498 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages) 499 { 500 /* pagein of a big page is an event. So, ignore page size */ 501 if (nr_pages > 0) 502 __count_memcg_events(memcg, PGPGIN, 1); 503 else { 504 __count_memcg_events(memcg, PGPGOUT, 1); 505 nr_pages = -nr_pages; /* for event */ 506 } 507 508 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages); 509 } 510 511 #define THRESHOLDS_EVENTS_TARGET 128 512 #define SOFTLIMIT_EVENTS_TARGET 1024 513 514 static bool memcg1_event_ratelimit(struct mem_cgroup *memcg, 515 enum mem_cgroup_events_target target) 516 { 517 unsigned long val, next; 518 519 val = __this_cpu_read(memcg->events_percpu->nr_page_events); 520 next = __this_cpu_read(memcg->events_percpu->targets[target]); 521 /* from time_after() in jiffies.h */ 522 if ((long)(next - val) < 0) { 523 switch (target) { 524 case MEM_CGROUP_TARGET_THRESH: 525 next = val + THRESHOLDS_EVENTS_TARGET; 526 break; 527 case MEM_CGROUP_TARGET_SOFTLIMIT: 528 next = val + SOFTLIMIT_EVENTS_TARGET; 529 break; 530 default: 531 break; 532 } 533 __this_cpu_write(memcg->events_percpu->targets[target], next); 534 return true; 535 } 536 return false; 537 } 538 539 /* 540 * Check events in order. 541 * 542 */ 543 static void memcg1_check_events(struct mem_cgroup *memcg, int nid) 544 { 545 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 546 return; 547 548 /* threshold event is triggered in finer grain than soft limit */ 549 if (unlikely(memcg1_event_ratelimit(memcg, 550 MEM_CGROUP_TARGET_THRESH))) { 551 bool do_softlimit; 552 553 do_softlimit = memcg1_event_ratelimit(memcg, 554 MEM_CGROUP_TARGET_SOFTLIMIT); 555 mem_cgroup_threshold(memcg); 556 if (unlikely(do_softlimit)) 557 memcg1_update_tree(memcg, nid); 558 } 559 } 560 561 void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) 562 { 563 unsigned long flags; 564 565 local_irq_save(flags); 566 memcg1_charge_statistics(memcg, folio_nr_pages(folio)); 567 memcg1_check_events(memcg, folio_nid(folio)); 568 local_irq_restore(flags); 569 } 570 571 void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) 572 { 573 /* 574 * Interrupts should be disabled here because the caller holds the 575 * i_pages lock which is taken with interrupts-off. It is 576 * important here to have the interrupts disabled because it is the 577 * only synchronisation we have for updating the per-CPU variables. 578 */ 579 preempt_disable_nested(); 580 VM_WARN_ON_IRQS_ENABLED(); 581 memcg1_charge_statistics(memcg, -folio_nr_pages(folio)); 582 preempt_enable_nested(); 583 memcg1_check_events(memcg, folio_nid(folio)); 584 } 585 586 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 587 unsigned long nr_memory, int nid) 588 { 589 unsigned long flags; 590 591 local_irq_save(flags); 592 __count_memcg_events(memcg, PGPGOUT, pgpgout); 593 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory); 594 memcg1_check_events(memcg, nid); 595 local_irq_restore(flags); 596 } 597 598 static int compare_thresholds(const void *a, const void *b) 599 { 600 const struct mem_cgroup_threshold *_a = a; 601 const struct mem_cgroup_threshold *_b = b; 602 603 if (_a->threshold > _b->threshold) 604 return 1; 605 606 if (_a->threshold < _b->threshold) 607 return -1; 608 609 return 0; 610 } 611 612 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 613 { 614 struct mem_cgroup_eventfd_list *ev; 615 616 spin_lock(&memcg_oom_lock); 617 618 list_for_each_entry(ev, &memcg->oom_notify, list) 619 eventfd_signal(ev->eventfd); 620 621 spin_unlock(&memcg_oom_lock); 622 return 0; 623 } 624 625 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 626 { 627 struct mem_cgroup *iter; 628 629 for_each_mem_cgroup_tree(iter, memcg) 630 mem_cgroup_oom_notify_cb(iter); 631 } 632 633 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 634 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 635 { 636 struct mem_cgroup_thresholds *thresholds; 637 struct mem_cgroup_threshold_ary *new; 638 unsigned long threshold; 639 unsigned long usage; 640 int i, size, ret; 641 642 ret = page_counter_memparse(args, "-1", &threshold); 643 if (ret) 644 return ret; 645 646 mutex_lock(&memcg->thresholds_lock); 647 648 if (type == _MEM) { 649 thresholds = &memcg->thresholds; 650 usage = mem_cgroup_usage(memcg, false); 651 } else if (type == _MEMSWAP) { 652 thresholds = &memcg->memsw_thresholds; 653 usage = mem_cgroup_usage(memcg, true); 654 } else 655 BUG(); 656 657 /* Check if a threshold crossed before adding a new one */ 658 if (thresholds->primary) 659 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 660 661 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 662 663 /* Allocate memory for new array of thresholds */ 664 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 665 if (!new) { 666 ret = -ENOMEM; 667 goto unlock; 668 } 669 new->size = size; 670 671 /* Copy thresholds (if any) to new array */ 672 if (thresholds->primary) 673 memcpy(new->entries, thresholds->primary->entries, 674 flex_array_size(new, entries, size - 1)); 675 676 /* Add new threshold */ 677 new->entries[size - 1].eventfd = eventfd; 678 new->entries[size - 1].threshold = threshold; 679 680 /* Sort thresholds. Registering of new threshold isn't time-critical */ 681 sort(new->entries, size, sizeof(*new->entries), 682 compare_thresholds, NULL); 683 684 /* Find current threshold */ 685 new->current_threshold = -1; 686 for (i = 0; i < size; i++) { 687 if (new->entries[i].threshold <= usage) { 688 /* 689 * new->current_threshold will not be used until 690 * rcu_assign_pointer(), so it's safe to increment 691 * it here. 692 */ 693 ++new->current_threshold; 694 } else 695 break; 696 } 697 698 /* Free old spare buffer and save old primary buffer as spare */ 699 kfree(thresholds->spare); 700 thresholds->spare = thresholds->primary; 701 702 rcu_assign_pointer(thresholds->primary, new); 703 704 /* To be sure that nobody uses thresholds */ 705 synchronize_rcu(); 706 707 unlock: 708 mutex_unlock(&memcg->thresholds_lock); 709 710 return ret; 711 } 712 713 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 714 struct eventfd_ctx *eventfd, const char *args) 715 { 716 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 717 } 718 719 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 720 struct eventfd_ctx *eventfd, const char *args) 721 { 722 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 723 } 724 725 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 726 struct eventfd_ctx *eventfd, enum res_type type) 727 { 728 struct mem_cgroup_thresholds *thresholds; 729 struct mem_cgroup_threshold_ary *new; 730 unsigned long usage; 731 int i, j, size, entries; 732 733 mutex_lock(&memcg->thresholds_lock); 734 735 if (type == _MEM) { 736 thresholds = &memcg->thresholds; 737 usage = mem_cgroup_usage(memcg, false); 738 } else if (type == _MEMSWAP) { 739 thresholds = &memcg->memsw_thresholds; 740 usage = mem_cgroup_usage(memcg, true); 741 } else 742 BUG(); 743 744 if (!thresholds->primary) 745 goto unlock; 746 747 /* Check if a threshold crossed before removing */ 748 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 749 750 /* Calculate new number of threshold */ 751 size = entries = 0; 752 for (i = 0; i < thresholds->primary->size; i++) { 753 if (thresholds->primary->entries[i].eventfd != eventfd) 754 size++; 755 else 756 entries++; 757 } 758 759 new = thresholds->spare; 760 761 /* If no items related to eventfd have been cleared, nothing to do */ 762 if (!entries) 763 goto unlock; 764 765 /* Set thresholds array to NULL if we don't have thresholds */ 766 if (!size) { 767 kfree(new); 768 new = NULL; 769 goto swap_buffers; 770 } 771 772 new->size = size; 773 774 /* Copy thresholds and find current threshold */ 775 new->current_threshold = -1; 776 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 777 if (thresholds->primary->entries[i].eventfd == eventfd) 778 continue; 779 780 new->entries[j] = thresholds->primary->entries[i]; 781 if (new->entries[j].threshold <= usage) { 782 /* 783 * new->current_threshold will not be used 784 * until rcu_assign_pointer(), so it's safe to increment 785 * it here. 786 */ 787 ++new->current_threshold; 788 } 789 j++; 790 } 791 792 swap_buffers: 793 /* Swap primary and spare array */ 794 thresholds->spare = thresholds->primary; 795 796 rcu_assign_pointer(thresholds->primary, new); 797 798 /* To be sure that nobody uses thresholds */ 799 synchronize_rcu(); 800 801 /* If all events are unregistered, free the spare array */ 802 if (!new) { 803 kfree(thresholds->spare); 804 thresholds->spare = NULL; 805 } 806 unlock: 807 mutex_unlock(&memcg->thresholds_lock); 808 } 809 810 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 811 struct eventfd_ctx *eventfd) 812 { 813 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 814 } 815 816 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 817 struct eventfd_ctx *eventfd) 818 { 819 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 820 } 821 822 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 823 struct eventfd_ctx *eventfd, const char *args) 824 { 825 struct mem_cgroup_eventfd_list *event; 826 827 event = kmalloc(sizeof(*event), GFP_KERNEL); 828 if (!event) 829 return -ENOMEM; 830 831 spin_lock(&memcg_oom_lock); 832 833 event->eventfd = eventfd; 834 list_add(&event->list, &memcg->oom_notify); 835 836 /* already in OOM ? */ 837 if (memcg->under_oom) 838 eventfd_signal(eventfd); 839 spin_unlock(&memcg_oom_lock); 840 841 return 0; 842 } 843 844 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 845 struct eventfd_ctx *eventfd) 846 { 847 struct mem_cgroup_eventfd_list *ev, *tmp; 848 849 spin_lock(&memcg_oom_lock); 850 851 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 852 if (ev->eventfd == eventfd) { 853 list_del(&ev->list); 854 kfree(ev); 855 } 856 } 857 858 spin_unlock(&memcg_oom_lock); 859 } 860 861 /* 862 * DO NOT USE IN NEW FILES. 863 * 864 * "cgroup.event_control" implementation. 865 * 866 * This is way over-engineered. It tries to support fully configurable 867 * events for each user. Such level of flexibility is completely 868 * unnecessary especially in the light of the planned unified hierarchy. 869 * 870 * Please deprecate this and replace with something simpler if at all 871 * possible. 872 */ 873 874 /* 875 * Unregister event and free resources. 876 * 877 * Gets called from workqueue. 878 */ 879 static void memcg_event_remove(struct work_struct *work) 880 { 881 struct mem_cgroup_event *event = 882 container_of(work, struct mem_cgroup_event, remove); 883 struct mem_cgroup *memcg = event->memcg; 884 885 remove_wait_queue(event->wqh, &event->wait); 886 887 event->unregister_event(memcg, event->eventfd); 888 889 /* Notify userspace the event is going away. */ 890 eventfd_signal(event->eventfd); 891 892 eventfd_ctx_put(event->eventfd); 893 kfree(event); 894 css_put(&memcg->css); 895 } 896 897 /* 898 * Gets called on EPOLLHUP on eventfd when user closes it. 899 * 900 * Called with wqh->lock held and interrupts disabled. 901 */ 902 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, 903 int sync, void *key) 904 { 905 struct mem_cgroup_event *event = 906 container_of(wait, struct mem_cgroup_event, wait); 907 struct mem_cgroup *memcg = event->memcg; 908 __poll_t flags = key_to_poll(key); 909 910 if (flags & EPOLLHUP) { 911 /* 912 * If the event has been detached at cgroup removal, we 913 * can simply return knowing the other side will cleanup 914 * for us. 915 * 916 * We can't race against event freeing since the other 917 * side will require wqh->lock via remove_wait_queue(), 918 * which we hold. 919 */ 920 spin_lock(&memcg->event_list_lock); 921 if (!list_empty(&event->list)) { 922 list_del_init(&event->list); 923 /* 924 * We are in atomic context, but cgroup_event_remove() 925 * may sleep, so we have to call it in workqueue. 926 */ 927 schedule_work(&event->remove); 928 } 929 spin_unlock(&memcg->event_list_lock); 930 } 931 932 return 0; 933 } 934 935 static void memcg_event_ptable_queue_proc(struct file *file, 936 wait_queue_head_t *wqh, poll_table *pt) 937 { 938 struct mem_cgroup_event *event = 939 container_of(pt, struct mem_cgroup_event, pt); 940 941 event->wqh = wqh; 942 add_wait_queue(wqh, &event->wait); 943 } 944 945 /* 946 * DO NOT USE IN NEW FILES. 947 * 948 * Parse input and register new cgroup event handler. 949 * 950 * Input must be in format '<event_fd> <control_fd> <args>'. 951 * Interpretation of args is defined by control file implementation. 952 */ 953 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 954 char *buf, size_t nbytes, loff_t off) 955 { 956 struct cgroup_subsys_state *css = of_css(of); 957 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 958 struct mem_cgroup_event *event; 959 struct cgroup_subsys_state *cfile_css; 960 unsigned int efd, cfd; 961 struct dentry *cdentry; 962 const char *name; 963 char *endp; 964 int ret; 965 966 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 967 return -EOPNOTSUPP; 968 969 buf = strstrip(buf); 970 971 efd = simple_strtoul(buf, &endp, 10); 972 if (*endp != ' ') 973 return -EINVAL; 974 buf = endp + 1; 975 976 cfd = simple_strtoul(buf, &endp, 10); 977 if (*endp == '\0') 978 buf = endp; 979 else if (*endp == ' ') 980 buf = endp + 1; 981 else 982 return -EINVAL; 983 984 CLASS(fd, efile)(efd); 985 if (fd_empty(efile)) 986 return -EBADF; 987 988 CLASS(fd, cfile)(cfd); 989 990 event = kzalloc(sizeof(*event), GFP_KERNEL); 991 if (!event) 992 return -ENOMEM; 993 994 event->memcg = memcg; 995 INIT_LIST_HEAD(&event->list); 996 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 997 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 998 INIT_WORK(&event->remove, memcg_event_remove); 999 1000 event->eventfd = eventfd_ctx_fileget(fd_file(efile)); 1001 if (IS_ERR(event->eventfd)) { 1002 ret = PTR_ERR(event->eventfd); 1003 goto out_kfree; 1004 } 1005 1006 if (fd_empty(cfile)) { 1007 ret = -EBADF; 1008 goto out_put_eventfd; 1009 } 1010 1011 /* the process need read permission on control file */ 1012 /* AV: shouldn't we check that it's been opened for read instead? */ 1013 ret = file_permission(fd_file(cfile), MAY_READ); 1014 if (ret < 0) 1015 goto out_put_eventfd; 1016 1017 /* 1018 * The control file must be a regular cgroup1 file. As a regular cgroup 1019 * file can't be renamed, it's safe to access its name afterwards. 1020 */ 1021 cdentry = fd_file(cfile)->f_path.dentry; 1022 if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 1023 ret = -EINVAL; 1024 goto out_put_eventfd; 1025 } 1026 1027 /* 1028 * Determine the event callbacks and set them in @event. This used 1029 * to be done via struct cftype but cgroup core no longer knows 1030 * about these events. The following is crude but the whole thing 1031 * is for compatibility anyway. 1032 * 1033 * DO NOT ADD NEW FILES. 1034 */ 1035 name = cdentry->d_name.name; 1036 1037 if (!strcmp(name, "memory.usage_in_bytes")) { 1038 event->register_event = mem_cgroup_usage_register_event; 1039 event->unregister_event = mem_cgroup_usage_unregister_event; 1040 } else if (!strcmp(name, "memory.oom_control")) { 1041 pr_warn_once("oom_control is deprecated and will be removed. " 1042 "Please report your usecase to linux-mm-@kvack.org" 1043 " if you depend on this functionality. \n"); 1044 event->register_event = mem_cgroup_oom_register_event; 1045 event->unregister_event = mem_cgroup_oom_unregister_event; 1046 } else if (!strcmp(name, "memory.pressure_level")) { 1047 pr_warn_once("pressure_level is deprecated and will be removed. " 1048 "Please report your usecase to linux-mm-@kvack.org " 1049 "if you depend on this functionality. \n"); 1050 event->register_event = vmpressure_register_event; 1051 event->unregister_event = vmpressure_unregister_event; 1052 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 1053 event->register_event = memsw_cgroup_usage_register_event; 1054 event->unregister_event = memsw_cgroup_usage_unregister_event; 1055 } else { 1056 ret = -EINVAL; 1057 goto out_put_eventfd; 1058 } 1059 1060 /* 1061 * Verify @cfile should belong to @css. Also, remaining events are 1062 * automatically removed on cgroup destruction but the removal is 1063 * asynchronous, so take an extra ref on @css. 1064 */ 1065 cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 1066 &memory_cgrp_subsys); 1067 ret = -EINVAL; 1068 if (IS_ERR(cfile_css)) 1069 goto out_put_eventfd; 1070 if (cfile_css != css) 1071 goto out_put_css; 1072 1073 ret = event->register_event(memcg, event->eventfd, buf); 1074 if (ret) 1075 goto out_put_css; 1076 1077 vfs_poll(fd_file(efile), &event->pt); 1078 1079 spin_lock_irq(&memcg->event_list_lock); 1080 list_add(&event->list, &memcg->event_list); 1081 spin_unlock_irq(&memcg->event_list_lock); 1082 return nbytes; 1083 1084 out_put_css: 1085 css_put(cfile_css); 1086 out_put_eventfd: 1087 eventfd_ctx_put(event->eventfd); 1088 out_kfree: 1089 kfree(event); 1090 return ret; 1091 } 1092 1093 void memcg1_memcg_init(struct mem_cgroup *memcg) 1094 { 1095 INIT_LIST_HEAD(&memcg->oom_notify); 1096 mutex_init(&memcg->thresholds_lock); 1097 INIT_LIST_HEAD(&memcg->event_list); 1098 spin_lock_init(&memcg->event_list_lock); 1099 } 1100 1101 void memcg1_css_offline(struct mem_cgroup *memcg) 1102 { 1103 struct mem_cgroup_event *event, *tmp; 1104 1105 /* 1106 * Unregister events and notify userspace. 1107 * Notify userspace about cgroup removing only after rmdir of cgroup 1108 * directory to avoid race between userspace and kernelspace. 1109 */ 1110 spin_lock_irq(&memcg->event_list_lock); 1111 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 1112 list_del_init(&event->list); 1113 schedule_work(&event->remove); 1114 } 1115 spin_unlock_irq(&memcg->event_list_lock); 1116 } 1117 1118 /* 1119 * Check OOM-Killer is already running under our hierarchy. 1120 * If someone is running, return false. 1121 */ 1122 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1123 { 1124 struct mem_cgroup *iter, *failed = NULL; 1125 1126 spin_lock(&memcg_oom_lock); 1127 1128 for_each_mem_cgroup_tree(iter, memcg) { 1129 if (iter->oom_lock) { 1130 /* 1131 * this subtree of our hierarchy is already locked 1132 * so we cannot give a lock. 1133 */ 1134 failed = iter; 1135 mem_cgroup_iter_break(memcg, iter); 1136 break; 1137 } else 1138 iter->oom_lock = true; 1139 } 1140 1141 if (failed) { 1142 /* 1143 * OK, we failed to lock the whole subtree so we have 1144 * to clean up what we set up to the failing subtree 1145 */ 1146 for_each_mem_cgroup_tree(iter, memcg) { 1147 if (iter == failed) { 1148 mem_cgroup_iter_break(memcg, iter); 1149 break; 1150 } 1151 iter->oom_lock = false; 1152 } 1153 } else 1154 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1155 1156 spin_unlock(&memcg_oom_lock); 1157 1158 return !failed; 1159 } 1160 1161 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1162 { 1163 struct mem_cgroup *iter; 1164 1165 spin_lock(&memcg_oom_lock); 1166 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1167 for_each_mem_cgroup_tree(iter, memcg) 1168 iter->oom_lock = false; 1169 spin_unlock(&memcg_oom_lock); 1170 } 1171 1172 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1173 { 1174 struct mem_cgroup *iter; 1175 1176 spin_lock(&memcg_oom_lock); 1177 for_each_mem_cgroup_tree(iter, memcg) 1178 iter->under_oom++; 1179 spin_unlock(&memcg_oom_lock); 1180 } 1181 1182 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1183 { 1184 struct mem_cgroup *iter; 1185 1186 /* 1187 * Be careful about under_oom underflows because a child memcg 1188 * could have been added after mem_cgroup_mark_under_oom. 1189 */ 1190 spin_lock(&memcg_oom_lock); 1191 for_each_mem_cgroup_tree(iter, memcg) 1192 if (iter->under_oom > 0) 1193 iter->under_oom--; 1194 spin_unlock(&memcg_oom_lock); 1195 } 1196 1197 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1198 1199 struct oom_wait_info { 1200 struct mem_cgroup *memcg; 1201 wait_queue_entry_t wait; 1202 }; 1203 1204 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1205 unsigned mode, int sync, void *arg) 1206 { 1207 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1208 struct mem_cgroup *oom_wait_memcg; 1209 struct oom_wait_info *oom_wait_info; 1210 1211 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1212 oom_wait_memcg = oom_wait_info->memcg; 1213 1214 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1215 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1216 return 0; 1217 return autoremove_wake_function(wait, mode, sync, arg); 1218 } 1219 1220 void memcg1_oom_recover(struct mem_cgroup *memcg) 1221 { 1222 /* 1223 * For the following lockless ->under_oom test, the only required 1224 * guarantee is that it must see the state asserted by an OOM when 1225 * this function is called as a result of userland actions 1226 * triggered by the notification of the OOM. This is trivially 1227 * achieved by invoking mem_cgroup_mark_under_oom() before 1228 * triggering notification. 1229 */ 1230 if (memcg && memcg->under_oom) 1231 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1232 } 1233 1234 /** 1235 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1236 * @handle: actually kill/wait or just clean up the OOM state 1237 * 1238 * This has to be called at the end of a page fault if the memcg OOM 1239 * handler was enabled. 1240 * 1241 * Memcg supports userspace OOM handling where failed allocations must 1242 * sleep on a waitqueue until the userspace task resolves the 1243 * situation. Sleeping directly in the charge context with all kinds 1244 * of locks held is not a good idea, instead we remember an OOM state 1245 * in the task and mem_cgroup_oom_synchronize() has to be called at 1246 * the end of the page fault to complete the OOM handling. 1247 * 1248 * Returns %true if an ongoing memcg OOM situation was detected and 1249 * completed, %false otherwise. 1250 */ 1251 bool mem_cgroup_oom_synchronize(bool handle) 1252 { 1253 struct mem_cgroup *memcg = current->memcg_in_oom; 1254 struct oom_wait_info owait; 1255 bool locked; 1256 1257 /* OOM is global, do not handle */ 1258 if (!memcg) 1259 return false; 1260 1261 if (!handle) 1262 goto cleanup; 1263 1264 owait.memcg = memcg; 1265 owait.wait.flags = 0; 1266 owait.wait.func = memcg_oom_wake_function; 1267 owait.wait.private = current; 1268 INIT_LIST_HEAD(&owait.wait.entry); 1269 1270 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1271 mem_cgroup_mark_under_oom(memcg); 1272 1273 locked = mem_cgroup_oom_trylock(memcg); 1274 1275 if (locked) 1276 mem_cgroup_oom_notify(memcg); 1277 1278 schedule(); 1279 mem_cgroup_unmark_under_oom(memcg); 1280 finish_wait(&memcg_oom_waitq, &owait.wait); 1281 1282 if (locked) 1283 mem_cgroup_oom_unlock(memcg); 1284 cleanup: 1285 current->memcg_in_oom = NULL; 1286 css_put(&memcg->css); 1287 return true; 1288 } 1289 1290 1291 bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) 1292 { 1293 /* 1294 * We are in the middle of the charge context here, so we 1295 * don't want to block when potentially sitting on a callstack 1296 * that holds all kinds of filesystem and mm locks. 1297 * 1298 * cgroup1 allows disabling the OOM killer and waiting for outside 1299 * handling until the charge can succeed; remember the context and put 1300 * the task to sleep at the end of the page fault when all locks are 1301 * released. 1302 * 1303 * On the other hand, in-kernel OOM killer allows for an async victim 1304 * memory reclaim (oom_reaper) and that means that we are not solely 1305 * relying on the oom victim to make a forward progress and we can 1306 * invoke the oom killer here. 1307 * 1308 * Please note that mem_cgroup_out_of_memory might fail to find a 1309 * victim and then we have to bail out from the charge path. 1310 */ 1311 if (READ_ONCE(memcg->oom_kill_disable)) { 1312 if (current->in_user_fault) { 1313 css_get(&memcg->css); 1314 current->memcg_in_oom = memcg; 1315 } 1316 return false; 1317 } 1318 1319 mem_cgroup_mark_under_oom(memcg); 1320 1321 *locked = mem_cgroup_oom_trylock(memcg); 1322 1323 if (*locked) 1324 mem_cgroup_oom_notify(memcg); 1325 1326 mem_cgroup_unmark_under_oom(memcg); 1327 1328 return true; 1329 } 1330 1331 void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked) 1332 { 1333 if (locked) 1334 mem_cgroup_oom_unlock(memcg); 1335 } 1336 1337 static DEFINE_MUTEX(memcg_max_mutex); 1338 1339 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 1340 unsigned long max, bool memsw) 1341 { 1342 bool enlarge = false; 1343 bool drained = false; 1344 int ret; 1345 bool limits_invariant; 1346 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 1347 1348 do { 1349 if (signal_pending(current)) { 1350 ret = -EINTR; 1351 break; 1352 } 1353 1354 mutex_lock(&memcg_max_mutex); 1355 /* 1356 * Make sure that the new limit (memsw or memory limit) doesn't 1357 * break our basic invariant rule memory.max <= memsw.max. 1358 */ 1359 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 1360 max <= memcg->memsw.max; 1361 if (!limits_invariant) { 1362 mutex_unlock(&memcg_max_mutex); 1363 ret = -EINVAL; 1364 break; 1365 } 1366 if (max > counter->max) 1367 enlarge = true; 1368 ret = page_counter_set_max(counter, max); 1369 mutex_unlock(&memcg_max_mutex); 1370 1371 if (!ret) 1372 break; 1373 1374 if (!drained) { 1375 drain_all_stock(memcg); 1376 drained = true; 1377 continue; 1378 } 1379 1380 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 1381 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) { 1382 ret = -EBUSY; 1383 break; 1384 } 1385 } while (true); 1386 1387 if (!ret && enlarge) 1388 memcg1_oom_recover(memcg); 1389 1390 return ret; 1391 } 1392 1393 /* 1394 * Reclaims as many pages from the given memcg as possible. 1395 * 1396 * Caller is responsible for holding css reference for memcg. 1397 */ 1398 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 1399 { 1400 int nr_retries = MAX_RECLAIM_RETRIES; 1401 1402 /* we call try-to-free pages for make this cgroup empty */ 1403 lru_add_drain_all(); 1404 1405 drain_all_stock(memcg); 1406 1407 /* try to free all pages in this cgroup */ 1408 while (nr_retries && page_counter_read(&memcg->memory)) { 1409 if (signal_pending(current)) 1410 return -EINTR; 1411 1412 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 1413 MEMCG_RECLAIM_MAY_SWAP, NULL)) 1414 nr_retries--; 1415 } 1416 1417 return 0; 1418 } 1419 1420 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 1421 char *buf, size_t nbytes, 1422 loff_t off) 1423 { 1424 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1425 1426 if (mem_cgroup_is_root(memcg)) 1427 return -EINVAL; 1428 return mem_cgroup_force_empty(memcg) ?: nbytes; 1429 } 1430 1431 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 1432 struct cftype *cft) 1433 { 1434 return 1; 1435 } 1436 1437 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 1438 struct cftype *cft, u64 val) 1439 { 1440 if (val == 1) 1441 return 0; 1442 1443 pr_warn_once("Non-hierarchical mode is deprecated. " 1444 "Please report your usecase to linux-mm@kvack.org if you " 1445 "depend on this functionality.\n"); 1446 1447 return -EINVAL; 1448 } 1449 1450 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 1451 struct cftype *cft) 1452 { 1453 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1454 struct page_counter *counter; 1455 1456 switch (MEMFILE_TYPE(cft->private)) { 1457 case _MEM: 1458 counter = &memcg->memory; 1459 break; 1460 case _MEMSWAP: 1461 counter = &memcg->memsw; 1462 break; 1463 case _KMEM: 1464 counter = &memcg->kmem; 1465 break; 1466 case _TCP: 1467 counter = &memcg->tcpmem; 1468 break; 1469 default: 1470 BUG(); 1471 } 1472 1473 switch (MEMFILE_ATTR(cft->private)) { 1474 case RES_USAGE: 1475 if (counter == &memcg->memory) 1476 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 1477 if (counter == &memcg->memsw) 1478 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 1479 return (u64)page_counter_read(counter) * PAGE_SIZE; 1480 case RES_LIMIT: 1481 return (u64)counter->max * PAGE_SIZE; 1482 case RES_MAX_USAGE: 1483 return (u64)counter->watermark * PAGE_SIZE; 1484 case RES_FAILCNT: 1485 return counter->failcnt; 1486 case RES_SOFT_LIMIT: 1487 return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE; 1488 default: 1489 BUG(); 1490 } 1491 } 1492 1493 /* 1494 * This function doesn't do anything useful. Its only job is to provide a read 1495 * handler for a file so that cgroup_file_mode() will add read permissions. 1496 */ 1497 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m, 1498 __always_unused void *v) 1499 { 1500 return -EINVAL; 1501 } 1502 1503 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 1504 { 1505 int ret; 1506 1507 mutex_lock(&memcg_max_mutex); 1508 1509 ret = page_counter_set_max(&memcg->tcpmem, max); 1510 if (ret) 1511 goto out; 1512 1513 if (!memcg->tcpmem_active) { 1514 /* 1515 * The active flag needs to be written after the static_key 1516 * update. This is what guarantees that the socket activation 1517 * function is the last one to run. See mem_cgroup_sk_alloc() 1518 * for details, and note that we don't mark any socket as 1519 * belonging to this memcg until that flag is up. 1520 * 1521 * We need to do this, because static_keys will span multiple 1522 * sites, but we can't control their order. If we mark a socket 1523 * as accounted, but the accounting functions are not patched in 1524 * yet, we'll lose accounting. 1525 * 1526 * We never race with the readers in mem_cgroup_sk_alloc(), 1527 * because when this value change, the code to process it is not 1528 * patched in yet. 1529 */ 1530 static_branch_inc(&memcg_sockets_enabled_key); 1531 memcg->tcpmem_active = true; 1532 } 1533 out: 1534 mutex_unlock(&memcg_max_mutex); 1535 return ret; 1536 } 1537 1538 /* 1539 * The user of this function is... 1540 * RES_LIMIT. 1541 */ 1542 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 1543 char *buf, size_t nbytes, loff_t off) 1544 { 1545 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1546 unsigned long nr_pages; 1547 int ret; 1548 1549 buf = strstrip(buf); 1550 ret = page_counter_memparse(buf, "-1", &nr_pages); 1551 if (ret) 1552 return ret; 1553 1554 switch (MEMFILE_ATTR(of_cft(of)->private)) { 1555 case RES_LIMIT: 1556 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 1557 ret = -EINVAL; 1558 break; 1559 } 1560 switch (MEMFILE_TYPE(of_cft(of)->private)) { 1561 case _MEM: 1562 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 1563 break; 1564 case _MEMSWAP: 1565 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 1566 break; 1567 case _KMEM: 1568 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 1569 "Writing any value to this file has no effect. " 1570 "Please report your usecase to linux-mm@kvack.org if you " 1571 "depend on this functionality.\n"); 1572 ret = 0; 1573 break; 1574 case _TCP: 1575 pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. " 1576 "Please report your usecase to linux-mm@kvack.org if you " 1577 "depend on this functionality.\n"); 1578 ret = memcg_update_tcp_max(memcg, nr_pages); 1579 break; 1580 } 1581 break; 1582 case RES_SOFT_LIMIT: 1583 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 1584 ret = -EOPNOTSUPP; 1585 } else { 1586 pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. " 1587 "Please report your usecase to linux-mm@kvack.org if you " 1588 "depend on this functionality.\n"); 1589 WRITE_ONCE(memcg->soft_limit, nr_pages); 1590 ret = 0; 1591 } 1592 break; 1593 } 1594 return ret ?: nbytes; 1595 } 1596 1597 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 1598 size_t nbytes, loff_t off) 1599 { 1600 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1601 struct page_counter *counter; 1602 1603 switch (MEMFILE_TYPE(of_cft(of)->private)) { 1604 case _MEM: 1605 counter = &memcg->memory; 1606 break; 1607 case _MEMSWAP: 1608 counter = &memcg->memsw; 1609 break; 1610 case _KMEM: 1611 counter = &memcg->kmem; 1612 break; 1613 case _TCP: 1614 counter = &memcg->tcpmem; 1615 break; 1616 default: 1617 BUG(); 1618 } 1619 1620 switch (MEMFILE_ATTR(of_cft(of)->private)) { 1621 case RES_MAX_USAGE: 1622 page_counter_reset_watermark(counter); 1623 break; 1624 case RES_FAILCNT: 1625 counter->failcnt = 0; 1626 break; 1627 default: 1628 BUG(); 1629 } 1630 1631 return nbytes; 1632 } 1633 1634 #ifdef CONFIG_NUMA 1635 1636 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 1637 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 1638 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 1639 1640 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 1641 int nid, unsigned int lru_mask, bool tree) 1642 { 1643 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 1644 unsigned long nr = 0; 1645 enum lru_list lru; 1646 1647 VM_BUG_ON((unsigned)nid >= nr_node_ids); 1648 1649 for_each_lru(lru) { 1650 if (!(BIT(lru) & lru_mask)) 1651 continue; 1652 if (tree) 1653 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 1654 else 1655 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 1656 } 1657 return nr; 1658 } 1659 1660 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 1661 unsigned int lru_mask, 1662 bool tree) 1663 { 1664 unsigned long nr = 0; 1665 enum lru_list lru; 1666 1667 for_each_lru(lru) { 1668 if (!(BIT(lru) & lru_mask)) 1669 continue; 1670 if (tree) 1671 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 1672 else 1673 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 1674 } 1675 return nr; 1676 } 1677 1678 static int memcg_numa_stat_show(struct seq_file *m, void *v) 1679 { 1680 struct numa_stat { 1681 const char *name; 1682 unsigned int lru_mask; 1683 }; 1684 1685 static const struct numa_stat stats[] = { 1686 { "total", LRU_ALL }, 1687 { "file", LRU_ALL_FILE }, 1688 { "anon", LRU_ALL_ANON }, 1689 { "unevictable", BIT(LRU_UNEVICTABLE) }, 1690 }; 1691 const struct numa_stat *stat; 1692 int nid; 1693 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 1694 1695 mem_cgroup_flush_stats(memcg); 1696 1697 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 1698 seq_printf(m, "%s=%lu", stat->name, 1699 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 1700 false)); 1701 for_each_node_state(nid, N_MEMORY) 1702 seq_printf(m, " N%d=%lu", nid, 1703 mem_cgroup_node_nr_lru_pages(memcg, nid, 1704 stat->lru_mask, false)); 1705 seq_putc(m, '\n'); 1706 } 1707 1708 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 1709 1710 seq_printf(m, "hierarchical_%s=%lu", stat->name, 1711 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 1712 true)); 1713 for_each_node_state(nid, N_MEMORY) 1714 seq_printf(m, " N%d=%lu", nid, 1715 mem_cgroup_node_nr_lru_pages(memcg, nid, 1716 stat->lru_mask, true)); 1717 seq_putc(m, '\n'); 1718 } 1719 1720 return 0; 1721 } 1722 #endif /* CONFIG_NUMA */ 1723 1724 static const unsigned int memcg1_stats[] = { 1725 NR_FILE_PAGES, 1726 NR_ANON_MAPPED, 1727 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1728 NR_ANON_THPS, 1729 #endif 1730 NR_SHMEM, 1731 NR_FILE_MAPPED, 1732 NR_FILE_DIRTY, 1733 NR_WRITEBACK, 1734 WORKINGSET_REFAULT_ANON, 1735 WORKINGSET_REFAULT_FILE, 1736 #ifdef CONFIG_SWAP 1737 MEMCG_SWAP, 1738 NR_SWAPCACHE, 1739 #endif 1740 }; 1741 1742 static const char *const memcg1_stat_names[] = { 1743 "cache", 1744 "rss", 1745 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1746 "rss_huge", 1747 #endif 1748 "shmem", 1749 "mapped_file", 1750 "dirty", 1751 "writeback", 1752 "workingset_refault_anon", 1753 "workingset_refault_file", 1754 #ifdef CONFIG_SWAP 1755 "swap", 1756 "swapcached", 1757 #endif 1758 }; 1759 1760 /* Universal VM events cgroup1 shows, original sort order */ 1761 static const unsigned int memcg1_events[] = { 1762 PGPGIN, 1763 PGPGOUT, 1764 PGFAULT, 1765 PGMAJFAULT, 1766 }; 1767 1768 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 1769 { 1770 unsigned long memory, memsw; 1771 struct mem_cgroup *mi; 1772 unsigned int i; 1773 1774 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 1775 1776 mem_cgroup_flush_stats(memcg); 1777 1778 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1779 unsigned long nr; 1780 1781 nr = memcg_page_state_local_output(memcg, memcg1_stats[i]); 1782 seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr); 1783 } 1784 1785 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 1786 seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), 1787 memcg_events_local(memcg, memcg1_events[i])); 1788 1789 for (i = 0; i < NR_LRU_LISTS; i++) 1790 seq_buf_printf(s, "%s %lu\n", lru_list_name(i), 1791 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 1792 PAGE_SIZE); 1793 1794 /* Hierarchical information */ 1795 memory = memsw = PAGE_COUNTER_MAX; 1796 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 1797 memory = min(memory, READ_ONCE(mi->memory.max)); 1798 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 1799 } 1800 seq_buf_printf(s, "hierarchical_memory_limit %llu\n", 1801 (u64)memory * PAGE_SIZE); 1802 seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", 1803 (u64)memsw * PAGE_SIZE); 1804 1805 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1806 unsigned long nr; 1807 1808 nr = memcg_page_state_output(memcg, memcg1_stats[i]); 1809 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], 1810 (u64)nr); 1811 } 1812 1813 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 1814 seq_buf_printf(s, "total_%s %llu\n", 1815 vm_event_name(memcg1_events[i]), 1816 (u64)memcg_events(memcg, memcg1_events[i])); 1817 1818 for (i = 0; i < NR_LRU_LISTS; i++) 1819 seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), 1820 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 1821 PAGE_SIZE); 1822 1823 #ifdef CONFIG_DEBUG_VM 1824 { 1825 pg_data_t *pgdat; 1826 struct mem_cgroup_per_node *mz; 1827 unsigned long anon_cost = 0; 1828 unsigned long file_cost = 0; 1829 1830 for_each_online_pgdat(pgdat) { 1831 mz = memcg->nodeinfo[pgdat->node_id]; 1832 1833 anon_cost += mz->lruvec.anon_cost; 1834 file_cost += mz->lruvec.file_cost; 1835 } 1836 seq_buf_printf(s, "anon_cost %lu\n", anon_cost); 1837 seq_buf_printf(s, "file_cost %lu\n", file_cost); 1838 } 1839 #endif 1840 } 1841 1842 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 1843 struct cftype *cft) 1844 { 1845 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1846 1847 return mem_cgroup_swappiness(memcg); 1848 } 1849 1850 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 1851 struct cftype *cft, u64 val) 1852 { 1853 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1854 1855 if (val > MAX_SWAPPINESS) 1856 return -EINVAL; 1857 1858 if (!mem_cgroup_is_root(memcg)) 1859 WRITE_ONCE(memcg->swappiness, val); 1860 else 1861 WRITE_ONCE(vm_swappiness, val); 1862 1863 return 0; 1864 } 1865 1866 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 1867 { 1868 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 1869 1870 seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable)); 1871 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 1872 seq_printf(sf, "oom_kill %lu\n", 1873 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 1874 return 0; 1875 } 1876 1877 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 1878 struct cftype *cft, u64 val) 1879 { 1880 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1881 1882 pr_warn_once("oom_control is deprecated and will be removed. " 1883 "Please report your usecase to linux-mm-@kvack.org if you " 1884 "depend on this functionality. \n"); 1885 1886 /* cannot set to root cgroup and only 0 and 1 are allowed */ 1887 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 1888 return -EINVAL; 1889 1890 WRITE_ONCE(memcg->oom_kill_disable, val); 1891 if (!val) 1892 memcg1_oom_recover(memcg); 1893 1894 return 0; 1895 } 1896 1897 #ifdef CONFIG_SLUB_DEBUG 1898 static int mem_cgroup_slab_show(struct seq_file *m, void *p) 1899 { 1900 /* 1901 * Deprecated. 1902 * Please, take a look at tools/cgroup/memcg_slabinfo.py . 1903 */ 1904 return 0; 1905 } 1906 #endif 1907 1908 struct cftype mem_cgroup_legacy_files[] = { 1909 { 1910 .name = "usage_in_bytes", 1911 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 1912 .read_u64 = mem_cgroup_read_u64, 1913 }, 1914 { 1915 .name = "max_usage_in_bytes", 1916 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 1917 .write = mem_cgroup_reset, 1918 .read_u64 = mem_cgroup_read_u64, 1919 }, 1920 { 1921 .name = "limit_in_bytes", 1922 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 1923 .write = mem_cgroup_write, 1924 .read_u64 = mem_cgroup_read_u64, 1925 }, 1926 { 1927 .name = "soft_limit_in_bytes", 1928 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 1929 .write = mem_cgroup_write, 1930 .read_u64 = mem_cgroup_read_u64, 1931 }, 1932 { 1933 .name = "failcnt", 1934 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 1935 .write = mem_cgroup_reset, 1936 .read_u64 = mem_cgroup_read_u64, 1937 }, 1938 { 1939 .name = "stat", 1940 .seq_show = memory_stat_show, 1941 }, 1942 { 1943 .name = "force_empty", 1944 .write = mem_cgroup_force_empty_write, 1945 }, 1946 { 1947 .name = "use_hierarchy", 1948 .write_u64 = mem_cgroup_hierarchy_write, 1949 .read_u64 = mem_cgroup_hierarchy_read, 1950 }, 1951 { 1952 .name = "cgroup.event_control", /* XXX: for compat */ 1953 .write = memcg_write_event_control, 1954 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 1955 }, 1956 { 1957 .name = "swappiness", 1958 .read_u64 = mem_cgroup_swappiness_read, 1959 .write_u64 = mem_cgroup_swappiness_write, 1960 }, 1961 { 1962 .name = "move_charge_at_immigrate", 1963 .read_u64 = mem_cgroup_move_charge_read, 1964 .write_u64 = mem_cgroup_move_charge_write, 1965 }, 1966 { 1967 .name = "oom_control", 1968 .seq_show = mem_cgroup_oom_control_read, 1969 .write_u64 = mem_cgroup_oom_control_write, 1970 }, 1971 { 1972 .name = "pressure_level", 1973 .seq_show = mem_cgroup_dummy_seq_show, 1974 }, 1975 #ifdef CONFIG_NUMA 1976 { 1977 .name = "numa_stat", 1978 .seq_show = memcg_numa_stat_show, 1979 }, 1980 #endif 1981 { 1982 .name = "kmem.limit_in_bytes", 1983 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 1984 .write = mem_cgroup_write, 1985 .read_u64 = mem_cgroup_read_u64, 1986 }, 1987 { 1988 .name = "kmem.usage_in_bytes", 1989 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 1990 .read_u64 = mem_cgroup_read_u64, 1991 }, 1992 { 1993 .name = "kmem.failcnt", 1994 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 1995 .write = mem_cgroup_reset, 1996 .read_u64 = mem_cgroup_read_u64, 1997 }, 1998 { 1999 .name = "kmem.max_usage_in_bytes", 2000 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 2001 .write = mem_cgroup_reset, 2002 .read_u64 = mem_cgroup_read_u64, 2003 }, 2004 #ifdef CONFIG_SLUB_DEBUG 2005 { 2006 .name = "kmem.slabinfo", 2007 .seq_show = mem_cgroup_slab_show, 2008 }, 2009 #endif 2010 { 2011 .name = "kmem.tcp.limit_in_bytes", 2012 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 2013 .write = mem_cgroup_write, 2014 .read_u64 = mem_cgroup_read_u64, 2015 }, 2016 { 2017 .name = "kmem.tcp.usage_in_bytes", 2018 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 2019 .read_u64 = mem_cgroup_read_u64, 2020 }, 2021 { 2022 .name = "kmem.tcp.failcnt", 2023 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 2024 .write = mem_cgroup_reset, 2025 .read_u64 = mem_cgroup_read_u64, 2026 }, 2027 { 2028 .name = "kmem.tcp.max_usage_in_bytes", 2029 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 2030 .write = mem_cgroup_reset, 2031 .read_u64 = mem_cgroup_read_u64, 2032 }, 2033 { }, /* terminate */ 2034 }; 2035 2036 struct cftype memsw_files[] = { 2037 { 2038 .name = "memsw.usage_in_bytes", 2039 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 2040 .read_u64 = mem_cgroup_read_u64, 2041 }, 2042 { 2043 .name = "memsw.max_usage_in_bytes", 2044 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 2045 .write = mem_cgroup_reset, 2046 .read_u64 = mem_cgroup_read_u64, 2047 }, 2048 { 2049 .name = "memsw.limit_in_bytes", 2050 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 2051 .write = mem_cgroup_write, 2052 .read_u64 = mem_cgroup_read_u64, 2053 }, 2054 { 2055 .name = "memsw.failcnt", 2056 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 2057 .write = mem_cgroup_reset, 2058 .read_u64 = mem_cgroup_read_u64, 2059 }, 2060 { }, /* terminate */ 2061 }; 2062 2063 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) 2064 { 2065 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 2066 if (nr_pages > 0) 2067 page_counter_charge(&memcg->kmem, nr_pages); 2068 else 2069 page_counter_uncharge(&memcg->kmem, -nr_pages); 2070 } 2071 } 2072 2073 bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 2074 gfp_t gfp_mask) 2075 { 2076 struct page_counter *fail; 2077 2078 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 2079 memcg->tcpmem_pressure = 0; 2080 return true; 2081 } 2082 memcg->tcpmem_pressure = 1; 2083 if (gfp_mask & __GFP_NOFAIL) { 2084 page_counter_charge(&memcg->tcpmem, nr_pages); 2085 return true; 2086 } 2087 return false; 2088 } 2089 2090 bool memcg1_alloc_events(struct mem_cgroup *memcg) 2091 { 2092 memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu, 2093 GFP_KERNEL_ACCOUNT); 2094 return !!memcg->events_percpu; 2095 } 2096 2097 void memcg1_free_events(struct mem_cgroup *memcg) 2098 { 2099 if (memcg->events_percpu) 2100 free_percpu(memcg->events_percpu); 2101 } 2102 2103 static int __init memcg1_init(void) 2104 { 2105 int node; 2106 2107 for_each_node(node) { 2108 struct mem_cgroup_tree_per_node *rtpn; 2109 2110 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); 2111 2112 rtpn->rb_root = RB_ROOT; 2113 rtpn->rb_rightmost = NULL; 2114 spin_lock_init(&rtpn->lock); 2115 soft_limit_tree.rb_tree_per_node[node] = rtpn; 2116 } 2117 2118 return 0; 2119 } 2120 subsys_initcall(memcg1_init); 2121