1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * This program is free software; you can redistribute it and/or modify 18 * it under the terms of the GNU General Public License as published by 19 * the Free Software Foundation; either version 2 of the License, or 20 * (at your option) any later version. 21 * 22 * This program is distributed in the hope that it will be useful, 23 * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 * GNU General Public License for more details. 26 */ 27 28 #include <linux/res_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/mm.h> 32 #include <linux/hugetlb.h> 33 #include <linux/pagemap.h> 34 #include <linux/smp.h> 35 #include <linux/page-flags.h> 36 #include <linux/backing-dev.h> 37 #include <linux/bit_spinlock.h> 38 #include <linux/rcupdate.h> 39 #include <linux/limits.h> 40 #include <linux/export.h> 41 #include <linux/mutex.h> 42 #include <linux/rbtree.h> 43 #include <linux/slab.h> 44 #include <linux/swap.h> 45 #include <linux/swapops.h> 46 #include <linux/spinlock.h> 47 #include <linux/eventfd.h> 48 #include <linux/sort.h> 49 #include <linux/fs.h> 50 #include <linux/seq_file.h> 51 #include <linux/vmalloc.h> 52 #include <linux/vmpressure.h> 53 #include <linux/mm_inline.h> 54 #include <linux/page_cgroup.h> 55 #include <linux/cpu.h> 56 #include <linux/oom.h> 57 #include "internal.h" 58 #include <net/sock.h> 59 #include <net/ip.h> 60 #include <net/tcp_memcontrol.h> 61 62 #include <asm/uaccess.h> 63 64 #include <trace/events/vmscan.h> 65 66 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 67 EXPORT_SYMBOL(mem_cgroup_subsys); 68 69 #define MEM_CGROUP_RECLAIM_RETRIES 5 70 static struct mem_cgroup *root_mem_cgroup __read_mostly; 71 72 #ifdef CONFIG_MEMCG_SWAP 73 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 74 int do_swap_account __read_mostly; 75 76 /* for remember boot option*/ 77 #ifdef CONFIG_MEMCG_SWAP_ENABLED 78 static int really_do_swap_account __initdata = 1; 79 #else 80 static int really_do_swap_account __initdata = 0; 81 #endif 82 83 #else 84 #define do_swap_account 0 85 #endif 86 87 88 /* 89 * Statistics for memory cgroup. 90 */ 91 enum mem_cgroup_stat_index { 92 /* 93 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 94 */ 95 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 96 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 97 MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ 98 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 99 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ 100 MEM_CGROUP_STAT_NSTATS, 101 }; 102 103 static const char * const mem_cgroup_stat_names[] = { 104 "cache", 105 "rss", 106 "rss_huge", 107 "mapped_file", 108 "swap", 109 }; 110 111 enum mem_cgroup_events_index { 112 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 113 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 114 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 115 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 116 MEM_CGROUP_EVENTS_NSTATS, 117 }; 118 119 static const char * const mem_cgroup_events_names[] = { 120 "pgpgin", 121 "pgpgout", 122 "pgfault", 123 "pgmajfault", 124 }; 125 126 static const char * const mem_cgroup_lru_names[] = { 127 "inactive_anon", 128 "active_anon", 129 "inactive_file", 130 "active_file", 131 "unevictable", 132 }; 133 134 /* 135 * Per memcg event counter is incremented at every pagein/pageout. With THP, 136 * it will be incremated by the number of pages. This counter is used for 137 * for trigger some periodic events. This is straightforward and better 138 * than using jiffies etc. to handle periodic memcg event. 139 */ 140 enum mem_cgroup_events_target { 141 MEM_CGROUP_TARGET_THRESH, 142 MEM_CGROUP_TARGET_SOFTLIMIT, 143 MEM_CGROUP_TARGET_NUMAINFO, 144 MEM_CGROUP_NTARGETS, 145 }; 146 #define THRESHOLDS_EVENTS_TARGET 128 147 #define SOFTLIMIT_EVENTS_TARGET 1024 148 #define NUMAINFO_EVENTS_TARGET 1024 149 150 struct mem_cgroup_stat_cpu { 151 long count[MEM_CGROUP_STAT_NSTATS]; 152 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 153 unsigned long nr_page_events; 154 unsigned long targets[MEM_CGROUP_NTARGETS]; 155 }; 156 157 struct mem_cgroup_reclaim_iter { 158 /* 159 * last scanned hierarchy member. Valid only if last_dead_count 160 * matches memcg->dead_count of the hierarchy root group. 161 */ 162 struct mem_cgroup *last_visited; 163 unsigned long last_dead_count; 164 165 /* scan generation, increased every round-trip */ 166 unsigned int generation; 167 }; 168 169 /* 170 * per-zone information in memory controller. 171 */ 172 struct mem_cgroup_per_zone { 173 struct lruvec lruvec; 174 unsigned long lru_size[NR_LRU_LISTS]; 175 176 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 177 178 struct rb_node tree_node; /* RB tree node */ 179 unsigned long long usage_in_excess;/* Set to the value by which */ 180 /* the soft limit is exceeded*/ 181 bool on_tree; 182 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 183 /* use container_of */ 184 }; 185 186 struct mem_cgroup_per_node { 187 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 188 }; 189 190 struct mem_cgroup_lru_info { 191 struct mem_cgroup_per_node *nodeinfo[0]; 192 }; 193 194 /* 195 * Cgroups above their limits are maintained in a RB-Tree, independent of 196 * their hierarchy representation 197 */ 198 199 struct mem_cgroup_tree_per_zone { 200 struct rb_root rb_root; 201 spinlock_t lock; 202 }; 203 204 struct mem_cgroup_tree_per_node { 205 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 206 }; 207 208 struct mem_cgroup_tree { 209 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 210 }; 211 212 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 213 214 struct mem_cgroup_threshold { 215 struct eventfd_ctx *eventfd; 216 u64 threshold; 217 }; 218 219 /* For threshold */ 220 struct mem_cgroup_threshold_ary { 221 /* An array index points to threshold just below or equal to usage. */ 222 int current_threshold; 223 /* Size of entries[] */ 224 unsigned int size; 225 /* Array of thresholds */ 226 struct mem_cgroup_threshold entries[0]; 227 }; 228 229 struct mem_cgroup_thresholds { 230 /* Primary thresholds array */ 231 struct mem_cgroup_threshold_ary *primary; 232 /* 233 * Spare threshold array. 234 * This is needed to make mem_cgroup_unregister_event() "never fail". 235 * It must be able to store at least primary->size - 1 entries. 236 */ 237 struct mem_cgroup_threshold_ary *spare; 238 }; 239 240 /* for OOM */ 241 struct mem_cgroup_eventfd_list { 242 struct list_head list; 243 struct eventfd_ctx *eventfd; 244 }; 245 246 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 247 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 248 249 /* 250 * The memory controller data structure. The memory controller controls both 251 * page cache and RSS per cgroup. We would eventually like to provide 252 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 253 * to help the administrator determine what knobs to tune. 254 * 255 * TODO: Add a water mark for the memory controller. Reclaim will begin when 256 * we hit the water mark. May be even add a low water mark, such that 257 * no reclaim occurs from a cgroup at it's low water mark, this is 258 * a feature that will be implemented much later in the future. 259 */ 260 struct mem_cgroup { 261 struct cgroup_subsys_state css; 262 /* 263 * the counter to account for memory usage 264 */ 265 struct res_counter res; 266 267 /* vmpressure notifications */ 268 struct vmpressure vmpressure; 269 270 union { 271 /* 272 * the counter to account for mem+swap usage. 273 */ 274 struct res_counter memsw; 275 276 /* 277 * rcu_freeing is used only when freeing struct mem_cgroup, 278 * so put it into a union to avoid wasting more memory. 279 * It must be disjoint from the css field. It could be 280 * in a union with the res field, but res plays a much 281 * larger part in mem_cgroup life than memsw, and might 282 * be of interest, even at time of free, when debugging. 283 * So share rcu_head with the less interesting memsw. 284 */ 285 struct rcu_head rcu_freeing; 286 /* 287 * We also need some space for a worker in deferred freeing. 288 * By the time we call it, rcu_freeing is no longer in use. 289 */ 290 struct work_struct work_freeing; 291 }; 292 293 /* 294 * the counter to account for kernel memory usage. 295 */ 296 struct res_counter kmem; 297 /* 298 * Should the accounting and control be hierarchical, per subtree? 299 */ 300 bool use_hierarchy; 301 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ 302 303 bool oom_lock; 304 atomic_t under_oom; 305 306 atomic_t refcnt; 307 308 int swappiness; 309 /* OOM-Killer disable */ 310 int oom_kill_disable; 311 312 /* set when res.limit == memsw.limit */ 313 bool memsw_is_minimum; 314 315 /* protect arrays of thresholds */ 316 struct mutex thresholds_lock; 317 318 /* thresholds for memory usage. RCU-protected */ 319 struct mem_cgroup_thresholds thresholds; 320 321 /* thresholds for mem+swap usage. RCU-protected */ 322 struct mem_cgroup_thresholds memsw_thresholds; 323 324 /* For oom notifier event fd */ 325 struct list_head oom_notify; 326 327 /* 328 * Should we move charges of a task when a task is moved into this 329 * mem_cgroup ? And what type of charges should we move ? 330 */ 331 unsigned long move_charge_at_immigrate; 332 /* 333 * set > 0 if pages under this cgroup are moving to other cgroup. 334 */ 335 atomic_t moving_account; 336 /* taken only while moving_account > 0 */ 337 spinlock_t move_lock; 338 /* 339 * percpu counter. 340 */ 341 struct mem_cgroup_stat_cpu __percpu *stat; 342 /* 343 * used when a cpu is offlined or other synchronizations 344 * See mem_cgroup_read_stat(). 345 */ 346 struct mem_cgroup_stat_cpu nocpu_base; 347 spinlock_t pcp_counter_lock; 348 349 atomic_t dead_count; 350 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 351 struct tcp_memcontrol tcp_mem; 352 #endif 353 #if defined(CONFIG_MEMCG_KMEM) 354 /* analogous to slab_common's slab_caches list. per-memcg */ 355 struct list_head memcg_slab_caches; 356 /* Not a spinlock, we can take a lot of time walking the list */ 357 struct mutex slab_caches_mutex; 358 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 359 int kmemcg_id; 360 #endif 361 362 int last_scanned_node; 363 #if MAX_NUMNODES > 1 364 nodemask_t scan_nodes; 365 atomic_t numainfo_events; 366 atomic_t numainfo_updating; 367 #endif 368 369 /* 370 * Per cgroup active and inactive list, similar to the 371 * per zone LRU lists. 372 * 373 * WARNING: This has to be the last element of the struct. Don't 374 * add new fields after this point. 375 */ 376 struct mem_cgroup_lru_info info; 377 }; 378 379 static size_t memcg_size(void) 380 { 381 return sizeof(struct mem_cgroup) + 382 nr_node_ids * sizeof(struct mem_cgroup_per_node); 383 } 384 385 /* internal only representation about the status of kmem accounting. */ 386 enum { 387 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ 388 KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ 389 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ 390 }; 391 392 /* We account when limit is on, but only after call sites are patched */ 393 #define KMEM_ACCOUNTED_MASK \ 394 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) 395 396 #ifdef CONFIG_MEMCG_KMEM 397 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) 398 { 399 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 400 } 401 402 static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 403 { 404 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 405 } 406 407 static void memcg_kmem_set_activated(struct mem_cgroup *memcg) 408 { 409 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); 410 } 411 412 static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) 413 { 414 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); 415 } 416 417 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 418 { 419 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) 420 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); 421 } 422 423 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) 424 { 425 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, 426 &memcg->kmem_account_flags); 427 } 428 #endif 429 430 /* Stuffs for move charges at task migration. */ 431 /* 432 * Types of charges to be moved. "move_charge_at_immitgrate" and 433 * "immigrate_flags" are treated as a left-shifted bitmap of these types. 434 */ 435 enum move_type { 436 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 437 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 438 NR_MOVE_TYPE, 439 }; 440 441 /* "mc" and its members are protected by cgroup_mutex */ 442 static struct move_charge_struct { 443 spinlock_t lock; /* for from, to */ 444 struct mem_cgroup *from; 445 struct mem_cgroup *to; 446 unsigned long immigrate_flags; 447 unsigned long precharge; 448 unsigned long moved_charge; 449 unsigned long moved_swap; 450 struct task_struct *moving_task; /* a task moving charges */ 451 wait_queue_head_t waitq; /* a waitq for other context */ 452 } mc = { 453 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 454 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 455 }; 456 457 static bool move_anon(void) 458 { 459 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); 460 } 461 462 static bool move_file(void) 463 { 464 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); 465 } 466 467 /* 468 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 469 * limit reclaim to prevent infinite loops, if they ever occur. 470 */ 471 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 472 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 473 474 enum charge_type { 475 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 476 MEM_CGROUP_CHARGE_TYPE_ANON, 477 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 478 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 479 NR_CHARGE_TYPE, 480 }; 481 482 /* for encoding cft->private value on file */ 483 enum res_type { 484 _MEM, 485 _MEMSWAP, 486 _OOM_TYPE, 487 _KMEM, 488 }; 489 490 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 491 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 492 #define MEMFILE_ATTR(val) ((val) & 0xffff) 493 /* Used for OOM nofiier */ 494 #define OOM_CONTROL (0) 495 496 /* 497 * Reclaim flags for mem_cgroup_hierarchical_reclaim 498 */ 499 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 500 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 501 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 502 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 503 504 /* 505 * The memcg_create_mutex will be held whenever a new cgroup is created. 506 * As a consequence, any change that needs to protect against new child cgroups 507 * appearing has to hold it as well. 508 */ 509 static DEFINE_MUTEX(memcg_create_mutex); 510 511 static void mem_cgroup_get(struct mem_cgroup *memcg); 512 static void mem_cgroup_put(struct mem_cgroup *memcg); 513 514 static inline 515 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 516 { 517 return container_of(s, struct mem_cgroup, css); 518 } 519 520 /* Some nice accessors for the vmpressure. */ 521 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 522 { 523 if (!memcg) 524 memcg = root_mem_cgroup; 525 return &memcg->vmpressure; 526 } 527 528 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 529 { 530 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 531 } 532 533 struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) 534 { 535 return &mem_cgroup_from_css(css)->vmpressure; 536 } 537 538 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 539 { 540 return (memcg == root_mem_cgroup); 541 } 542 543 /* Writing them here to avoid exposing memcg's inner layout */ 544 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 545 546 void sock_update_memcg(struct sock *sk) 547 { 548 if (mem_cgroup_sockets_enabled) { 549 struct mem_cgroup *memcg; 550 struct cg_proto *cg_proto; 551 552 BUG_ON(!sk->sk_prot->proto_cgroup); 553 554 /* Socket cloning can throw us here with sk_cgrp already 555 * filled. It won't however, necessarily happen from 556 * process context. So the test for root memcg given 557 * the current task's memcg won't help us in this case. 558 * 559 * Respecting the original socket's memcg is a better 560 * decision in this case. 561 */ 562 if (sk->sk_cgrp) { 563 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 564 mem_cgroup_get(sk->sk_cgrp->memcg); 565 return; 566 } 567 568 rcu_read_lock(); 569 memcg = mem_cgroup_from_task(current); 570 cg_proto = sk->sk_prot->proto_cgroup(memcg); 571 if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) { 572 mem_cgroup_get(memcg); 573 sk->sk_cgrp = cg_proto; 574 } 575 rcu_read_unlock(); 576 } 577 } 578 EXPORT_SYMBOL(sock_update_memcg); 579 580 void sock_release_memcg(struct sock *sk) 581 { 582 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 583 struct mem_cgroup *memcg; 584 WARN_ON(!sk->sk_cgrp->memcg); 585 memcg = sk->sk_cgrp->memcg; 586 mem_cgroup_put(memcg); 587 } 588 } 589 590 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 591 { 592 if (!memcg || mem_cgroup_is_root(memcg)) 593 return NULL; 594 595 return &memcg->tcp_mem.cg_proto; 596 } 597 EXPORT_SYMBOL(tcp_proto_cgroup); 598 599 static void disarm_sock_keys(struct mem_cgroup *memcg) 600 { 601 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) 602 return; 603 static_key_slow_dec(&memcg_socket_limit_enabled); 604 } 605 #else 606 static void disarm_sock_keys(struct mem_cgroup *memcg) 607 { 608 } 609 #endif 610 611 #ifdef CONFIG_MEMCG_KMEM 612 /* 613 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 614 * There are two main reasons for not using the css_id for this: 615 * 1) this works better in sparse environments, where we have a lot of memcgs, 616 * but only a few kmem-limited. Or also, if we have, for instance, 200 617 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 618 * 200 entry array for that. 619 * 620 * 2) In order not to violate the cgroup API, we would like to do all memory 621 * allocation in ->create(). At that point, we haven't yet allocated the 622 * css_id. Having a separate index prevents us from messing with the cgroup 623 * core for this 624 * 625 * The current size of the caches array is stored in 626 * memcg_limited_groups_array_size. It will double each time we have to 627 * increase it. 628 */ 629 static DEFINE_IDA(kmem_limited_groups); 630 int memcg_limited_groups_array_size; 631 632 /* 633 * MIN_SIZE is different than 1, because we would like to avoid going through 634 * the alloc/free process all the time. In a small machine, 4 kmem-limited 635 * cgroups is a reasonable guess. In the future, it could be a parameter or 636 * tunable, but that is strictly not necessary. 637 * 638 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get 639 * this constant directly from cgroup, but it is understandable that this is 640 * better kept as an internal representation in cgroup.c. In any case, the 641 * css_id space is not getting any smaller, and we don't have to necessarily 642 * increase ours as well if it increases. 643 */ 644 #define MEMCG_CACHES_MIN_SIZE 4 645 #define MEMCG_CACHES_MAX_SIZE 65535 646 647 /* 648 * A lot of the calls to the cache allocation functions are expected to be 649 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 650 * conditional to this static branch, we'll have to allow modules that does 651 * kmem_cache_alloc and the such to see this symbol as well 652 */ 653 struct static_key memcg_kmem_enabled_key; 654 EXPORT_SYMBOL(memcg_kmem_enabled_key); 655 656 static void disarm_kmem_keys(struct mem_cgroup *memcg) 657 { 658 if (memcg_kmem_is_active(memcg)) { 659 static_key_slow_dec(&memcg_kmem_enabled_key); 660 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); 661 } 662 /* 663 * This check can't live in kmem destruction function, 664 * since the charges will outlive the cgroup 665 */ 666 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); 667 } 668 #else 669 static void disarm_kmem_keys(struct mem_cgroup *memcg) 670 { 671 } 672 #endif /* CONFIG_MEMCG_KMEM */ 673 674 static void disarm_static_keys(struct mem_cgroup *memcg) 675 { 676 disarm_sock_keys(memcg); 677 disarm_kmem_keys(memcg); 678 } 679 680 static void drain_all_stock_async(struct mem_cgroup *memcg); 681 682 static struct mem_cgroup_per_zone * 683 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 684 { 685 VM_BUG_ON((unsigned)nid >= nr_node_ids); 686 return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; 687 } 688 689 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 690 { 691 return &memcg->css; 692 } 693 694 static struct mem_cgroup_per_zone * 695 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) 696 { 697 int nid = page_to_nid(page); 698 int zid = page_zonenum(page); 699 700 return mem_cgroup_zoneinfo(memcg, nid, zid); 701 } 702 703 static struct mem_cgroup_tree_per_zone * 704 soft_limit_tree_node_zone(int nid, int zid) 705 { 706 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 707 } 708 709 static struct mem_cgroup_tree_per_zone * 710 soft_limit_tree_from_page(struct page *page) 711 { 712 int nid = page_to_nid(page); 713 int zid = page_zonenum(page); 714 715 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 716 } 717 718 static void 719 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, 720 struct mem_cgroup_per_zone *mz, 721 struct mem_cgroup_tree_per_zone *mctz, 722 unsigned long long new_usage_in_excess) 723 { 724 struct rb_node **p = &mctz->rb_root.rb_node; 725 struct rb_node *parent = NULL; 726 struct mem_cgroup_per_zone *mz_node; 727 728 if (mz->on_tree) 729 return; 730 731 mz->usage_in_excess = new_usage_in_excess; 732 if (!mz->usage_in_excess) 733 return; 734 while (*p) { 735 parent = *p; 736 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 737 tree_node); 738 if (mz->usage_in_excess < mz_node->usage_in_excess) 739 p = &(*p)->rb_left; 740 /* 741 * We can't avoid mem cgroups that are over their soft 742 * limit by the same amount 743 */ 744 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 745 p = &(*p)->rb_right; 746 } 747 rb_link_node(&mz->tree_node, parent, p); 748 rb_insert_color(&mz->tree_node, &mctz->rb_root); 749 mz->on_tree = true; 750 } 751 752 static void 753 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 754 struct mem_cgroup_per_zone *mz, 755 struct mem_cgroup_tree_per_zone *mctz) 756 { 757 if (!mz->on_tree) 758 return; 759 rb_erase(&mz->tree_node, &mctz->rb_root); 760 mz->on_tree = false; 761 } 762 763 static void 764 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 765 struct mem_cgroup_per_zone *mz, 766 struct mem_cgroup_tree_per_zone *mctz) 767 { 768 spin_lock(&mctz->lock); 769 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 770 spin_unlock(&mctz->lock); 771 } 772 773 774 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 775 { 776 unsigned long long excess; 777 struct mem_cgroup_per_zone *mz; 778 struct mem_cgroup_tree_per_zone *mctz; 779 int nid = page_to_nid(page); 780 int zid = page_zonenum(page); 781 mctz = soft_limit_tree_from_page(page); 782 783 /* 784 * Necessary to update all ancestors when hierarchy is used. 785 * because their event counter is not touched. 786 */ 787 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 788 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 789 excess = res_counter_soft_limit_excess(&memcg->res); 790 /* 791 * We have to update the tree if mz is on RB-tree or 792 * mem is over its softlimit. 793 */ 794 if (excess || mz->on_tree) { 795 spin_lock(&mctz->lock); 796 /* if on-tree, remove it */ 797 if (mz->on_tree) 798 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 799 /* 800 * Insert again. mz->usage_in_excess will be updated. 801 * If excess is 0, no tree ops. 802 */ 803 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); 804 spin_unlock(&mctz->lock); 805 } 806 } 807 } 808 809 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 810 { 811 int node, zone; 812 struct mem_cgroup_per_zone *mz; 813 struct mem_cgroup_tree_per_zone *mctz; 814 815 for_each_node(node) { 816 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 817 mz = mem_cgroup_zoneinfo(memcg, node, zone); 818 mctz = soft_limit_tree_node_zone(node, zone); 819 mem_cgroup_remove_exceeded(memcg, mz, mctz); 820 } 821 } 822 } 823 824 static struct mem_cgroup_per_zone * 825 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 826 { 827 struct rb_node *rightmost = NULL; 828 struct mem_cgroup_per_zone *mz; 829 830 retry: 831 mz = NULL; 832 rightmost = rb_last(&mctz->rb_root); 833 if (!rightmost) 834 goto done; /* Nothing to reclaim from */ 835 836 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 837 /* 838 * Remove the node now but someone else can add it back, 839 * we will to add it back at the end of reclaim to its correct 840 * position in the tree. 841 */ 842 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 843 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 844 !css_tryget(&mz->memcg->css)) 845 goto retry; 846 done: 847 return mz; 848 } 849 850 static struct mem_cgroup_per_zone * 851 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 852 { 853 struct mem_cgroup_per_zone *mz; 854 855 spin_lock(&mctz->lock); 856 mz = __mem_cgroup_largest_soft_limit_node(mctz); 857 spin_unlock(&mctz->lock); 858 return mz; 859 } 860 861 /* 862 * Implementation Note: reading percpu statistics for memcg. 863 * 864 * Both of vmstat[] and percpu_counter has threshold and do periodic 865 * synchronization to implement "quick" read. There are trade-off between 866 * reading cost and precision of value. Then, we may have a chance to implement 867 * a periodic synchronizion of counter in memcg's counter. 868 * 869 * But this _read() function is used for user interface now. The user accounts 870 * memory usage by memory cgroup and he _always_ requires exact value because 871 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 872 * have to visit all online cpus and make sum. So, for now, unnecessary 873 * synchronization is not implemented. (just implemented for cpu hotplug) 874 * 875 * If there are kernel internal actions which can make use of some not-exact 876 * value, and reading all cpu value can be performance bottleneck in some 877 * common workload, threashold and synchonization as vmstat[] should be 878 * implemented. 879 */ 880 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 881 enum mem_cgroup_stat_index idx) 882 { 883 long val = 0; 884 int cpu; 885 886 get_online_cpus(); 887 for_each_online_cpu(cpu) 888 val += per_cpu(memcg->stat->count[idx], cpu); 889 #ifdef CONFIG_HOTPLUG_CPU 890 spin_lock(&memcg->pcp_counter_lock); 891 val += memcg->nocpu_base.count[idx]; 892 spin_unlock(&memcg->pcp_counter_lock); 893 #endif 894 put_online_cpus(); 895 return val; 896 } 897 898 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 899 bool charge) 900 { 901 int val = (charge) ? 1 : -1; 902 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 903 } 904 905 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 906 enum mem_cgroup_events_index idx) 907 { 908 unsigned long val = 0; 909 int cpu; 910 911 for_each_online_cpu(cpu) 912 val += per_cpu(memcg->stat->events[idx], cpu); 913 #ifdef CONFIG_HOTPLUG_CPU 914 spin_lock(&memcg->pcp_counter_lock); 915 val += memcg->nocpu_base.events[idx]; 916 spin_unlock(&memcg->pcp_counter_lock); 917 #endif 918 return val; 919 } 920 921 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 922 struct page *page, 923 bool anon, int nr_pages) 924 { 925 preempt_disable(); 926 927 /* 928 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 929 * counted as CACHE even if it's on ANON LRU. 930 */ 931 if (anon) 932 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 933 nr_pages); 934 else 935 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 936 nr_pages); 937 938 if (PageTransHuge(page)) 939 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 940 nr_pages); 941 942 /* pagein of a big page is an event. So, ignore page size */ 943 if (nr_pages > 0) 944 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 945 else { 946 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 947 nr_pages = -nr_pages; /* for event */ 948 } 949 950 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 951 952 preempt_enable(); 953 } 954 955 unsigned long 956 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 957 { 958 struct mem_cgroup_per_zone *mz; 959 960 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 961 return mz->lru_size[lru]; 962 } 963 964 static unsigned long 965 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, 966 unsigned int lru_mask) 967 { 968 struct mem_cgroup_per_zone *mz; 969 enum lru_list lru; 970 unsigned long ret = 0; 971 972 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 973 974 for_each_lru(lru) { 975 if (BIT(lru) & lru_mask) 976 ret += mz->lru_size[lru]; 977 } 978 return ret; 979 } 980 981 static unsigned long 982 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 983 int nid, unsigned int lru_mask) 984 { 985 u64 total = 0; 986 int zid; 987 988 for (zid = 0; zid < MAX_NR_ZONES; zid++) 989 total += mem_cgroup_zone_nr_lru_pages(memcg, 990 nid, zid, lru_mask); 991 992 return total; 993 } 994 995 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 996 unsigned int lru_mask) 997 { 998 int nid; 999 u64 total = 0; 1000 1001 for_each_node_state(nid, N_MEMORY) 1002 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 1003 return total; 1004 } 1005 1006 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 1007 enum mem_cgroup_events_target target) 1008 { 1009 unsigned long val, next; 1010 1011 val = __this_cpu_read(memcg->stat->nr_page_events); 1012 next = __this_cpu_read(memcg->stat->targets[target]); 1013 /* from time_after() in jiffies.h */ 1014 if ((long)next - (long)val < 0) { 1015 switch (target) { 1016 case MEM_CGROUP_TARGET_THRESH: 1017 next = val + THRESHOLDS_EVENTS_TARGET; 1018 break; 1019 case MEM_CGROUP_TARGET_SOFTLIMIT: 1020 next = val + SOFTLIMIT_EVENTS_TARGET; 1021 break; 1022 case MEM_CGROUP_TARGET_NUMAINFO: 1023 next = val + NUMAINFO_EVENTS_TARGET; 1024 break; 1025 default: 1026 break; 1027 } 1028 __this_cpu_write(memcg->stat->targets[target], next); 1029 return true; 1030 } 1031 return false; 1032 } 1033 1034 /* 1035 * Check events in order. 1036 * 1037 */ 1038 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 1039 { 1040 preempt_disable(); 1041 /* threshold event is triggered in finer grain than soft limit */ 1042 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1043 MEM_CGROUP_TARGET_THRESH))) { 1044 bool do_softlimit; 1045 bool do_numainfo __maybe_unused; 1046 1047 do_softlimit = mem_cgroup_event_ratelimit(memcg, 1048 MEM_CGROUP_TARGET_SOFTLIMIT); 1049 #if MAX_NUMNODES > 1 1050 do_numainfo = mem_cgroup_event_ratelimit(memcg, 1051 MEM_CGROUP_TARGET_NUMAINFO); 1052 #endif 1053 preempt_enable(); 1054 1055 mem_cgroup_threshold(memcg); 1056 if (unlikely(do_softlimit)) 1057 mem_cgroup_update_tree(memcg, page); 1058 #if MAX_NUMNODES > 1 1059 if (unlikely(do_numainfo)) 1060 atomic_inc(&memcg->numainfo_events); 1061 #endif 1062 } else 1063 preempt_enable(); 1064 } 1065 1066 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 1067 { 1068 return mem_cgroup_from_css( 1069 cgroup_subsys_state(cont, mem_cgroup_subsys_id)); 1070 } 1071 1072 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1073 { 1074 /* 1075 * mm_update_next_owner() may clear mm->owner to NULL 1076 * if it races with swapoff, page migration, etc. 1077 * So this can be called with p == NULL. 1078 */ 1079 if (unlikely(!p)) 1080 return NULL; 1081 1082 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); 1083 } 1084 1085 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 1086 { 1087 struct mem_cgroup *memcg = NULL; 1088 1089 if (!mm) 1090 return NULL; 1091 /* 1092 * Because we have no locks, mm->owner's may be being moved to other 1093 * cgroup. We use css_tryget() here even if this looks 1094 * pessimistic (rather than adding locks here). 1095 */ 1096 rcu_read_lock(); 1097 do { 1098 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1099 if (unlikely(!memcg)) 1100 break; 1101 } while (!css_tryget(&memcg->css)); 1102 rcu_read_unlock(); 1103 return memcg; 1104 } 1105 1106 /* 1107 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1108 * ref. count) or NULL if the whole root's subtree has been visited. 1109 * 1110 * helper function to be used by mem_cgroup_iter 1111 */ 1112 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1113 struct mem_cgroup *last_visited) 1114 { 1115 struct cgroup *prev_cgroup, *next_cgroup; 1116 1117 /* 1118 * Root is not visited by cgroup iterators so it needs an 1119 * explicit visit. 1120 */ 1121 if (!last_visited) 1122 return root; 1123 1124 prev_cgroup = (last_visited == root) ? NULL 1125 : last_visited->css.cgroup; 1126 skip_node: 1127 next_cgroup = cgroup_next_descendant_pre( 1128 prev_cgroup, root->css.cgroup); 1129 1130 /* 1131 * Even if we found a group we have to make sure it is 1132 * alive. css && !memcg means that the groups should be 1133 * skipped and we should continue the tree walk. 1134 * last_visited css is safe to use because it is 1135 * protected by css_get and the tree walk is rcu safe. 1136 */ 1137 if (next_cgroup) { 1138 struct mem_cgroup *mem = mem_cgroup_from_cont( 1139 next_cgroup); 1140 if (css_tryget(&mem->css)) 1141 return mem; 1142 else { 1143 prev_cgroup = next_cgroup; 1144 goto skip_node; 1145 } 1146 } 1147 1148 return NULL; 1149 } 1150 1151 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) 1152 { 1153 /* 1154 * When a group in the hierarchy below root is destroyed, the 1155 * hierarchy iterator can no longer be trusted since it might 1156 * have pointed to the destroyed group. Invalidate it. 1157 */ 1158 atomic_inc(&root->dead_count); 1159 } 1160 1161 static struct mem_cgroup * 1162 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, 1163 struct mem_cgroup *root, 1164 int *sequence) 1165 { 1166 struct mem_cgroup *position = NULL; 1167 /* 1168 * A cgroup destruction happens in two stages: offlining and 1169 * release. They are separated by a RCU grace period. 1170 * 1171 * If the iterator is valid, we may still race with an 1172 * offlining. The RCU lock ensures the object won't be 1173 * released, tryget will fail if we lost the race. 1174 */ 1175 *sequence = atomic_read(&root->dead_count); 1176 if (iter->last_dead_count == *sequence) { 1177 smp_rmb(); 1178 position = iter->last_visited; 1179 if (position && !css_tryget(&position->css)) 1180 position = NULL; 1181 } 1182 return position; 1183 } 1184 1185 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, 1186 struct mem_cgroup *last_visited, 1187 struct mem_cgroup *new_position, 1188 int sequence) 1189 { 1190 if (last_visited) 1191 css_put(&last_visited->css); 1192 /* 1193 * We store the sequence count from the time @last_visited was 1194 * loaded successfully instead of rereading it here so that we 1195 * don't lose destruction events in between. We could have 1196 * raced with the destruction of @new_position after all. 1197 */ 1198 iter->last_visited = new_position; 1199 smp_wmb(); 1200 iter->last_dead_count = sequence; 1201 } 1202 1203 /** 1204 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1205 * @root: hierarchy root 1206 * @prev: previously returned memcg, NULL on first invocation 1207 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1208 * 1209 * Returns references to children of the hierarchy below @root, or 1210 * @root itself, or %NULL after a full round-trip. 1211 * 1212 * Caller must pass the return value in @prev on subsequent 1213 * invocations for reference counting, or use mem_cgroup_iter_break() 1214 * to cancel a hierarchy walk before the round-trip is complete. 1215 * 1216 * Reclaimers can specify a zone and a priority level in @reclaim to 1217 * divide up the memcgs in the hierarchy among all concurrent 1218 * reclaimers operating on the same zone and priority. 1219 */ 1220 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1221 struct mem_cgroup *prev, 1222 struct mem_cgroup_reclaim_cookie *reclaim) 1223 { 1224 struct mem_cgroup *memcg = NULL; 1225 struct mem_cgroup *last_visited = NULL; 1226 1227 if (mem_cgroup_disabled()) 1228 return NULL; 1229 1230 if (!root) 1231 root = root_mem_cgroup; 1232 1233 if (prev && !reclaim) 1234 last_visited = prev; 1235 1236 if (!root->use_hierarchy && root != root_mem_cgroup) { 1237 if (prev) 1238 goto out_css_put; 1239 return root; 1240 } 1241 1242 rcu_read_lock(); 1243 while (!memcg) { 1244 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1245 int uninitialized_var(seq); 1246 1247 if (reclaim) { 1248 int nid = zone_to_nid(reclaim->zone); 1249 int zid = zone_idx(reclaim->zone); 1250 struct mem_cgroup_per_zone *mz; 1251 1252 mz = mem_cgroup_zoneinfo(root, nid, zid); 1253 iter = &mz->reclaim_iter[reclaim->priority]; 1254 if (prev && reclaim->generation != iter->generation) { 1255 iter->last_visited = NULL; 1256 goto out_unlock; 1257 } 1258 1259 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1260 } 1261 1262 memcg = __mem_cgroup_iter_next(root, last_visited); 1263 1264 if (reclaim) { 1265 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1266 1267 if (!memcg) 1268 iter->generation++; 1269 else if (!prev && memcg) 1270 reclaim->generation = iter->generation; 1271 } 1272 1273 if (prev && !memcg) 1274 goto out_unlock; 1275 } 1276 out_unlock: 1277 rcu_read_unlock(); 1278 out_css_put: 1279 if (prev && prev != root) 1280 css_put(&prev->css); 1281 1282 return memcg; 1283 } 1284 1285 /** 1286 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1287 * @root: hierarchy root 1288 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1289 */ 1290 void mem_cgroup_iter_break(struct mem_cgroup *root, 1291 struct mem_cgroup *prev) 1292 { 1293 if (!root) 1294 root = root_mem_cgroup; 1295 if (prev && prev != root) 1296 css_put(&prev->css); 1297 } 1298 1299 /* 1300 * Iteration constructs for visiting all cgroups (under a tree). If 1301 * loops are exited prematurely (break), mem_cgroup_iter_break() must 1302 * be used for reference counting. 1303 */ 1304 #define for_each_mem_cgroup_tree(iter, root) \ 1305 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 1306 iter != NULL; \ 1307 iter = mem_cgroup_iter(root, iter, NULL)) 1308 1309 #define for_each_mem_cgroup(iter) \ 1310 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1311 iter != NULL; \ 1312 iter = mem_cgroup_iter(NULL, iter, NULL)) 1313 1314 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1315 { 1316 struct mem_cgroup *memcg; 1317 1318 rcu_read_lock(); 1319 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1320 if (unlikely(!memcg)) 1321 goto out; 1322 1323 switch (idx) { 1324 case PGFAULT: 1325 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1326 break; 1327 case PGMAJFAULT: 1328 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1329 break; 1330 default: 1331 BUG(); 1332 } 1333 out: 1334 rcu_read_unlock(); 1335 } 1336 EXPORT_SYMBOL(__mem_cgroup_count_vm_event); 1337 1338 /** 1339 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1340 * @zone: zone of the wanted lruvec 1341 * @memcg: memcg of the wanted lruvec 1342 * 1343 * Returns the lru list vector holding pages for the given @zone and 1344 * @mem. This can be the global zone lruvec, if the memory controller 1345 * is disabled. 1346 */ 1347 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1348 struct mem_cgroup *memcg) 1349 { 1350 struct mem_cgroup_per_zone *mz; 1351 struct lruvec *lruvec; 1352 1353 if (mem_cgroup_disabled()) { 1354 lruvec = &zone->lruvec; 1355 goto out; 1356 } 1357 1358 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); 1359 lruvec = &mz->lruvec; 1360 out: 1361 /* 1362 * Since a node can be onlined after the mem_cgroup was created, 1363 * we have to be prepared to initialize lruvec->zone here; 1364 * and if offlined then reonlined, we need to reinitialize it. 1365 */ 1366 if (unlikely(lruvec->zone != zone)) 1367 lruvec->zone = zone; 1368 return lruvec; 1369 } 1370 1371 /* 1372 * Following LRU functions are allowed to be used without PCG_LOCK. 1373 * Operations are called by routine of global LRU independently from memcg. 1374 * What we have to take care of here is validness of pc->mem_cgroup. 1375 * 1376 * Changes to pc->mem_cgroup happens when 1377 * 1. charge 1378 * 2. moving account 1379 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 1380 * It is added to LRU before charge. 1381 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 1382 * When moving account, the page is not on LRU. It's isolated. 1383 */ 1384 1385 /** 1386 * mem_cgroup_page_lruvec - return lruvec for adding an lru page 1387 * @page: the page 1388 * @zone: zone of the page 1389 */ 1390 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1391 { 1392 struct mem_cgroup_per_zone *mz; 1393 struct mem_cgroup *memcg; 1394 struct page_cgroup *pc; 1395 struct lruvec *lruvec; 1396 1397 if (mem_cgroup_disabled()) { 1398 lruvec = &zone->lruvec; 1399 goto out; 1400 } 1401 1402 pc = lookup_page_cgroup(page); 1403 memcg = pc->mem_cgroup; 1404 1405 /* 1406 * Surreptitiously switch any uncharged offlist page to root: 1407 * an uncharged page off lru does nothing to secure 1408 * its former mem_cgroup from sudden removal. 1409 * 1410 * Our caller holds lru_lock, and PageCgroupUsed is updated 1411 * under page_cgroup lock: between them, they make all uses 1412 * of pc->mem_cgroup safe. 1413 */ 1414 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1415 pc->mem_cgroup = memcg = root_mem_cgroup; 1416 1417 mz = page_cgroup_zoneinfo(memcg, page); 1418 lruvec = &mz->lruvec; 1419 out: 1420 /* 1421 * Since a node can be onlined after the mem_cgroup was created, 1422 * we have to be prepared to initialize lruvec->zone here; 1423 * and if offlined then reonlined, we need to reinitialize it. 1424 */ 1425 if (unlikely(lruvec->zone != zone)) 1426 lruvec->zone = zone; 1427 return lruvec; 1428 } 1429 1430 /** 1431 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1432 * @lruvec: mem_cgroup per zone lru vector 1433 * @lru: index of lru list the page is sitting on 1434 * @nr_pages: positive when adding or negative when removing 1435 * 1436 * This function must be called when a page is added to or removed from an 1437 * lru list. 1438 */ 1439 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1440 int nr_pages) 1441 { 1442 struct mem_cgroup_per_zone *mz; 1443 unsigned long *lru_size; 1444 1445 if (mem_cgroup_disabled()) 1446 return; 1447 1448 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1449 lru_size = mz->lru_size + lru; 1450 *lru_size += nr_pages; 1451 VM_BUG_ON((long)(*lru_size) < 0); 1452 } 1453 1454 /* 1455 * Checks whether given mem is same or in the root_mem_cgroup's 1456 * hierarchy subtree 1457 */ 1458 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1459 struct mem_cgroup *memcg) 1460 { 1461 if (root_memcg == memcg) 1462 return true; 1463 if (!root_memcg->use_hierarchy || !memcg) 1464 return false; 1465 return css_is_ancestor(&memcg->css, &root_memcg->css); 1466 } 1467 1468 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1469 struct mem_cgroup *memcg) 1470 { 1471 bool ret; 1472 1473 rcu_read_lock(); 1474 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); 1475 rcu_read_unlock(); 1476 return ret; 1477 } 1478 1479 bool task_in_mem_cgroup(struct task_struct *task, 1480 const struct mem_cgroup *memcg) 1481 { 1482 struct mem_cgroup *curr = NULL; 1483 struct task_struct *p; 1484 bool ret; 1485 1486 p = find_lock_task_mm(task); 1487 if (p) { 1488 curr = try_get_mem_cgroup_from_mm(p->mm); 1489 task_unlock(p); 1490 } else { 1491 /* 1492 * All threads may have already detached their mm's, but the oom 1493 * killer still needs to detect if they have already been oom 1494 * killed to prevent needlessly killing additional tasks. 1495 */ 1496 rcu_read_lock(); 1497 curr = mem_cgroup_from_task(task); 1498 if (curr) 1499 css_get(&curr->css); 1500 rcu_read_unlock(); 1501 } 1502 if (!curr) 1503 return false; 1504 /* 1505 * We should check use_hierarchy of "memcg" not "curr". Because checking 1506 * use_hierarchy of "curr" here make this function true if hierarchy is 1507 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* 1508 * hierarchy(even if use_hierarchy is disabled in "memcg"). 1509 */ 1510 ret = mem_cgroup_same_or_subtree(memcg, curr); 1511 css_put(&curr->css); 1512 return ret; 1513 } 1514 1515 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 1516 { 1517 unsigned long inactive_ratio; 1518 unsigned long inactive; 1519 unsigned long active; 1520 unsigned long gb; 1521 1522 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); 1523 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); 1524 1525 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1526 if (gb) 1527 inactive_ratio = int_sqrt(10 * gb); 1528 else 1529 inactive_ratio = 1; 1530 1531 return inactive * inactive_ratio < active; 1532 } 1533 1534 #define mem_cgroup_from_res_counter(counter, member) \ 1535 container_of(counter, struct mem_cgroup, member) 1536 1537 /** 1538 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1539 * @memcg: the memory cgroup 1540 * 1541 * Returns the maximum amount of memory @mem can be charged with, in 1542 * pages. 1543 */ 1544 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1545 { 1546 unsigned long long margin; 1547 1548 margin = res_counter_margin(&memcg->res); 1549 if (do_swap_account) 1550 margin = min(margin, res_counter_margin(&memcg->memsw)); 1551 return margin >> PAGE_SHIFT; 1552 } 1553 1554 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1555 { 1556 struct cgroup *cgrp = memcg->css.cgroup; 1557 1558 /* root ? */ 1559 if (cgrp->parent == NULL) 1560 return vm_swappiness; 1561 1562 return memcg->swappiness; 1563 } 1564 1565 /* 1566 * memcg->moving_account is used for checking possibility that some thread is 1567 * calling move_account(). When a thread on CPU-A starts moving pages under 1568 * a memcg, other threads should check memcg->moving_account under 1569 * rcu_read_lock(), like this: 1570 * 1571 * CPU-A CPU-B 1572 * rcu_read_lock() 1573 * memcg->moving_account+1 if (memcg->mocing_account) 1574 * take heavy locks. 1575 * synchronize_rcu() update something. 1576 * rcu_read_unlock() 1577 * start move here. 1578 */ 1579 1580 /* for quick checking without looking up memcg */ 1581 atomic_t memcg_moving __read_mostly; 1582 1583 static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1584 { 1585 atomic_inc(&memcg_moving); 1586 atomic_inc(&memcg->moving_account); 1587 synchronize_rcu(); 1588 } 1589 1590 static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1591 { 1592 /* 1593 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1594 * We check NULL in callee rather than caller. 1595 */ 1596 if (memcg) { 1597 atomic_dec(&memcg_moving); 1598 atomic_dec(&memcg->moving_account); 1599 } 1600 } 1601 1602 /* 1603 * 2 routines for checking "mem" is under move_account() or not. 1604 * 1605 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This 1606 * is used for avoiding races in accounting. If true, 1607 * pc->mem_cgroup may be overwritten. 1608 * 1609 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1610 * under hierarchy of moving cgroups. This is for 1611 * waiting at hith-memory prressure caused by "move". 1612 */ 1613 1614 static bool mem_cgroup_stolen(struct mem_cgroup *memcg) 1615 { 1616 VM_BUG_ON(!rcu_read_lock_held()); 1617 return atomic_read(&memcg->moving_account) > 0; 1618 } 1619 1620 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1621 { 1622 struct mem_cgroup *from; 1623 struct mem_cgroup *to; 1624 bool ret = false; 1625 /* 1626 * Unlike task_move routines, we access mc.to, mc.from not under 1627 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1628 */ 1629 spin_lock(&mc.lock); 1630 from = mc.from; 1631 to = mc.to; 1632 if (!from) 1633 goto unlock; 1634 1635 ret = mem_cgroup_same_or_subtree(memcg, from) 1636 || mem_cgroup_same_or_subtree(memcg, to); 1637 unlock: 1638 spin_unlock(&mc.lock); 1639 return ret; 1640 } 1641 1642 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1643 { 1644 if (mc.moving_task && current != mc.moving_task) { 1645 if (mem_cgroup_under_move(memcg)) { 1646 DEFINE_WAIT(wait); 1647 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1648 /* moving charge context might have finished. */ 1649 if (mc.moving_task) 1650 schedule(); 1651 finish_wait(&mc.waitq, &wait); 1652 return true; 1653 } 1654 } 1655 return false; 1656 } 1657 1658 /* 1659 * Take this lock when 1660 * - a code tries to modify page's memcg while it's USED. 1661 * - a code tries to modify page state accounting in a memcg. 1662 * see mem_cgroup_stolen(), too. 1663 */ 1664 static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1665 unsigned long *flags) 1666 { 1667 spin_lock_irqsave(&memcg->move_lock, *flags); 1668 } 1669 1670 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, 1671 unsigned long *flags) 1672 { 1673 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1674 } 1675 1676 #define K(x) ((x) << (PAGE_SHIFT-10)) 1677 /** 1678 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1679 * @memcg: The memory cgroup that went over limit 1680 * @p: Task that is going to be killed 1681 * 1682 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1683 * enabled 1684 */ 1685 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1686 { 1687 struct cgroup *task_cgrp; 1688 struct cgroup *mem_cgrp; 1689 /* 1690 * Need a buffer in BSS, can't rely on allocations. The code relies 1691 * on the assumption that OOM is serialized for memory controller. 1692 * If this assumption is broken, revisit this code. 1693 */ 1694 static char memcg_name[PATH_MAX]; 1695 int ret; 1696 struct mem_cgroup *iter; 1697 unsigned int i; 1698 1699 if (!p) 1700 return; 1701 1702 rcu_read_lock(); 1703 1704 mem_cgrp = memcg->css.cgroup; 1705 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1706 1707 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1708 if (ret < 0) { 1709 /* 1710 * Unfortunately, we are unable to convert to a useful name 1711 * But we'll still print out the usage information 1712 */ 1713 rcu_read_unlock(); 1714 goto done; 1715 } 1716 rcu_read_unlock(); 1717 1718 pr_info("Task in %s killed", memcg_name); 1719 1720 rcu_read_lock(); 1721 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1722 if (ret < 0) { 1723 rcu_read_unlock(); 1724 goto done; 1725 } 1726 rcu_read_unlock(); 1727 1728 /* 1729 * Continues from above, so we don't need an KERN_ level 1730 */ 1731 pr_cont(" as a result of limit of %s\n", memcg_name); 1732 done: 1733 1734 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1735 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1736 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1737 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1738 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", 1739 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1740 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1741 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1742 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1743 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, 1744 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, 1745 res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1746 1747 for_each_mem_cgroup_tree(iter, memcg) { 1748 pr_info("Memory cgroup stats"); 1749 1750 rcu_read_lock(); 1751 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); 1752 if (!ret) 1753 pr_cont(" for %s", memcg_name); 1754 rcu_read_unlock(); 1755 pr_cont(":"); 1756 1757 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1758 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1759 continue; 1760 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], 1761 K(mem_cgroup_read_stat(iter, i))); 1762 } 1763 1764 for (i = 0; i < NR_LRU_LISTS; i++) 1765 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1766 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1767 1768 pr_cont("\n"); 1769 } 1770 } 1771 1772 /* 1773 * This function returns the number of memcg under hierarchy tree. Returns 1774 * 1(self count) if no children. 1775 */ 1776 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1777 { 1778 int num = 0; 1779 struct mem_cgroup *iter; 1780 1781 for_each_mem_cgroup_tree(iter, memcg) 1782 num++; 1783 return num; 1784 } 1785 1786 /* 1787 * Return the memory (and swap, if configured) limit for a memcg. 1788 */ 1789 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1790 { 1791 u64 limit; 1792 1793 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1794 1795 /* 1796 * Do not consider swap space if we cannot swap due to swappiness 1797 */ 1798 if (mem_cgroup_swappiness(memcg)) { 1799 u64 memsw; 1800 1801 limit += total_swap_pages << PAGE_SHIFT; 1802 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1803 1804 /* 1805 * If memsw is finite and limits the amount of swap space 1806 * available to this memcg, return that limit. 1807 */ 1808 limit = min(limit, memsw); 1809 } 1810 1811 return limit; 1812 } 1813 1814 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1815 int order) 1816 { 1817 struct mem_cgroup *iter; 1818 unsigned long chosen_points = 0; 1819 unsigned long totalpages; 1820 unsigned int points = 0; 1821 struct task_struct *chosen = NULL; 1822 1823 /* 1824 * If current has a pending SIGKILL or is exiting, then automatically 1825 * select it. The goal is to allow it to allocate so that it may 1826 * quickly exit and free its memory. 1827 */ 1828 if (fatal_signal_pending(current) || current->flags & PF_EXITING) { 1829 set_thread_flag(TIF_MEMDIE); 1830 return; 1831 } 1832 1833 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1834 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1835 for_each_mem_cgroup_tree(iter, memcg) { 1836 struct cgroup *cgroup = iter->css.cgroup; 1837 struct cgroup_iter it; 1838 struct task_struct *task; 1839 1840 cgroup_iter_start(cgroup, &it); 1841 while ((task = cgroup_iter_next(cgroup, &it))) { 1842 switch (oom_scan_process_thread(task, totalpages, NULL, 1843 false)) { 1844 case OOM_SCAN_SELECT: 1845 if (chosen) 1846 put_task_struct(chosen); 1847 chosen = task; 1848 chosen_points = ULONG_MAX; 1849 get_task_struct(chosen); 1850 /* fall through */ 1851 case OOM_SCAN_CONTINUE: 1852 continue; 1853 case OOM_SCAN_ABORT: 1854 cgroup_iter_end(cgroup, &it); 1855 mem_cgroup_iter_break(memcg, iter); 1856 if (chosen) 1857 put_task_struct(chosen); 1858 return; 1859 case OOM_SCAN_OK: 1860 break; 1861 }; 1862 points = oom_badness(task, memcg, NULL, totalpages); 1863 if (points > chosen_points) { 1864 if (chosen) 1865 put_task_struct(chosen); 1866 chosen = task; 1867 chosen_points = points; 1868 get_task_struct(chosen); 1869 } 1870 } 1871 cgroup_iter_end(cgroup, &it); 1872 } 1873 1874 if (!chosen) 1875 return; 1876 points = chosen_points * 1000 / totalpages; 1877 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, 1878 NULL, "Memory cgroup out of memory"); 1879 } 1880 1881 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1882 gfp_t gfp_mask, 1883 unsigned long flags) 1884 { 1885 unsigned long total = 0; 1886 bool noswap = false; 1887 int loop; 1888 1889 if (flags & MEM_CGROUP_RECLAIM_NOSWAP) 1890 noswap = true; 1891 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) 1892 noswap = true; 1893 1894 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { 1895 if (loop) 1896 drain_all_stock_async(memcg); 1897 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); 1898 /* 1899 * Allow limit shrinkers, which are triggered directly 1900 * by userspace, to catch signals and stop reclaim 1901 * after minimal progress, regardless of the margin. 1902 */ 1903 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) 1904 break; 1905 if (mem_cgroup_margin(memcg)) 1906 break; 1907 /* 1908 * If nothing was reclaimed after two attempts, there 1909 * may be no reclaimable pages in this hierarchy. 1910 */ 1911 if (loop && !total) 1912 break; 1913 } 1914 return total; 1915 } 1916 1917 /** 1918 * test_mem_cgroup_node_reclaimable 1919 * @memcg: the target memcg 1920 * @nid: the node ID to be checked. 1921 * @noswap : specify true here if the user wants flle only information. 1922 * 1923 * This function returns whether the specified memcg contains any 1924 * reclaimable pages on a node. Returns true if there are any reclaimable 1925 * pages in the node. 1926 */ 1927 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1928 int nid, bool noswap) 1929 { 1930 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1931 return true; 1932 if (noswap || !total_swap_pages) 1933 return false; 1934 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1935 return true; 1936 return false; 1937 1938 } 1939 #if MAX_NUMNODES > 1 1940 1941 /* 1942 * Always updating the nodemask is not very good - even if we have an empty 1943 * list or the wrong list here, we can start from some node and traverse all 1944 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1945 * 1946 */ 1947 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1948 { 1949 int nid; 1950 /* 1951 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1952 * pagein/pageout changes since the last update. 1953 */ 1954 if (!atomic_read(&memcg->numainfo_events)) 1955 return; 1956 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1957 return; 1958 1959 /* make a nodemask where this memcg uses memory from */ 1960 memcg->scan_nodes = node_states[N_MEMORY]; 1961 1962 for_each_node_mask(nid, node_states[N_MEMORY]) { 1963 1964 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1965 node_clear(nid, memcg->scan_nodes); 1966 } 1967 1968 atomic_set(&memcg->numainfo_events, 0); 1969 atomic_set(&memcg->numainfo_updating, 0); 1970 } 1971 1972 /* 1973 * Selecting a node where we start reclaim from. Because what we need is just 1974 * reducing usage counter, start from anywhere is O,K. Considering 1975 * memory reclaim from current node, there are pros. and cons. 1976 * 1977 * Freeing memory from current node means freeing memory from a node which 1978 * we'll use or we've used. So, it may make LRU bad. And if several threads 1979 * hit limits, it will see a contention on a node. But freeing from remote 1980 * node means more costs for memory reclaim because of memory latency. 1981 * 1982 * Now, we use round-robin. Better algorithm is welcomed. 1983 */ 1984 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1985 { 1986 int node; 1987 1988 mem_cgroup_may_update_nodemask(memcg); 1989 node = memcg->last_scanned_node; 1990 1991 node = next_node(node, memcg->scan_nodes); 1992 if (node == MAX_NUMNODES) 1993 node = first_node(memcg->scan_nodes); 1994 /* 1995 * We call this when we hit limit, not when pages are added to LRU. 1996 * No LRU may hold pages because all pages are UNEVICTABLE or 1997 * memcg is too small and all pages are not on LRU. In that case, 1998 * we use curret node. 1999 */ 2000 if (unlikely(node == MAX_NUMNODES)) 2001 node = numa_node_id(); 2002 2003 memcg->last_scanned_node = node; 2004 return node; 2005 } 2006 2007 /* 2008 * Check all nodes whether it contains reclaimable pages or not. 2009 * For quick scan, we make use of scan_nodes. This will allow us to skip 2010 * unused nodes. But scan_nodes is lazily updated and may not cotain 2011 * enough new information. We need to do double check. 2012 */ 2013 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 2014 { 2015 int nid; 2016 2017 /* 2018 * quick check...making use of scan_node. 2019 * We can skip unused nodes. 2020 */ 2021 if (!nodes_empty(memcg->scan_nodes)) { 2022 for (nid = first_node(memcg->scan_nodes); 2023 nid < MAX_NUMNODES; 2024 nid = next_node(nid, memcg->scan_nodes)) { 2025 2026 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 2027 return true; 2028 } 2029 } 2030 /* 2031 * Check rest of nodes. 2032 */ 2033 for_each_node_state(nid, N_MEMORY) { 2034 if (node_isset(nid, memcg->scan_nodes)) 2035 continue; 2036 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 2037 return true; 2038 } 2039 return false; 2040 } 2041 2042 #else 2043 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 2044 { 2045 return 0; 2046 } 2047 2048 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 2049 { 2050 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 2051 } 2052 #endif 2053 2054 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 2055 struct zone *zone, 2056 gfp_t gfp_mask, 2057 unsigned long *total_scanned) 2058 { 2059 struct mem_cgroup *victim = NULL; 2060 int total = 0; 2061 int loop = 0; 2062 unsigned long excess; 2063 unsigned long nr_scanned; 2064 struct mem_cgroup_reclaim_cookie reclaim = { 2065 .zone = zone, 2066 .priority = 0, 2067 }; 2068 2069 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 2070 2071 while (1) { 2072 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 2073 if (!victim) { 2074 loop++; 2075 if (loop >= 2) { 2076 /* 2077 * If we have not been able to reclaim 2078 * anything, it might because there are 2079 * no reclaimable pages under this hierarchy 2080 */ 2081 if (!total) 2082 break; 2083 /* 2084 * We want to do more targeted reclaim. 2085 * excess >> 2 is not to excessive so as to 2086 * reclaim too much, nor too less that we keep 2087 * coming back to reclaim from this cgroup 2088 */ 2089 if (total >= (excess >> 2) || 2090 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 2091 break; 2092 } 2093 continue; 2094 } 2095 if (!mem_cgroup_reclaimable(victim, false)) 2096 continue; 2097 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 2098 zone, &nr_scanned); 2099 *total_scanned += nr_scanned; 2100 if (!res_counter_soft_limit_excess(&root_memcg->res)) 2101 break; 2102 } 2103 mem_cgroup_iter_break(root_memcg, victim); 2104 return total; 2105 } 2106 2107 /* 2108 * Check OOM-Killer is already running under our hierarchy. 2109 * If someone is running, return false. 2110 * Has to be called with memcg_oom_lock 2111 */ 2112 static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) 2113 { 2114 struct mem_cgroup *iter, *failed = NULL; 2115 2116 for_each_mem_cgroup_tree(iter, memcg) { 2117 if (iter->oom_lock) { 2118 /* 2119 * this subtree of our hierarchy is already locked 2120 * so we cannot give a lock. 2121 */ 2122 failed = iter; 2123 mem_cgroup_iter_break(memcg, iter); 2124 break; 2125 } else 2126 iter->oom_lock = true; 2127 } 2128 2129 if (!failed) 2130 return true; 2131 2132 /* 2133 * OK, we failed to lock the whole subtree so we have to clean up 2134 * what we set up to the failing subtree 2135 */ 2136 for_each_mem_cgroup_tree(iter, memcg) { 2137 if (iter == failed) { 2138 mem_cgroup_iter_break(memcg, iter); 2139 break; 2140 } 2141 iter->oom_lock = false; 2142 } 2143 return false; 2144 } 2145 2146 /* 2147 * Has to be called with memcg_oom_lock 2148 */ 2149 static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 2150 { 2151 struct mem_cgroup *iter; 2152 2153 for_each_mem_cgroup_tree(iter, memcg) 2154 iter->oom_lock = false; 2155 return 0; 2156 } 2157 2158 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 2159 { 2160 struct mem_cgroup *iter; 2161 2162 for_each_mem_cgroup_tree(iter, memcg) 2163 atomic_inc(&iter->under_oom); 2164 } 2165 2166 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 2167 { 2168 struct mem_cgroup *iter; 2169 2170 /* 2171 * When a new child is created while the hierarchy is under oom, 2172 * mem_cgroup_oom_lock() may not be called. We have to use 2173 * atomic_add_unless() here. 2174 */ 2175 for_each_mem_cgroup_tree(iter, memcg) 2176 atomic_add_unless(&iter->under_oom, -1, 0); 2177 } 2178 2179 static DEFINE_SPINLOCK(memcg_oom_lock); 2180 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 2181 2182 struct oom_wait_info { 2183 struct mem_cgroup *memcg; 2184 wait_queue_t wait; 2185 }; 2186 2187 static int memcg_oom_wake_function(wait_queue_t *wait, 2188 unsigned mode, int sync, void *arg) 2189 { 2190 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 2191 struct mem_cgroup *oom_wait_memcg; 2192 struct oom_wait_info *oom_wait_info; 2193 2194 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 2195 oom_wait_memcg = oom_wait_info->memcg; 2196 2197 /* 2198 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 2199 * Then we can use css_is_ancestor without taking care of RCU. 2200 */ 2201 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 2202 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) 2203 return 0; 2204 return autoremove_wake_function(wait, mode, sync, arg); 2205 } 2206 2207 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 2208 { 2209 /* for filtering, pass "memcg" as argument. */ 2210 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 2211 } 2212 2213 static void memcg_oom_recover(struct mem_cgroup *memcg) 2214 { 2215 if (memcg && atomic_read(&memcg->under_oom)) 2216 memcg_wakeup_oom(memcg); 2217 } 2218 2219 /* 2220 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 2221 */ 2222 static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, 2223 int order) 2224 { 2225 struct oom_wait_info owait; 2226 bool locked, need_to_kill; 2227 2228 owait.memcg = memcg; 2229 owait.wait.flags = 0; 2230 owait.wait.func = memcg_oom_wake_function; 2231 owait.wait.private = current; 2232 INIT_LIST_HEAD(&owait.wait.task_list); 2233 need_to_kill = true; 2234 mem_cgroup_mark_under_oom(memcg); 2235 2236 /* At first, try to OOM lock hierarchy under memcg.*/ 2237 spin_lock(&memcg_oom_lock); 2238 locked = mem_cgroup_oom_lock(memcg); 2239 /* 2240 * Even if signal_pending(), we can't quit charge() loop without 2241 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 2242 * under OOM is always welcomed, use TASK_KILLABLE here. 2243 */ 2244 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2245 if (!locked || memcg->oom_kill_disable) 2246 need_to_kill = false; 2247 if (locked) 2248 mem_cgroup_oom_notify(memcg); 2249 spin_unlock(&memcg_oom_lock); 2250 2251 if (need_to_kill) { 2252 finish_wait(&memcg_oom_waitq, &owait.wait); 2253 mem_cgroup_out_of_memory(memcg, mask, order); 2254 } else { 2255 schedule(); 2256 finish_wait(&memcg_oom_waitq, &owait.wait); 2257 } 2258 spin_lock(&memcg_oom_lock); 2259 if (locked) 2260 mem_cgroup_oom_unlock(memcg); 2261 memcg_wakeup_oom(memcg); 2262 spin_unlock(&memcg_oom_lock); 2263 2264 mem_cgroup_unmark_under_oom(memcg); 2265 2266 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2267 return false; 2268 /* Give chance to dying process */ 2269 schedule_timeout_uninterruptible(1); 2270 return true; 2271 } 2272 2273 /* 2274 * Currently used to update mapped file statistics, but the routine can be 2275 * generalized to update other statistics as well. 2276 * 2277 * Notes: Race condition 2278 * 2279 * We usually use page_cgroup_lock() for accessing page_cgroup member but 2280 * it tends to be costly. But considering some conditions, we doesn't need 2281 * to do so _always_. 2282 * 2283 * Considering "charge", lock_page_cgroup() is not required because all 2284 * file-stat operations happen after a page is attached to radix-tree. There 2285 * are no race with "charge". 2286 * 2287 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 2288 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 2289 * if there are race with "uncharge". Statistics itself is properly handled 2290 * by flags. 2291 * 2292 * Considering "move", this is an only case we see a race. To make the race 2293 * small, we check mm->moving_account and detect there are possibility of race 2294 * If there is, we take a lock. 2295 */ 2296 2297 void __mem_cgroup_begin_update_page_stat(struct page *page, 2298 bool *locked, unsigned long *flags) 2299 { 2300 struct mem_cgroup *memcg; 2301 struct page_cgroup *pc; 2302 2303 pc = lookup_page_cgroup(page); 2304 again: 2305 memcg = pc->mem_cgroup; 2306 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2307 return; 2308 /* 2309 * If this memory cgroup is not under account moving, we don't 2310 * need to take move_lock_mem_cgroup(). Because we already hold 2311 * rcu_read_lock(), any calls to move_account will be delayed until 2312 * rcu_read_unlock() if mem_cgroup_stolen() == true. 2313 */ 2314 if (!mem_cgroup_stolen(memcg)) 2315 return; 2316 2317 move_lock_mem_cgroup(memcg, flags); 2318 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 2319 move_unlock_mem_cgroup(memcg, flags); 2320 goto again; 2321 } 2322 *locked = true; 2323 } 2324 2325 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) 2326 { 2327 struct page_cgroup *pc = lookup_page_cgroup(page); 2328 2329 /* 2330 * It's guaranteed that pc->mem_cgroup never changes while 2331 * lock is held because a routine modifies pc->mem_cgroup 2332 * should take move_lock_mem_cgroup(). 2333 */ 2334 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 2335 } 2336 2337 void mem_cgroup_update_page_stat(struct page *page, 2338 enum mem_cgroup_page_stat_item idx, int val) 2339 { 2340 struct mem_cgroup *memcg; 2341 struct page_cgroup *pc = lookup_page_cgroup(page); 2342 unsigned long uninitialized_var(flags); 2343 2344 if (mem_cgroup_disabled()) 2345 return; 2346 2347 memcg = pc->mem_cgroup; 2348 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2349 return; 2350 2351 switch (idx) { 2352 case MEMCG_NR_FILE_MAPPED: 2353 idx = MEM_CGROUP_STAT_FILE_MAPPED; 2354 break; 2355 default: 2356 BUG(); 2357 } 2358 2359 this_cpu_add(memcg->stat->count[idx], val); 2360 } 2361 2362 /* 2363 * size of first charge trial. "32" comes from vmscan.c's magic value. 2364 * TODO: maybe necessary to use big numbers in big irons. 2365 */ 2366 #define CHARGE_BATCH 32U 2367 struct memcg_stock_pcp { 2368 struct mem_cgroup *cached; /* this never be root cgroup */ 2369 unsigned int nr_pages; 2370 struct work_struct work; 2371 unsigned long flags; 2372 #define FLUSHING_CACHED_CHARGE 0 2373 }; 2374 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2375 static DEFINE_MUTEX(percpu_charge_mutex); 2376 2377 /** 2378 * consume_stock: Try to consume stocked charge on this cpu. 2379 * @memcg: memcg to consume from. 2380 * @nr_pages: how many pages to charge. 2381 * 2382 * The charges will only happen if @memcg matches the current cpu's memcg 2383 * stock, and at least @nr_pages are available in that stock. Failure to 2384 * service an allocation will refill the stock. 2385 * 2386 * returns true if successful, false otherwise. 2387 */ 2388 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2389 { 2390 struct memcg_stock_pcp *stock; 2391 bool ret = true; 2392 2393 if (nr_pages > CHARGE_BATCH) 2394 return false; 2395 2396 stock = &get_cpu_var(memcg_stock); 2397 if (memcg == stock->cached && stock->nr_pages >= nr_pages) 2398 stock->nr_pages -= nr_pages; 2399 else /* need to call res_counter_charge */ 2400 ret = false; 2401 put_cpu_var(memcg_stock); 2402 return ret; 2403 } 2404 2405 /* 2406 * Returns stocks cached in percpu to res_counter and reset cached information. 2407 */ 2408 static void drain_stock(struct memcg_stock_pcp *stock) 2409 { 2410 struct mem_cgroup *old = stock->cached; 2411 2412 if (stock->nr_pages) { 2413 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2414 2415 res_counter_uncharge(&old->res, bytes); 2416 if (do_swap_account) 2417 res_counter_uncharge(&old->memsw, bytes); 2418 stock->nr_pages = 0; 2419 } 2420 stock->cached = NULL; 2421 } 2422 2423 /* 2424 * This must be called under preempt disabled or must be called by 2425 * a thread which is pinned to local cpu. 2426 */ 2427 static void drain_local_stock(struct work_struct *dummy) 2428 { 2429 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2430 drain_stock(stock); 2431 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2432 } 2433 2434 static void __init memcg_stock_init(void) 2435 { 2436 int cpu; 2437 2438 for_each_possible_cpu(cpu) { 2439 struct memcg_stock_pcp *stock = 2440 &per_cpu(memcg_stock, cpu); 2441 INIT_WORK(&stock->work, drain_local_stock); 2442 } 2443 } 2444 2445 /* 2446 * Cache charges(val) which is from res_counter, to local per_cpu area. 2447 * This will be consumed by consume_stock() function, later. 2448 */ 2449 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2450 { 2451 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2452 2453 if (stock->cached != memcg) { /* reset if necessary */ 2454 drain_stock(stock); 2455 stock->cached = memcg; 2456 } 2457 stock->nr_pages += nr_pages; 2458 put_cpu_var(memcg_stock); 2459 } 2460 2461 /* 2462 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2463 * of the hierarchy under it. sync flag says whether we should block 2464 * until the work is done. 2465 */ 2466 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2467 { 2468 int cpu, curcpu; 2469 2470 /* Notify other cpus that system-wide "drain" is running */ 2471 get_online_cpus(); 2472 curcpu = get_cpu(); 2473 for_each_online_cpu(cpu) { 2474 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2475 struct mem_cgroup *memcg; 2476 2477 memcg = stock->cached; 2478 if (!memcg || !stock->nr_pages) 2479 continue; 2480 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2481 continue; 2482 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2483 if (cpu == curcpu) 2484 drain_local_stock(&stock->work); 2485 else 2486 schedule_work_on(cpu, &stock->work); 2487 } 2488 } 2489 put_cpu(); 2490 2491 if (!sync) 2492 goto out; 2493 2494 for_each_online_cpu(cpu) { 2495 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2496 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2497 flush_work(&stock->work); 2498 } 2499 out: 2500 put_online_cpus(); 2501 } 2502 2503 /* 2504 * Tries to drain stocked charges in other cpus. This function is asynchronous 2505 * and just put a work per cpu for draining localy on each cpu. Caller can 2506 * expects some charges will be back to res_counter later but cannot wait for 2507 * it. 2508 */ 2509 static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2510 { 2511 /* 2512 * If someone calls draining, avoid adding more kworker runs. 2513 */ 2514 if (!mutex_trylock(&percpu_charge_mutex)) 2515 return; 2516 drain_all_stock(root_memcg, false); 2517 mutex_unlock(&percpu_charge_mutex); 2518 } 2519 2520 /* This is a synchronous drain interface. */ 2521 static void drain_all_stock_sync(struct mem_cgroup *root_memcg) 2522 { 2523 /* called when force_empty is called */ 2524 mutex_lock(&percpu_charge_mutex); 2525 drain_all_stock(root_memcg, true); 2526 mutex_unlock(&percpu_charge_mutex); 2527 } 2528 2529 /* 2530 * This function drains percpu counter value from DEAD cpu and 2531 * move it to local cpu. Note that this function can be preempted. 2532 */ 2533 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2534 { 2535 int i; 2536 2537 spin_lock(&memcg->pcp_counter_lock); 2538 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 2539 long x = per_cpu(memcg->stat->count[i], cpu); 2540 2541 per_cpu(memcg->stat->count[i], cpu) = 0; 2542 memcg->nocpu_base.count[i] += x; 2543 } 2544 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2545 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2546 2547 per_cpu(memcg->stat->events[i], cpu) = 0; 2548 memcg->nocpu_base.events[i] += x; 2549 } 2550 spin_unlock(&memcg->pcp_counter_lock); 2551 } 2552 2553 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 2554 unsigned long action, 2555 void *hcpu) 2556 { 2557 int cpu = (unsigned long)hcpu; 2558 struct memcg_stock_pcp *stock; 2559 struct mem_cgroup *iter; 2560 2561 if (action == CPU_ONLINE) 2562 return NOTIFY_OK; 2563 2564 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2565 return NOTIFY_OK; 2566 2567 for_each_mem_cgroup(iter) 2568 mem_cgroup_drain_pcp_counter(iter, cpu); 2569 2570 stock = &per_cpu(memcg_stock, cpu); 2571 drain_stock(stock); 2572 return NOTIFY_OK; 2573 } 2574 2575 2576 /* See __mem_cgroup_try_charge() for details */ 2577 enum { 2578 CHARGE_OK, /* success */ 2579 CHARGE_RETRY, /* need to retry but retry is not bad */ 2580 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2581 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2582 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2583 }; 2584 2585 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2586 unsigned int nr_pages, unsigned int min_pages, 2587 bool oom_check) 2588 { 2589 unsigned long csize = nr_pages * PAGE_SIZE; 2590 struct mem_cgroup *mem_over_limit; 2591 struct res_counter *fail_res; 2592 unsigned long flags = 0; 2593 int ret; 2594 2595 ret = res_counter_charge(&memcg->res, csize, &fail_res); 2596 2597 if (likely(!ret)) { 2598 if (!do_swap_account) 2599 return CHARGE_OK; 2600 ret = res_counter_charge(&memcg->memsw, csize, &fail_res); 2601 if (likely(!ret)) 2602 return CHARGE_OK; 2603 2604 res_counter_uncharge(&memcg->res, csize); 2605 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2606 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2607 } else 2608 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2609 /* 2610 * Never reclaim on behalf of optional batching, retry with a 2611 * single page instead. 2612 */ 2613 if (nr_pages > min_pages) 2614 return CHARGE_RETRY; 2615 2616 if (!(gfp_mask & __GFP_WAIT)) 2617 return CHARGE_WOULDBLOCK; 2618 2619 if (gfp_mask & __GFP_NORETRY) 2620 return CHARGE_NOMEM; 2621 2622 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2623 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2624 return CHARGE_RETRY; 2625 /* 2626 * Even though the limit is exceeded at this point, reclaim 2627 * may have been able to free some pages. Retry the charge 2628 * before killing the task. 2629 * 2630 * Only for regular pages, though: huge pages are rather 2631 * unlikely to succeed so close to the limit, and we fall back 2632 * to regular pages anyway in case of failure. 2633 */ 2634 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) 2635 return CHARGE_RETRY; 2636 2637 /* 2638 * At task move, charge accounts can be doubly counted. So, it's 2639 * better to wait until the end of task_move if something is going on. 2640 */ 2641 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2642 return CHARGE_RETRY; 2643 2644 /* If we don't need to call oom-killer at el, return immediately */ 2645 if (!oom_check) 2646 return CHARGE_NOMEM; 2647 /* check OOM */ 2648 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) 2649 return CHARGE_OOM_DIE; 2650 2651 return CHARGE_RETRY; 2652 } 2653 2654 /* 2655 * __mem_cgroup_try_charge() does 2656 * 1. detect memcg to be charged against from passed *mm and *ptr, 2657 * 2. update res_counter 2658 * 3. call memory reclaim if necessary. 2659 * 2660 * In some special case, if the task is fatal, fatal_signal_pending() or 2661 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup 2662 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon 2663 * as possible without any hazards. 2: all pages should have a valid 2664 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg 2665 * pointer, that is treated as a charge to root_mem_cgroup. 2666 * 2667 * So __mem_cgroup_try_charge() will return 2668 * 0 ... on success, filling *ptr with a valid memcg pointer. 2669 * -ENOMEM ... charge failure because of resource limits. 2670 * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. 2671 * 2672 * Unlike the exported interface, an "oom" parameter is added. if oom==true, 2673 * the oom-killer can be invoked. 2674 */ 2675 static int __mem_cgroup_try_charge(struct mm_struct *mm, 2676 gfp_t gfp_mask, 2677 unsigned int nr_pages, 2678 struct mem_cgroup **ptr, 2679 bool oom) 2680 { 2681 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2682 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2683 struct mem_cgroup *memcg = NULL; 2684 int ret; 2685 2686 /* 2687 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2688 * in system level. So, allow to go ahead dying process in addition to 2689 * MEMDIE process. 2690 */ 2691 if (unlikely(test_thread_flag(TIF_MEMDIE) 2692 || fatal_signal_pending(current))) 2693 goto bypass; 2694 2695 /* 2696 * We always charge the cgroup the mm_struct belongs to. 2697 * The mm_struct's mem_cgroup changes on task migration if the 2698 * thread group leader migrates. It's possible that mm is not 2699 * set, if so charge the root memcg (happens for pagecache usage). 2700 */ 2701 if (!*ptr && !mm) 2702 *ptr = root_mem_cgroup; 2703 again: 2704 if (*ptr) { /* css should be a valid one */ 2705 memcg = *ptr; 2706 if (mem_cgroup_is_root(memcg)) 2707 goto done; 2708 if (consume_stock(memcg, nr_pages)) 2709 goto done; 2710 css_get(&memcg->css); 2711 } else { 2712 struct task_struct *p; 2713 2714 rcu_read_lock(); 2715 p = rcu_dereference(mm->owner); 2716 /* 2717 * Because we don't have task_lock(), "p" can exit. 2718 * In that case, "memcg" can point to root or p can be NULL with 2719 * race with swapoff. Then, we have small risk of mis-accouning. 2720 * But such kind of mis-account by race always happens because 2721 * we don't have cgroup_mutex(). It's overkill and we allo that 2722 * small race, here. 2723 * (*) swapoff at el will charge against mm-struct not against 2724 * task-struct. So, mm->owner can be NULL. 2725 */ 2726 memcg = mem_cgroup_from_task(p); 2727 if (!memcg) 2728 memcg = root_mem_cgroup; 2729 if (mem_cgroup_is_root(memcg)) { 2730 rcu_read_unlock(); 2731 goto done; 2732 } 2733 if (consume_stock(memcg, nr_pages)) { 2734 /* 2735 * It seems dagerous to access memcg without css_get(). 2736 * But considering how consume_stok works, it's not 2737 * necessary. If consume_stock success, some charges 2738 * from this memcg are cached on this cpu. So, we 2739 * don't need to call css_get()/css_tryget() before 2740 * calling consume_stock(). 2741 */ 2742 rcu_read_unlock(); 2743 goto done; 2744 } 2745 /* after here, we may be blocked. we need to get refcnt */ 2746 if (!css_tryget(&memcg->css)) { 2747 rcu_read_unlock(); 2748 goto again; 2749 } 2750 rcu_read_unlock(); 2751 } 2752 2753 do { 2754 bool oom_check; 2755 2756 /* If killed, bypass charge */ 2757 if (fatal_signal_pending(current)) { 2758 css_put(&memcg->css); 2759 goto bypass; 2760 } 2761 2762 oom_check = false; 2763 if (oom && !nr_oom_retries) { 2764 oom_check = true; 2765 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2766 } 2767 2768 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, 2769 oom_check); 2770 switch (ret) { 2771 case CHARGE_OK: 2772 break; 2773 case CHARGE_RETRY: /* not in OOM situation but retry */ 2774 batch = nr_pages; 2775 css_put(&memcg->css); 2776 memcg = NULL; 2777 goto again; 2778 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2779 css_put(&memcg->css); 2780 goto nomem; 2781 case CHARGE_NOMEM: /* OOM routine works */ 2782 if (!oom) { 2783 css_put(&memcg->css); 2784 goto nomem; 2785 } 2786 /* If oom, we never return -ENOMEM */ 2787 nr_oom_retries--; 2788 break; 2789 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2790 css_put(&memcg->css); 2791 goto bypass; 2792 } 2793 } while (ret != CHARGE_OK); 2794 2795 if (batch > nr_pages) 2796 refill_stock(memcg, batch - nr_pages); 2797 css_put(&memcg->css); 2798 done: 2799 *ptr = memcg; 2800 return 0; 2801 nomem: 2802 *ptr = NULL; 2803 return -ENOMEM; 2804 bypass: 2805 *ptr = root_mem_cgroup; 2806 return -EINTR; 2807 } 2808 2809 /* 2810 * Somemtimes we have to undo a charge we got by try_charge(). 2811 * This function is for that and do uncharge, put css's refcnt. 2812 * gotten by try_charge(). 2813 */ 2814 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, 2815 unsigned int nr_pages) 2816 { 2817 if (!mem_cgroup_is_root(memcg)) { 2818 unsigned long bytes = nr_pages * PAGE_SIZE; 2819 2820 res_counter_uncharge(&memcg->res, bytes); 2821 if (do_swap_account) 2822 res_counter_uncharge(&memcg->memsw, bytes); 2823 } 2824 } 2825 2826 /* 2827 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. 2828 * This is useful when moving usage to parent cgroup. 2829 */ 2830 static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, 2831 unsigned int nr_pages) 2832 { 2833 unsigned long bytes = nr_pages * PAGE_SIZE; 2834 2835 if (mem_cgroup_is_root(memcg)) 2836 return; 2837 2838 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2839 if (do_swap_account) 2840 res_counter_uncharge_until(&memcg->memsw, 2841 memcg->memsw.parent, bytes); 2842 } 2843 2844 /* 2845 * A helper function to get mem_cgroup from ID. must be called under 2846 * rcu_read_lock(). The caller is responsible for calling css_tryget if 2847 * the mem_cgroup is used for charging. (dropping refcnt from swap can be 2848 * called against removed memcg.) 2849 */ 2850 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2851 { 2852 struct cgroup_subsys_state *css; 2853 2854 /* ID 0 is unused ID */ 2855 if (!id) 2856 return NULL; 2857 css = css_lookup(&mem_cgroup_subsys, id); 2858 if (!css) 2859 return NULL; 2860 return mem_cgroup_from_css(css); 2861 } 2862 2863 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2864 { 2865 struct mem_cgroup *memcg = NULL; 2866 struct page_cgroup *pc; 2867 unsigned short id; 2868 swp_entry_t ent; 2869 2870 VM_BUG_ON(!PageLocked(page)); 2871 2872 pc = lookup_page_cgroup(page); 2873 lock_page_cgroup(pc); 2874 if (PageCgroupUsed(pc)) { 2875 memcg = pc->mem_cgroup; 2876 if (memcg && !css_tryget(&memcg->css)) 2877 memcg = NULL; 2878 } else if (PageSwapCache(page)) { 2879 ent.val = page_private(page); 2880 id = lookup_swap_cgroup_id(ent); 2881 rcu_read_lock(); 2882 memcg = mem_cgroup_lookup(id); 2883 if (memcg && !css_tryget(&memcg->css)) 2884 memcg = NULL; 2885 rcu_read_unlock(); 2886 } 2887 unlock_page_cgroup(pc); 2888 return memcg; 2889 } 2890 2891 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, 2892 struct page *page, 2893 unsigned int nr_pages, 2894 enum charge_type ctype, 2895 bool lrucare) 2896 { 2897 struct page_cgroup *pc = lookup_page_cgroup(page); 2898 struct zone *uninitialized_var(zone); 2899 struct lruvec *lruvec; 2900 bool was_on_lru = false; 2901 bool anon; 2902 2903 lock_page_cgroup(pc); 2904 VM_BUG_ON(PageCgroupUsed(pc)); 2905 /* 2906 * we don't need page_cgroup_lock about tail pages, becase they are not 2907 * accessed by any other context at this point. 2908 */ 2909 2910 /* 2911 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2912 * may already be on some other mem_cgroup's LRU. Take care of it. 2913 */ 2914 if (lrucare) { 2915 zone = page_zone(page); 2916 spin_lock_irq(&zone->lru_lock); 2917 if (PageLRU(page)) { 2918 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); 2919 ClearPageLRU(page); 2920 del_page_from_lru_list(page, lruvec, page_lru(page)); 2921 was_on_lru = true; 2922 } 2923 } 2924 2925 pc->mem_cgroup = memcg; 2926 /* 2927 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2928 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2929 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2930 * before USED bit, we need memory barrier here. 2931 * See mem_cgroup_add_lru_list(), etc. 2932 */ 2933 smp_wmb(); 2934 SetPageCgroupUsed(pc); 2935 2936 if (lrucare) { 2937 if (was_on_lru) { 2938 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); 2939 VM_BUG_ON(PageLRU(page)); 2940 SetPageLRU(page); 2941 add_page_to_lru_list(page, lruvec, page_lru(page)); 2942 } 2943 spin_unlock_irq(&zone->lru_lock); 2944 } 2945 2946 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) 2947 anon = true; 2948 else 2949 anon = false; 2950 2951 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); 2952 unlock_page_cgroup(pc); 2953 2954 /* 2955 * "charge_statistics" updated event counter. Then, check it. 2956 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2957 * if they exceeds softlimit. 2958 */ 2959 memcg_check_events(memcg, page); 2960 } 2961 2962 static DEFINE_MUTEX(set_limit_mutex); 2963 2964 #ifdef CONFIG_MEMCG_KMEM 2965 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 2966 { 2967 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && 2968 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); 2969 } 2970 2971 /* 2972 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2973 * in the memcg_cache_params struct. 2974 */ 2975 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) 2976 { 2977 struct kmem_cache *cachep; 2978 2979 VM_BUG_ON(p->is_root_cache); 2980 cachep = p->root_cache; 2981 return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; 2982 } 2983 2984 #ifdef CONFIG_SLABINFO 2985 static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, 2986 struct seq_file *m) 2987 { 2988 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 2989 struct memcg_cache_params *params; 2990 2991 if (!memcg_can_account_kmem(memcg)) 2992 return -EIO; 2993 2994 print_slabinfo_header(m); 2995 2996 mutex_lock(&memcg->slab_caches_mutex); 2997 list_for_each_entry(params, &memcg->memcg_slab_caches, list) 2998 cache_show(memcg_params_to_cache(params), m); 2999 mutex_unlock(&memcg->slab_caches_mutex); 3000 3001 return 0; 3002 } 3003 #endif 3004 3005 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) 3006 { 3007 struct res_counter *fail_res; 3008 struct mem_cgroup *_memcg; 3009 int ret = 0; 3010 bool may_oom; 3011 3012 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 3013 if (ret) 3014 return ret; 3015 3016 /* 3017 * Conditions under which we can wait for the oom_killer. Those are 3018 * the same conditions tested by the core page allocator 3019 */ 3020 may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); 3021 3022 _memcg = memcg; 3023 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, 3024 &_memcg, may_oom); 3025 3026 if (ret == -EINTR) { 3027 /* 3028 * __mem_cgroup_try_charge() chosed to bypass to root due to 3029 * OOM kill or fatal signal. Since our only options are to 3030 * either fail the allocation or charge it to this cgroup, do 3031 * it as a temporary condition. But we can't fail. From a 3032 * kmem/slab perspective, the cache has already been selected, 3033 * by mem_cgroup_kmem_get_cache(), so it is too late to change 3034 * our minds. 3035 * 3036 * This condition will only trigger if the task entered 3037 * memcg_charge_kmem in a sane state, but was OOM-killed during 3038 * __mem_cgroup_try_charge() above. Tasks that were already 3039 * dying when the allocation triggers should have been already 3040 * directed to the root cgroup in memcontrol.h 3041 */ 3042 res_counter_charge_nofail(&memcg->res, size, &fail_res); 3043 if (do_swap_account) 3044 res_counter_charge_nofail(&memcg->memsw, size, 3045 &fail_res); 3046 ret = 0; 3047 } else if (ret) 3048 res_counter_uncharge(&memcg->kmem, size); 3049 3050 return ret; 3051 } 3052 3053 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) 3054 { 3055 res_counter_uncharge(&memcg->res, size); 3056 if (do_swap_account) 3057 res_counter_uncharge(&memcg->memsw, size); 3058 3059 /* Not down to 0 */ 3060 if (res_counter_uncharge(&memcg->kmem, size)) 3061 return; 3062 3063 if (memcg_kmem_test_and_clear_dead(memcg)) 3064 mem_cgroup_put(memcg); 3065 } 3066 3067 void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) 3068 { 3069 if (!memcg) 3070 return; 3071 3072 mutex_lock(&memcg->slab_caches_mutex); 3073 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 3074 mutex_unlock(&memcg->slab_caches_mutex); 3075 } 3076 3077 /* 3078 * helper for acessing a memcg's index. It will be used as an index in the 3079 * child cache array in kmem_cache, and also to derive its name. This function 3080 * will return -1 when this is not a kmem-limited memcg. 3081 */ 3082 int memcg_cache_id(struct mem_cgroup *memcg) 3083 { 3084 return memcg ? memcg->kmemcg_id : -1; 3085 } 3086 3087 /* 3088 * This ends up being protected by the set_limit mutex, during normal 3089 * operation, because that is its main call site. 3090 * 3091 * But when we create a new cache, we can call this as well if its parent 3092 * is kmem-limited. That will have to hold set_limit_mutex as well. 3093 */ 3094 int memcg_update_cache_sizes(struct mem_cgroup *memcg) 3095 { 3096 int num, ret; 3097 3098 num = ida_simple_get(&kmem_limited_groups, 3099 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 3100 if (num < 0) 3101 return num; 3102 /* 3103 * After this point, kmem_accounted (that we test atomically in 3104 * the beginning of this conditional), is no longer 0. This 3105 * guarantees only one process will set the following boolean 3106 * to true. We don't need test_and_set because we're protected 3107 * by the set_limit_mutex anyway. 3108 */ 3109 memcg_kmem_set_activated(memcg); 3110 3111 ret = memcg_update_all_caches(num+1); 3112 if (ret) { 3113 ida_simple_remove(&kmem_limited_groups, num); 3114 memcg_kmem_clear_activated(memcg); 3115 return ret; 3116 } 3117 3118 memcg->kmemcg_id = num; 3119 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 3120 mutex_init(&memcg->slab_caches_mutex); 3121 return 0; 3122 } 3123 3124 static size_t memcg_caches_array_size(int num_groups) 3125 { 3126 ssize_t size; 3127 if (num_groups <= 0) 3128 return 0; 3129 3130 size = 2 * num_groups; 3131 if (size < MEMCG_CACHES_MIN_SIZE) 3132 size = MEMCG_CACHES_MIN_SIZE; 3133 else if (size > MEMCG_CACHES_MAX_SIZE) 3134 size = MEMCG_CACHES_MAX_SIZE; 3135 3136 return size; 3137 } 3138 3139 /* 3140 * We should update the current array size iff all caches updates succeed. This 3141 * can only be done from the slab side. The slab mutex needs to be held when 3142 * calling this. 3143 */ 3144 void memcg_update_array_size(int num) 3145 { 3146 if (num > memcg_limited_groups_array_size) 3147 memcg_limited_groups_array_size = memcg_caches_array_size(num); 3148 } 3149 3150 static void kmem_cache_destroy_work_func(struct work_struct *w); 3151 3152 int memcg_update_cache_size(struct kmem_cache *s, int num_groups) 3153 { 3154 struct memcg_cache_params *cur_params = s->memcg_params; 3155 3156 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); 3157 3158 if (num_groups > memcg_limited_groups_array_size) { 3159 int i; 3160 ssize_t size = memcg_caches_array_size(num_groups); 3161 3162 size *= sizeof(void *); 3163 size += sizeof(struct memcg_cache_params); 3164 3165 s->memcg_params = kzalloc(size, GFP_KERNEL); 3166 if (!s->memcg_params) { 3167 s->memcg_params = cur_params; 3168 return -ENOMEM; 3169 } 3170 3171 s->memcg_params->is_root_cache = true; 3172 3173 /* 3174 * There is the chance it will be bigger than 3175 * memcg_limited_groups_array_size, if we failed an allocation 3176 * in a cache, in which case all caches updated before it, will 3177 * have a bigger array. 3178 * 3179 * But if that is the case, the data after 3180 * memcg_limited_groups_array_size is certainly unused 3181 */ 3182 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3183 if (!cur_params->memcg_caches[i]) 3184 continue; 3185 s->memcg_params->memcg_caches[i] = 3186 cur_params->memcg_caches[i]; 3187 } 3188 3189 /* 3190 * Ideally, we would wait until all caches succeed, and only 3191 * then free the old one. But this is not worth the extra 3192 * pointer per-cache we'd have to have for this. 3193 * 3194 * It is not a big deal if some caches are left with a size 3195 * bigger than the others. And all updates will reset this 3196 * anyway. 3197 */ 3198 kfree(cur_params); 3199 } 3200 return 0; 3201 } 3202 3203 int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, 3204 struct kmem_cache *root_cache) 3205 { 3206 size_t size = sizeof(struct memcg_cache_params); 3207 3208 if (!memcg_kmem_enabled()) 3209 return 0; 3210 3211 if (!memcg) 3212 size += memcg_limited_groups_array_size * sizeof(void *); 3213 3214 s->memcg_params = kzalloc(size, GFP_KERNEL); 3215 if (!s->memcg_params) 3216 return -ENOMEM; 3217 3218 INIT_WORK(&s->memcg_params->destroy, 3219 kmem_cache_destroy_work_func); 3220 if (memcg) { 3221 s->memcg_params->memcg = memcg; 3222 s->memcg_params->root_cache = root_cache; 3223 } else 3224 s->memcg_params->is_root_cache = true; 3225 3226 return 0; 3227 } 3228 3229 void memcg_release_cache(struct kmem_cache *s) 3230 { 3231 struct kmem_cache *root; 3232 struct mem_cgroup *memcg; 3233 int id; 3234 3235 /* 3236 * This happens, for instance, when a root cache goes away before we 3237 * add any memcg. 3238 */ 3239 if (!s->memcg_params) 3240 return; 3241 3242 if (s->memcg_params->is_root_cache) 3243 goto out; 3244 3245 memcg = s->memcg_params->memcg; 3246 id = memcg_cache_id(memcg); 3247 3248 root = s->memcg_params->root_cache; 3249 root->memcg_params->memcg_caches[id] = NULL; 3250 3251 mutex_lock(&memcg->slab_caches_mutex); 3252 list_del(&s->memcg_params->list); 3253 mutex_unlock(&memcg->slab_caches_mutex); 3254 3255 mem_cgroup_put(memcg); 3256 out: 3257 kfree(s->memcg_params); 3258 } 3259 3260 /* 3261 * During the creation a new cache, we need to disable our accounting mechanism 3262 * altogether. This is true even if we are not creating, but rather just 3263 * enqueing new caches to be created. 3264 * 3265 * This is because that process will trigger allocations; some visible, like 3266 * explicit kmallocs to auxiliary data structures, name strings and internal 3267 * cache structures; some well concealed, like INIT_WORK() that can allocate 3268 * objects during debug. 3269 * 3270 * If any allocation happens during memcg_kmem_get_cache, we will recurse back 3271 * to it. This may not be a bounded recursion: since the first cache creation 3272 * failed to complete (waiting on the allocation), we'll just try to create the 3273 * cache again, failing at the same point. 3274 * 3275 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of 3276 * memcg_kmem_skip_account. So we enclose anything that might allocate memory 3277 * inside the following two functions. 3278 */ 3279 static inline void memcg_stop_kmem_account(void) 3280 { 3281 VM_BUG_ON(!current->mm); 3282 current->memcg_kmem_skip_account++; 3283 } 3284 3285 static inline void memcg_resume_kmem_account(void) 3286 { 3287 VM_BUG_ON(!current->mm); 3288 current->memcg_kmem_skip_account--; 3289 } 3290 3291 static void kmem_cache_destroy_work_func(struct work_struct *w) 3292 { 3293 struct kmem_cache *cachep; 3294 struct memcg_cache_params *p; 3295 3296 p = container_of(w, struct memcg_cache_params, destroy); 3297 3298 cachep = memcg_params_to_cache(p); 3299 3300 /* 3301 * If we get down to 0 after shrink, we could delete right away. 3302 * However, memcg_release_pages() already puts us back in the workqueue 3303 * in that case. If we proceed deleting, we'll get a dangling 3304 * reference, and removing the object from the workqueue in that case 3305 * is unnecessary complication. We are not a fast path. 3306 * 3307 * Note that this case is fundamentally different from racing with 3308 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in 3309 * kmem_cache_shrink, not only we would be reinserting a dead cache 3310 * into the queue, but doing so from inside the worker racing to 3311 * destroy it. 3312 * 3313 * So if we aren't down to zero, we'll just schedule a worker and try 3314 * again 3315 */ 3316 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { 3317 kmem_cache_shrink(cachep); 3318 if (atomic_read(&cachep->memcg_params->nr_pages) == 0) 3319 return; 3320 } else 3321 kmem_cache_destroy(cachep); 3322 } 3323 3324 void mem_cgroup_destroy_cache(struct kmem_cache *cachep) 3325 { 3326 if (!cachep->memcg_params->dead) 3327 return; 3328 3329 /* 3330 * There are many ways in which we can get here. 3331 * 3332 * We can get to a memory-pressure situation while the delayed work is 3333 * still pending to run. The vmscan shrinkers can then release all 3334 * cache memory and get us to destruction. If this is the case, we'll 3335 * be executed twice, which is a bug (the second time will execute over 3336 * bogus data). In this case, cancelling the work should be fine. 3337 * 3338 * But we can also get here from the worker itself, if 3339 * kmem_cache_shrink is enough to shake all the remaining objects and 3340 * get the page count to 0. In this case, we'll deadlock if we try to 3341 * cancel the work (the worker runs with an internal lock held, which 3342 * is the same lock we would hold for cancel_work_sync().) 3343 * 3344 * Since we can't possibly know who got us here, just refrain from 3345 * running if there is already work pending 3346 */ 3347 if (work_pending(&cachep->memcg_params->destroy)) 3348 return; 3349 /* 3350 * We have to defer the actual destroying to a workqueue, because 3351 * we might currently be in a context that cannot sleep. 3352 */ 3353 schedule_work(&cachep->memcg_params->destroy); 3354 } 3355 3356 /* 3357 * This lock protects updaters, not readers. We want readers to be as fast as 3358 * they can, and they will either see NULL or a valid cache value. Our model 3359 * allow them to see NULL, in which case the root memcg will be selected. 3360 * 3361 * We need this lock because multiple allocations to the same cache from a non 3362 * will span more than one worker. Only one of them can create the cache. 3363 */ 3364 static DEFINE_MUTEX(memcg_cache_mutex); 3365 3366 /* 3367 * Called with memcg_cache_mutex held 3368 */ 3369 static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, 3370 struct kmem_cache *s) 3371 { 3372 struct kmem_cache *new; 3373 static char *tmp_name = NULL; 3374 3375 lockdep_assert_held(&memcg_cache_mutex); 3376 3377 /* 3378 * kmem_cache_create_memcg duplicates the given name and 3379 * cgroup_name for this name requires RCU context. 3380 * This static temporary buffer is used to prevent from 3381 * pointless shortliving allocation. 3382 */ 3383 if (!tmp_name) { 3384 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); 3385 if (!tmp_name) 3386 return NULL; 3387 } 3388 3389 rcu_read_lock(); 3390 snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, 3391 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); 3392 rcu_read_unlock(); 3393 3394 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, 3395 (s->flags & ~SLAB_PANIC), s->ctor, s); 3396 3397 if (new) 3398 new->allocflags |= __GFP_KMEMCG; 3399 3400 return new; 3401 } 3402 3403 static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, 3404 struct kmem_cache *cachep) 3405 { 3406 struct kmem_cache *new_cachep; 3407 int idx; 3408 3409 BUG_ON(!memcg_can_account_kmem(memcg)); 3410 3411 idx = memcg_cache_id(memcg); 3412 3413 mutex_lock(&memcg_cache_mutex); 3414 new_cachep = cachep->memcg_params->memcg_caches[idx]; 3415 if (new_cachep) 3416 goto out; 3417 3418 new_cachep = kmem_cache_dup(memcg, cachep); 3419 if (new_cachep == NULL) { 3420 new_cachep = cachep; 3421 goto out; 3422 } 3423 3424 mem_cgroup_get(memcg); 3425 atomic_set(&new_cachep->memcg_params->nr_pages , 0); 3426 3427 cachep->memcg_params->memcg_caches[idx] = new_cachep; 3428 /* 3429 * the readers won't lock, make sure everybody sees the updated value, 3430 * so they won't put stuff in the queue again for no reason 3431 */ 3432 wmb(); 3433 out: 3434 mutex_unlock(&memcg_cache_mutex); 3435 return new_cachep; 3436 } 3437 3438 void kmem_cache_destroy_memcg_children(struct kmem_cache *s) 3439 { 3440 struct kmem_cache *c; 3441 int i; 3442 3443 if (!s->memcg_params) 3444 return; 3445 if (!s->memcg_params->is_root_cache) 3446 return; 3447 3448 /* 3449 * If the cache is being destroyed, we trust that there is no one else 3450 * requesting objects from it. Even if there are, the sanity checks in 3451 * kmem_cache_destroy should caught this ill-case. 3452 * 3453 * Still, we don't want anyone else freeing memcg_caches under our 3454 * noses, which can happen if a new memcg comes to life. As usual, 3455 * we'll take the set_limit_mutex to protect ourselves against this. 3456 */ 3457 mutex_lock(&set_limit_mutex); 3458 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3459 c = s->memcg_params->memcg_caches[i]; 3460 if (!c) 3461 continue; 3462 3463 /* 3464 * We will now manually delete the caches, so to avoid races 3465 * we need to cancel all pending destruction workers and 3466 * proceed with destruction ourselves. 3467 * 3468 * kmem_cache_destroy() will call kmem_cache_shrink internally, 3469 * and that could spawn the workers again: it is likely that 3470 * the cache still have active pages until this very moment. 3471 * This would lead us back to mem_cgroup_destroy_cache. 3472 * 3473 * But that will not execute at all if the "dead" flag is not 3474 * set, so flip it down to guarantee we are in control. 3475 */ 3476 c->memcg_params->dead = false; 3477 cancel_work_sync(&c->memcg_params->destroy); 3478 kmem_cache_destroy(c); 3479 } 3480 mutex_unlock(&set_limit_mutex); 3481 } 3482 3483 struct create_work { 3484 struct mem_cgroup *memcg; 3485 struct kmem_cache *cachep; 3486 struct work_struct work; 3487 }; 3488 3489 static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3490 { 3491 struct kmem_cache *cachep; 3492 struct memcg_cache_params *params; 3493 3494 if (!memcg_kmem_is_active(memcg)) 3495 return; 3496 3497 mutex_lock(&memcg->slab_caches_mutex); 3498 list_for_each_entry(params, &memcg->memcg_slab_caches, list) { 3499 cachep = memcg_params_to_cache(params); 3500 cachep->memcg_params->dead = true; 3501 schedule_work(&cachep->memcg_params->destroy); 3502 } 3503 mutex_unlock(&memcg->slab_caches_mutex); 3504 } 3505 3506 static void memcg_create_cache_work_func(struct work_struct *w) 3507 { 3508 struct create_work *cw; 3509 3510 cw = container_of(w, struct create_work, work); 3511 memcg_create_kmem_cache(cw->memcg, cw->cachep); 3512 /* Drop the reference gotten when we enqueued. */ 3513 css_put(&cw->memcg->css); 3514 kfree(cw); 3515 } 3516 3517 /* 3518 * Enqueue the creation of a per-memcg kmem_cache. 3519 */ 3520 static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3521 struct kmem_cache *cachep) 3522 { 3523 struct create_work *cw; 3524 3525 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); 3526 if (cw == NULL) { 3527 css_put(&memcg->css); 3528 return; 3529 } 3530 3531 cw->memcg = memcg; 3532 cw->cachep = cachep; 3533 3534 INIT_WORK(&cw->work, memcg_create_cache_work_func); 3535 schedule_work(&cw->work); 3536 } 3537 3538 static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3539 struct kmem_cache *cachep) 3540 { 3541 /* 3542 * We need to stop accounting when we kmalloc, because if the 3543 * corresponding kmalloc cache is not yet created, the first allocation 3544 * in __memcg_create_cache_enqueue will recurse. 3545 * 3546 * However, it is better to enclose the whole function. Depending on 3547 * the debugging options enabled, INIT_WORK(), for instance, can 3548 * trigger an allocation. This too, will make us recurse. Because at 3549 * this point we can't allow ourselves back into memcg_kmem_get_cache, 3550 * the safest choice is to do it like this, wrapping the whole function. 3551 */ 3552 memcg_stop_kmem_account(); 3553 __memcg_create_cache_enqueue(memcg, cachep); 3554 memcg_resume_kmem_account(); 3555 } 3556 /* 3557 * Return the kmem_cache we're supposed to use for a slab allocation. 3558 * We try to use the current memcg's version of the cache. 3559 * 3560 * If the cache does not exist yet, if we are the first user of it, 3561 * we either create it immediately, if possible, or create it asynchronously 3562 * in a workqueue. 3563 * In the latter case, we will let the current allocation go through with 3564 * the original cache. 3565 * 3566 * Can't be called in interrupt context or from kernel threads. 3567 * This function needs to be called with rcu_read_lock() held. 3568 */ 3569 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, 3570 gfp_t gfp) 3571 { 3572 struct mem_cgroup *memcg; 3573 int idx; 3574 3575 VM_BUG_ON(!cachep->memcg_params); 3576 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 3577 3578 if (!current->mm || current->memcg_kmem_skip_account) 3579 return cachep; 3580 3581 rcu_read_lock(); 3582 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); 3583 3584 if (!memcg_can_account_kmem(memcg)) 3585 goto out; 3586 3587 idx = memcg_cache_id(memcg); 3588 3589 /* 3590 * barrier to mare sure we're always seeing the up to date value. The 3591 * code updating memcg_caches will issue a write barrier to match this. 3592 */ 3593 read_barrier_depends(); 3594 if (likely(cachep->memcg_params->memcg_caches[idx])) { 3595 cachep = cachep->memcg_params->memcg_caches[idx]; 3596 goto out; 3597 } 3598 3599 /* The corresponding put will be done in the workqueue. */ 3600 if (!css_tryget(&memcg->css)) 3601 goto out; 3602 rcu_read_unlock(); 3603 3604 /* 3605 * If we are in a safe context (can wait, and not in interrupt 3606 * context), we could be be predictable and return right away. 3607 * This would guarantee that the allocation being performed 3608 * already belongs in the new cache. 3609 * 3610 * However, there are some clashes that can arrive from locking. 3611 * For instance, because we acquire the slab_mutex while doing 3612 * kmem_cache_dup, this means no further allocation could happen 3613 * with the slab_mutex held. 3614 * 3615 * Also, because cache creation issue get_online_cpus(), this 3616 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, 3617 * that ends up reversed during cpu hotplug. (cpuset allocates 3618 * a bunch of GFP_KERNEL memory during cpuup). Due to all that, 3619 * better to defer everything. 3620 */ 3621 memcg_create_cache_enqueue(memcg, cachep); 3622 return cachep; 3623 out: 3624 rcu_read_unlock(); 3625 return cachep; 3626 } 3627 EXPORT_SYMBOL(__memcg_kmem_get_cache); 3628 3629 /* 3630 * We need to verify if the allocation against current->mm->owner's memcg is 3631 * possible for the given order. But the page is not allocated yet, so we'll 3632 * need a further commit step to do the final arrangements. 3633 * 3634 * It is possible for the task to switch cgroups in this mean time, so at 3635 * commit time, we can't rely on task conversion any longer. We'll then use 3636 * the handle argument to return to the caller which cgroup we should commit 3637 * against. We could also return the memcg directly and avoid the pointer 3638 * passing, but a boolean return value gives better semantics considering 3639 * the compiled-out case as well. 3640 * 3641 * Returning true means the allocation is possible. 3642 */ 3643 bool 3644 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 3645 { 3646 struct mem_cgroup *memcg; 3647 int ret; 3648 3649 *_memcg = NULL; 3650 memcg = try_get_mem_cgroup_from_mm(current->mm); 3651 3652 /* 3653 * very rare case described in mem_cgroup_from_task. Unfortunately there 3654 * isn't much we can do without complicating this too much, and it would 3655 * be gfp-dependent anyway. Just let it go 3656 */ 3657 if (unlikely(!memcg)) 3658 return true; 3659 3660 if (!memcg_can_account_kmem(memcg)) { 3661 css_put(&memcg->css); 3662 return true; 3663 } 3664 3665 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); 3666 if (!ret) 3667 *_memcg = memcg; 3668 3669 css_put(&memcg->css); 3670 return (ret == 0); 3671 } 3672 3673 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 3674 int order) 3675 { 3676 struct page_cgroup *pc; 3677 3678 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3679 3680 /* The page allocation failed. Revert */ 3681 if (!page) { 3682 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3683 return; 3684 } 3685 3686 pc = lookup_page_cgroup(page); 3687 lock_page_cgroup(pc); 3688 pc->mem_cgroup = memcg; 3689 SetPageCgroupUsed(pc); 3690 unlock_page_cgroup(pc); 3691 } 3692 3693 void __memcg_kmem_uncharge_pages(struct page *page, int order) 3694 { 3695 struct mem_cgroup *memcg = NULL; 3696 struct page_cgroup *pc; 3697 3698 3699 pc = lookup_page_cgroup(page); 3700 /* 3701 * Fast unlocked return. Theoretically might have changed, have to 3702 * check again after locking. 3703 */ 3704 if (!PageCgroupUsed(pc)) 3705 return; 3706 3707 lock_page_cgroup(pc); 3708 if (PageCgroupUsed(pc)) { 3709 memcg = pc->mem_cgroup; 3710 ClearPageCgroupUsed(pc); 3711 } 3712 unlock_page_cgroup(pc); 3713 3714 /* 3715 * We trust that only if there is a memcg associated with the page, it 3716 * is a valid allocation 3717 */ 3718 if (!memcg) 3719 return; 3720 3721 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3722 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3723 } 3724 #else 3725 static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3726 { 3727 } 3728 #endif /* CONFIG_MEMCG_KMEM */ 3729 3730 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3731 3732 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) 3733 /* 3734 * Because tail pages are not marked as "used", set it. We're under 3735 * zone->lru_lock, 'splitting on pmd' and compound_lock. 3736 * charge/uncharge will be never happen and move_account() is done under 3737 * compound_lock(), so we don't have to take care of races. 3738 */ 3739 void mem_cgroup_split_huge_fixup(struct page *head) 3740 { 3741 struct page_cgroup *head_pc = lookup_page_cgroup(head); 3742 struct page_cgroup *pc; 3743 struct mem_cgroup *memcg; 3744 int i; 3745 3746 if (mem_cgroup_disabled()) 3747 return; 3748 3749 memcg = head_pc->mem_cgroup; 3750 for (i = 1; i < HPAGE_PMD_NR; i++) { 3751 pc = head_pc + i; 3752 pc->mem_cgroup = memcg; 3753 smp_wmb();/* see __commit_charge() */ 3754 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 3755 } 3756 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3757 HPAGE_PMD_NR); 3758 } 3759 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3760 3761 /** 3762 * mem_cgroup_move_account - move account of the page 3763 * @page: the page 3764 * @nr_pages: number of regular pages (>1 for huge pages) 3765 * @pc: page_cgroup of the page. 3766 * @from: mem_cgroup which the page is moved from. 3767 * @to: mem_cgroup which the page is moved to. @from != @to. 3768 * 3769 * The caller must confirm following. 3770 * - page is not on LRU (isolate_page() is useful.) 3771 * - compound_lock is held when nr_pages > 1 3772 * 3773 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 3774 * from old cgroup. 3775 */ 3776 static int mem_cgroup_move_account(struct page *page, 3777 unsigned int nr_pages, 3778 struct page_cgroup *pc, 3779 struct mem_cgroup *from, 3780 struct mem_cgroup *to) 3781 { 3782 unsigned long flags; 3783 int ret; 3784 bool anon = PageAnon(page); 3785 3786 VM_BUG_ON(from == to); 3787 VM_BUG_ON(PageLRU(page)); 3788 /* 3789 * The page is isolated from LRU. So, collapse function 3790 * will not handle this page. But page splitting can happen. 3791 * Do this check under compound_page_lock(). The caller should 3792 * hold it. 3793 */ 3794 ret = -EBUSY; 3795 if (nr_pages > 1 && !PageTransHuge(page)) 3796 goto out; 3797 3798 lock_page_cgroup(pc); 3799 3800 ret = -EINVAL; 3801 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 3802 goto unlock; 3803 3804 move_lock_mem_cgroup(from, &flags); 3805 3806 if (!anon && page_mapped(page)) { 3807 /* Update mapped_file data for mem_cgroup */ 3808 preempt_disable(); 3809 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 3810 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 3811 preempt_enable(); 3812 } 3813 mem_cgroup_charge_statistics(from, page, anon, -nr_pages); 3814 3815 /* caller should have done css_get */ 3816 pc->mem_cgroup = to; 3817 mem_cgroup_charge_statistics(to, page, anon, nr_pages); 3818 move_unlock_mem_cgroup(from, &flags); 3819 ret = 0; 3820 unlock: 3821 unlock_page_cgroup(pc); 3822 /* 3823 * check events 3824 */ 3825 memcg_check_events(to, page); 3826 memcg_check_events(from, page); 3827 out: 3828 return ret; 3829 } 3830 3831 /** 3832 * mem_cgroup_move_parent - moves page to the parent group 3833 * @page: the page to move 3834 * @pc: page_cgroup of the page 3835 * @child: page's cgroup 3836 * 3837 * move charges to its parent or the root cgroup if the group has no 3838 * parent (aka use_hierarchy==0). 3839 * Although this might fail (get_page_unless_zero, isolate_lru_page or 3840 * mem_cgroup_move_account fails) the failure is always temporary and 3841 * it signals a race with a page removal/uncharge or migration. In the 3842 * first case the page is on the way out and it will vanish from the LRU 3843 * on the next attempt and the call should be retried later. 3844 * Isolation from the LRU fails only if page has been isolated from 3845 * the LRU since we looked at it and that usually means either global 3846 * reclaim or migration going on. The page will either get back to the 3847 * LRU or vanish. 3848 * Finaly mem_cgroup_move_account fails only if the page got uncharged 3849 * (!PageCgroupUsed) or moved to a different group. The page will 3850 * disappear in the next attempt. 3851 */ 3852 static int mem_cgroup_move_parent(struct page *page, 3853 struct page_cgroup *pc, 3854 struct mem_cgroup *child) 3855 { 3856 struct mem_cgroup *parent; 3857 unsigned int nr_pages; 3858 unsigned long uninitialized_var(flags); 3859 int ret; 3860 3861 VM_BUG_ON(mem_cgroup_is_root(child)); 3862 3863 ret = -EBUSY; 3864 if (!get_page_unless_zero(page)) 3865 goto out; 3866 if (isolate_lru_page(page)) 3867 goto put; 3868 3869 nr_pages = hpage_nr_pages(page); 3870 3871 parent = parent_mem_cgroup(child); 3872 /* 3873 * If no parent, move charges to root cgroup. 3874 */ 3875 if (!parent) 3876 parent = root_mem_cgroup; 3877 3878 if (nr_pages > 1) { 3879 VM_BUG_ON(!PageTransHuge(page)); 3880 flags = compound_lock_irqsave(page); 3881 } 3882 3883 ret = mem_cgroup_move_account(page, nr_pages, 3884 pc, child, parent); 3885 if (!ret) 3886 __mem_cgroup_cancel_local_charge(child, nr_pages); 3887 3888 if (nr_pages > 1) 3889 compound_unlock_irqrestore(page, flags); 3890 putback_lru_page(page); 3891 put: 3892 put_page(page); 3893 out: 3894 return ret; 3895 } 3896 3897 /* 3898 * Charge the memory controller for page usage. 3899 * Return 3900 * 0 if the charge was successful 3901 * < 0 if the cgroup is over its limit 3902 */ 3903 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 3904 gfp_t gfp_mask, enum charge_type ctype) 3905 { 3906 struct mem_cgroup *memcg = NULL; 3907 unsigned int nr_pages = 1; 3908 bool oom = true; 3909 int ret; 3910 3911 if (PageTransHuge(page)) { 3912 nr_pages <<= compound_order(page); 3913 VM_BUG_ON(!PageTransHuge(page)); 3914 /* 3915 * Never OOM-kill a process for a huge page. The 3916 * fault handler will fall back to regular pages. 3917 */ 3918 oom = false; 3919 } 3920 3921 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 3922 if (ret == -ENOMEM) 3923 return ret; 3924 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); 3925 return 0; 3926 } 3927 3928 int mem_cgroup_newpage_charge(struct page *page, 3929 struct mm_struct *mm, gfp_t gfp_mask) 3930 { 3931 if (mem_cgroup_disabled()) 3932 return 0; 3933 VM_BUG_ON(page_mapped(page)); 3934 VM_BUG_ON(page->mapping && !PageAnon(page)); 3935 VM_BUG_ON(!mm); 3936 return mem_cgroup_charge_common(page, mm, gfp_mask, 3937 MEM_CGROUP_CHARGE_TYPE_ANON); 3938 } 3939 3940 /* 3941 * While swap-in, try_charge -> commit or cancel, the page is locked. 3942 * And when try_charge() successfully returns, one refcnt to memcg without 3943 * struct page_cgroup is acquired. This refcnt will be consumed by 3944 * "commit()" or removed by "cancel()" 3945 */ 3946 static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, 3947 struct page *page, 3948 gfp_t mask, 3949 struct mem_cgroup **memcgp) 3950 { 3951 struct mem_cgroup *memcg; 3952 struct page_cgroup *pc; 3953 int ret; 3954 3955 pc = lookup_page_cgroup(page); 3956 /* 3957 * Every swap fault against a single page tries to charge the 3958 * page, bail as early as possible. shmem_unuse() encounters 3959 * already charged pages, too. The USED bit is protected by 3960 * the page lock, which serializes swap cache removal, which 3961 * in turn serializes uncharging. 3962 */ 3963 if (PageCgroupUsed(pc)) 3964 return 0; 3965 if (!do_swap_account) 3966 goto charge_cur_mm; 3967 memcg = try_get_mem_cgroup_from_page(page); 3968 if (!memcg) 3969 goto charge_cur_mm; 3970 *memcgp = memcg; 3971 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); 3972 css_put(&memcg->css); 3973 if (ret == -EINTR) 3974 ret = 0; 3975 return ret; 3976 charge_cur_mm: 3977 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 3978 if (ret == -EINTR) 3979 ret = 0; 3980 return ret; 3981 } 3982 3983 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, 3984 gfp_t gfp_mask, struct mem_cgroup **memcgp) 3985 { 3986 *memcgp = NULL; 3987 if (mem_cgroup_disabled()) 3988 return 0; 3989 /* 3990 * A racing thread's fault, or swapoff, may have already 3991 * updated the pte, and even removed page from swap cache: in 3992 * those cases unuse_pte()'s pte_same() test will fail; but 3993 * there's also a KSM case which does need to charge the page. 3994 */ 3995 if (!PageSwapCache(page)) { 3996 int ret; 3997 3998 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); 3999 if (ret == -EINTR) 4000 ret = 0; 4001 return ret; 4002 } 4003 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); 4004 } 4005 4006 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 4007 { 4008 if (mem_cgroup_disabled()) 4009 return; 4010 if (!memcg) 4011 return; 4012 __mem_cgroup_cancel_charge(memcg, 1); 4013 } 4014 4015 static void 4016 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 4017 enum charge_type ctype) 4018 { 4019 if (mem_cgroup_disabled()) 4020 return; 4021 if (!memcg) 4022 return; 4023 4024 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 4025 /* 4026 * Now swap is on-memory. This means this page may be 4027 * counted both as mem and swap....double count. 4028 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 4029 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 4030 * may call delete_from_swap_cache() before reach here. 4031 */ 4032 if (do_swap_account && PageSwapCache(page)) { 4033 swp_entry_t ent = {.val = page_private(page)}; 4034 mem_cgroup_uncharge_swap(ent); 4035 } 4036 } 4037 4038 void mem_cgroup_commit_charge_swapin(struct page *page, 4039 struct mem_cgroup *memcg) 4040 { 4041 __mem_cgroup_commit_charge_swapin(page, memcg, 4042 MEM_CGROUP_CHARGE_TYPE_ANON); 4043 } 4044 4045 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 4046 gfp_t gfp_mask) 4047 { 4048 struct mem_cgroup *memcg = NULL; 4049 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 4050 int ret; 4051 4052 if (mem_cgroup_disabled()) 4053 return 0; 4054 if (PageCompound(page)) 4055 return 0; 4056 4057 if (!PageSwapCache(page)) 4058 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); 4059 else { /* page is swapcache/shmem */ 4060 ret = __mem_cgroup_try_charge_swapin(mm, page, 4061 gfp_mask, &memcg); 4062 if (!ret) 4063 __mem_cgroup_commit_charge_swapin(page, memcg, type); 4064 } 4065 return ret; 4066 } 4067 4068 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 4069 unsigned int nr_pages, 4070 const enum charge_type ctype) 4071 { 4072 struct memcg_batch_info *batch = NULL; 4073 bool uncharge_memsw = true; 4074 4075 /* If swapout, usage of swap doesn't decrease */ 4076 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 4077 uncharge_memsw = false; 4078 4079 batch = ¤t->memcg_batch; 4080 /* 4081 * In usual, we do css_get() when we remember memcg pointer. 4082 * But in this case, we keep res->usage until end of a series of 4083 * uncharges. Then, it's ok to ignore memcg's refcnt. 4084 */ 4085 if (!batch->memcg) 4086 batch->memcg = memcg; 4087 /* 4088 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 4089 * In those cases, all pages freed continuously can be expected to be in 4090 * the same cgroup and we have chance to coalesce uncharges. 4091 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 4092 * because we want to do uncharge as soon as possible. 4093 */ 4094 4095 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 4096 goto direct_uncharge; 4097 4098 if (nr_pages > 1) 4099 goto direct_uncharge; 4100 4101 /* 4102 * In typical case, batch->memcg == mem. This means we can 4103 * merge a series of uncharges to an uncharge of res_counter. 4104 * If not, we uncharge res_counter ony by one. 4105 */ 4106 if (batch->memcg != memcg) 4107 goto direct_uncharge; 4108 /* remember freed charge and uncharge it later */ 4109 batch->nr_pages++; 4110 if (uncharge_memsw) 4111 batch->memsw_nr_pages++; 4112 return; 4113 direct_uncharge: 4114 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); 4115 if (uncharge_memsw) 4116 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 4117 if (unlikely(batch->memcg != memcg)) 4118 memcg_oom_recover(memcg); 4119 } 4120 4121 /* 4122 * uncharge if !page_mapped(page) 4123 */ 4124 static struct mem_cgroup * 4125 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, 4126 bool end_migration) 4127 { 4128 struct mem_cgroup *memcg = NULL; 4129 unsigned int nr_pages = 1; 4130 struct page_cgroup *pc; 4131 bool anon; 4132 4133 if (mem_cgroup_disabled()) 4134 return NULL; 4135 4136 if (PageTransHuge(page)) { 4137 nr_pages <<= compound_order(page); 4138 VM_BUG_ON(!PageTransHuge(page)); 4139 } 4140 /* 4141 * Check if our page_cgroup is valid 4142 */ 4143 pc = lookup_page_cgroup(page); 4144 if (unlikely(!PageCgroupUsed(pc))) 4145 return NULL; 4146 4147 lock_page_cgroup(pc); 4148 4149 memcg = pc->mem_cgroup; 4150 4151 if (!PageCgroupUsed(pc)) 4152 goto unlock_out; 4153 4154 anon = PageAnon(page); 4155 4156 switch (ctype) { 4157 case MEM_CGROUP_CHARGE_TYPE_ANON: 4158 /* 4159 * Generally PageAnon tells if it's the anon statistics to be 4160 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 4161 * used before page reached the stage of being marked PageAnon. 4162 */ 4163 anon = true; 4164 /* fallthrough */ 4165 case MEM_CGROUP_CHARGE_TYPE_DROP: 4166 /* See mem_cgroup_prepare_migration() */ 4167 if (page_mapped(page)) 4168 goto unlock_out; 4169 /* 4170 * Pages under migration may not be uncharged. But 4171 * end_migration() /must/ be the one uncharging the 4172 * unused post-migration page and so it has to call 4173 * here with the migration bit still set. See the 4174 * res_counter handling below. 4175 */ 4176 if (!end_migration && PageCgroupMigration(pc)) 4177 goto unlock_out; 4178 break; 4179 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 4180 if (!PageAnon(page)) { /* Shared memory */ 4181 if (page->mapping && !page_is_file_cache(page)) 4182 goto unlock_out; 4183 } else if (page_mapped(page)) /* Anon */ 4184 goto unlock_out; 4185 break; 4186 default: 4187 break; 4188 } 4189 4190 mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); 4191 4192 ClearPageCgroupUsed(pc); 4193 /* 4194 * pc->mem_cgroup is not cleared here. It will be accessed when it's 4195 * freed from LRU. This is safe because uncharged page is expected not 4196 * to be reused (freed soon). Exception is SwapCache, it's handled by 4197 * special functions. 4198 */ 4199 4200 unlock_page_cgroup(pc); 4201 /* 4202 * even after unlock, we have memcg->res.usage here and this memcg 4203 * will never be freed. 4204 */ 4205 memcg_check_events(memcg, page); 4206 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 4207 mem_cgroup_swap_statistics(memcg, true); 4208 mem_cgroup_get(memcg); 4209 } 4210 /* 4211 * Migration does not charge the res_counter for the 4212 * replacement page, so leave it alone when phasing out the 4213 * page that is unused after the migration. 4214 */ 4215 if (!end_migration && !mem_cgroup_is_root(memcg)) 4216 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 4217 4218 return memcg; 4219 4220 unlock_out: 4221 unlock_page_cgroup(pc); 4222 return NULL; 4223 } 4224 4225 void mem_cgroup_uncharge_page(struct page *page) 4226 { 4227 /* early check. */ 4228 if (page_mapped(page)) 4229 return; 4230 VM_BUG_ON(page->mapping && !PageAnon(page)); 4231 /* 4232 * If the page is in swap cache, uncharge should be deferred 4233 * to the swap path, which also properly accounts swap usage 4234 * and handles memcg lifetime. 4235 * 4236 * Note that this check is not stable and reclaim may add the 4237 * page to swap cache at any time after this. However, if the 4238 * page is not in swap cache by the time page->mapcount hits 4239 * 0, there won't be any page table references to the swap 4240 * slot, and reclaim will free it and not actually write the 4241 * page to disk. 4242 */ 4243 if (PageSwapCache(page)) 4244 return; 4245 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); 4246 } 4247 4248 void mem_cgroup_uncharge_cache_page(struct page *page) 4249 { 4250 VM_BUG_ON(page_mapped(page)); 4251 VM_BUG_ON(page->mapping); 4252 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); 4253 } 4254 4255 /* 4256 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 4257 * In that cases, pages are freed continuously and we can expect pages 4258 * are in the same memcg. All these calls itself limits the number of 4259 * pages freed at once, then uncharge_start/end() is called properly. 4260 * This may be called prural(2) times in a context, 4261 */ 4262 4263 void mem_cgroup_uncharge_start(void) 4264 { 4265 current->memcg_batch.do_batch++; 4266 /* We can do nest. */ 4267 if (current->memcg_batch.do_batch == 1) { 4268 current->memcg_batch.memcg = NULL; 4269 current->memcg_batch.nr_pages = 0; 4270 current->memcg_batch.memsw_nr_pages = 0; 4271 } 4272 } 4273 4274 void mem_cgroup_uncharge_end(void) 4275 { 4276 struct memcg_batch_info *batch = ¤t->memcg_batch; 4277 4278 if (!batch->do_batch) 4279 return; 4280 4281 batch->do_batch--; 4282 if (batch->do_batch) /* If stacked, do nothing. */ 4283 return; 4284 4285 if (!batch->memcg) 4286 return; 4287 /* 4288 * This "batch->memcg" is valid without any css_get/put etc... 4289 * bacause we hide charges behind us. 4290 */ 4291 if (batch->nr_pages) 4292 res_counter_uncharge(&batch->memcg->res, 4293 batch->nr_pages * PAGE_SIZE); 4294 if (batch->memsw_nr_pages) 4295 res_counter_uncharge(&batch->memcg->memsw, 4296 batch->memsw_nr_pages * PAGE_SIZE); 4297 memcg_oom_recover(batch->memcg); 4298 /* forget this pointer (for sanity check) */ 4299 batch->memcg = NULL; 4300 } 4301 4302 #ifdef CONFIG_SWAP 4303 /* 4304 * called after __delete_from_swap_cache() and drop "page" account. 4305 * memcg information is recorded to swap_cgroup of "ent" 4306 */ 4307 void 4308 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 4309 { 4310 struct mem_cgroup *memcg; 4311 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 4312 4313 if (!swapout) /* this was a swap cache but the swap is unused ! */ 4314 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 4315 4316 memcg = __mem_cgroup_uncharge_common(page, ctype, false); 4317 4318 /* 4319 * record memcg information, if swapout && memcg != NULL, 4320 * mem_cgroup_get() was called in uncharge(). 4321 */ 4322 if (do_swap_account && swapout && memcg) 4323 swap_cgroup_record(ent, css_id(&memcg->css)); 4324 } 4325 #endif 4326 4327 #ifdef CONFIG_MEMCG_SWAP 4328 /* 4329 * called from swap_entry_free(). remove record in swap_cgroup and 4330 * uncharge "memsw" account. 4331 */ 4332 void mem_cgroup_uncharge_swap(swp_entry_t ent) 4333 { 4334 struct mem_cgroup *memcg; 4335 unsigned short id; 4336 4337 if (!do_swap_account) 4338 return; 4339 4340 id = swap_cgroup_record(ent, 0); 4341 rcu_read_lock(); 4342 memcg = mem_cgroup_lookup(id); 4343 if (memcg) { 4344 /* 4345 * We uncharge this because swap is freed. 4346 * This memcg can be obsolete one. We avoid calling css_tryget 4347 */ 4348 if (!mem_cgroup_is_root(memcg)) 4349 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 4350 mem_cgroup_swap_statistics(memcg, false); 4351 mem_cgroup_put(memcg); 4352 } 4353 rcu_read_unlock(); 4354 } 4355 4356 /** 4357 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 4358 * @entry: swap entry to be moved 4359 * @from: mem_cgroup which the entry is moved from 4360 * @to: mem_cgroup which the entry is moved to 4361 * 4362 * It succeeds only when the swap_cgroup's record for this entry is the same 4363 * as the mem_cgroup's id of @from. 4364 * 4365 * Returns 0 on success, -EINVAL on failure. 4366 * 4367 * The caller must have charged to @to, IOW, called res_counter_charge() about 4368 * both res and memsw, and called css_get(). 4369 */ 4370 static int mem_cgroup_move_swap_account(swp_entry_t entry, 4371 struct mem_cgroup *from, struct mem_cgroup *to) 4372 { 4373 unsigned short old_id, new_id; 4374 4375 old_id = css_id(&from->css); 4376 new_id = css_id(&to->css); 4377 4378 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 4379 mem_cgroup_swap_statistics(from, false); 4380 mem_cgroup_swap_statistics(to, true); 4381 /* 4382 * This function is only called from task migration context now. 4383 * It postpones res_counter and refcount handling till the end 4384 * of task migration(mem_cgroup_clear_mc()) for performance 4385 * improvement. But we cannot postpone mem_cgroup_get(to) 4386 * because if the process that has been moved to @to does 4387 * swap-in, the refcount of @to might be decreased to 0. 4388 */ 4389 mem_cgroup_get(to); 4390 return 0; 4391 } 4392 return -EINVAL; 4393 } 4394 #else 4395 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 4396 struct mem_cgroup *from, struct mem_cgroup *to) 4397 { 4398 return -EINVAL; 4399 } 4400 #endif 4401 4402 /* 4403 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 4404 * page belongs to. 4405 */ 4406 void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, 4407 struct mem_cgroup **memcgp) 4408 { 4409 struct mem_cgroup *memcg = NULL; 4410 unsigned int nr_pages = 1; 4411 struct page_cgroup *pc; 4412 enum charge_type ctype; 4413 4414 *memcgp = NULL; 4415 4416 if (mem_cgroup_disabled()) 4417 return; 4418 4419 if (PageTransHuge(page)) 4420 nr_pages <<= compound_order(page); 4421 4422 pc = lookup_page_cgroup(page); 4423 lock_page_cgroup(pc); 4424 if (PageCgroupUsed(pc)) { 4425 memcg = pc->mem_cgroup; 4426 css_get(&memcg->css); 4427 /* 4428 * At migrating an anonymous page, its mapcount goes down 4429 * to 0 and uncharge() will be called. But, even if it's fully 4430 * unmapped, migration may fail and this page has to be 4431 * charged again. We set MIGRATION flag here and delay uncharge 4432 * until end_migration() is called 4433 * 4434 * Corner Case Thinking 4435 * A) 4436 * When the old page was mapped as Anon and it's unmap-and-freed 4437 * while migration was ongoing. 4438 * If unmap finds the old page, uncharge() of it will be delayed 4439 * until end_migration(). If unmap finds a new page, it's 4440 * uncharged when it make mapcount to be 1->0. If unmap code 4441 * finds swap_migration_entry, the new page will not be mapped 4442 * and end_migration() will find it(mapcount==0). 4443 * 4444 * B) 4445 * When the old page was mapped but migraion fails, the kernel 4446 * remaps it. A charge for it is kept by MIGRATION flag even 4447 * if mapcount goes down to 0. We can do remap successfully 4448 * without charging it again. 4449 * 4450 * C) 4451 * The "old" page is under lock_page() until the end of 4452 * migration, so, the old page itself will not be swapped-out. 4453 * If the new page is swapped out before end_migraton, our 4454 * hook to usual swap-out path will catch the event. 4455 */ 4456 if (PageAnon(page)) 4457 SetPageCgroupMigration(pc); 4458 } 4459 unlock_page_cgroup(pc); 4460 /* 4461 * If the page is not charged at this point, 4462 * we return here. 4463 */ 4464 if (!memcg) 4465 return; 4466 4467 *memcgp = memcg; 4468 /* 4469 * We charge new page before it's used/mapped. So, even if unlock_page() 4470 * is called before end_migration, we can catch all events on this new 4471 * page. In the case new page is migrated but not remapped, new page's 4472 * mapcount will be finally 0 and we call uncharge in end_migration(). 4473 */ 4474 if (PageAnon(page)) 4475 ctype = MEM_CGROUP_CHARGE_TYPE_ANON; 4476 else 4477 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 4478 /* 4479 * The page is committed to the memcg, but it's not actually 4480 * charged to the res_counter since we plan on replacing the 4481 * old one and only one page is going to be left afterwards. 4482 */ 4483 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); 4484 } 4485 4486 /* remove redundant charge if migration failed*/ 4487 void mem_cgroup_end_migration(struct mem_cgroup *memcg, 4488 struct page *oldpage, struct page *newpage, bool migration_ok) 4489 { 4490 struct page *used, *unused; 4491 struct page_cgroup *pc; 4492 bool anon; 4493 4494 if (!memcg) 4495 return; 4496 4497 if (!migration_ok) { 4498 used = oldpage; 4499 unused = newpage; 4500 } else { 4501 used = newpage; 4502 unused = oldpage; 4503 } 4504 anon = PageAnon(used); 4505 __mem_cgroup_uncharge_common(unused, 4506 anon ? MEM_CGROUP_CHARGE_TYPE_ANON 4507 : MEM_CGROUP_CHARGE_TYPE_CACHE, 4508 true); 4509 css_put(&memcg->css); 4510 /* 4511 * We disallowed uncharge of pages under migration because mapcount 4512 * of the page goes down to zero, temporarly. 4513 * Clear the flag and check the page should be charged. 4514 */ 4515 pc = lookup_page_cgroup(oldpage); 4516 lock_page_cgroup(pc); 4517 ClearPageCgroupMigration(pc); 4518 unlock_page_cgroup(pc); 4519 4520 /* 4521 * If a page is a file cache, radix-tree replacement is very atomic 4522 * and we can skip this check. When it was an Anon page, its mapcount 4523 * goes down to 0. But because we added MIGRATION flage, it's not 4524 * uncharged yet. There are several case but page->mapcount check 4525 * and USED bit check in mem_cgroup_uncharge_page() will do enough 4526 * check. (see prepare_charge() also) 4527 */ 4528 if (anon) 4529 mem_cgroup_uncharge_page(used); 4530 } 4531 4532 /* 4533 * At replace page cache, newpage is not under any memcg but it's on 4534 * LRU. So, this function doesn't touch res_counter but handles LRU 4535 * in correct way. Both pages are locked so we cannot race with uncharge. 4536 */ 4537 void mem_cgroup_replace_page_cache(struct page *oldpage, 4538 struct page *newpage) 4539 { 4540 struct mem_cgroup *memcg = NULL; 4541 struct page_cgroup *pc; 4542 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 4543 4544 if (mem_cgroup_disabled()) 4545 return; 4546 4547 pc = lookup_page_cgroup(oldpage); 4548 /* fix accounting on old pages */ 4549 lock_page_cgroup(pc); 4550 if (PageCgroupUsed(pc)) { 4551 memcg = pc->mem_cgroup; 4552 mem_cgroup_charge_statistics(memcg, oldpage, false, -1); 4553 ClearPageCgroupUsed(pc); 4554 } 4555 unlock_page_cgroup(pc); 4556 4557 /* 4558 * When called from shmem_replace_page(), in some cases the 4559 * oldpage has already been charged, and in some cases not. 4560 */ 4561 if (!memcg) 4562 return; 4563 /* 4564 * Even if newpage->mapping was NULL before starting replacement, 4565 * the newpage may be on LRU(or pagevec for LRU) already. We lock 4566 * LRU while we overwrite pc->mem_cgroup. 4567 */ 4568 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); 4569 } 4570 4571 #ifdef CONFIG_DEBUG_VM 4572 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 4573 { 4574 struct page_cgroup *pc; 4575 4576 pc = lookup_page_cgroup(page); 4577 /* 4578 * Can be NULL while feeding pages into the page allocator for 4579 * the first time, i.e. during boot or memory hotplug; 4580 * or when mem_cgroup_disabled(). 4581 */ 4582 if (likely(pc) && PageCgroupUsed(pc)) 4583 return pc; 4584 return NULL; 4585 } 4586 4587 bool mem_cgroup_bad_page_check(struct page *page) 4588 { 4589 if (mem_cgroup_disabled()) 4590 return false; 4591 4592 return lookup_page_cgroup_used(page) != NULL; 4593 } 4594 4595 void mem_cgroup_print_bad_page(struct page *page) 4596 { 4597 struct page_cgroup *pc; 4598 4599 pc = lookup_page_cgroup_used(page); 4600 if (pc) { 4601 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 4602 pc, pc->flags, pc->mem_cgroup); 4603 } 4604 } 4605 #endif 4606 4607 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 4608 unsigned long long val) 4609 { 4610 int retry_count; 4611 u64 memswlimit, memlimit; 4612 int ret = 0; 4613 int children = mem_cgroup_count_children(memcg); 4614 u64 curusage, oldusage; 4615 int enlarge; 4616 4617 /* 4618 * For keeping hierarchical_reclaim simple, how long we should retry 4619 * is depends on callers. We set our retry-count to be function 4620 * of # of children which we should visit in this loop. 4621 */ 4622 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 4623 4624 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 4625 4626 enlarge = 0; 4627 while (retry_count) { 4628 if (signal_pending(current)) { 4629 ret = -EINTR; 4630 break; 4631 } 4632 /* 4633 * Rather than hide all in some function, I do this in 4634 * open coded manner. You see what this really does. 4635 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 4636 */ 4637 mutex_lock(&set_limit_mutex); 4638 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4639 if (memswlimit < val) { 4640 ret = -EINVAL; 4641 mutex_unlock(&set_limit_mutex); 4642 break; 4643 } 4644 4645 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4646 if (memlimit < val) 4647 enlarge = 1; 4648 4649 ret = res_counter_set_limit(&memcg->res, val); 4650 if (!ret) { 4651 if (memswlimit == val) 4652 memcg->memsw_is_minimum = true; 4653 else 4654 memcg->memsw_is_minimum = false; 4655 } 4656 mutex_unlock(&set_limit_mutex); 4657 4658 if (!ret) 4659 break; 4660 4661 mem_cgroup_reclaim(memcg, GFP_KERNEL, 4662 MEM_CGROUP_RECLAIM_SHRINK); 4663 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 4664 /* Usage is reduced ? */ 4665 if (curusage >= oldusage) 4666 retry_count--; 4667 else 4668 oldusage = curusage; 4669 } 4670 if (!ret && enlarge) 4671 memcg_oom_recover(memcg); 4672 4673 return ret; 4674 } 4675 4676 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 4677 unsigned long long val) 4678 { 4679 int retry_count; 4680 u64 memlimit, memswlimit, oldusage, curusage; 4681 int children = mem_cgroup_count_children(memcg); 4682 int ret = -EBUSY; 4683 int enlarge = 0; 4684 4685 /* see mem_cgroup_resize_res_limit */ 4686 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 4687 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4688 while (retry_count) { 4689 if (signal_pending(current)) { 4690 ret = -EINTR; 4691 break; 4692 } 4693 /* 4694 * Rather than hide all in some function, I do this in 4695 * open coded manner. You see what this really does. 4696 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 4697 */ 4698 mutex_lock(&set_limit_mutex); 4699 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4700 if (memlimit > val) { 4701 ret = -EINVAL; 4702 mutex_unlock(&set_limit_mutex); 4703 break; 4704 } 4705 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4706 if (memswlimit < val) 4707 enlarge = 1; 4708 ret = res_counter_set_limit(&memcg->memsw, val); 4709 if (!ret) { 4710 if (memlimit == val) 4711 memcg->memsw_is_minimum = true; 4712 else 4713 memcg->memsw_is_minimum = false; 4714 } 4715 mutex_unlock(&set_limit_mutex); 4716 4717 if (!ret) 4718 break; 4719 4720 mem_cgroup_reclaim(memcg, GFP_KERNEL, 4721 MEM_CGROUP_RECLAIM_NOSWAP | 4722 MEM_CGROUP_RECLAIM_SHRINK); 4723 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4724 /* Usage is reduced ? */ 4725 if (curusage >= oldusage) 4726 retry_count--; 4727 else 4728 oldusage = curusage; 4729 } 4730 if (!ret && enlarge) 4731 memcg_oom_recover(memcg); 4732 return ret; 4733 } 4734 4735 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 4736 gfp_t gfp_mask, 4737 unsigned long *total_scanned) 4738 { 4739 unsigned long nr_reclaimed = 0; 4740 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 4741 unsigned long reclaimed; 4742 int loop = 0; 4743 struct mem_cgroup_tree_per_zone *mctz; 4744 unsigned long long excess; 4745 unsigned long nr_scanned; 4746 4747 if (order > 0) 4748 return 0; 4749 4750 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 4751 /* 4752 * This loop can run a while, specially if mem_cgroup's continuously 4753 * keep exceeding their soft limit and putting the system under 4754 * pressure 4755 */ 4756 do { 4757 if (next_mz) 4758 mz = next_mz; 4759 else 4760 mz = mem_cgroup_largest_soft_limit_node(mctz); 4761 if (!mz) 4762 break; 4763 4764 nr_scanned = 0; 4765 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 4766 gfp_mask, &nr_scanned); 4767 nr_reclaimed += reclaimed; 4768 *total_scanned += nr_scanned; 4769 spin_lock(&mctz->lock); 4770 4771 /* 4772 * If we failed to reclaim anything from this memory cgroup 4773 * it is time to move on to the next cgroup 4774 */ 4775 next_mz = NULL; 4776 if (!reclaimed) { 4777 do { 4778 /* 4779 * Loop until we find yet another one. 4780 * 4781 * By the time we get the soft_limit lock 4782 * again, someone might have aded the 4783 * group back on the RB tree. Iterate to 4784 * make sure we get a different mem. 4785 * mem_cgroup_largest_soft_limit_node returns 4786 * NULL if no other cgroup is present on 4787 * the tree 4788 */ 4789 next_mz = 4790 __mem_cgroup_largest_soft_limit_node(mctz); 4791 if (next_mz == mz) 4792 css_put(&next_mz->memcg->css); 4793 else /* next_mz == NULL or other memcg */ 4794 break; 4795 } while (1); 4796 } 4797 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 4798 excess = res_counter_soft_limit_excess(&mz->memcg->res); 4799 /* 4800 * One school of thought says that we should not add 4801 * back the node to the tree if reclaim returns 0. 4802 * But our reclaim could return 0, simply because due 4803 * to priority we are exposing a smaller subset of 4804 * memory to reclaim from. Consider this as a longer 4805 * term TODO. 4806 */ 4807 /* If excess == 0, no tree ops */ 4808 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); 4809 spin_unlock(&mctz->lock); 4810 css_put(&mz->memcg->css); 4811 loop++; 4812 /* 4813 * Could not reclaim anything and there are no more 4814 * mem cgroups to try or we seem to be looping without 4815 * reclaiming anything. 4816 */ 4817 if (!nr_reclaimed && 4818 (next_mz == NULL || 4819 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 4820 break; 4821 } while (!nr_reclaimed); 4822 if (next_mz) 4823 css_put(&next_mz->memcg->css); 4824 return nr_reclaimed; 4825 } 4826 4827 /** 4828 * mem_cgroup_force_empty_list - clears LRU of a group 4829 * @memcg: group to clear 4830 * @node: NUMA node 4831 * @zid: zone id 4832 * @lru: lru to to clear 4833 * 4834 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 4835 * reclaim the pages page themselves - pages are moved to the parent (or root) 4836 * group. 4837 */ 4838 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 4839 int node, int zid, enum lru_list lru) 4840 { 4841 struct lruvec *lruvec; 4842 unsigned long flags; 4843 struct list_head *list; 4844 struct page *busy; 4845 struct zone *zone; 4846 4847 zone = &NODE_DATA(node)->node_zones[zid]; 4848 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 4849 list = &lruvec->lists[lru]; 4850 4851 busy = NULL; 4852 do { 4853 struct page_cgroup *pc; 4854 struct page *page; 4855 4856 spin_lock_irqsave(&zone->lru_lock, flags); 4857 if (list_empty(list)) { 4858 spin_unlock_irqrestore(&zone->lru_lock, flags); 4859 break; 4860 } 4861 page = list_entry(list->prev, struct page, lru); 4862 if (busy == page) { 4863 list_move(&page->lru, list); 4864 busy = NULL; 4865 spin_unlock_irqrestore(&zone->lru_lock, flags); 4866 continue; 4867 } 4868 spin_unlock_irqrestore(&zone->lru_lock, flags); 4869 4870 pc = lookup_page_cgroup(page); 4871 4872 if (mem_cgroup_move_parent(page, pc, memcg)) { 4873 /* found lock contention or "pc" is obsolete. */ 4874 busy = page; 4875 cond_resched(); 4876 } else 4877 busy = NULL; 4878 } while (!list_empty(list)); 4879 } 4880 4881 /* 4882 * make mem_cgroup's charge to be 0 if there is no task by moving 4883 * all the charges and pages to the parent. 4884 * This enables deleting this mem_cgroup. 4885 * 4886 * Caller is responsible for holding css reference on the memcg. 4887 */ 4888 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) 4889 { 4890 int node, zid; 4891 u64 usage; 4892 4893 do { 4894 /* This is for making all *used* pages to be on LRU. */ 4895 lru_add_drain_all(); 4896 drain_all_stock_sync(memcg); 4897 mem_cgroup_start_move(memcg); 4898 for_each_node_state(node, N_MEMORY) { 4899 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4900 enum lru_list lru; 4901 for_each_lru(lru) { 4902 mem_cgroup_force_empty_list(memcg, 4903 node, zid, lru); 4904 } 4905 } 4906 } 4907 mem_cgroup_end_move(memcg); 4908 memcg_oom_recover(memcg); 4909 cond_resched(); 4910 4911 /* 4912 * Kernel memory may not necessarily be trackable to a specific 4913 * process. So they are not migrated, and therefore we can't 4914 * expect their value to drop to 0 here. 4915 * Having res filled up with kmem only is enough. 4916 * 4917 * This is a safety check because mem_cgroup_force_empty_list 4918 * could have raced with mem_cgroup_replace_page_cache callers 4919 * so the lru seemed empty but the page could have been added 4920 * right after the check. RES_USAGE should be safe as we always 4921 * charge before adding to the LRU. 4922 */ 4923 usage = res_counter_read_u64(&memcg->res, RES_USAGE) - 4924 res_counter_read_u64(&memcg->kmem, RES_USAGE); 4925 } while (usage > 0); 4926 } 4927 4928 /* 4929 * This mainly exists for tests during the setting of set of use_hierarchy. 4930 * Since this is the very setting we are changing, the current hierarchy value 4931 * is meaningless 4932 */ 4933 static inline bool __memcg_has_children(struct mem_cgroup *memcg) 4934 { 4935 struct cgroup *pos; 4936 4937 /* bounce at first found */ 4938 cgroup_for_each_child(pos, memcg->css.cgroup) 4939 return true; 4940 return false; 4941 } 4942 4943 /* 4944 * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed 4945 * to be already dead (as in mem_cgroup_force_empty, for instance). This is 4946 * from mem_cgroup_count_children(), in the sense that we don't really care how 4947 * many children we have; we only need to know if we have any. It also counts 4948 * any memcg without hierarchy as infertile. 4949 */ 4950 static inline bool memcg_has_children(struct mem_cgroup *memcg) 4951 { 4952 return memcg->use_hierarchy && __memcg_has_children(memcg); 4953 } 4954 4955 /* 4956 * Reclaims as many pages from the given memcg as possible and moves 4957 * the rest to the parent. 4958 * 4959 * Caller is responsible for holding css reference for memcg. 4960 */ 4961 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 4962 { 4963 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 4964 struct cgroup *cgrp = memcg->css.cgroup; 4965 4966 /* returns EBUSY if there is a task or if we come here twice. */ 4967 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 4968 return -EBUSY; 4969 4970 /* we call try-to-free pages for make this cgroup empty */ 4971 lru_add_drain_all(); 4972 /* try to free all pages in this cgroup */ 4973 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 4974 int progress; 4975 4976 if (signal_pending(current)) 4977 return -EINTR; 4978 4979 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 4980 false); 4981 if (!progress) { 4982 nr_retries--; 4983 /* maybe some writeback is necessary */ 4984 congestion_wait(BLK_RW_ASYNC, HZ/10); 4985 } 4986 4987 } 4988 lru_add_drain(); 4989 mem_cgroup_reparent_charges(memcg); 4990 4991 return 0; 4992 } 4993 4994 static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 4995 { 4996 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4997 int ret; 4998 4999 if (mem_cgroup_is_root(memcg)) 5000 return -EINVAL; 5001 css_get(&memcg->css); 5002 ret = mem_cgroup_force_empty(memcg); 5003 css_put(&memcg->css); 5004 5005 return ret; 5006 } 5007 5008 5009 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 5010 { 5011 return mem_cgroup_from_cont(cont)->use_hierarchy; 5012 } 5013 5014 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 5015 u64 val) 5016 { 5017 int retval = 0; 5018 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5019 struct cgroup *parent = cont->parent; 5020 struct mem_cgroup *parent_memcg = NULL; 5021 5022 if (parent) 5023 parent_memcg = mem_cgroup_from_cont(parent); 5024 5025 mutex_lock(&memcg_create_mutex); 5026 5027 if (memcg->use_hierarchy == val) 5028 goto out; 5029 5030 /* 5031 * If parent's use_hierarchy is set, we can't make any modifications 5032 * in the child subtrees. If it is unset, then the change can 5033 * occur, provided the current cgroup has no children. 5034 * 5035 * For the root cgroup, parent_mem is NULL, we allow value to be 5036 * set if there are no children. 5037 */ 5038 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 5039 (val == 1 || val == 0)) { 5040 if (!__memcg_has_children(memcg)) 5041 memcg->use_hierarchy = val; 5042 else 5043 retval = -EBUSY; 5044 } else 5045 retval = -EINVAL; 5046 5047 out: 5048 mutex_unlock(&memcg_create_mutex); 5049 5050 return retval; 5051 } 5052 5053 5054 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 5055 enum mem_cgroup_stat_index idx) 5056 { 5057 struct mem_cgroup *iter; 5058 long val = 0; 5059 5060 /* Per-cpu values can be negative, use a signed accumulator */ 5061 for_each_mem_cgroup_tree(iter, memcg) 5062 val += mem_cgroup_read_stat(iter, idx); 5063 5064 if (val < 0) /* race ? */ 5065 val = 0; 5066 return val; 5067 } 5068 5069 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 5070 { 5071 u64 val; 5072 5073 if (!mem_cgroup_is_root(memcg)) { 5074 if (!swap) 5075 return res_counter_read_u64(&memcg->res, RES_USAGE); 5076 else 5077 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 5078 } 5079 5080 /* 5081 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS 5082 * as well as in MEM_CGROUP_STAT_RSS_HUGE. 5083 */ 5084 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); 5085 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 5086 5087 if (swap) 5088 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); 5089 5090 return val << PAGE_SHIFT; 5091 } 5092 5093 static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, 5094 struct file *file, char __user *buf, 5095 size_t nbytes, loff_t *ppos) 5096 { 5097 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5098 char str[64]; 5099 u64 val; 5100 int name, len; 5101 enum res_type type; 5102 5103 type = MEMFILE_TYPE(cft->private); 5104 name = MEMFILE_ATTR(cft->private); 5105 5106 switch (type) { 5107 case _MEM: 5108 if (name == RES_USAGE) 5109 val = mem_cgroup_usage(memcg, false); 5110 else 5111 val = res_counter_read_u64(&memcg->res, name); 5112 break; 5113 case _MEMSWAP: 5114 if (name == RES_USAGE) 5115 val = mem_cgroup_usage(memcg, true); 5116 else 5117 val = res_counter_read_u64(&memcg->memsw, name); 5118 break; 5119 case _KMEM: 5120 val = res_counter_read_u64(&memcg->kmem, name); 5121 break; 5122 default: 5123 BUG(); 5124 } 5125 5126 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 5127 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 5128 } 5129 5130 static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) 5131 { 5132 int ret = -EINVAL; 5133 #ifdef CONFIG_MEMCG_KMEM 5134 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5135 /* 5136 * For simplicity, we won't allow this to be disabled. It also can't 5137 * be changed if the cgroup has children already, or if tasks had 5138 * already joined. 5139 * 5140 * If tasks join before we set the limit, a person looking at 5141 * kmem.usage_in_bytes will have no way to determine when it took 5142 * place, which makes the value quite meaningless. 5143 * 5144 * After it first became limited, changes in the value of the limit are 5145 * of course permitted. 5146 */ 5147 mutex_lock(&memcg_create_mutex); 5148 mutex_lock(&set_limit_mutex); 5149 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { 5150 if (cgroup_task_count(cont) || memcg_has_children(memcg)) { 5151 ret = -EBUSY; 5152 goto out; 5153 } 5154 ret = res_counter_set_limit(&memcg->kmem, val); 5155 VM_BUG_ON(ret); 5156 5157 ret = memcg_update_cache_sizes(memcg); 5158 if (ret) { 5159 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); 5160 goto out; 5161 } 5162 static_key_slow_inc(&memcg_kmem_enabled_key); 5163 /* 5164 * setting the active bit after the inc will guarantee no one 5165 * starts accounting before all call sites are patched 5166 */ 5167 memcg_kmem_set_active(memcg); 5168 5169 /* 5170 * kmem charges can outlive the cgroup. In the case of slab 5171 * pages, for instance, a page contain objects from various 5172 * processes, so it is unfeasible to migrate them away. We 5173 * need to reference count the memcg because of that. 5174 */ 5175 mem_cgroup_get(memcg); 5176 } else 5177 ret = res_counter_set_limit(&memcg->kmem, val); 5178 out: 5179 mutex_unlock(&set_limit_mutex); 5180 mutex_unlock(&memcg_create_mutex); 5181 #endif 5182 return ret; 5183 } 5184 5185 #ifdef CONFIG_MEMCG_KMEM 5186 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 5187 { 5188 int ret = 0; 5189 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5190 if (!parent) 5191 goto out; 5192 5193 memcg->kmem_account_flags = parent->kmem_account_flags; 5194 /* 5195 * When that happen, we need to disable the static branch only on those 5196 * memcgs that enabled it. To achieve this, we would be forced to 5197 * complicate the code by keeping track of which memcgs were the ones 5198 * that actually enabled limits, and which ones got it from its 5199 * parents. 5200 * 5201 * It is a lot simpler just to do static_key_slow_inc() on every child 5202 * that is accounted. 5203 */ 5204 if (!memcg_kmem_is_active(memcg)) 5205 goto out; 5206 5207 /* 5208 * destroy(), called if we fail, will issue static_key_slow_inc() and 5209 * mem_cgroup_put() if kmem is enabled. We have to either call them 5210 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find 5211 * this more consistent, since it always leads to the same destroy path 5212 */ 5213 mem_cgroup_get(memcg); 5214 static_key_slow_inc(&memcg_kmem_enabled_key); 5215 5216 mutex_lock(&set_limit_mutex); 5217 ret = memcg_update_cache_sizes(memcg); 5218 mutex_unlock(&set_limit_mutex); 5219 out: 5220 return ret; 5221 } 5222 #endif /* CONFIG_MEMCG_KMEM */ 5223 5224 /* 5225 * The user of this function is... 5226 * RES_LIMIT. 5227 */ 5228 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 5229 const char *buffer) 5230 { 5231 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5232 enum res_type type; 5233 int name; 5234 unsigned long long val; 5235 int ret; 5236 5237 type = MEMFILE_TYPE(cft->private); 5238 name = MEMFILE_ATTR(cft->private); 5239 5240 switch (name) { 5241 case RES_LIMIT: 5242 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 5243 ret = -EINVAL; 5244 break; 5245 } 5246 /* This function does all necessary parse...reuse it */ 5247 ret = res_counter_memparse_write_strategy(buffer, &val); 5248 if (ret) 5249 break; 5250 if (type == _MEM) 5251 ret = mem_cgroup_resize_limit(memcg, val); 5252 else if (type == _MEMSWAP) 5253 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5254 else if (type == _KMEM) 5255 ret = memcg_update_kmem_limit(cont, val); 5256 else 5257 return -EINVAL; 5258 break; 5259 case RES_SOFT_LIMIT: 5260 ret = res_counter_memparse_write_strategy(buffer, &val); 5261 if (ret) 5262 break; 5263 /* 5264 * For memsw, soft limits are hard to implement in terms 5265 * of semantics, for now, we support soft limits for 5266 * control without swap 5267 */ 5268 if (type == _MEM) 5269 ret = res_counter_set_soft_limit(&memcg->res, val); 5270 else 5271 ret = -EINVAL; 5272 break; 5273 default: 5274 ret = -EINVAL; /* should be BUG() ? */ 5275 break; 5276 } 5277 return ret; 5278 } 5279 5280 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 5281 unsigned long long *mem_limit, unsigned long long *memsw_limit) 5282 { 5283 struct cgroup *cgroup; 5284 unsigned long long min_limit, min_memsw_limit, tmp; 5285 5286 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 5287 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 5288 cgroup = memcg->css.cgroup; 5289 if (!memcg->use_hierarchy) 5290 goto out; 5291 5292 while (cgroup->parent) { 5293 cgroup = cgroup->parent; 5294 memcg = mem_cgroup_from_cont(cgroup); 5295 if (!memcg->use_hierarchy) 5296 break; 5297 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 5298 min_limit = min(min_limit, tmp); 5299 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 5300 min_memsw_limit = min(min_memsw_limit, tmp); 5301 } 5302 out: 5303 *mem_limit = min_limit; 5304 *memsw_limit = min_memsw_limit; 5305 } 5306 5307 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 5308 { 5309 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5310 int name; 5311 enum res_type type; 5312 5313 type = MEMFILE_TYPE(event); 5314 name = MEMFILE_ATTR(event); 5315 5316 switch (name) { 5317 case RES_MAX_USAGE: 5318 if (type == _MEM) 5319 res_counter_reset_max(&memcg->res); 5320 else if (type == _MEMSWAP) 5321 res_counter_reset_max(&memcg->memsw); 5322 else if (type == _KMEM) 5323 res_counter_reset_max(&memcg->kmem); 5324 else 5325 return -EINVAL; 5326 break; 5327 case RES_FAILCNT: 5328 if (type == _MEM) 5329 res_counter_reset_failcnt(&memcg->res); 5330 else if (type == _MEMSWAP) 5331 res_counter_reset_failcnt(&memcg->memsw); 5332 else if (type == _KMEM) 5333 res_counter_reset_failcnt(&memcg->kmem); 5334 else 5335 return -EINVAL; 5336 break; 5337 } 5338 5339 return 0; 5340 } 5341 5342 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 5343 struct cftype *cft) 5344 { 5345 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 5346 } 5347 5348 #ifdef CONFIG_MMU 5349 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 5350 struct cftype *cft, u64 val) 5351 { 5352 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5353 5354 if (val >= (1 << NR_MOVE_TYPE)) 5355 return -EINVAL; 5356 5357 /* 5358 * No kind of locking is needed in here, because ->can_attach() will 5359 * check this value once in the beginning of the process, and then carry 5360 * on with stale data. This means that changes to this value will only 5361 * affect task migrations starting after the change. 5362 */ 5363 memcg->move_charge_at_immigrate = val; 5364 return 0; 5365 } 5366 #else 5367 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 5368 struct cftype *cft, u64 val) 5369 { 5370 return -ENOSYS; 5371 } 5372 #endif 5373 5374 #ifdef CONFIG_NUMA 5375 static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, 5376 struct seq_file *m) 5377 { 5378 int nid; 5379 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 5380 unsigned long node_nr; 5381 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5382 5383 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 5384 seq_printf(m, "total=%lu", total_nr); 5385 for_each_node_state(nid, N_MEMORY) { 5386 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); 5387 seq_printf(m, " N%d=%lu", nid, node_nr); 5388 } 5389 seq_putc(m, '\n'); 5390 5391 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); 5392 seq_printf(m, "file=%lu", file_nr); 5393 for_each_node_state(nid, N_MEMORY) { 5394 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5395 LRU_ALL_FILE); 5396 seq_printf(m, " N%d=%lu", nid, node_nr); 5397 } 5398 seq_putc(m, '\n'); 5399 5400 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); 5401 seq_printf(m, "anon=%lu", anon_nr); 5402 for_each_node_state(nid, N_MEMORY) { 5403 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5404 LRU_ALL_ANON); 5405 seq_printf(m, " N%d=%lu", nid, node_nr); 5406 } 5407 seq_putc(m, '\n'); 5408 5409 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); 5410 seq_printf(m, "unevictable=%lu", unevictable_nr); 5411 for_each_node_state(nid, N_MEMORY) { 5412 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5413 BIT(LRU_UNEVICTABLE)); 5414 seq_printf(m, " N%d=%lu", nid, node_nr); 5415 } 5416 seq_putc(m, '\n'); 5417 return 0; 5418 } 5419 #endif /* CONFIG_NUMA */ 5420 5421 static inline void mem_cgroup_lru_names_not_uptodate(void) 5422 { 5423 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5424 } 5425 5426 static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, 5427 struct seq_file *m) 5428 { 5429 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5430 struct mem_cgroup *mi; 5431 unsigned int i; 5432 5433 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 5434 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 5435 continue; 5436 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 5437 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 5438 } 5439 5440 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 5441 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 5442 mem_cgroup_read_events(memcg, i)); 5443 5444 for (i = 0; i < NR_LRU_LISTS; i++) 5445 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 5446 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 5447 5448 /* Hierarchical information */ 5449 { 5450 unsigned long long limit, memsw_limit; 5451 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 5452 seq_printf(m, "hierarchical_memory_limit %llu\n", limit); 5453 if (do_swap_account) 5454 seq_printf(m, "hierarchical_memsw_limit %llu\n", 5455 memsw_limit); 5456 } 5457 5458 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 5459 long long val = 0; 5460 5461 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 5462 continue; 5463 for_each_mem_cgroup_tree(mi, memcg) 5464 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 5465 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); 5466 } 5467 5468 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 5469 unsigned long long val = 0; 5470 5471 for_each_mem_cgroup_tree(mi, memcg) 5472 val += mem_cgroup_read_events(mi, i); 5473 seq_printf(m, "total_%s %llu\n", 5474 mem_cgroup_events_names[i], val); 5475 } 5476 5477 for (i = 0; i < NR_LRU_LISTS; i++) { 5478 unsigned long long val = 0; 5479 5480 for_each_mem_cgroup_tree(mi, memcg) 5481 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 5482 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 5483 } 5484 5485 #ifdef CONFIG_DEBUG_VM 5486 { 5487 int nid, zid; 5488 struct mem_cgroup_per_zone *mz; 5489 struct zone_reclaim_stat *rstat; 5490 unsigned long recent_rotated[2] = {0, 0}; 5491 unsigned long recent_scanned[2] = {0, 0}; 5492 5493 for_each_online_node(nid) 5494 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 5495 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 5496 rstat = &mz->lruvec.reclaim_stat; 5497 5498 recent_rotated[0] += rstat->recent_rotated[0]; 5499 recent_rotated[1] += rstat->recent_rotated[1]; 5500 recent_scanned[0] += rstat->recent_scanned[0]; 5501 recent_scanned[1] += rstat->recent_scanned[1]; 5502 } 5503 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 5504 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 5505 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 5506 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 5507 } 5508 #endif 5509 5510 return 0; 5511 } 5512 5513 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 5514 { 5515 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5516 5517 return mem_cgroup_swappiness(memcg); 5518 } 5519 5520 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 5521 u64 val) 5522 { 5523 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5524 struct mem_cgroup *parent; 5525 5526 if (val > 100) 5527 return -EINVAL; 5528 5529 if (cgrp->parent == NULL) 5530 return -EINVAL; 5531 5532 parent = mem_cgroup_from_cont(cgrp->parent); 5533 5534 mutex_lock(&memcg_create_mutex); 5535 5536 /* If under hierarchy, only empty-root can set this value */ 5537 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5538 mutex_unlock(&memcg_create_mutex); 5539 return -EINVAL; 5540 } 5541 5542 memcg->swappiness = val; 5543 5544 mutex_unlock(&memcg_create_mutex); 5545 5546 return 0; 5547 } 5548 5549 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 5550 { 5551 struct mem_cgroup_threshold_ary *t; 5552 u64 usage; 5553 int i; 5554 5555 rcu_read_lock(); 5556 if (!swap) 5557 t = rcu_dereference(memcg->thresholds.primary); 5558 else 5559 t = rcu_dereference(memcg->memsw_thresholds.primary); 5560 5561 if (!t) 5562 goto unlock; 5563 5564 usage = mem_cgroup_usage(memcg, swap); 5565 5566 /* 5567 * current_threshold points to threshold just below or equal to usage. 5568 * If it's not true, a threshold was crossed after last 5569 * call of __mem_cgroup_threshold(). 5570 */ 5571 i = t->current_threshold; 5572 5573 /* 5574 * Iterate backward over array of thresholds starting from 5575 * current_threshold and check if a threshold is crossed. 5576 * If none of thresholds below usage is crossed, we read 5577 * only one element of the array here. 5578 */ 5579 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 5580 eventfd_signal(t->entries[i].eventfd, 1); 5581 5582 /* i = current_threshold + 1 */ 5583 i++; 5584 5585 /* 5586 * Iterate forward over array of thresholds starting from 5587 * current_threshold+1 and check if a threshold is crossed. 5588 * If none of thresholds above usage is crossed, we read 5589 * only one element of the array here. 5590 */ 5591 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 5592 eventfd_signal(t->entries[i].eventfd, 1); 5593 5594 /* Update current_threshold */ 5595 t->current_threshold = i - 1; 5596 unlock: 5597 rcu_read_unlock(); 5598 } 5599 5600 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 5601 { 5602 while (memcg) { 5603 __mem_cgroup_threshold(memcg, false); 5604 if (do_swap_account) 5605 __mem_cgroup_threshold(memcg, true); 5606 5607 memcg = parent_mem_cgroup(memcg); 5608 } 5609 } 5610 5611 static int compare_thresholds(const void *a, const void *b) 5612 { 5613 const struct mem_cgroup_threshold *_a = a; 5614 const struct mem_cgroup_threshold *_b = b; 5615 5616 return _a->threshold - _b->threshold; 5617 } 5618 5619 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 5620 { 5621 struct mem_cgroup_eventfd_list *ev; 5622 5623 list_for_each_entry(ev, &memcg->oom_notify, list) 5624 eventfd_signal(ev->eventfd, 1); 5625 return 0; 5626 } 5627 5628 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 5629 { 5630 struct mem_cgroup *iter; 5631 5632 for_each_mem_cgroup_tree(iter, memcg) 5633 mem_cgroup_oom_notify_cb(iter); 5634 } 5635 5636 static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 5637 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5638 { 5639 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5640 struct mem_cgroup_thresholds *thresholds; 5641 struct mem_cgroup_threshold_ary *new; 5642 enum res_type type = MEMFILE_TYPE(cft->private); 5643 u64 threshold, usage; 5644 int i, size, ret; 5645 5646 ret = res_counter_memparse_write_strategy(args, &threshold); 5647 if (ret) 5648 return ret; 5649 5650 mutex_lock(&memcg->thresholds_lock); 5651 5652 if (type == _MEM) 5653 thresholds = &memcg->thresholds; 5654 else if (type == _MEMSWAP) 5655 thresholds = &memcg->memsw_thresholds; 5656 else 5657 BUG(); 5658 5659 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 5660 5661 /* Check if a threshold crossed before adding a new one */ 5662 if (thresholds->primary) 5663 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 5664 5665 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 5666 5667 /* Allocate memory for new array of thresholds */ 5668 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 5669 GFP_KERNEL); 5670 if (!new) { 5671 ret = -ENOMEM; 5672 goto unlock; 5673 } 5674 new->size = size; 5675 5676 /* Copy thresholds (if any) to new array */ 5677 if (thresholds->primary) { 5678 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 5679 sizeof(struct mem_cgroup_threshold)); 5680 } 5681 5682 /* Add new threshold */ 5683 new->entries[size - 1].eventfd = eventfd; 5684 new->entries[size - 1].threshold = threshold; 5685 5686 /* Sort thresholds. Registering of new threshold isn't time-critical */ 5687 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 5688 compare_thresholds, NULL); 5689 5690 /* Find current threshold */ 5691 new->current_threshold = -1; 5692 for (i = 0; i < size; i++) { 5693 if (new->entries[i].threshold <= usage) { 5694 /* 5695 * new->current_threshold will not be used until 5696 * rcu_assign_pointer(), so it's safe to increment 5697 * it here. 5698 */ 5699 ++new->current_threshold; 5700 } else 5701 break; 5702 } 5703 5704 /* Free old spare buffer and save old primary buffer as spare */ 5705 kfree(thresholds->spare); 5706 thresholds->spare = thresholds->primary; 5707 5708 rcu_assign_pointer(thresholds->primary, new); 5709 5710 /* To be sure that nobody uses thresholds */ 5711 synchronize_rcu(); 5712 5713 unlock: 5714 mutex_unlock(&memcg->thresholds_lock); 5715 5716 return ret; 5717 } 5718 5719 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 5720 struct cftype *cft, struct eventfd_ctx *eventfd) 5721 { 5722 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5723 struct mem_cgroup_thresholds *thresholds; 5724 struct mem_cgroup_threshold_ary *new; 5725 enum res_type type = MEMFILE_TYPE(cft->private); 5726 u64 usage; 5727 int i, j, size; 5728 5729 mutex_lock(&memcg->thresholds_lock); 5730 if (type == _MEM) 5731 thresholds = &memcg->thresholds; 5732 else if (type == _MEMSWAP) 5733 thresholds = &memcg->memsw_thresholds; 5734 else 5735 BUG(); 5736 5737 if (!thresholds->primary) 5738 goto unlock; 5739 5740 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 5741 5742 /* Check if a threshold crossed before removing */ 5743 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 5744 5745 /* Calculate new number of threshold */ 5746 size = 0; 5747 for (i = 0; i < thresholds->primary->size; i++) { 5748 if (thresholds->primary->entries[i].eventfd != eventfd) 5749 size++; 5750 } 5751 5752 new = thresholds->spare; 5753 5754 /* Set thresholds array to NULL if we don't have thresholds */ 5755 if (!size) { 5756 kfree(new); 5757 new = NULL; 5758 goto swap_buffers; 5759 } 5760 5761 new->size = size; 5762 5763 /* Copy thresholds and find current threshold */ 5764 new->current_threshold = -1; 5765 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 5766 if (thresholds->primary->entries[i].eventfd == eventfd) 5767 continue; 5768 5769 new->entries[j] = thresholds->primary->entries[i]; 5770 if (new->entries[j].threshold <= usage) { 5771 /* 5772 * new->current_threshold will not be used 5773 * until rcu_assign_pointer(), so it's safe to increment 5774 * it here. 5775 */ 5776 ++new->current_threshold; 5777 } 5778 j++; 5779 } 5780 5781 swap_buffers: 5782 /* Swap primary and spare array */ 5783 thresholds->spare = thresholds->primary; 5784 /* If all events are unregistered, free the spare array */ 5785 if (!new) { 5786 kfree(thresholds->spare); 5787 thresholds->spare = NULL; 5788 } 5789 5790 rcu_assign_pointer(thresholds->primary, new); 5791 5792 /* To be sure that nobody uses thresholds */ 5793 synchronize_rcu(); 5794 unlock: 5795 mutex_unlock(&memcg->thresholds_lock); 5796 } 5797 5798 static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 5799 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5800 { 5801 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5802 struct mem_cgroup_eventfd_list *event; 5803 enum res_type type = MEMFILE_TYPE(cft->private); 5804 5805 BUG_ON(type != _OOM_TYPE); 5806 event = kmalloc(sizeof(*event), GFP_KERNEL); 5807 if (!event) 5808 return -ENOMEM; 5809 5810 spin_lock(&memcg_oom_lock); 5811 5812 event->eventfd = eventfd; 5813 list_add(&event->list, &memcg->oom_notify); 5814 5815 /* already in OOM ? */ 5816 if (atomic_read(&memcg->under_oom)) 5817 eventfd_signal(eventfd, 1); 5818 spin_unlock(&memcg_oom_lock); 5819 5820 return 0; 5821 } 5822 5823 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 5824 struct cftype *cft, struct eventfd_ctx *eventfd) 5825 { 5826 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5827 struct mem_cgroup_eventfd_list *ev, *tmp; 5828 enum res_type type = MEMFILE_TYPE(cft->private); 5829 5830 BUG_ON(type != _OOM_TYPE); 5831 5832 spin_lock(&memcg_oom_lock); 5833 5834 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 5835 if (ev->eventfd == eventfd) { 5836 list_del(&ev->list); 5837 kfree(ev); 5838 } 5839 } 5840 5841 spin_unlock(&memcg_oom_lock); 5842 } 5843 5844 static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 5845 struct cftype *cft, struct cgroup_map_cb *cb) 5846 { 5847 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5848 5849 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); 5850 5851 if (atomic_read(&memcg->under_oom)) 5852 cb->fill(cb, "under_oom", 1); 5853 else 5854 cb->fill(cb, "under_oom", 0); 5855 return 0; 5856 } 5857 5858 static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 5859 struct cftype *cft, u64 val) 5860 { 5861 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5862 struct mem_cgroup *parent; 5863 5864 /* cannot set to root cgroup and only 0 and 1 are allowed */ 5865 if (!cgrp->parent || !((val == 0) || (val == 1))) 5866 return -EINVAL; 5867 5868 parent = mem_cgroup_from_cont(cgrp->parent); 5869 5870 mutex_lock(&memcg_create_mutex); 5871 /* oom-kill-disable is a flag for subhierarchy. */ 5872 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5873 mutex_unlock(&memcg_create_mutex); 5874 return -EINVAL; 5875 } 5876 memcg->oom_kill_disable = val; 5877 if (!val) 5878 memcg_oom_recover(memcg); 5879 mutex_unlock(&memcg_create_mutex); 5880 return 0; 5881 } 5882 5883 #ifdef CONFIG_MEMCG_KMEM 5884 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5885 { 5886 int ret; 5887 5888 memcg->kmemcg_id = -1; 5889 ret = memcg_propagate_kmem(memcg); 5890 if (ret) 5891 return ret; 5892 5893 return mem_cgroup_sockets_init(memcg, ss); 5894 } 5895 5896 static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 5897 { 5898 mem_cgroup_sockets_destroy(memcg); 5899 5900 memcg_kmem_mark_dead(memcg); 5901 5902 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) 5903 return; 5904 5905 /* 5906 * Charges already down to 0, undo mem_cgroup_get() done in the charge 5907 * path here, being careful not to race with memcg_uncharge_kmem: it is 5908 * possible that the charges went down to 0 between mark_dead and the 5909 * res_counter read, so in that case, we don't need the put 5910 */ 5911 if (memcg_kmem_test_and_clear_dead(memcg)) 5912 mem_cgroup_put(memcg); 5913 } 5914 #else 5915 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5916 { 5917 return 0; 5918 } 5919 5920 static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 5921 { 5922 } 5923 #endif 5924 5925 static struct cftype mem_cgroup_files[] = { 5926 { 5927 .name = "usage_in_bytes", 5928 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 5929 .read = mem_cgroup_read, 5930 .register_event = mem_cgroup_usage_register_event, 5931 .unregister_event = mem_cgroup_usage_unregister_event, 5932 }, 5933 { 5934 .name = "max_usage_in_bytes", 5935 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 5936 .trigger = mem_cgroup_reset, 5937 .read = mem_cgroup_read, 5938 }, 5939 { 5940 .name = "limit_in_bytes", 5941 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 5942 .write_string = mem_cgroup_write, 5943 .read = mem_cgroup_read, 5944 }, 5945 { 5946 .name = "soft_limit_in_bytes", 5947 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 5948 .write_string = mem_cgroup_write, 5949 .read = mem_cgroup_read, 5950 }, 5951 { 5952 .name = "failcnt", 5953 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 5954 .trigger = mem_cgroup_reset, 5955 .read = mem_cgroup_read, 5956 }, 5957 { 5958 .name = "stat", 5959 .read_seq_string = memcg_stat_show, 5960 }, 5961 { 5962 .name = "force_empty", 5963 .trigger = mem_cgroup_force_empty_write, 5964 }, 5965 { 5966 .name = "use_hierarchy", 5967 .flags = CFTYPE_INSANE, 5968 .write_u64 = mem_cgroup_hierarchy_write, 5969 .read_u64 = mem_cgroup_hierarchy_read, 5970 }, 5971 { 5972 .name = "swappiness", 5973 .read_u64 = mem_cgroup_swappiness_read, 5974 .write_u64 = mem_cgroup_swappiness_write, 5975 }, 5976 { 5977 .name = "move_charge_at_immigrate", 5978 .read_u64 = mem_cgroup_move_charge_read, 5979 .write_u64 = mem_cgroup_move_charge_write, 5980 }, 5981 { 5982 .name = "oom_control", 5983 .read_map = mem_cgroup_oom_control_read, 5984 .write_u64 = mem_cgroup_oom_control_write, 5985 .register_event = mem_cgroup_oom_register_event, 5986 .unregister_event = mem_cgroup_oom_unregister_event, 5987 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 5988 }, 5989 { 5990 .name = "pressure_level", 5991 .register_event = vmpressure_register_event, 5992 .unregister_event = vmpressure_unregister_event, 5993 }, 5994 #ifdef CONFIG_NUMA 5995 { 5996 .name = "numa_stat", 5997 .read_seq_string = memcg_numa_stat_show, 5998 }, 5999 #endif 6000 #ifdef CONFIG_MEMCG_KMEM 6001 { 6002 .name = "kmem.limit_in_bytes", 6003 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 6004 .write_string = mem_cgroup_write, 6005 .read = mem_cgroup_read, 6006 }, 6007 { 6008 .name = "kmem.usage_in_bytes", 6009 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 6010 .read = mem_cgroup_read, 6011 }, 6012 { 6013 .name = "kmem.failcnt", 6014 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 6015 .trigger = mem_cgroup_reset, 6016 .read = mem_cgroup_read, 6017 }, 6018 { 6019 .name = "kmem.max_usage_in_bytes", 6020 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 6021 .trigger = mem_cgroup_reset, 6022 .read = mem_cgroup_read, 6023 }, 6024 #ifdef CONFIG_SLABINFO 6025 { 6026 .name = "kmem.slabinfo", 6027 .read_seq_string = mem_cgroup_slabinfo_read, 6028 }, 6029 #endif 6030 #endif 6031 { }, /* terminate */ 6032 }; 6033 6034 #ifdef CONFIG_MEMCG_SWAP 6035 static struct cftype memsw_cgroup_files[] = { 6036 { 6037 .name = "memsw.usage_in_bytes", 6038 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6039 .read = mem_cgroup_read, 6040 .register_event = mem_cgroup_usage_register_event, 6041 .unregister_event = mem_cgroup_usage_unregister_event, 6042 }, 6043 { 6044 .name = "memsw.max_usage_in_bytes", 6045 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6046 .trigger = mem_cgroup_reset, 6047 .read = mem_cgroup_read, 6048 }, 6049 { 6050 .name = "memsw.limit_in_bytes", 6051 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6052 .write_string = mem_cgroup_write, 6053 .read = mem_cgroup_read, 6054 }, 6055 { 6056 .name = "memsw.failcnt", 6057 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6058 .trigger = mem_cgroup_reset, 6059 .read = mem_cgroup_read, 6060 }, 6061 { }, /* terminate */ 6062 }; 6063 #endif 6064 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 6065 { 6066 struct mem_cgroup_per_node *pn; 6067 struct mem_cgroup_per_zone *mz; 6068 int zone, tmp = node; 6069 /* 6070 * This routine is called against possible nodes. 6071 * But it's BUG to call kmalloc() against offline node. 6072 * 6073 * TODO: this routine can waste much memory for nodes which will 6074 * never be onlined. It's better to use memory hotplug callback 6075 * function. 6076 */ 6077 if (!node_state(node, N_NORMAL_MEMORY)) 6078 tmp = -1; 6079 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 6080 if (!pn) 6081 return 1; 6082 6083 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6084 mz = &pn->zoneinfo[zone]; 6085 lruvec_init(&mz->lruvec); 6086 mz->usage_in_excess = 0; 6087 mz->on_tree = false; 6088 mz->memcg = memcg; 6089 } 6090 memcg->info.nodeinfo[node] = pn; 6091 return 0; 6092 } 6093 6094 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 6095 { 6096 kfree(memcg->info.nodeinfo[node]); 6097 } 6098 6099 static struct mem_cgroup *mem_cgroup_alloc(void) 6100 { 6101 struct mem_cgroup *memcg; 6102 size_t size = memcg_size(); 6103 6104 /* Can be very big if nr_node_ids is very big */ 6105 if (size < PAGE_SIZE) 6106 memcg = kzalloc(size, GFP_KERNEL); 6107 else 6108 memcg = vzalloc(size); 6109 6110 if (!memcg) 6111 return NULL; 6112 6113 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 6114 if (!memcg->stat) 6115 goto out_free; 6116 spin_lock_init(&memcg->pcp_counter_lock); 6117 return memcg; 6118 6119 out_free: 6120 if (size < PAGE_SIZE) 6121 kfree(memcg); 6122 else 6123 vfree(memcg); 6124 return NULL; 6125 } 6126 6127 /* 6128 * At destroying mem_cgroup, references from swap_cgroup can remain. 6129 * (scanning all at force_empty is too costly...) 6130 * 6131 * Instead of clearing all references at force_empty, we remember 6132 * the number of reference from swap_cgroup and free mem_cgroup when 6133 * it goes down to 0. 6134 * 6135 * Removal of cgroup itself succeeds regardless of refs from swap. 6136 */ 6137 6138 static void __mem_cgroup_free(struct mem_cgroup *memcg) 6139 { 6140 int node; 6141 size_t size = memcg_size(); 6142 6143 mem_cgroup_remove_from_trees(memcg); 6144 free_css_id(&mem_cgroup_subsys, &memcg->css); 6145 6146 for_each_node(node) 6147 free_mem_cgroup_per_zone_info(memcg, node); 6148 6149 free_percpu(memcg->stat); 6150 6151 /* 6152 * We need to make sure that (at least for now), the jump label 6153 * destruction code runs outside of the cgroup lock. This is because 6154 * get_online_cpus(), which is called from the static_branch update, 6155 * can't be called inside the cgroup_lock. cpusets are the ones 6156 * enforcing this dependency, so if they ever change, we might as well. 6157 * 6158 * schedule_work() will guarantee this happens. Be careful if you need 6159 * to move this code around, and make sure it is outside 6160 * the cgroup_lock. 6161 */ 6162 disarm_static_keys(memcg); 6163 if (size < PAGE_SIZE) 6164 kfree(memcg); 6165 else 6166 vfree(memcg); 6167 } 6168 6169 6170 /* 6171 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, 6172 * but in process context. The work_freeing structure is overlaid 6173 * on the rcu_freeing structure, which itself is overlaid on memsw. 6174 */ 6175 static void free_work(struct work_struct *work) 6176 { 6177 struct mem_cgroup *memcg; 6178 6179 memcg = container_of(work, struct mem_cgroup, work_freeing); 6180 __mem_cgroup_free(memcg); 6181 } 6182 6183 static void free_rcu(struct rcu_head *rcu_head) 6184 { 6185 struct mem_cgroup *memcg; 6186 6187 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); 6188 INIT_WORK(&memcg->work_freeing, free_work); 6189 schedule_work(&memcg->work_freeing); 6190 } 6191 6192 static void mem_cgroup_get(struct mem_cgroup *memcg) 6193 { 6194 atomic_inc(&memcg->refcnt); 6195 } 6196 6197 static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) 6198 { 6199 if (atomic_sub_and_test(count, &memcg->refcnt)) { 6200 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 6201 call_rcu(&memcg->rcu_freeing, free_rcu); 6202 if (parent) 6203 mem_cgroup_put(parent); 6204 } 6205 } 6206 6207 static void mem_cgroup_put(struct mem_cgroup *memcg) 6208 { 6209 __mem_cgroup_put(memcg, 1); 6210 } 6211 6212 /* 6213 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 6214 */ 6215 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 6216 { 6217 if (!memcg->res.parent) 6218 return NULL; 6219 return mem_cgroup_from_res_counter(memcg->res.parent, res); 6220 } 6221 EXPORT_SYMBOL(parent_mem_cgroup); 6222 6223 static void __init mem_cgroup_soft_limit_tree_init(void) 6224 { 6225 struct mem_cgroup_tree_per_node *rtpn; 6226 struct mem_cgroup_tree_per_zone *rtpz; 6227 int tmp, node, zone; 6228 6229 for_each_node(node) { 6230 tmp = node; 6231 if (!node_state(node, N_NORMAL_MEMORY)) 6232 tmp = -1; 6233 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 6234 BUG_ON(!rtpn); 6235 6236 soft_limit_tree.rb_tree_per_node[node] = rtpn; 6237 6238 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6239 rtpz = &rtpn->rb_tree_per_zone[zone]; 6240 rtpz->rb_root = RB_ROOT; 6241 spin_lock_init(&rtpz->lock); 6242 } 6243 } 6244 } 6245 6246 static struct cgroup_subsys_state * __ref 6247 mem_cgroup_css_alloc(struct cgroup *cont) 6248 { 6249 struct mem_cgroup *memcg; 6250 long error = -ENOMEM; 6251 int node; 6252 6253 memcg = mem_cgroup_alloc(); 6254 if (!memcg) 6255 return ERR_PTR(error); 6256 6257 for_each_node(node) 6258 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 6259 goto free_out; 6260 6261 /* root ? */ 6262 if (cont->parent == NULL) { 6263 root_mem_cgroup = memcg; 6264 res_counter_init(&memcg->res, NULL); 6265 res_counter_init(&memcg->memsw, NULL); 6266 res_counter_init(&memcg->kmem, NULL); 6267 } 6268 6269 memcg->last_scanned_node = MAX_NUMNODES; 6270 INIT_LIST_HEAD(&memcg->oom_notify); 6271 atomic_set(&memcg->refcnt, 1); 6272 memcg->move_charge_at_immigrate = 0; 6273 mutex_init(&memcg->thresholds_lock); 6274 spin_lock_init(&memcg->move_lock); 6275 vmpressure_init(&memcg->vmpressure); 6276 6277 return &memcg->css; 6278 6279 free_out: 6280 __mem_cgroup_free(memcg); 6281 return ERR_PTR(error); 6282 } 6283 6284 static int 6285 mem_cgroup_css_online(struct cgroup *cont) 6286 { 6287 struct mem_cgroup *memcg, *parent; 6288 int error = 0; 6289 6290 if (!cont->parent) 6291 return 0; 6292 6293 mutex_lock(&memcg_create_mutex); 6294 memcg = mem_cgroup_from_cont(cont); 6295 parent = mem_cgroup_from_cont(cont->parent); 6296 6297 memcg->use_hierarchy = parent->use_hierarchy; 6298 memcg->oom_kill_disable = parent->oom_kill_disable; 6299 memcg->swappiness = mem_cgroup_swappiness(parent); 6300 6301 if (parent->use_hierarchy) { 6302 res_counter_init(&memcg->res, &parent->res); 6303 res_counter_init(&memcg->memsw, &parent->memsw); 6304 res_counter_init(&memcg->kmem, &parent->kmem); 6305 6306 /* 6307 * We increment refcnt of the parent to ensure that we can 6308 * safely access it on res_counter_charge/uncharge. 6309 * This refcnt will be decremented when freeing this 6310 * mem_cgroup(see mem_cgroup_put). 6311 */ 6312 mem_cgroup_get(parent); 6313 } else { 6314 res_counter_init(&memcg->res, NULL); 6315 res_counter_init(&memcg->memsw, NULL); 6316 res_counter_init(&memcg->kmem, NULL); 6317 /* 6318 * Deeper hierachy with use_hierarchy == false doesn't make 6319 * much sense so let cgroup subsystem know about this 6320 * unfortunate state in our controller. 6321 */ 6322 if (parent != root_mem_cgroup) 6323 mem_cgroup_subsys.broken_hierarchy = true; 6324 } 6325 6326 error = memcg_init_kmem(memcg, &mem_cgroup_subsys); 6327 mutex_unlock(&memcg_create_mutex); 6328 if (error) { 6329 /* 6330 * We call put now because our (and parent's) refcnts 6331 * are already in place. mem_cgroup_put() will internally 6332 * call __mem_cgroup_free, so return directly 6333 */ 6334 mem_cgroup_put(memcg); 6335 if (parent->use_hierarchy) 6336 mem_cgroup_put(parent); 6337 } 6338 return error; 6339 } 6340 6341 /* 6342 * Announce all parents that a group from their hierarchy is gone. 6343 */ 6344 static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) 6345 { 6346 struct mem_cgroup *parent = memcg; 6347 6348 while ((parent = parent_mem_cgroup(parent))) 6349 mem_cgroup_iter_invalidate(parent); 6350 6351 /* 6352 * if the root memcg is not hierarchical we have to check it 6353 * explicitely. 6354 */ 6355 if (!root_mem_cgroup->use_hierarchy) 6356 mem_cgroup_iter_invalidate(root_mem_cgroup); 6357 } 6358 6359 static void mem_cgroup_css_offline(struct cgroup *cont) 6360 { 6361 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6362 6363 mem_cgroup_invalidate_reclaim_iterators(memcg); 6364 mem_cgroup_reparent_charges(memcg); 6365 mem_cgroup_destroy_all_caches(memcg); 6366 } 6367 6368 static void mem_cgroup_css_free(struct cgroup *cont) 6369 { 6370 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6371 6372 kmem_cgroup_destroy(memcg); 6373 6374 mem_cgroup_put(memcg); 6375 } 6376 6377 #ifdef CONFIG_MMU 6378 /* Handlers for move charge at task migration. */ 6379 #define PRECHARGE_COUNT_AT_ONCE 256 6380 static int mem_cgroup_do_precharge(unsigned long count) 6381 { 6382 int ret = 0; 6383 int batch_count = PRECHARGE_COUNT_AT_ONCE; 6384 struct mem_cgroup *memcg = mc.to; 6385 6386 if (mem_cgroup_is_root(memcg)) { 6387 mc.precharge += count; 6388 /* we don't need css_get for root */ 6389 return ret; 6390 } 6391 /* try to charge at once */ 6392 if (count > 1) { 6393 struct res_counter *dummy; 6394 /* 6395 * "memcg" cannot be under rmdir() because we've already checked 6396 * by cgroup_lock_live_cgroup() that it is not removed and we 6397 * are still under the same cgroup_mutex. So we can postpone 6398 * css_get(). 6399 */ 6400 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) 6401 goto one_by_one; 6402 if (do_swap_account && res_counter_charge(&memcg->memsw, 6403 PAGE_SIZE * count, &dummy)) { 6404 res_counter_uncharge(&memcg->res, PAGE_SIZE * count); 6405 goto one_by_one; 6406 } 6407 mc.precharge += count; 6408 return ret; 6409 } 6410 one_by_one: 6411 /* fall back to one by one charge */ 6412 while (count--) { 6413 if (signal_pending(current)) { 6414 ret = -EINTR; 6415 break; 6416 } 6417 if (!batch_count--) { 6418 batch_count = PRECHARGE_COUNT_AT_ONCE; 6419 cond_resched(); 6420 } 6421 ret = __mem_cgroup_try_charge(NULL, 6422 GFP_KERNEL, 1, &memcg, false); 6423 if (ret) 6424 /* mem_cgroup_clear_mc() will do uncharge later */ 6425 return ret; 6426 mc.precharge++; 6427 } 6428 return ret; 6429 } 6430 6431 /** 6432 * get_mctgt_type - get target type of moving charge 6433 * @vma: the vma the pte to be checked belongs 6434 * @addr: the address corresponding to the pte to be checked 6435 * @ptent: the pte to be checked 6436 * @target: the pointer the target page or swap ent will be stored(can be NULL) 6437 * 6438 * Returns 6439 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 6440 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 6441 * move charge. if @target is not NULL, the page is stored in target->page 6442 * with extra refcnt got(Callers should handle it). 6443 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 6444 * target for charge migration. if @target is not NULL, the entry is stored 6445 * in target->ent. 6446 * 6447 * Called with pte lock held. 6448 */ 6449 union mc_target { 6450 struct page *page; 6451 swp_entry_t ent; 6452 }; 6453 6454 enum mc_target_type { 6455 MC_TARGET_NONE = 0, 6456 MC_TARGET_PAGE, 6457 MC_TARGET_SWAP, 6458 }; 6459 6460 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 6461 unsigned long addr, pte_t ptent) 6462 { 6463 struct page *page = vm_normal_page(vma, addr, ptent); 6464 6465 if (!page || !page_mapped(page)) 6466 return NULL; 6467 if (PageAnon(page)) { 6468 /* we don't move shared anon */ 6469 if (!move_anon()) 6470 return NULL; 6471 } else if (!move_file()) 6472 /* we ignore mapcount for file pages */ 6473 return NULL; 6474 if (!get_page_unless_zero(page)) 6475 return NULL; 6476 6477 return page; 6478 } 6479 6480 #ifdef CONFIG_SWAP 6481 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 6482 unsigned long addr, pte_t ptent, swp_entry_t *entry) 6483 { 6484 struct page *page = NULL; 6485 swp_entry_t ent = pte_to_swp_entry(ptent); 6486 6487 if (!move_anon() || non_swap_entry(ent)) 6488 return NULL; 6489 /* 6490 * Because lookup_swap_cache() updates some statistics counter, 6491 * we call find_get_page() with swapper_space directly. 6492 */ 6493 page = find_get_page(swap_address_space(ent), ent.val); 6494 if (do_swap_account) 6495 entry->val = ent.val; 6496 6497 return page; 6498 } 6499 #else 6500 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 6501 unsigned long addr, pte_t ptent, swp_entry_t *entry) 6502 { 6503 return NULL; 6504 } 6505 #endif 6506 6507 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 6508 unsigned long addr, pte_t ptent, swp_entry_t *entry) 6509 { 6510 struct page *page = NULL; 6511 struct address_space *mapping; 6512 pgoff_t pgoff; 6513 6514 if (!vma->vm_file) /* anonymous vma */ 6515 return NULL; 6516 if (!move_file()) 6517 return NULL; 6518 6519 mapping = vma->vm_file->f_mapping; 6520 if (pte_none(ptent)) 6521 pgoff = linear_page_index(vma, addr); 6522 else /* pte_file(ptent) is true */ 6523 pgoff = pte_to_pgoff(ptent); 6524 6525 /* page is moved even if it's not RSS of this task(page-faulted). */ 6526 page = find_get_page(mapping, pgoff); 6527 6528 #ifdef CONFIG_SWAP 6529 /* shmem/tmpfs may report page out on swap: account for that too. */ 6530 if (radix_tree_exceptional_entry(page)) { 6531 swp_entry_t swap = radix_to_swp_entry(page); 6532 if (do_swap_account) 6533 *entry = swap; 6534 page = find_get_page(swap_address_space(swap), swap.val); 6535 } 6536 #endif 6537 return page; 6538 } 6539 6540 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 6541 unsigned long addr, pte_t ptent, union mc_target *target) 6542 { 6543 struct page *page = NULL; 6544 struct page_cgroup *pc; 6545 enum mc_target_type ret = MC_TARGET_NONE; 6546 swp_entry_t ent = { .val = 0 }; 6547 6548 if (pte_present(ptent)) 6549 page = mc_handle_present_pte(vma, addr, ptent); 6550 else if (is_swap_pte(ptent)) 6551 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 6552 else if (pte_none(ptent) || pte_file(ptent)) 6553 page = mc_handle_file_pte(vma, addr, ptent, &ent); 6554 6555 if (!page && !ent.val) 6556 return ret; 6557 if (page) { 6558 pc = lookup_page_cgroup(page); 6559 /* 6560 * Do only loose check w/o page_cgroup lock. 6561 * mem_cgroup_move_account() checks the pc is valid or not under 6562 * the lock. 6563 */ 6564 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 6565 ret = MC_TARGET_PAGE; 6566 if (target) 6567 target->page = page; 6568 } 6569 if (!ret || !target) 6570 put_page(page); 6571 } 6572 /* There is a swap entry and a page doesn't exist or isn't charged */ 6573 if (ent.val && !ret && 6574 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { 6575 ret = MC_TARGET_SWAP; 6576 if (target) 6577 target->ent = ent; 6578 } 6579 return ret; 6580 } 6581 6582 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6583 /* 6584 * We don't consider swapping or file mapped pages because THP does not 6585 * support them for now. 6586 * Caller should make sure that pmd_trans_huge(pmd) is true. 6587 */ 6588 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 6589 unsigned long addr, pmd_t pmd, union mc_target *target) 6590 { 6591 struct page *page = NULL; 6592 struct page_cgroup *pc; 6593 enum mc_target_type ret = MC_TARGET_NONE; 6594 6595 page = pmd_page(pmd); 6596 VM_BUG_ON(!page || !PageHead(page)); 6597 if (!move_anon()) 6598 return ret; 6599 pc = lookup_page_cgroup(page); 6600 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 6601 ret = MC_TARGET_PAGE; 6602 if (target) { 6603 get_page(page); 6604 target->page = page; 6605 } 6606 } 6607 return ret; 6608 } 6609 #else 6610 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 6611 unsigned long addr, pmd_t pmd, union mc_target *target) 6612 { 6613 return MC_TARGET_NONE; 6614 } 6615 #endif 6616 6617 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 6618 unsigned long addr, unsigned long end, 6619 struct mm_walk *walk) 6620 { 6621 struct vm_area_struct *vma = walk->private; 6622 pte_t *pte; 6623 spinlock_t *ptl; 6624 6625 if (pmd_trans_huge_lock(pmd, vma) == 1) { 6626 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 6627 mc.precharge += HPAGE_PMD_NR; 6628 spin_unlock(&vma->vm_mm->page_table_lock); 6629 return 0; 6630 } 6631 6632 if (pmd_trans_unstable(pmd)) 6633 return 0; 6634 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6635 for (; addr != end; pte++, addr += PAGE_SIZE) 6636 if (get_mctgt_type(vma, addr, *pte, NULL)) 6637 mc.precharge++; /* increment precharge temporarily */ 6638 pte_unmap_unlock(pte - 1, ptl); 6639 cond_resched(); 6640 6641 return 0; 6642 } 6643 6644 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 6645 { 6646 unsigned long precharge; 6647 struct vm_area_struct *vma; 6648 6649 down_read(&mm->mmap_sem); 6650 for (vma = mm->mmap; vma; vma = vma->vm_next) { 6651 struct mm_walk mem_cgroup_count_precharge_walk = { 6652 .pmd_entry = mem_cgroup_count_precharge_pte_range, 6653 .mm = mm, 6654 .private = vma, 6655 }; 6656 if (is_vm_hugetlb_page(vma)) 6657 continue; 6658 walk_page_range(vma->vm_start, vma->vm_end, 6659 &mem_cgroup_count_precharge_walk); 6660 } 6661 up_read(&mm->mmap_sem); 6662 6663 precharge = mc.precharge; 6664 mc.precharge = 0; 6665 6666 return precharge; 6667 } 6668 6669 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 6670 { 6671 unsigned long precharge = mem_cgroup_count_precharge(mm); 6672 6673 VM_BUG_ON(mc.moving_task); 6674 mc.moving_task = current; 6675 return mem_cgroup_do_precharge(precharge); 6676 } 6677 6678 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 6679 static void __mem_cgroup_clear_mc(void) 6680 { 6681 struct mem_cgroup *from = mc.from; 6682 struct mem_cgroup *to = mc.to; 6683 6684 /* we must uncharge all the leftover precharges from mc.to */ 6685 if (mc.precharge) { 6686 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 6687 mc.precharge = 0; 6688 } 6689 /* 6690 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 6691 * we must uncharge here. 6692 */ 6693 if (mc.moved_charge) { 6694 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 6695 mc.moved_charge = 0; 6696 } 6697 /* we must fixup refcnts and charges */ 6698 if (mc.moved_swap) { 6699 /* uncharge swap account from the old cgroup */ 6700 if (!mem_cgroup_is_root(mc.from)) 6701 res_counter_uncharge(&mc.from->memsw, 6702 PAGE_SIZE * mc.moved_swap); 6703 __mem_cgroup_put(mc.from, mc.moved_swap); 6704 6705 if (!mem_cgroup_is_root(mc.to)) { 6706 /* 6707 * we charged both to->res and to->memsw, so we should 6708 * uncharge to->res. 6709 */ 6710 res_counter_uncharge(&mc.to->res, 6711 PAGE_SIZE * mc.moved_swap); 6712 } 6713 /* we've already done mem_cgroup_get(mc.to) */ 6714 mc.moved_swap = 0; 6715 } 6716 memcg_oom_recover(from); 6717 memcg_oom_recover(to); 6718 wake_up_all(&mc.waitq); 6719 } 6720 6721 static void mem_cgroup_clear_mc(void) 6722 { 6723 struct mem_cgroup *from = mc.from; 6724 6725 /* 6726 * we must clear moving_task before waking up waiters at the end of 6727 * task migration. 6728 */ 6729 mc.moving_task = NULL; 6730 __mem_cgroup_clear_mc(); 6731 spin_lock(&mc.lock); 6732 mc.from = NULL; 6733 mc.to = NULL; 6734 spin_unlock(&mc.lock); 6735 mem_cgroup_end_move(from); 6736 } 6737 6738 static int mem_cgroup_can_attach(struct cgroup *cgroup, 6739 struct cgroup_taskset *tset) 6740 { 6741 struct task_struct *p = cgroup_taskset_first(tset); 6742 int ret = 0; 6743 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); 6744 unsigned long move_charge_at_immigrate; 6745 6746 /* 6747 * We are now commited to this value whatever it is. Changes in this 6748 * tunable will only affect upcoming migrations, not the current one. 6749 * So we need to save it, and keep it going. 6750 */ 6751 move_charge_at_immigrate = memcg->move_charge_at_immigrate; 6752 if (move_charge_at_immigrate) { 6753 struct mm_struct *mm; 6754 struct mem_cgroup *from = mem_cgroup_from_task(p); 6755 6756 VM_BUG_ON(from == memcg); 6757 6758 mm = get_task_mm(p); 6759 if (!mm) 6760 return 0; 6761 /* We move charges only when we move a owner of the mm */ 6762 if (mm->owner == p) { 6763 VM_BUG_ON(mc.from); 6764 VM_BUG_ON(mc.to); 6765 VM_BUG_ON(mc.precharge); 6766 VM_BUG_ON(mc.moved_charge); 6767 VM_BUG_ON(mc.moved_swap); 6768 mem_cgroup_start_move(from); 6769 spin_lock(&mc.lock); 6770 mc.from = from; 6771 mc.to = memcg; 6772 mc.immigrate_flags = move_charge_at_immigrate; 6773 spin_unlock(&mc.lock); 6774 /* We set mc.moving_task later */ 6775 6776 ret = mem_cgroup_precharge_mc(mm); 6777 if (ret) 6778 mem_cgroup_clear_mc(); 6779 } 6780 mmput(mm); 6781 } 6782 return ret; 6783 } 6784 6785 static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 6786 struct cgroup_taskset *tset) 6787 { 6788 mem_cgroup_clear_mc(); 6789 } 6790 6791 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 6792 unsigned long addr, unsigned long end, 6793 struct mm_walk *walk) 6794 { 6795 int ret = 0; 6796 struct vm_area_struct *vma = walk->private; 6797 pte_t *pte; 6798 spinlock_t *ptl; 6799 enum mc_target_type target_type; 6800 union mc_target target; 6801 struct page *page; 6802 struct page_cgroup *pc; 6803 6804 /* 6805 * We don't take compound_lock() here but no race with splitting thp 6806 * happens because: 6807 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 6808 * under splitting, which means there's no concurrent thp split, 6809 * - if another thread runs into split_huge_page() just after we 6810 * entered this if-block, the thread must wait for page table lock 6811 * to be unlocked in __split_huge_page_splitting(), where the main 6812 * part of thp split is not executed yet. 6813 */ 6814 if (pmd_trans_huge_lock(pmd, vma) == 1) { 6815 if (mc.precharge < HPAGE_PMD_NR) { 6816 spin_unlock(&vma->vm_mm->page_table_lock); 6817 return 0; 6818 } 6819 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6820 if (target_type == MC_TARGET_PAGE) { 6821 page = target.page; 6822 if (!isolate_lru_page(page)) { 6823 pc = lookup_page_cgroup(page); 6824 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 6825 pc, mc.from, mc.to)) { 6826 mc.precharge -= HPAGE_PMD_NR; 6827 mc.moved_charge += HPAGE_PMD_NR; 6828 } 6829 putback_lru_page(page); 6830 } 6831 put_page(page); 6832 } 6833 spin_unlock(&vma->vm_mm->page_table_lock); 6834 return 0; 6835 } 6836 6837 if (pmd_trans_unstable(pmd)) 6838 return 0; 6839 retry: 6840 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6841 for (; addr != end; addr += PAGE_SIZE) { 6842 pte_t ptent = *(pte++); 6843 swp_entry_t ent; 6844 6845 if (!mc.precharge) 6846 break; 6847 6848 switch (get_mctgt_type(vma, addr, ptent, &target)) { 6849 case MC_TARGET_PAGE: 6850 page = target.page; 6851 if (isolate_lru_page(page)) 6852 goto put; 6853 pc = lookup_page_cgroup(page); 6854 if (!mem_cgroup_move_account(page, 1, pc, 6855 mc.from, mc.to)) { 6856 mc.precharge--; 6857 /* we uncharge from mc.from later. */ 6858 mc.moved_charge++; 6859 } 6860 putback_lru_page(page); 6861 put: /* get_mctgt_type() gets the page */ 6862 put_page(page); 6863 break; 6864 case MC_TARGET_SWAP: 6865 ent = target.ent; 6866 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 6867 mc.precharge--; 6868 /* we fixup refcnts and charges later. */ 6869 mc.moved_swap++; 6870 } 6871 break; 6872 default: 6873 break; 6874 } 6875 } 6876 pte_unmap_unlock(pte - 1, ptl); 6877 cond_resched(); 6878 6879 if (addr != end) { 6880 /* 6881 * We have consumed all precharges we got in can_attach(). 6882 * We try charge one by one, but don't do any additional 6883 * charges to mc.to if we have failed in charge once in attach() 6884 * phase. 6885 */ 6886 ret = mem_cgroup_do_precharge(1); 6887 if (!ret) 6888 goto retry; 6889 } 6890 6891 return ret; 6892 } 6893 6894 static void mem_cgroup_move_charge(struct mm_struct *mm) 6895 { 6896 struct vm_area_struct *vma; 6897 6898 lru_add_drain_all(); 6899 retry: 6900 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 6901 /* 6902 * Someone who are holding the mmap_sem might be waiting in 6903 * waitq. So we cancel all extra charges, wake up all waiters, 6904 * and retry. Because we cancel precharges, we might not be able 6905 * to move enough charges, but moving charge is a best-effort 6906 * feature anyway, so it wouldn't be a big problem. 6907 */ 6908 __mem_cgroup_clear_mc(); 6909 cond_resched(); 6910 goto retry; 6911 } 6912 for (vma = mm->mmap; vma; vma = vma->vm_next) { 6913 int ret; 6914 struct mm_walk mem_cgroup_move_charge_walk = { 6915 .pmd_entry = mem_cgroup_move_charge_pte_range, 6916 .mm = mm, 6917 .private = vma, 6918 }; 6919 if (is_vm_hugetlb_page(vma)) 6920 continue; 6921 ret = walk_page_range(vma->vm_start, vma->vm_end, 6922 &mem_cgroup_move_charge_walk); 6923 if (ret) 6924 /* 6925 * means we have consumed all precharges and failed in 6926 * doing additional charge. Just abandon here. 6927 */ 6928 break; 6929 } 6930 up_read(&mm->mmap_sem); 6931 } 6932 6933 static void mem_cgroup_move_task(struct cgroup *cont, 6934 struct cgroup_taskset *tset) 6935 { 6936 struct task_struct *p = cgroup_taskset_first(tset); 6937 struct mm_struct *mm = get_task_mm(p); 6938 6939 if (mm) { 6940 if (mc.to) 6941 mem_cgroup_move_charge(mm); 6942 mmput(mm); 6943 } 6944 if (mc.to) 6945 mem_cgroup_clear_mc(); 6946 } 6947 #else /* !CONFIG_MMU */ 6948 static int mem_cgroup_can_attach(struct cgroup *cgroup, 6949 struct cgroup_taskset *tset) 6950 { 6951 return 0; 6952 } 6953 static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 6954 struct cgroup_taskset *tset) 6955 { 6956 } 6957 static void mem_cgroup_move_task(struct cgroup *cont, 6958 struct cgroup_taskset *tset) 6959 { 6960 } 6961 #endif 6962 6963 /* 6964 * Cgroup retains root cgroups across [un]mount cycles making it necessary 6965 * to verify sane_behavior flag on each mount attempt. 6966 */ 6967 static void mem_cgroup_bind(struct cgroup *root) 6968 { 6969 /* 6970 * use_hierarchy is forced with sane_behavior. cgroup core 6971 * guarantees that @root doesn't have any children, so turning it 6972 * on for the root memcg is enough. 6973 */ 6974 if (cgroup_sane_behavior(root)) 6975 mem_cgroup_from_cont(root)->use_hierarchy = true; 6976 } 6977 6978 struct cgroup_subsys mem_cgroup_subsys = { 6979 .name = "memory", 6980 .subsys_id = mem_cgroup_subsys_id, 6981 .css_alloc = mem_cgroup_css_alloc, 6982 .css_online = mem_cgroup_css_online, 6983 .css_offline = mem_cgroup_css_offline, 6984 .css_free = mem_cgroup_css_free, 6985 .can_attach = mem_cgroup_can_attach, 6986 .cancel_attach = mem_cgroup_cancel_attach, 6987 .attach = mem_cgroup_move_task, 6988 .bind = mem_cgroup_bind, 6989 .base_cftypes = mem_cgroup_files, 6990 .early_init = 0, 6991 .use_id = 1, 6992 }; 6993 6994 #ifdef CONFIG_MEMCG_SWAP 6995 static int __init enable_swap_account(char *s) 6996 { 6997 /* consider enabled if no parameter or 1 is given */ 6998 if (!strcmp(s, "1")) 6999 really_do_swap_account = 1; 7000 else if (!strcmp(s, "0")) 7001 really_do_swap_account = 0; 7002 return 1; 7003 } 7004 __setup("swapaccount=", enable_swap_account); 7005 7006 static void __init memsw_file_init(void) 7007 { 7008 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); 7009 } 7010 7011 static void __init enable_swap_cgroup(void) 7012 { 7013 if (!mem_cgroup_disabled() && really_do_swap_account) { 7014 do_swap_account = 1; 7015 memsw_file_init(); 7016 } 7017 } 7018 7019 #else 7020 static void __init enable_swap_cgroup(void) 7021 { 7022 } 7023 #endif 7024 7025 /* 7026 * subsys_initcall() for memory controller. 7027 * 7028 * Some parts like hotcpu_notifier() have to be initialized from this context 7029 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically 7030 * everything that doesn't depend on a specific mem_cgroup structure should 7031 * be initialized from here. 7032 */ 7033 static int __init mem_cgroup_init(void) 7034 { 7035 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 7036 enable_swap_cgroup(); 7037 mem_cgroup_soft_limit_tree_init(); 7038 memcg_stock_init(); 7039 return 0; 7040 } 7041 subsys_initcall(mem_cgroup_init); 7042