1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20 #include <linux/res_counter.h> 21 #include <linux/memcontrol.h> 22 #include <linux/cgroup.h> 23 #include <linux/mm.h> 24 #include <linux/pagemap.h> 25 #include <linux/smp.h> 26 #include <linux/page-flags.h> 27 #include <linux/backing-dev.h> 28 #include <linux/bit_spinlock.h> 29 #include <linux/rcupdate.h> 30 #include <linux/mutex.h> 31 #include <linux/slab.h> 32 #include <linux/swap.h> 33 #include <linux/spinlock.h> 34 #include <linux/fs.h> 35 #include <linux/seq_file.h> 36 #include <linux/vmalloc.h> 37 #include <linux/mm_inline.h> 38 #include <linux/page_cgroup.h> 39 #include "internal.h" 40 41 #include <asm/uaccess.h> 42 43 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 44 #define MEM_CGROUP_RECLAIM_RETRIES 5 45 46 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 47 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ 48 int do_swap_account __read_mostly; 49 static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 50 #else 51 #define do_swap_account (0) 52 #endif 53 54 static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 55 56 /* 57 * Statistics for memory cgroup. 58 */ 59 enum mem_cgroup_stat_index { 60 /* 61 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 62 */ 63 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 64 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 65 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 66 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 67 68 MEM_CGROUP_STAT_NSTATS, 69 }; 70 71 struct mem_cgroup_stat_cpu { 72 s64 count[MEM_CGROUP_STAT_NSTATS]; 73 } ____cacheline_aligned_in_smp; 74 75 struct mem_cgroup_stat { 76 struct mem_cgroup_stat_cpu cpustat[0]; 77 }; 78 79 /* 80 * For accounting under irq disable, no need for increment preempt count. 81 */ 82 static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, 83 enum mem_cgroup_stat_index idx, int val) 84 { 85 stat->count[idx] += val; 86 } 87 88 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 89 enum mem_cgroup_stat_index idx) 90 { 91 int cpu; 92 s64 ret = 0; 93 for_each_possible_cpu(cpu) 94 ret += stat->cpustat[cpu].count[idx]; 95 return ret; 96 } 97 98 /* 99 * per-zone information in memory controller. 100 */ 101 struct mem_cgroup_per_zone { 102 /* 103 * spin_lock to protect the per cgroup LRU 104 */ 105 struct list_head lists[NR_LRU_LISTS]; 106 unsigned long count[NR_LRU_LISTS]; 107 108 struct zone_reclaim_stat reclaim_stat; 109 }; 110 /* Macro for accessing counter */ 111 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 112 113 struct mem_cgroup_per_node { 114 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 115 }; 116 117 struct mem_cgroup_lru_info { 118 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 119 }; 120 121 /* 122 * The memory controller data structure. The memory controller controls both 123 * page cache and RSS per cgroup. We would eventually like to provide 124 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 125 * to help the administrator determine what knobs to tune. 126 * 127 * TODO: Add a water mark for the memory controller. Reclaim will begin when 128 * we hit the water mark. May be even add a low water mark, such that 129 * no reclaim occurs from a cgroup at it's low water mark, this is 130 * a feature that will be implemented much later in the future. 131 */ 132 struct mem_cgroup { 133 struct cgroup_subsys_state css; 134 /* 135 * the counter to account for memory usage 136 */ 137 struct res_counter res; 138 /* 139 * the counter to account for mem+swap usage. 140 */ 141 struct res_counter memsw; 142 /* 143 * Per cgroup active and inactive list, similar to the 144 * per zone LRU lists. 145 */ 146 struct mem_cgroup_lru_info info; 147 148 /* 149 protect against reclaim related member. 150 */ 151 spinlock_t reclaim_param_lock; 152 153 int prev_priority; /* for recording reclaim priority */ 154 155 /* 156 * While reclaiming in a hiearchy, we cache the last child we 157 * reclaimed from. Protected by hierarchy_mutex 158 */ 159 struct mem_cgroup *last_scanned_child; 160 /* 161 * Should the accounting and control be hierarchical, per subtree? 162 */ 163 bool use_hierarchy; 164 unsigned long last_oom_jiffies; 165 atomic_t refcnt; 166 167 unsigned int swappiness; 168 169 /* 170 * statistics. This must be placed at the end of memcg. 171 */ 172 struct mem_cgroup_stat stat; 173 }; 174 175 enum charge_type { 176 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 177 MEM_CGROUP_CHARGE_TYPE_MAPPED, 178 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 179 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 180 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 181 NR_CHARGE_TYPE, 182 }; 183 184 /* only for here (for easy reading.) */ 185 #define PCGF_CACHE (1UL << PCG_CACHE) 186 #define PCGF_USED (1UL << PCG_USED) 187 #define PCGF_LOCK (1UL << PCG_LOCK) 188 static const unsigned long 189 pcg_default_flags[NR_CHARGE_TYPE] = { 190 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ 191 PCGF_USED | PCGF_LOCK, /* Anon */ 192 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ 193 0, /* FORCE */ 194 }; 195 196 /* for encoding cft->private value on file */ 197 #define _MEM (0) 198 #define _MEMSWAP (1) 199 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 200 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 201 #define MEMFILE_ATTR(val) ((val) & 0xffff) 202 203 static void mem_cgroup_get(struct mem_cgroup *mem); 204 static void mem_cgroup_put(struct mem_cgroup *mem); 205 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 206 207 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 208 struct page_cgroup *pc, 209 bool charge) 210 { 211 int val = (charge)? 1 : -1; 212 struct mem_cgroup_stat *stat = &mem->stat; 213 struct mem_cgroup_stat_cpu *cpustat; 214 int cpu = get_cpu(); 215 216 cpustat = &stat->cpustat[cpu]; 217 if (PageCgroupCache(pc)) 218 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 219 else 220 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 221 222 if (charge) 223 __mem_cgroup_stat_add_safe(cpustat, 224 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 225 else 226 __mem_cgroup_stat_add_safe(cpustat, 227 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 228 put_cpu(); 229 } 230 231 static struct mem_cgroup_per_zone * 232 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 233 { 234 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 235 } 236 237 static struct mem_cgroup_per_zone * 238 page_cgroup_zoneinfo(struct page_cgroup *pc) 239 { 240 struct mem_cgroup *mem = pc->mem_cgroup; 241 int nid = page_cgroup_nid(pc); 242 int zid = page_cgroup_zid(pc); 243 244 if (!mem) 245 return NULL; 246 247 return mem_cgroup_zoneinfo(mem, nid, zid); 248 } 249 250 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 251 enum lru_list idx) 252 { 253 int nid, zid; 254 struct mem_cgroup_per_zone *mz; 255 u64 total = 0; 256 257 for_each_online_node(nid) 258 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 259 mz = mem_cgroup_zoneinfo(mem, nid, zid); 260 total += MEM_CGROUP_ZSTAT(mz, idx); 261 } 262 return total; 263 } 264 265 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 266 { 267 return container_of(cgroup_subsys_state(cont, 268 mem_cgroup_subsys_id), struct mem_cgroup, 269 css); 270 } 271 272 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 273 { 274 /* 275 * mm_update_next_owner() may clear mm->owner to NULL 276 * if it races with swapoff, page migration, etc. 277 * So this can be called with p == NULL. 278 */ 279 if (unlikely(!p)) 280 return NULL; 281 282 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 283 struct mem_cgroup, css); 284 } 285 286 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 287 { 288 struct mem_cgroup *mem = NULL; 289 /* 290 * Because we have no locks, mm->owner's may be being moved to other 291 * cgroup. We use css_tryget() here even if this looks 292 * pessimistic (rather than adding locks here). 293 */ 294 rcu_read_lock(); 295 do { 296 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 297 if (unlikely(!mem)) 298 break; 299 } while (!css_tryget(&mem->css)); 300 rcu_read_unlock(); 301 return mem; 302 } 303 304 static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) 305 { 306 if (!mem) 307 return true; 308 return css_is_removed(&mem->css); 309 } 310 311 /* 312 * Following LRU functions are allowed to be used without PCG_LOCK. 313 * Operations are called by routine of global LRU independently from memcg. 314 * What we have to take care of here is validness of pc->mem_cgroup. 315 * 316 * Changes to pc->mem_cgroup happens when 317 * 1. charge 318 * 2. moving account 319 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 320 * It is added to LRU before charge. 321 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 322 * When moving account, the page is not on LRU. It's isolated. 323 */ 324 325 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 326 { 327 struct page_cgroup *pc; 328 struct mem_cgroup *mem; 329 struct mem_cgroup_per_zone *mz; 330 331 if (mem_cgroup_disabled()) 332 return; 333 pc = lookup_page_cgroup(page); 334 /* can happen while we handle swapcache. */ 335 if (list_empty(&pc->lru) || !pc->mem_cgroup) 336 return; 337 /* 338 * We don't check PCG_USED bit. It's cleared when the "page" is finally 339 * removed from global LRU. 340 */ 341 mz = page_cgroup_zoneinfo(pc); 342 mem = pc->mem_cgroup; 343 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 344 list_del_init(&pc->lru); 345 return; 346 } 347 348 void mem_cgroup_del_lru(struct page *page) 349 { 350 mem_cgroup_del_lru_list(page, page_lru(page)); 351 } 352 353 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 354 { 355 struct mem_cgroup_per_zone *mz; 356 struct page_cgroup *pc; 357 358 if (mem_cgroup_disabled()) 359 return; 360 361 pc = lookup_page_cgroup(page); 362 /* 363 * Used bit is set without atomic ops but after smp_wmb(). 364 * For making pc->mem_cgroup visible, insert smp_rmb() here. 365 */ 366 smp_rmb(); 367 /* unused page is not rotated. */ 368 if (!PageCgroupUsed(pc)) 369 return; 370 mz = page_cgroup_zoneinfo(pc); 371 list_move(&pc->lru, &mz->lists[lru]); 372 } 373 374 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 375 { 376 struct page_cgroup *pc; 377 struct mem_cgroup_per_zone *mz; 378 379 if (mem_cgroup_disabled()) 380 return; 381 pc = lookup_page_cgroup(page); 382 /* 383 * Used bit is set without atomic ops but after smp_wmb(). 384 * For making pc->mem_cgroup visible, insert smp_rmb() here. 385 */ 386 smp_rmb(); 387 if (!PageCgroupUsed(pc)) 388 return; 389 390 mz = page_cgroup_zoneinfo(pc); 391 MEM_CGROUP_ZSTAT(mz, lru) += 1; 392 list_add(&pc->lru, &mz->lists[lru]); 393 } 394 395 /* 396 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 397 * lru because the page may.be reused after it's fully uncharged (because of 398 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 399 * it again. This function is only used to charge SwapCache. It's done under 400 * lock_page and expected that zone->lru_lock is never held. 401 */ 402 static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 403 { 404 unsigned long flags; 405 struct zone *zone = page_zone(page); 406 struct page_cgroup *pc = lookup_page_cgroup(page); 407 408 spin_lock_irqsave(&zone->lru_lock, flags); 409 /* 410 * Forget old LRU when this page_cgroup is *not* used. This Used bit 411 * is guarded by lock_page() because the page is SwapCache. 412 */ 413 if (!PageCgroupUsed(pc)) 414 mem_cgroup_del_lru_list(page, page_lru(page)); 415 spin_unlock_irqrestore(&zone->lru_lock, flags); 416 } 417 418 static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 419 { 420 unsigned long flags; 421 struct zone *zone = page_zone(page); 422 struct page_cgroup *pc = lookup_page_cgroup(page); 423 424 spin_lock_irqsave(&zone->lru_lock, flags); 425 /* link when the page is linked to LRU but page_cgroup isn't */ 426 if (PageLRU(page) && list_empty(&pc->lru)) 427 mem_cgroup_add_lru_list(page, page_lru(page)); 428 spin_unlock_irqrestore(&zone->lru_lock, flags); 429 } 430 431 432 void mem_cgroup_move_lists(struct page *page, 433 enum lru_list from, enum lru_list to) 434 { 435 if (mem_cgroup_disabled()) 436 return; 437 mem_cgroup_del_lru_list(page, from); 438 mem_cgroup_add_lru_list(page, to); 439 } 440 441 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 442 { 443 int ret; 444 445 task_lock(task); 446 ret = task->mm && mm_match_cgroup(task->mm, mem); 447 task_unlock(task); 448 return ret; 449 } 450 451 /* 452 * Calculate mapped_ratio under memory controller. This will be used in 453 * vmscan.c for deteremining we have to reclaim mapped pages. 454 */ 455 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) 456 { 457 long total, rss; 458 459 /* 460 * usage is recorded in bytes. But, here, we assume the number of 461 * physical pages can be represented by "long" on any arch. 462 */ 463 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; 464 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 465 return (int)((rss * 100L) / total); 466 } 467 468 /* 469 * prev_priority control...this will be used in memory reclaim path. 470 */ 471 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 472 { 473 int prev_priority; 474 475 spin_lock(&mem->reclaim_param_lock); 476 prev_priority = mem->prev_priority; 477 spin_unlock(&mem->reclaim_param_lock); 478 479 return prev_priority; 480 } 481 482 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 483 { 484 spin_lock(&mem->reclaim_param_lock); 485 if (priority < mem->prev_priority) 486 mem->prev_priority = priority; 487 spin_unlock(&mem->reclaim_param_lock); 488 } 489 490 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 491 { 492 spin_lock(&mem->reclaim_param_lock); 493 mem->prev_priority = priority; 494 spin_unlock(&mem->reclaim_param_lock); 495 } 496 497 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 498 { 499 unsigned long active; 500 unsigned long inactive; 501 unsigned long gb; 502 unsigned long inactive_ratio; 503 504 inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); 505 active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); 506 507 gb = (inactive + active) >> (30 - PAGE_SHIFT); 508 if (gb) 509 inactive_ratio = int_sqrt(10 * gb); 510 else 511 inactive_ratio = 1; 512 513 if (present_pages) { 514 present_pages[0] = inactive; 515 present_pages[1] = active; 516 } 517 518 return inactive_ratio; 519 } 520 521 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 522 { 523 unsigned long active; 524 unsigned long inactive; 525 unsigned long present_pages[2]; 526 unsigned long inactive_ratio; 527 528 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 529 530 inactive = present_pages[0]; 531 active = present_pages[1]; 532 533 if (inactive * inactive_ratio < active) 534 return 1; 535 536 return 0; 537 } 538 539 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 540 struct zone *zone, 541 enum lru_list lru) 542 { 543 int nid = zone->zone_pgdat->node_id; 544 int zid = zone_idx(zone); 545 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 546 547 return MEM_CGROUP_ZSTAT(mz, lru); 548 } 549 550 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 551 struct zone *zone) 552 { 553 int nid = zone->zone_pgdat->node_id; 554 int zid = zone_idx(zone); 555 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 556 557 return &mz->reclaim_stat; 558 } 559 560 struct zone_reclaim_stat * 561 mem_cgroup_get_reclaim_stat_from_page(struct page *page) 562 { 563 struct page_cgroup *pc; 564 struct mem_cgroup_per_zone *mz; 565 566 if (mem_cgroup_disabled()) 567 return NULL; 568 569 pc = lookup_page_cgroup(page); 570 /* 571 * Used bit is set without atomic ops but after smp_wmb(). 572 * For making pc->mem_cgroup visible, insert smp_rmb() here. 573 */ 574 smp_rmb(); 575 if (!PageCgroupUsed(pc)) 576 return NULL; 577 578 mz = page_cgroup_zoneinfo(pc); 579 if (!mz) 580 return NULL; 581 582 return &mz->reclaim_stat; 583 } 584 585 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 586 struct list_head *dst, 587 unsigned long *scanned, int order, 588 int mode, struct zone *z, 589 struct mem_cgroup *mem_cont, 590 int active, int file) 591 { 592 unsigned long nr_taken = 0; 593 struct page *page; 594 unsigned long scan; 595 LIST_HEAD(pc_list); 596 struct list_head *src; 597 struct page_cgroup *pc, *tmp; 598 int nid = z->zone_pgdat->node_id; 599 int zid = zone_idx(z); 600 struct mem_cgroup_per_zone *mz; 601 int lru = LRU_FILE * !!file + !!active; 602 603 BUG_ON(!mem_cont); 604 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 605 src = &mz->lists[lru]; 606 607 scan = 0; 608 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 609 if (scan >= nr_to_scan) 610 break; 611 612 page = pc->page; 613 if (unlikely(!PageCgroupUsed(pc))) 614 continue; 615 if (unlikely(!PageLRU(page))) 616 continue; 617 618 scan++; 619 if (__isolate_lru_page(page, mode, file) == 0) { 620 list_move(&page->lru, dst); 621 nr_taken++; 622 } 623 } 624 625 *scanned = scan; 626 return nr_taken; 627 } 628 629 #define mem_cgroup_from_res_counter(counter, member) \ 630 container_of(counter, struct mem_cgroup, member) 631 632 /* 633 * This routine finds the DFS walk successor. This routine should be 634 * called with hierarchy_mutex held 635 */ 636 static struct mem_cgroup * 637 __mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) 638 { 639 struct cgroup *cgroup, *curr_cgroup, *root_cgroup; 640 641 curr_cgroup = curr->css.cgroup; 642 root_cgroup = root_mem->css.cgroup; 643 644 if (!list_empty(&curr_cgroup->children)) { 645 /* 646 * Walk down to children 647 */ 648 cgroup = list_entry(curr_cgroup->children.next, 649 struct cgroup, sibling); 650 curr = mem_cgroup_from_cont(cgroup); 651 goto done; 652 } 653 654 visit_parent: 655 if (curr_cgroup == root_cgroup) { 656 /* caller handles NULL case */ 657 curr = NULL; 658 goto done; 659 } 660 661 /* 662 * Goto next sibling 663 */ 664 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { 665 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, 666 sibling); 667 curr = mem_cgroup_from_cont(cgroup); 668 goto done; 669 } 670 671 /* 672 * Go up to next parent and next parent's sibling if need be 673 */ 674 curr_cgroup = curr_cgroup->parent; 675 goto visit_parent; 676 677 done: 678 return curr; 679 } 680 681 /* 682 * Visit the first child (need not be the first child as per the ordering 683 * of the cgroup list, since we track last_scanned_child) of @mem and use 684 * that to reclaim free pages from. 685 */ 686 static struct mem_cgroup * 687 mem_cgroup_get_next_node(struct mem_cgroup *root_mem) 688 { 689 struct cgroup *cgroup; 690 struct mem_cgroup *orig, *next; 691 bool obsolete; 692 693 /* 694 * Scan all children under the mem_cgroup mem 695 */ 696 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); 697 698 orig = root_mem->last_scanned_child; 699 obsolete = mem_cgroup_is_obsolete(orig); 700 701 if (list_empty(&root_mem->css.cgroup->children)) { 702 /* 703 * root_mem might have children before and last_scanned_child 704 * may point to one of them. We put it later. 705 */ 706 if (orig) 707 VM_BUG_ON(!obsolete); 708 next = NULL; 709 goto done; 710 } 711 712 if (!orig || obsolete) { 713 cgroup = list_first_entry(&root_mem->css.cgroup->children, 714 struct cgroup, sibling); 715 next = mem_cgroup_from_cont(cgroup); 716 } else 717 next = __mem_cgroup_get_next_node(orig, root_mem); 718 719 done: 720 if (next) 721 mem_cgroup_get(next); 722 root_mem->last_scanned_child = next; 723 if (orig) 724 mem_cgroup_put(orig); 725 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); 726 return (next) ? next : root_mem; 727 } 728 729 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 730 { 731 if (do_swap_account) { 732 if (res_counter_check_under_limit(&mem->res) && 733 res_counter_check_under_limit(&mem->memsw)) 734 return true; 735 } else 736 if (res_counter_check_under_limit(&mem->res)) 737 return true; 738 return false; 739 } 740 741 static unsigned int get_swappiness(struct mem_cgroup *memcg) 742 { 743 struct cgroup *cgrp = memcg->css.cgroup; 744 unsigned int swappiness; 745 746 /* root ? */ 747 if (cgrp->parent == NULL) 748 return vm_swappiness; 749 750 spin_lock(&memcg->reclaim_param_lock); 751 swappiness = memcg->swappiness; 752 spin_unlock(&memcg->reclaim_param_lock); 753 754 return swappiness; 755 } 756 757 /* 758 * Dance down the hierarchy if needed to reclaim memory. We remember the 759 * last child we reclaimed from, so that we don't end up penalizing 760 * one child extensively based on its position in the children list. 761 * 762 * root_mem is the original ancestor that we've been reclaim from. 763 */ 764 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 765 gfp_t gfp_mask, bool noswap) 766 { 767 struct mem_cgroup *next_mem; 768 int ret = 0; 769 770 /* 771 * Reclaim unconditionally and don't check for return value. 772 * We need to reclaim in the current group and down the tree. 773 * One might think about checking for children before reclaiming, 774 * but there might be left over accounting, even after children 775 * have left. 776 */ 777 ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, 778 get_swappiness(root_mem)); 779 if (mem_cgroup_check_under_limit(root_mem)) 780 return 1; /* indicate reclaim has succeeded */ 781 if (!root_mem->use_hierarchy) 782 return ret; 783 784 next_mem = mem_cgroup_get_next_node(root_mem); 785 786 while (next_mem != root_mem) { 787 if (mem_cgroup_is_obsolete(next_mem)) { 788 next_mem = mem_cgroup_get_next_node(root_mem); 789 continue; 790 } 791 ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, 792 get_swappiness(next_mem)); 793 if (mem_cgroup_check_under_limit(root_mem)) 794 return 1; /* indicate reclaim has succeeded */ 795 next_mem = mem_cgroup_get_next_node(root_mem); 796 } 797 return ret; 798 } 799 800 bool mem_cgroup_oom_called(struct task_struct *task) 801 { 802 bool ret = false; 803 struct mem_cgroup *mem; 804 struct mm_struct *mm; 805 806 rcu_read_lock(); 807 mm = task->mm; 808 if (!mm) 809 mm = &init_mm; 810 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 811 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) 812 ret = true; 813 rcu_read_unlock(); 814 return ret; 815 } 816 /* 817 * Unlike exported interface, "oom" parameter is added. if oom==true, 818 * oom-killer can be invoked. 819 */ 820 static int __mem_cgroup_try_charge(struct mm_struct *mm, 821 gfp_t gfp_mask, struct mem_cgroup **memcg, 822 bool oom) 823 { 824 struct mem_cgroup *mem, *mem_over_limit; 825 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 826 struct res_counter *fail_res; 827 828 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 829 /* Don't account this! */ 830 *memcg = NULL; 831 return 0; 832 } 833 834 /* 835 * We always charge the cgroup the mm_struct belongs to. 836 * The mm_struct's mem_cgroup changes on task migration if the 837 * thread group leader migrates. It's possible that mm is not 838 * set, if so charge the init_mm (happens for pagecache usage). 839 */ 840 mem = *memcg; 841 if (likely(!mem)) { 842 mem = try_get_mem_cgroup_from_mm(mm); 843 *memcg = mem; 844 } else { 845 css_get(&mem->css); 846 } 847 if (unlikely(!mem)) 848 return 0; 849 850 VM_BUG_ON(mem_cgroup_is_obsolete(mem)); 851 852 while (1) { 853 int ret; 854 bool noswap = false; 855 856 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 857 if (likely(!ret)) { 858 if (!do_swap_account) 859 break; 860 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 861 &fail_res); 862 if (likely(!ret)) 863 break; 864 /* mem+swap counter fails */ 865 res_counter_uncharge(&mem->res, PAGE_SIZE); 866 noswap = true; 867 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 868 memsw); 869 } else 870 /* mem counter fails */ 871 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 872 res); 873 874 if (!(gfp_mask & __GFP_WAIT)) 875 goto nomem; 876 877 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 878 noswap); 879 if (ret) 880 continue; 881 882 /* 883 * try_to_free_mem_cgroup_pages() might not give us a full 884 * picture of reclaim. Some pages are reclaimed and might be 885 * moved to swap cache or just unmapped from the cgroup. 886 * Check the limit again to see if the reclaim reduced the 887 * current usage of the cgroup before giving up 888 * 889 */ 890 if (mem_cgroup_check_under_limit(mem_over_limit)) 891 continue; 892 893 if (!nr_retries--) { 894 if (oom) { 895 mutex_lock(&memcg_tasklist); 896 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 897 mutex_unlock(&memcg_tasklist); 898 mem_over_limit->last_oom_jiffies = jiffies; 899 } 900 goto nomem; 901 } 902 } 903 return 0; 904 nomem: 905 css_put(&mem->css); 906 return -ENOMEM; 907 } 908 909 static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 910 { 911 struct mem_cgroup *mem; 912 swp_entry_t ent; 913 914 if (!PageSwapCache(page)) 915 return NULL; 916 917 ent.val = page_private(page); 918 mem = lookup_swap_cgroup(ent); 919 if (!mem) 920 return NULL; 921 if (!css_tryget(&mem->css)) 922 return NULL; 923 return mem; 924 } 925 926 /* 927 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 928 * USED state. If already USED, uncharge and return. 929 */ 930 931 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 932 struct page_cgroup *pc, 933 enum charge_type ctype) 934 { 935 /* try_charge() can return NULL to *memcg, taking care of it. */ 936 if (!mem) 937 return; 938 939 lock_page_cgroup(pc); 940 if (unlikely(PageCgroupUsed(pc))) { 941 unlock_page_cgroup(pc); 942 res_counter_uncharge(&mem->res, PAGE_SIZE); 943 if (do_swap_account) 944 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 945 css_put(&mem->css); 946 return; 947 } 948 pc->mem_cgroup = mem; 949 smp_wmb(); 950 pc->flags = pcg_default_flags[ctype]; 951 952 mem_cgroup_charge_statistics(mem, pc, true); 953 954 unlock_page_cgroup(pc); 955 } 956 957 /** 958 * mem_cgroup_move_account - move account of the page 959 * @pc: page_cgroup of the page. 960 * @from: mem_cgroup which the page is moved from. 961 * @to: mem_cgroup which the page is moved to. @from != @to. 962 * 963 * The caller must confirm following. 964 * - page is not on LRU (isolate_page() is useful.) 965 * 966 * returns 0 at success, 967 * returns -EBUSY when lock is busy or "pc" is unstable. 968 * 969 * This function does "uncharge" from old cgroup but doesn't do "charge" to 970 * new cgroup. It should be done by a caller. 971 */ 972 973 static int mem_cgroup_move_account(struct page_cgroup *pc, 974 struct mem_cgroup *from, struct mem_cgroup *to) 975 { 976 struct mem_cgroup_per_zone *from_mz, *to_mz; 977 int nid, zid; 978 int ret = -EBUSY; 979 980 VM_BUG_ON(from == to); 981 VM_BUG_ON(PageLRU(pc->page)); 982 983 nid = page_cgroup_nid(pc); 984 zid = page_cgroup_zid(pc); 985 from_mz = mem_cgroup_zoneinfo(from, nid, zid); 986 to_mz = mem_cgroup_zoneinfo(to, nid, zid); 987 988 if (!trylock_page_cgroup(pc)) 989 return ret; 990 991 if (!PageCgroupUsed(pc)) 992 goto out; 993 994 if (pc->mem_cgroup != from) 995 goto out; 996 997 res_counter_uncharge(&from->res, PAGE_SIZE); 998 mem_cgroup_charge_statistics(from, pc, false); 999 if (do_swap_account) 1000 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1001 css_put(&from->css); 1002 1003 css_get(&to->css); 1004 pc->mem_cgroup = to; 1005 mem_cgroup_charge_statistics(to, pc, true); 1006 ret = 0; 1007 out: 1008 unlock_page_cgroup(pc); 1009 return ret; 1010 } 1011 1012 /* 1013 * move charges to its parent. 1014 */ 1015 1016 static int mem_cgroup_move_parent(struct page_cgroup *pc, 1017 struct mem_cgroup *child, 1018 gfp_t gfp_mask) 1019 { 1020 struct page *page = pc->page; 1021 struct cgroup *cg = child->css.cgroup; 1022 struct cgroup *pcg = cg->parent; 1023 struct mem_cgroup *parent; 1024 int ret; 1025 1026 /* Is ROOT ? */ 1027 if (!pcg) 1028 return -EINVAL; 1029 1030 1031 parent = mem_cgroup_from_cont(pcg); 1032 1033 1034 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1035 if (ret || !parent) 1036 return ret; 1037 1038 if (!get_page_unless_zero(page)) { 1039 ret = -EBUSY; 1040 goto uncharge; 1041 } 1042 1043 ret = isolate_lru_page(page); 1044 1045 if (ret) 1046 goto cancel; 1047 1048 ret = mem_cgroup_move_account(pc, child, parent); 1049 1050 putback_lru_page(page); 1051 if (!ret) { 1052 put_page(page); 1053 /* drop extra refcnt by try_charge() */ 1054 css_put(&parent->css); 1055 return 0; 1056 } 1057 1058 cancel: 1059 put_page(page); 1060 uncharge: 1061 /* drop extra refcnt by try_charge() */ 1062 css_put(&parent->css); 1063 /* uncharge if move fails */ 1064 res_counter_uncharge(&parent->res, PAGE_SIZE); 1065 if (do_swap_account) 1066 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 1067 return ret; 1068 } 1069 1070 /* 1071 * Charge the memory controller for page usage. 1072 * Return 1073 * 0 if the charge was successful 1074 * < 0 if the cgroup is over its limit 1075 */ 1076 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 1077 gfp_t gfp_mask, enum charge_type ctype, 1078 struct mem_cgroup *memcg) 1079 { 1080 struct mem_cgroup *mem; 1081 struct page_cgroup *pc; 1082 int ret; 1083 1084 pc = lookup_page_cgroup(page); 1085 /* can happen at boot */ 1086 if (unlikely(!pc)) 1087 return 0; 1088 prefetchw(pc); 1089 1090 mem = memcg; 1091 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1092 if (ret || !mem) 1093 return ret; 1094 1095 __mem_cgroup_commit_charge(mem, pc, ctype); 1096 return 0; 1097 } 1098 1099 int mem_cgroup_newpage_charge(struct page *page, 1100 struct mm_struct *mm, gfp_t gfp_mask) 1101 { 1102 if (mem_cgroup_disabled()) 1103 return 0; 1104 if (PageCompound(page)) 1105 return 0; 1106 /* 1107 * If already mapped, we don't have to account. 1108 * If page cache, page->mapping has address_space. 1109 * But page->mapping may have out-of-use anon_vma pointer, 1110 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 1111 * is NULL. 1112 */ 1113 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 1114 return 0; 1115 if (unlikely(!mm)) 1116 mm = &init_mm; 1117 return mem_cgroup_charge_common(page, mm, gfp_mask, 1118 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1119 } 1120 1121 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1122 gfp_t gfp_mask) 1123 { 1124 struct mem_cgroup *mem = NULL; 1125 int ret; 1126 1127 if (mem_cgroup_disabled()) 1128 return 0; 1129 if (PageCompound(page)) 1130 return 0; 1131 /* 1132 * Corner case handling. This is called from add_to_page_cache() 1133 * in usual. But some FS (shmem) precharges this page before calling it 1134 * and call add_to_page_cache() with GFP_NOWAIT. 1135 * 1136 * For GFP_NOWAIT case, the page may be pre-charged before calling 1137 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 1138 * charge twice. (It works but has to pay a bit larger cost.) 1139 * And when the page is SwapCache, it should take swap information 1140 * into account. This is under lock_page() now. 1141 */ 1142 if (!(gfp_mask & __GFP_WAIT)) { 1143 struct page_cgroup *pc; 1144 1145 1146 pc = lookup_page_cgroup(page); 1147 if (!pc) 1148 return 0; 1149 lock_page_cgroup(pc); 1150 if (PageCgroupUsed(pc)) { 1151 unlock_page_cgroup(pc); 1152 return 0; 1153 } 1154 unlock_page_cgroup(pc); 1155 } 1156 1157 if (do_swap_account && PageSwapCache(page)) { 1158 mem = try_get_mem_cgroup_from_swapcache(page); 1159 if (mem) 1160 mm = NULL; 1161 else 1162 mem = NULL; 1163 /* SwapCache may be still linked to LRU now. */ 1164 mem_cgroup_lru_del_before_commit_swapcache(page); 1165 } 1166 1167 if (unlikely(!mm && !mem)) 1168 mm = &init_mm; 1169 1170 if (page_is_file_cache(page)) 1171 return mem_cgroup_charge_common(page, mm, gfp_mask, 1172 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1173 1174 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 1175 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 1176 if (mem) 1177 css_put(&mem->css); 1178 if (PageSwapCache(page)) 1179 mem_cgroup_lru_add_after_commit_swapcache(page); 1180 1181 if (do_swap_account && !ret && PageSwapCache(page)) { 1182 swp_entry_t ent = {.val = page_private(page)}; 1183 /* avoid double counting */ 1184 mem = swap_cgroup_record(ent, NULL); 1185 if (mem) { 1186 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1187 mem_cgroup_put(mem); 1188 } 1189 } 1190 return ret; 1191 } 1192 1193 /* 1194 * While swap-in, try_charge -> commit or cancel, the page is locked. 1195 * And when try_charge() successfully returns, one refcnt to memcg without 1196 * struct page_cgroup is aquired. This refcnt will be cumsumed by 1197 * "commit()" or removed by "cancel()" 1198 */ 1199 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 1200 struct page *page, 1201 gfp_t mask, struct mem_cgroup **ptr) 1202 { 1203 struct mem_cgroup *mem; 1204 int ret; 1205 1206 if (mem_cgroup_disabled()) 1207 return 0; 1208 1209 if (!do_swap_account) 1210 goto charge_cur_mm; 1211 /* 1212 * A racing thread's fault, or swapoff, may have already updated 1213 * the pte, and even removed page from swap cache: return success 1214 * to go on to do_swap_page()'s pte_same() test, which should fail. 1215 */ 1216 if (!PageSwapCache(page)) 1217 return 0; 1218 mem = try_get_mem_cgroup_from_swapcache(page); 1219 if (!mem) 1220 goto charge_cur_mm; 1221 *ptr = mem; 1222 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 1223 /* drop extra refcnt from tryget */ 1224 css_put(&mem->css); 1225 return ret; 1226 charge_cur_mm: 1227 if (unlikely(!mm)) 1228 mm = &init_mm; 1229 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1230 } 1231 1232 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1233 { 1234 struct page_cgroup *pc; 1235 1236 if (mem_cgroup_disabled()) 1237 return; 1238 if (!ptr) 1239 return; 1240 pc = lookup_page_cgroup(page); 1241 mem_cgroup_lru_del_before_commit_swapcache(page); 1242 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1243 mem_cgroup_lru_add_after_commit_swapcache(page); 1244 /* 1245 * Now swap is on-memory. This means this page may be 1246 * counted both as mem and swap....double count. 1247 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 1248 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 1249 * may call delete_from_swap_cache() before reach here. 1250 */ 1251 if (do_swap_account && PageSwapCache(page)) { 1252 swp_entry_t ent = {.val = page_private(page)}; 1253 struct mem_cgroup *memcg; 1254 memcg = swap_cgroup_record(ent, NULL); 1255 if (memcg) { 1256 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1257 mem_cgroup_put(memcg); 1258 } 1259 1260 } 1261 /* add this page(page_cgroup) to the LRU we want. */ 1262 1263 } 1264 1265 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 1266 { 1267 if (mem_cgroup_disabled()) 1268 return; 1269 if (!mem) 1270 return; 1271 res_counter_uncharge(&mem->res, PAGE_SIZE); 1272 if (do_swap_account) 1273 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1274 css_put(&mem->css); 1275 } 1276 1277 1278 /* 1279 * uncharge if !page_mapped(page) 1280 */ 1281 static struct mem_cgroup * 1282 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 1283 { 1284 struct page_cgroup *pc; 1285 struct mem_cgroup *mem = NULL; 1286 struct mem_cgroup_per_zone *mz; 1287 1288 if (mem_cgroup_disabled()) 1289 return NULL; 1290 1291 if (PageSwapCache(page)) 1292 return NULL; 1293 1294 /* 1295 * Check if our page_cgroup is valid 1296 */ 1297 pc = lookup_page_cgroup(page); 1298 if (unlikely(!pc || !PageCgroupUsed(pc))) 1299 return NULL; 1300 1301 lock_page_cgroup(pc); 1302 1303 mem = pc->mem_cgroup; 1304 1305 if (!PageCgroupUsed(pc)) 1306 goto unlock_out; 1307 1308 switch (ctype) { 1309 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1310 if (page_mapped(page)) 1311 goto unlock_out; 1312 break; 1313 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 1314 if (!PageAnon(page)) { /* Shared memory */ 1315 if (page->mapping && !page_is_file_cache(page)) 1316 goto unlock_out; 1317 } else if (page_mapped(page)) /* Anon */ 1318 goto unlock_out; 1319 break; 1320 default: 1321 break; 1322 } 1323 1324 res_counter_uncharge(&mem->res, PAGE_SIZE); 1325 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1326 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1327 1328 mem_cgroup_charge_statistics(mem, pc, false); 1329 ClearPageCgroupUsed(pc); 1330 /* 1331 * pc->mem_cgroup is not cleared here. It will be accessed when it's 1332 * freed from LRU. This is safe because uncharged page is expected not 1333 * to be reused (freed soon). Exception is SwapCache, it's handled by 1334 * special functions. 1335 */ 1336 1337 mz = page_cgroup_zoneinfo(pc); 1338 unlock_page_cgroup(pc); 1339 1340 /* at swapout, this memcg will be accessed to record to swap */ 1341 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1342 css_put(&mem->css); 1343 1344 return mem; 1345 1346 unlock_out: 1347 unlock_page_cgroup(pc); 1348 return NULL; 1349 } 1350 1351 void mem_cgroup_uncharge_page(struct page *page) 1352 { 1353 /* early check. */ 1354 if (page_mapped(page)) 1355 return; 1356 if (page->mapping && !PageAnon(page)) 1357 return; 1358 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1359 } 1360 1361 void mem_cgroup_uncharge_cache_page(struct page *page) 1362 { 1363 VM_BUG_ON(page_mapped(page)); 1364 VM_BUG_ON(page->mapping); 1365 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 1366 } 1367 1368 /* 1369 * called from __delete_from_swap_cache() and drop "page" account. 1370 * memcg information is recorded to swap_cgroup of "ent" 1371 */ 1372 void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) 1373 { 1374 struct mem_cgroup *memcg; 1375 1376 memcg = __mem_cgroup_uncharge_common(page, 1377 MEM_CGROUP_CHARGE_TYPE_SWAPOUT); 1378 /* record memcg information */ 1379 if (do_swap_account && memcg) { 1380 swap_cgroup_record(ent, memcg); 1381 mem_cgroup_get(memcg); 1382 } 1383 if (memcg) 1384 css_put(&memcg->css); 1385 } 1386 1387 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 1388 /* 1389 * called from swap_entry_free(). remove record in swap_cgroup and 1390 * uncharge "memsw" account. 1391 */ 1392 void mem_cgroup_uncharge_swap(swp_entry_t ent) 1393 { 1394 struct mem_cgroup *memcg; 1395 1396 if (!do_swap_account) 1397 return; 1398 1399 memcg = swap_cgroup_record(ent, NULL); 1400 if (memcg) { 1401 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1402 mem_cgroup_put(memcg); 1403 } 1404 } 1405 #endif 1406 1407 /* 1408 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 1409 * page belongs to. 1410 */ 1411 int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 1412 { 1413 struct page_cgroup *pc; 1414 struct mem_cgroup *mem = NULL; 1415 int ret = 0; 1416 1417 if (mem_cgroup_disabled()) 1418 return 0; 1419 1420 pc = lookup_page_cgroup(page); 1421 lock_page_cgroup(pc); 1422 if (PageCgroupUsed(pc)) { 1423 mem = pc->mem_cgroup; 1424 css_get(&mem->css); 1425 } 1426 unlock_page_cgroup(pc); 1427 1428 if (mem) { 1429 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 1430 css_put(&mem->css); 1431 } 1432 *ptr = mem; 1433 return ret; 1434 } 1435 1436 /* remove redundant charge if migration failed*/ 1437 void mem_cgroup_end_migration(struct mem_cgroup *mem, 1438 struct page *oldpage, struct page *newpage) 1439 { 1440 struct page *target, *unused; 1441 struct page_cgroup *pc; 1442 enum charge_type ctype; 1443 1444 if (!mem) 1445 return; 1446 1447 /* at migration success, oldpage->mapping is NULL. */ 1448 if (oldpage->mapping) { 1449 target = oldpage; 1450 unused = NULL; 1451 } else { 1452 target = newpage; 1453 unused = oldpage; 1454 } 1455 1456 if (PageAnon(target)) 1457 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 1458 else if (page_is_file_cache(target)) 1459 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 1460 else 1461 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 1462 1463 /* unused page is not on radix-tree now. */ 1464 if (unused) 1465 __mem_cgroup_uncharge_common(unused, ctype); 1466 1467 pc = lookup_page_cgroup(target); 1468 /* 1469 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. 1470 * So, double-counting is effectively avoided. 1471 */ 1472 __mem_cgroup_commit_charge(mem, pc, ctype); 1473 1474 /* 1475 * Both of oldpage and newpage are still under lock_page(). 1476 * Then, we don't have to care about race in radix-tree. 1477 * But we have to be careful that this page is unmapped or not. 1478 * 1479 * There is a case for !page_mapped(). At the start of 1480 * migration, oldpage was mapped. But now, it's zapped. 1481 * But we know *target* page is not freed/reused under us. 1482 * mem_cgroup_uncharge_page() does all necessary checks. 1483 */ 1484 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 1485 mem_cgroup_uncharge_page(target); 1486 } 1487 1488 /* 1489 * A call to try to shrink memory usage under specified resource controller. 1490 * This is typically used for page reclaiming for shmem for reducing side 1491 * effect of page allocation from shmem, which is used by some mem_cgroup. 1492 */ 1493 int mem_cgroup_shrink_usage(struct page *page, 1494 struct mm_struct *mm, 1495 gfp_t gfp_mask) 1496 { 1497 struct mem_cgroup *mem = NULL; 1498 int progress = 0; 1499 int retry = MEM_CGROUP_RECLAIM_RETRIES; 1500 1501 if (mem_cgroup_disabled()) 1502 return 0; 1503 if (page) 1504 mem = try_get_mem_cgroup_from_swapcache(page); 1505 if (!mem && mm) 1506 mem = try_get_mem_cgroup_from_mm(mm); 1507 if (unlikely(!mem)) 1508 return 0; 1509 1510 do { 1511 progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); 1512 progress += mem_cgroup_check_under_limit(mem); 1513 } while (!progress && --retry); 1514 1515 css_put(&mem->css); 1516 if (!retry) 1517 return -ENOMEM; 1518 return 0; 1519 } 1520 1521 static DEFINE_MUTEX(set_limit_mutex); 1522 1523 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 1524 unsigned long long val) 1525 { 1526 1527 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1528 int progress; 1529 u64 memswlimit; 1530 int ret = 0; 1531 1532 while (retry_count) { 1533 if (signal_pending(current)) { 1534 ret = -EINTR; 1535 break; 1536 } 1537 /* 1538 * Rather than hide all in some function, I do this in 1539 * open coded manner. You see what this really does. 1540 * We have to guarantee mem->res.limit < mem->memsw.limit. 1541 */ 1542 mutex_lock(&set_limit_mutex); 1543 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1544 if (memswlimit < val) { 1545 ret = -EINVAL; 1546 mutex_unlock(&set_limit_mutex); 1547 break; 1548 } 1549 ret = res_counter_set_limit(&memcg->res, val); 1550 mutex_unlock(&set_limit_mutex); 1551 1552 if (!ret) 1553 break; 1554 1555 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 1556 false); 1557 if (!progress) retry_count--; 1558 } 1559 1560 return ret; 1561 } 1562 1563 int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 1564 unsigned long long val) 1565 { 1566 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1567 u64 memlimit, oldusage, curusage; 1568 int ret; 1569 1570 if (!do_swap_account) 1571 return -EINVAL; 1572 1573 while (retry_count) { 1574 if (signal_pending(current)) { 1575 ret = -EINTR; 1576 break; 1577 } 1578 /* 1579 * Rather than hide all in some function, I do this in 1580 * open coded manner. You see what this really does. 1581 * We have to guarantee mem->res.limit < mem->memsw.limit. 1582 */ 1583 mutex_lock(&set_limit_mutex); 1584 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1585 if (memlimit > val) { 1586 ret = -EINVAL; 1587 mutex_unlock(&set_limit_mutex); 1588 break; 1589 } 1590 ret = res_counter_set_limit(&memcg->memsw, val); 1591 mutex_unlock(&set_limit_mutex); 1592 1593 if (!ret) 1594 break; 1595 1596 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1597 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true); 1598 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1599 if (curusage >= oldusage) 1600 retry_count--; 1601 } 1602 return ret; 1603 } 1604 1605 /* 1606 * This routine traverse page_cgroup in given list and drop them all. 1607 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 1608 */ 1609 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 1610 int node, int zid, enum lru_list lru) 1611 { 1612 struct zone *zone; 1613 struct mem_cgroup_per_zone *mz; 1614 struct page_cgroup *pc, *busy; 1615 unsigned long flags, loop; 1616 struct list_head *list; 1617 int ret = 0; 1618 1619 zone = &NODE_DATA(node)->node_zones[zid]; 1620 mz = mem_cgroup_zoneinfo(mem, node, zid); 1621 list = &mz->lists[lru]; 1622 1623 loop = MEM_CGROUP_ZSTAT(mz, lru); 1624 /* give some margin against EBUSY etc...*/ 1625 loop += 256; 1626 busy = NULL; 1627 while (loop--) { 1628 ret = 0; 1629 spin_lock_irqsave(&zone->lru_lock, flags); 1630 if (list_empty(list)) { 1631 spin_unlock_irqrestore(&zone->lru_lock, flags); 1632 break; 1633 } 1634 pc = list_entry(list->prev, struct page_cgroup, lru); 1635 if (busy == pc) { 1636 list_move(&pc->lru, list); 1637 busy = 0; 1638 spin_unlock_irqrestore(&zone->lru_lock, flags); 1639 continue; 1640 } 1641 spin_unlock_irqrestore(&zone->lru_lock, flags); 1642 1643 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 1644 if (ret == -ENOMEM) 1645 break; 1646 1647 if (ret == -EBUSY || ret == -EINVAL) { 1648 /* found lock contention or "pc" is obsolete. */ 1649 busy = pc; 1650 cond_resched(); 1651 } else 1652 busy = NULL; 1653 } 1654 1655 if (!ret && !list_empty(list)) 1656 return -EBUSY; 1657 return ret; 1658 } 1659 1660 /* 1661 * make mem_cgroup's charge to be 0 if there is no task. 1662 * This enables deleting this mem_cgroup. 1663 */ 1664 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 1665 { 1666 int ret; 1667 int node, zid, shrink; 1668 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1669 struct cgroup *cgrp = mem->css.cgroup; 1670 1671 css_get(&mem->css); 1672 1673 shrink = 0; 1674 /* should free all ? */ 1675 if (free_all) 1676 goto try_to_free; 1677 move_account: 1678 while (mem->res.usage > 0) { 1679 ret = -EBUSY; 1680 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 1681 goto out; 1682 ret = -EINTR; 1683 if (signal_pending(current)) 1684 goto out; 1685 /* This is for making all *used* pages to be on LRU. */ 1686 lru_add_drain_all(); 1687 ret = 0; 1688 for_each_node_state(node, N_HIGH_MEMORY) { 1689 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 1690 enum lru_list l; 1691 for_each_lru(l) { 1692 ret = mem_cgroup_force_empty_list(mem, 1693 node, zid, l); 1694 if (ret) 1695 break; 1696 } 1697 } 1698 if (ret) 1699 break; 1700 } 1701 /* it seems parent cgroup doesn't have enough mem */ 1702 if (ret == -ENOMEM) 1703 goto try_to_free; 1704 cond_resched(); 1705 } 1706 ret = 0; 1707 out: 1708 css_put(&mem->css); 1709 return ret; 1710 1711 try_to_free: 1712 /* returns EBUSY if there is a task or if we come here twice. */ 1713 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 1714 ret = -EBUSY; 1715 goto out; 1716 } 1717 /* we call try-to-free pages for make this cgroup empty */ 1718 lru_add_drain_all(); 1719 /* try to free all pages in this cgroup */ 1720 shrink = 1; 1721 while (nr_retries && mem->res.usage > 0) { 1722 int progress; 1723 1724 if (signal_pending(current)) { 1725 ret = -EINTR; 1726 goto out; 1727 } 1728 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 1729 false, get_swappiness(mem)); 1730 if (!progress) { 1731 nr_retries--; 1732 /* maybe some writeback is necessary */ 1733 congestion_wait(WRITE, HZ/10); 1734 } 1735 1736 } 1737 lru_add_drain(); 1738 /* try move_account...there may be some *locked* pages. */ 1739 if (mem->res.usage) 1740 goto move_account; 1741 ret = 0; 1742 goto out; 1743 } 1744 1745 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 1746 { 1747 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 1748 } 1749 1750 1751 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 1752 { 1753 return mem_cgroup_from_cont(cont)->use_hierarchy; 1754 } 1755 1756 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 1757 u64 val) 1758 { 1759 int retval = 0; 1760 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1761 struct cgroup *parent = cont->parent; 1762 struct mem_cgroup *parent_mem = NULL; 1763 1764 if (parent) 1765 parent_mem = mem_cgroup_from_cont(parent); 1766 1767 cgroup_lock(); 1768 /* 1769 * If parent's use_hiearchy is set, we can't make any modifications 1770 * in the child subtrees. If it is unset, then the change can 1771 * occur, provided the current cgroup has no children. 1772 * 1773 * For the root cgroup, parent_mem is NULL, we allow value to be 1774 * set if there are no children. 1775 */ 1776 if ((!parent_mem || !parent_mem->use_hierarchy) && 1777 (val == 1 || val == 0)) { 1778 if (list_empty(&cont->children)) 1779 mem->use_hierarchy = val; 1780 else 1781 retval = -EBUSY; 1782 } else 1783 retval = -EINVAL; 1784 cgroup_unlock(); 1785 1786 return retval; 1787 } 1788 1789 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 1790 { 1791 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1792 u64 val = 0; 1793 int type, name; 1794 1795 type = MEMFILE_TYPE(cft->private); 1796 name = MEMFILE_ATTR(cft->private); 1797 switch (type) { 1798 case _MEM: 1799 val = res_counter_read_u64(&mem->res, name); 1800 break; 1801 case _MEMSWAP: 1802 if (do_swap_account) 1803 val = res_counter_read_u64(&mem->memsw, name); 1804 break; 1805 default: 1806 BUG(); 1807 break; 1808 } 1809 return val; 1810 } 1811 /* 1812 * The user of this function is... 1813 * RES_LIMIT. 1814 */ 1815 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 1816 const char *buffer) 1817 { 1818 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 1819 int type, name; 1820 unsigned long long val; 1821 int ret; 1822 1823 type = MEMFILE_TYPE(cft->private); 1824 name = MEMFILE_ATTR(cft->private); 1825 switch (name) { 1826 case RES_LIMIT: 1827 /* This function does all necessary parse...reuse it */ 1828 ret = res_counter_memparse_write_strategy(buffer, &val); 1829 if (ret) 1830 break; 1831 if (type == _MEM) 1832 ret = mem_cgroup_resize_limit(memcg, val); 1833 else 1834 ret = mem_cgroup_resize_memsw_limit(memcg, val); 1835 break; 1836 default: 1837 ret = -EINVAL; /* should be BUG() ? */ 1838 break; 1839 } 1840 return ret; 1841 } 1842 1843 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 1844 unsigned long long *mem_limit, unsigned long long *memsw_limit) 1845 { 1846 struct cgroup *cgroup; 1847 unsigned long long min_limit, min_memsw_limit, tmp; 1848 1849 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1850 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1851 cgroup = memcg->css.cgroup; 1852 if (!memcg->use_hierarchy) 1853 goto out; 1854 1855 while (cgroup->parent) { 1856 cgroup = cgroup->parent; 1857 memcg = mem_cgroup_from_cont(cgroup); 1858 if (!memcg->use_hierarchy) 1859 break; 1860 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 1861 min_limit = min(min_limit, tmp); 1862 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1863 min_memsw_limit = min(min_memsw_limit, tmp); 1864 } 1865 out: 1866 *mem_limit = min_limit; 1867 *memsw_limit = min_memsw_limit; 1868 return; 1869 } 1870 1871 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 1872 { 1873 struct mem_cgroup *mem; 1874 int type, name; 1875 1876 mem = mem_cgroup_from_cont(cont); 1877 type = MEMFILE_TYPE(event); 1878 name = MEMFILE_ATTR(event); 1879 switch (name) { 1880 case RES_MAX_USAGE: 1881 if (type == _MEM) 1882 res_counter_reset_max(&mem->res); 1883 else 1884 res_counter_reset_max(&mem->memsw); 1885 break; 1886 case RES_FAILCNT: 1887 if (type == _MEM) 1888 res_counter_reset_failcnt(&mem->res); 1889 else 1890 res_counter_reset_failcnt(&mem->memsw); 1891 break; 1892 } 1893 return 0; 1894 } 1895 1896 static const struct mem_cgroup_stat_desc { 1897 const char *msg; 1898 u64 unit; 1899 } mem_cgroup_stat_desc[] = { 1900 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 1901 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 1902 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, 1903 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, 1904 }; 1905 1906 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 1907 struct cgroup_map_cb *cb) 1908 { 1909 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 1910 struct mem_cgroup_stat *stat = &mem_cont->stat; 1911 int i; 1912 1913 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 1914 s64 val; 1915 1916 val = mem_cgroup_read_stat(stat, i); 1917 val *= mem_cgroup_stat_desc[i].unit; 1918 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); 1919 } 1920 /* showing # of active pages */ 1921 { 1922 unsigned long active_anon, inactive_anon; 1923 unsigned long active_file, inactive_file; 1924 unsigned long unevictable; 1925 1926 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, 1927 LRU_INACTIVE_ANON); 1928 active_anon = mem_cgroup_get_all_zonestat(mem_cont, 1929 LRU_ACTIVE_ANON); 1930 inactive_file = mem_cgroup_get_all_zonestat(mem_cont, 1931 LRU_INACTIVE_FILE); 1932 active_file = mem_cgroup_get_all_zonestat(mem_cont, 1933 LRU_ACTIVE_FILE); 1934 unevictable = mem_cgroup_get_all_zonestat(mem_cont, 1935 LRU_UNEVICTABLE); 1936 1937 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); 1938 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); 1939 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); 1940 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); 1941 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); 1942 1943 } 1944 { 1945 unsigned long long limit, memsw_limit; 1946 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 1947 cb->fill(cb, "hierarchical_memory_limit", limit); 1948 if (do_swap_account) 1949 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 1950 } 1951 1952 #ifdef CONFIG_DEBUG_VM 1953 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 1954 1955 { 1956 int nid, zid; 1957 struct mem_cgroup_per_zone *mz; 1958 unsigned long recent_rotated[2] = {0, 0}; 1959 unsigned long recent_scanned[2] = {0, 0}; 1960 1961 for_each_online_node(nid) 1962 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1963 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 1964 1965 recent_rotated[0] += 1966 mz->reclaim_stat.recent_rotated[0]; 1967 recent_rotated[1] += 1968 mz->reclaim_stat.recent_rotated[1]; 1969 recent_scanned[0] += 1970 mz->reclaim_stat.recent_scanned[0]; 1971 recent_scanned[1] += 1972 mz->reclaim_stat.recent_scanned[1]; 1973 } 1974 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 1975 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 1976 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 1977 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 1978 } 1979 #endif 1980 1981 return 0; 1982 } 1983 1984 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 1985 { 1986 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 1987 1988 return get_swappiness(memcg); 1989 } 1990 1991 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 1992 u64 val) 1993 { 1994 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 1995 struct mem_cgroup *parent; 1996 1997 if (val > 100) 1998 return -EINVAL; 1999 2000 if (cgrp->parent == NULL) 2001 return -EINVAL; 2002 2003 parent = mem_cgroup_from_cont(cgrp->parent); 2004 2005 cgroup_lock(); 2006 2007 /* If under hierarchy, only empty-root can set this value */ 2008 if ((parent->use_hierarchy) || 2009 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 2010 cgroup_unlock(); 2011 return -EINVAL; 2012 } 2013 2014 spin_lock(&memcg->reclaim_param_lock); 2015 memcg->swappiness = val; 2016 spin_unlock(&memcg->reclaim_param_lock); 2017 2018 cgroup_unlock(); 2019 2020 return 0; 2021 } 2022 2023 2024 static struct cftype mem_cgroup_files[] = { 2025 { 2026 .name = "usage_in_bytes", 2027 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 2028 .read_u64 = mem_cgroup_read, 2029 }, 2030 { 2031 .name = "max_usage_in_bytes", 2032 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 2033 .trigger = mem_cgroup_reset, 2034 .read_u64 = mem_cgroup_read, 2035 }, 2036 { 2037 .name = "limit_in_bytes", 2038 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 2039 .write_string = mem_cgroup_write, 2040 .read_u64 = mem_cgroup_read, 2041 }, 2042 { 2043 .name = "failcnt", 2044 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2045 .trigger = mem_cgroup_reset, 2046 .read_u64 = mem_cgroup_read, 2047 }, 2048 { 2049 .name = "stat", 2050 .read_map = mem_control_stat_show, 2051 }, 2052 { 2053 .name = "force_empty", 2054 .trigger = mem_cgroup_force_empty_write, 2055 }, 2056 { 2057 .name = "use_hierarchy", 2058 .write_u64 = mem_cgroup_hierarchy_write, 2059 .read_u64 = mem_cgroup_hierarchy_read, 2060 }, 2061 { 2062 .name = "swappiness", 2063 .read_u64 = mem_cgroup_swappiness_read, 2064 .write_u64 = mem_cgroup_swappiness_write, 2065 }, 2066 }; 2067 2068 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2069 static struct cftype memsw_cgroup_files[] = { 2070 { 2071 .name = "memsw.usage_in_bytes", 2072 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 2073 .read_u64 = mem_cgroup_read, 2074 }, 2075 { 2076 .name = "memsw.max_usage_in_bytes", 2077 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 2078 .trigger = mem_cgroup_reset, 2079 .read_u64 = mem_cgroup_read, 2080 }, 2081 { 2082 .name = "memsw.limit_in_bytes", 2083 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 2084 .write_string = mem_cgroup_write, 2085 .read_u64 = mem_cgroup_read, 2086 }, 2087 { 2088 .name = "memsw.failcnt", 2089 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 2090 .trigger = mem_cgroup_reset, 2091 .read_u64 = mem_cgroup_read, 2092 }, 2093 }; 2094 2095 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 2096 { 2097 if (!do_swap_account) 2098 return 0; 2099 return cgroup_add_files(cont, ss, memsw_cgroup_files, 2100 ARRAY_SIZE(memsw_cgroup_files)); 2101 }; 2102 #else 2103 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 2104 { 2105 return 0; 2106 } 2107 #endif 2108 2109 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2110 { 2111 struct mem_cgroup_per_node *pn; 2112 struct mem_cgroup_per_zone *mz; 2113 enum lru_list l; 2114 int zone, tmp = node; 2115 /* 2116 * This routine is called against possible nodes. 2117 * But it's BUG to call kmalloc() against offline node. 2118 * 2119 * TODO: this routine can waste much memory for nodes which will 2120 * never be onlined. It's better to use memory hotplug callback 2121 * function. 2122 */ 2123 if (!node_state(node, N_NORMAL_MEMORY)) 2124 tmp = -1; 2125 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 2126 if (!pn) 2127 return 1; 2128 2129 mem->info.nodeinfo[node] = pn; 2130 memset(pn, 0, sizeof(*pn)); 2131 2132 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 2133 mz = &pn->zoneinfo[zone]; 2134 for_each_lru(l) 2135 INIT_LIST_HEAD(&mz->lists[l]); 2136 } 2137 return 0; 2138 } 2139 2140 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2141 { 2142 kfree(mem->info.nodeinfo[node]); 2143 } 2144 2145 static int mem_cgroup_size(void) 2146 { 2147 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); 2148 return sizeof(struct mem_cgroup) + cpustat_size; 2149 } 2150 2151 static struct mem_cgroup *mem_cgroup_alloc(void) 2152 { 2153 struct mem_cgroup *mem; 2154 int size = mem_cgroup_size(); 2155 2156 if (size < PAGE_SIZE) 2157 mem = kmalloc(size, GFP_KERNEL); 2158 else 2159 mem = vmalloc(size); 2160 2161 if (mem) 2162 memset(mem, 0, size); 2163 return mem; 2164 } 2165 2166 /* 2167 * At destroying mem_cgroup, references from swap_cgroup can remain. 2168 * (scanning all at force_empty is too costly...) 2169 * 2170 * Instead of clearing all references at force_empty, we remember 2171 * the number of reference from swap_cgroup and free mem_cgroup when 2172 * it goes down to 0. 2173 * 2174 * Removal of cgroup itself succeeds regardless of refs from swap. 2175 */ 2176 2177 static void __mem_cgroup_free(struct mem_cgroup *mem) 2178 { 2179 int node; 2180 2181 for_each_node_state(node, N_POSSIBLE) 2182 free_mem_cgroup_per_zone_info(mem, node); 2183 2184 if (mem_cgroup_size() < PAGE_SIZE) 2185 kfree(mem); 2186 else 2187 vfree(mem); 2188 } 2189 2190 static void mem_cgroup_get(struct mem_cgroup *mem) 2191 { 2192 atomic_inc(&mem->refcnt); 2193 } 2194 2195 static void mem_cgroup_put(struct mem_cgroup *mem) 2196 { 2197 if (atomic_dec_and_test(&mem->refcnt)) { 2198 struct mem_cgroup *parent = parent_mem_cgroup(mem); 2199 __mem_cgroup_free(mem); 2200 if (parent) 2201 mem_cgroup_put(parent); 2202 } 2203 } 2204 2205 /* 2206 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 2207 */ 2208 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 2209 { 2210 if (!mem->res.parent) 2211 return NULL; 2212 return mem_cgroup_from_res_counter(mem->res.parent, res); 2213 } 2214 2215 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2216 static void __init enable_swap_cgroup(void) 2217 { 2218 if (!mem_cgroup_disabled() && really_do_swap_account) 2219 do_swap_account = 1; 2220 } 2221 #else 2222 static void __init enable_swap_cgroup(void) 2223 { 2224 } 2225 #endif 2226 2227 static struct cgroup_subsys_state * __ref 2228 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 2229 { 2230 struct mem_cgroup *mem, *parent; 2231 int node; 2232 2233 mem = mem_cgroup_alloc(); 2234 if (!mem) 2235 return ERR_PTR(-ENOMEM); 2236 2237 for_each_node_state(node, N_POSSIBLE) 2238 if (alloc_mem_cgroup_per_zone_info(mem, node)) 2239 goto free_out; 2240 /* root ? */ 2241 if (cont->parent == NULL) { 2242 enable_swap_cgroup(); 2243 parent = NULL; 2244 } else { 2245 parent = mem_cgroup_from_cont(cont->parent); 2246 mem->use_hierarchy = parent->use_hierarchy; 2247 } 2248 2249 if (parent && parent->use_hierarchy) { 2250 res_counter_init(&mem->res, &parent->res); 2251 res_counter_init(&mem->memsw, &parent->memsw); 2252 /* 2253 * We increment refcnt of the parent to ensure that we can 2254 * safely access it on res_counter_charge/uncharge. 2255 * This refcnt will be decremented when freeing this 2256 * mem_cgroup(see mem_cgroup_put). 2257 */ 2258 mem_cgroup_get(parent); 2259 } else { 2260 res_counter_init(&mem->res, NULL); 2261 res_counter_init(&mem->memsw, NULL); 2262 } 2263 mem->last_scanned_child = NULL; 2264 spin_lock_init(&mem->reclaim_param_lock); 2265 2266 if (parent) 2267 mem->swappiness = get_swappiness(parent); 2268 atomic_set(&mem->refcnt, 1); 2269 return &mem->css; 2270 free_out: 2271 __mem_cgroup_free(mem); 2272 return ERR_PTR(-ENOMEM); 2273 } 2274 2275 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 2276 struct cgroup *cont) 2277 { 2278 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2279 mem_cgroup_force_empty(mem, false); 2280 } 2281 2282 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 2283 struct cgroup *cont) 2284 { 2285 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2286 struct mem_cgroup *last_scanned_child = mem->last_scanned_child; 2287 2288 if (last_scanned_child) { 2289 VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child)); 2290 mem_cgroup_put(last_scanned_child); 2291 } 2292 mem_cgroup_put(mem); 2293 } 2294 2295 static int mem_cgroup_populate(struct cgroup_subsys *ss, 2296 struct cgroup *cont) 2297 { 2298 int ret; 2299 2300 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 2301 ARRAY_SIZE(mem_cgroup_files)); 2302 2303 if (!ret) 2304 ret = register_memsw_files(cont, ss); 2305 return ret; 2306 } 2307 2308 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 2309 struct cgroup *cont, 2310 struct cgroup *old_cont, 2311 struct task_struct *p) 2312 { 2313 mutex_lock(&memcg_tasklist); 2314 /* 2315 * FIXME: It's better to move charges of this process from old 2316 * memcg to new memcg. But it's just on TODO-List now. 2317 */ 2318 mutex_unlock(&memcg_tasklist); 2319 } 2320 2321 struct cgroup_subsys mem_cgroup_subsys = { 2322 .name = "memory", 2323 .subsys_id = mem_cgroup_subsys_id, 2324 .create = mem_cgroup_create, 2325 .pre_destroy = mem_cgroup_pre_destroy, 2326 .destroy = mem_cgroup_destroy, 2327 .populate = mem_cgroup_populate, 2328 .attach = mem_cgroup_move_task, 2329 .early_init = 0, 2330 }; 2331 2332 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2333 2334 static int __init disable_swap_account(char *s) 2335 { 2336 really_do_swap_account = 0; 2337 return 1; 2338 } 2339 __setup("noswapaccount", disable_swap_account); 2340 #endif 2341