1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20 #include <linux/res_counter.h> 21 #include <linux/memcontrol.h> 22 #include <linux/cgroup.h> 23 #include <linux/mm.h> 24 #include <linux/smp.h> 25 #include <linux/page-flags.h> 26 #include <linux/backing-dev.h> 27 #include <linux/bit_spinlock.h> 28 #include <linux/rcupdate.h> 29 #include <linux/slab.h> 30 #include <linux/swap.h> 31 #include <linux/spinlock.h> 32 #include <linux/fs.h> 33 #include <linux/seq_file.h> 34 #include <linux/vmalloc.h> 35 36 #include <asm/uaccess.h> 37 38 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 39 static struct kmem_cache *page_cgroup_cache __read_mostly; 40 #define MEM_CGROUP_RECLAIM_RETRIES 5 41 42 /* 43 * Statistics for memory cgroup. 44 */ 45 enum mem_cgroup_stat_index { 46 /* 47 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 48 */ 49 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 50 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 51 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 52 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 53 54 MEM_CGROUP_STAT_NSTATS, 55 }; 56 57 struct mem_cgroup_stat_cpu { 58 s64 count[MEM_CGROUP_STAT_NSTATS]; 59 } ____cacheline_aligned_in_smp; 60 61 struct mem_cgroup_stat { 62 struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; 63 }; 64 65 /* 66 * For accounting under irq disable, no need for increment preempt count. 67 */ 68 static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, 69 enum mem_cgroup_stat_index idx, int val) 70 { 71 int cpu = smp_processor_id(); 72 stat->cpustat[cpu].count[idx] += val; 73 } 74 75 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 76 enum mem_cgroup_stat_index idx) 77 { 78 int cpu; 79 s64 ret = 0; 80 for_each_possible_cpu(cpu) 81 ret += stat->cpustat[cpu].count[idx]; 82 return ret; 83 } 84 85 /* 86 * per-zone information in memory controller. 87 */ 88 89 enum mem_cgroup_zstat_index { 90 MEM_CGROUP_ZSTAT_ACTIVE, 91 MEM_CGROUP_ZSTAT_INACTIVE, 92 93 NR_MEM_CGROUP_ZSTAT, 94 }; 95 96 struct mem_cgroup_per_zone { 97 /* 98 * spin_lock to protect the per cgroup LRU 99 */ 100 spinlock_t lru_lock; 101 struct list_head active_list; 102 struct list_head inactive_list; 103 unsigned long count[NR_MEM_CGROUP_ZSTAT]; 104 }; 105 /* Macro for accessing counter */ 106 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 107 108 struct mem_cgroup_per_node { 109 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 110 }; 111 112 struct mem_cgroup_lru_info { 113 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 114 }; 115 116 /* 117 * The memory controller data structure. The memory controller controls both 118 * page cache and RSS per cgroup. We would eventually like to provide 119 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 120 * to help the administrator determine what knobs to tune. 121 * 122 * TODO: Add a water mark for the memory controller. Reclaim will begin when 123 * we hit the water mark. May be even add a low water mark, such that 124 * no reclaim occurs from a cgroup at it's low water mark, this is 125 * a feature that will be implemented much later in the future. 126 */ 127 struct mem_cgroup { 128 struct cgroup_subsys_state css; 129 /* 130 * the counter to account for memory usage 131 */ 132 struct res_counter res; 133 /* 134 * Per cgroup active and inactive list, similar to the 135 * per zone LRU lists. 136 */ 137 struct mem_cgroup_lru_info info; 138 139 int prev_priority; /* for recording reclaim priority */ 140 /* 141 * statistics. 142 */ 143 struct mem_cgroup_stat stat; 144 }; 145 static struct mem_cgroup init_mem_cgroup; 146 147 /* 148 * We use the lower bit of the page->page_cgroup pointer as a bit spin 149 * lock. We need to ensure that page->page_cgroup is at least two 150 * byte aligned (based on comments from Nick Piggin). But since 151 * bit_spin_lock doesn't actually set that lock bit in a non-debug 152 * uniprocessor kernel, we should avoid setting it here too. 153 */ 154 #define PAGE_CGROUP_LOCK_BIT 0x0 155 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) 156 #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) 157 #else 158 #define PAGE_CGROUP_LOCK 0x0 159 #endif 160 161 /* 162 * A page_cgroup page is associated with every page descriptor. The 163 * page_cgroup helps us identify information about the cgroup 164 */ 165 struct page_cgroup { 166 struct list_head lru; /* per cgroup LRU list */ 167 struct page *page; 168 struct mem_cgroup *mem_cgroup; 169 int flags; 170 }; 171 #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 172 #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ 173 174 static int page_cgroup_nid(struct page_cgroup *pc) 175 { 176 return page_to_nid(pc->page); 177 } 178 179 static enum zone_type page_cgroup_zid(struct page_cgroup *pc) 180 { 181 return page_zonenum(pc->page); 182 } 183 184 enum charge_type { 185 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 186 MEM_CGROUP_CHARGE_TYPE_MAPPED, 187 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 188 }; 189 190 /* 191 * Always modified under lru lock. Then, not necessary to preempt_disable() 192 */ 193 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, 194 bool charge) 195 { 196 int val = (charge)? 1 : -1; 197 struct mem_cgroup_stat *stat = &mem->stat; 198 199 VM_BUG_ON(!irqs_disabled()); 200 if (flags & PAGE_CGROUP_FLAG_CACHE) 201 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); 202 else 203 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 204 205 if (charge) 206 __mem_cgroup_stat_add_safe(stat, 207 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 208 else 209 __mem_cgroup_stat_add_safe(stat, 210 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 211 } 212 213 static struct mem_cgroup_per_zone * 214 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 215 { 216 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 217 } 218 219 static struct mem_cgroup_per_zone * 220 page_cgroup_zoneinfo(struct page_cgroup *pc) 221 { 222 struct mem_cgroup *mem = pc->mem_cgroup; 223 int nid = page_cgroup_nid(pc); 224 int zid = page_cgroup_zid(pc); 225 226 return mem_cgroup_zoneinfo(mem, nid, zid); 227 } 228 229 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 230 enum mem_cgroup_zstat_index idx) 231 { 232 int nid, zid; 233 struct mem_cgroup_per_zone *mz; 234 u64 total = 0; 235 236 for_each_online_node(nid) 237 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 238 mz = mem_cgroup_zoneinfo(mem, nid, zid); 239 total += MEM_CGROUP_ZSTAT(mz, idx); 240 } 241 return total; 242 } 243 244 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 245 { 246 return container_of(cgroup_subsys_state(cont, 247 mem_cgroup_subsys_id), struct mem_cgroup, 248 css); 249 } 250 251 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 252 { 253 /* 254 * mm_update_next_owner() may clear mm->owner to NULL 255 * if it races with swapoff, page migration, etc. 256 * So this can be called with p == NULL. 257 */ 258 if (unlikely(!p)) 259 return NULL; 260 261 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 262 struct mem_cgroup, css); 263 } 264 265 static inline int page_cgroup_locked(struct page *page) 266 { 267 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 268 } 269 270 static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 271 { 272 VM_BUG_ON(!page_cgroup_locked(page)); 273 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); 274 } 275 276 struct page_cgroup *page_get_page_cgroup(struct page *page) 277 { 278 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); 279 } 280 281 static void lock_page_cgroup(struct page *page) 282 { 283 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 284 } 285 286 static int try_lock_page_cgroup(struct page *page) 287 { 288 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 289 } 290 291 static void unlock_page_cgroup(struct page *page) 292 { 293 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 294 } 295 296 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 297 struct page_cgroup *pc) 298 { 299 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 300 301 if (from) 302 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 303 else 304 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 305 306 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 307 list_del(&pc->lru); 308 } 309 310 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 311 struct page_cgroup *pc) 312 { 313 int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 314 315 if (!to) { 316 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 317 list_add(&pc->lru, &mz->inactive_list); 318 } else { 319 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 320 list_add(&pc->lru, &mz->active_list); 321 } 322 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 323 } 324 325 static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 326 { 327 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 328 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 329 330 if (from) 331 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 332 else 333 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 334 335 if (active) { 336 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 337 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 338 list_move(&pc->lru, &mz->active_list); 339 } else { 340 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 341 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 342 list_move(&pc->lru, &mz->inactive_list); 343 } 344 } 345 346 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 347 { 348 int ret; 349 350 task_lock(task); 351 ret = task->mm && mm_match_cgroup(task->mm, mem); 352 task_unlock(task); 353 return ret; 354 } 355 356 /* 357 * This routine assumes that the appropriate zone's lru lock is already held 358 */ 359 void mem_cgroup_move_lists(struct page *page, bool active) 360 { 361 struct page_cgroup *pc; 362 struct mem_cgroup_per_zone *mz; 363 unsigned long flags; 364 365 if (mem_cgroup_subsys.disabled) 366 return; 367 368 /* 369 * We cannot lock_page_cgroup while holding zone's lru_lock, 370 * because other holders of lock_page_cgroup can be interrupted 371 * with an attempt to rotate_reclaimable_page. But we cannot 372 * safely get to page_cgroup without it, so just try_lock it: 373 * mem_cgroup_isolate_pages allows for page left on wrong list. 374 */ 375 if (!try_lock_page_cgroup(page)) 376 return; 377 378 pc = page_get_page_cgroup(page); 379 if (pc) { 380 mz = page_cgroup_zoneinfo(pc); 381 spin_lock_irqsave(&mz->lru_lock, flags); 382 __mem_cgroup_move_lists(pc, active); 383 spin_unlock_irqrestore(&mz->lru_lock, flags); 384 } 385 unlock_page_cgroup(page); 386 } 387 388 /* 389 * Calculate mapped_ratio under memory controller. This will be used in 390 * vmscan.c for deteremining we have to reclaim mapped pages. 391 */ 392 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) 393 { 394 long total, rss; 395 396 /* 397 * usage is recorded in bytes. But, here, we assume the number of 398 * physical pages can be represented by "long" on any arch. 399 */ 400 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; 401 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 402 return (int)((rss * 100L) / total); 403 } 404 405 /* 406 * This function is called from vmscan.c. In page reclaiming loop. balance 407 * between active and inactive list is calculated. For memory controller 408 * page reclaiming, we should use using mem_cgroup's imbalance rather than 409 * zone's global lru imbalance. 410 */ 411 long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) 412 { 413 unsigned long active, inactive; 414 /* active and inactive are the number of pages. 'long' is ok.*/ 415 active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); 416 inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); 417 return (long) (active / (inactive + 1)); 418 } 419 420 /* 421 * prev_priority control...this will be used in memory reclaim path. 422 */ 423 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 424 { 425 return mem->prev_priority; 426 } 427 428 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 429 { 430 if (priority < mem->prev_priority) 431 mem->prev_priority = priority; 432 } 433 434 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 435 { 436 mem->prev_priority = priority; 437 } 438 439 /* 440 * Calculate # of pages to be scanned in this priority/zone. 441 * See also vmscan.c 442 * 443 * priority starts from "DEF_PRIORITY" and decremented in each loop. 444 * (see include/linux/mmzone.h) 445 */ 446 447 long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, 448 struct zone *zone, int priority) 449 { 450 long nr_active; 451 int nid = zone->zone_pgdat->node_id; 452 int zid = zone_idx(zone); 453 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 454 455 nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); 456 return (nr_active >> priority); 457 } 458 459 long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, 460 struct zone *zone, int priority) 461 { 462 long nr_inactive; 463 int nid = zone->zone_pgdat->node_id; 464 int zid = zone_idx(zone); 465 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 466 467 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); 468 return (nr_inactive >> priority); 469 } 470 471 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 472 struct list_head *dst, 473 unsigned long *scanned, int order, 474 int mode, struct zone *z, 475 struct mem_cgroup *mem_cont, 476 int active) 477 { 478 unsigned long nr_taken = 0; 479 struct page *page; 480 unsigned long scan; 481 LIST_HEAD(pc_list); 482 struct list_head *src; 483 struct page_cgroup *pc, *tmp; 484 int nid = z->zone_pgdat->node_id; 485 int zid = zone_idx(z); 486 struct mem_cgroup_per_zone *mz; 487 488 BUG_ON(!mem_cont); 489 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 490 if (active) 491 src = &mz->active_list; 492 else 493 src = &mz->inactive_list; 494 495 496 spin_lock(&mz->lru_lock); 497 scan = 0; 498 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 499 if (scan >= nr_to_scan) 500 break; 501 page = pc->page; 502 503 if (unlikely(!PageLRU(page))) 504 continue; 505 506 if (PageActive(page) && !active) { 507 __mem_cgroup_move_lists(pc, true); 508 continue; 509 } 510 if (!PageActive(page) && active) { 511 __mem_cgroup_move_lists(pc, false); 512 continue; 513 } 514 515 scan++; 516 list_move(&pc->lru, &pc_list); 517 518 if (__isolate_lru_page(page, mode) == 0) { 519 list_move(&page->lru, dst); 520 nr_taken++; 521 } 522 } 523 524 list_splice(&pc_list, src); 525 spin_unlock(&mz->lru_lock); 526 527 *scanned = scan; 528 return nr_taken; 529 } 530 531 /* 532 * Charge the memory controller for page usage. 533 * Return 534 * 0 if the charge was successful 535 * < 0 if the cgroup is over its limit 536 */ 537 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 538 gfp_t gfp_mask, enum charge_type ctype, 539 struct mem_cgroup *memcg) 540 { 541 struct mem_cgroup *mem; 542 struct page_cgroup *pc; 543 unsigned long flags; 544 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 545 struct mem_cgroup_per_zone *mz; 546 547 pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); 548 if (unlikely(pc == NULL)) 549 goto err; 550 551 /* 552 * We always charge the cgroup the mm_struct belongs to. 553 * The mm_struct's mem_cgroup changes on task migration if the 554 * thread group leader migrates. It's possible that mm is not 555 * set, if so charge the init_mm (happens for pagecache usage). 556 */ 557 if (likely(!memcg)) { 558 rcu_read_lock(); 559 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 560 if (unlikely(!mem)) { 561 rcu_read_unlock(); 562 kmem_cache_free(page_cgroup_cache, pc); 563 return 0; 564 } 565 /* 566 * For every charge from the cgroup, increment reference count 567 */ 568 css_get(&mem->css); 569 rcu_read_unlock(); 570 } else { 571 mem = memcg; 572 css_get(&memcg->css); 573 } 574 575 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 576 if (!(gfp_mask & __GFP_WAIT)) 577 goto out; 578 579 if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) 580 continue; 581 582 /* 583 * try_to_free_mem_cgroup_pages() might not give us a full 584 * picture of reclaim. Some pages are reclaimed and might be 585 * moved to swap cache or just unmapped from the cgroup. 586 * Check the limit again to see if the reclaim reduced the 587 * current usage of the cgroup before giving up 588 */ 589 if (res_counter_check_under_limit(&mem->res)) 590 continue; 591 592 if (!nr_retries--) { 593 mem_cgroup_out_of_memory(mem, gfp_mask); 594 goto out; 595 } 596 } 597 598 pc->mem_cgroup = mem; 599 pc->page = page; 600 /* 601 * If a page is accounted as a page cache, insert to inactive list. 602 * If anon, insert to active list. 603 */ 604 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 605 pc->flags = PAGE_CGROUP_FLAG_CACHE; 606 else 607 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 608 609 lock_page_cgroup(page); 610 if (unlikely(page_get_page_cgroup(page))) { 611 unlock_page_cgroup(page); 612 res_counter_uncharge(&mem->res, PAGE_SIZE); 613 css_put(&mem->css); 614 kmem_cache_free(page_cgroup_cache, pc); 615 goto done; 616 } 617 page_assign_page_cgroup(page, pc); 618 619 mz = page_cgroup_zoneinfo(pc); 620 spin_lock_irqsave(&mz->lru_lock, flags); 621 __mem_cgroup_add_list(mz, pc); 622 spin_unlock_irqrestore(&mz->lru_lock, flags); 623 624 unlock_page_cgroup(page); 625 done: 626 return 0; 627 out: 628 css_put(&mem->css); 629 kmem_cache_free(page_cgroup_cache, pc); 630 err: 631 return -ENOMEM; 632 } 633 634 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 635 { 636 if (mem_cgroup_subsys.disabled) 637 return 0; 638 639 /* 640 * If already mapped, we don't have to account. 641 * If page cache, page->mapping has address_space. 642 * But page->mapping may have out-of-use anon_vma pointer, 643 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 644 * is NULL. 645 */ 646 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 647 return 0; 648 if (unlikely(!mm)) 649 mm = &init_mm; 650 return mem_cgroup_charge_common(page, mm, gfp_mask, 651 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 652 } 653 654 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 655 gfp_t gfp_mask) 656 { 657 if (mem_cgroup_subsys.disabled) 658 return 0; 659 660 /* 661 * Corner case handling. This is called from add_to_page_cache() 662 * in usual. But some FS (shmem) precharges this page before calling it 663 * and call add_to_page_cache() with GFP_NOWAIT. 664 * 665 * For GFP_NOWAIT case, the page may be pre-charged before calling 666 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 667 * charge twice. (It works but has to pay a bit larger cost.) 668 */ 669 if (!(gfp_mask & __GFP_WAIT)) { 670 struct page_cgroup *pc; 671 672 lock_page_cgroup(page); 673 pc = page_get_page_cgroup(page); 674 if (pc) { 675 VM_BUG_ON(pc->page != page); 676 VM_BUG_ON(!pc->mem_cgroup); 677 unlock_page_cgroup(page); 678 return 0; 679 } 680 unlock_page_cgroup(page); 681 } 682 683 if (unlikely(!mm)) 684 mm = &init_mm; 685 686 return mem_cgroup_charge_common(page, mm, gfp_mask, 687 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 688 } 689 690 /* 691 * uncharge if !page_mapped(page) 692 */ 693 static void 694 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 695 { 696 struct page_cgroup *pc; 697 struct mem_cgroup *mem; 698 struct mem_cgroup_per_zone *mz; 699 unsigned long flags; 700 701 if (mem_cgroup_subsys.disabled) 702 return; 703 704 /* 705 * Check if our page_cgroup is valid 706 */ 707 lock_page_cgroup(page); 708 pc = page_get_page_cgroup(page); 709 if (unlikely(!pc)) 710 goto unlock; 711 712 VM_BUG_ON(pc->page != page); 713 714 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 715 && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) 716 || page_mapped(page))) 717 goto unlock; 718 719 mz = page_cgroup_zoneinfo(pc); 720 spin_lock_irqsave(&mz->lru_lock, flags); 721 __mem_cgroup_remove_list(mz, pc); 722 spin_unlock_irqrestore(&mz->lru_lock, flags); 723 724 page_assign_page_cgroup(page, NULL); 725 unlock_page_cgroup(page); 726 727 mem = pc->mem_cgroup; 728 res_counter_uncharge(&mem->res, PAGE_SIZE); 729 css_put(&mem->css); 730 731 kmem_cache_free(page_cgroup_cache, pc); 732 return; 733 unlock: 734 unlock_page_cgroup(page); 735 } 736 737 void mem_cgroup_uncharge_page(struct page *page) 738 { 739 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 740 } 741 742 void mem_cgroup_uncharge_cache_page(struct page *page) 743 { 744 VM_BUG_ON(page_mapped(page)); 745 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 746 } 747 748 /* 749 * Before starting migration, account against new page. 750 */ 751 int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) 752 { 753 struct page_cgroup *pc; 754 struct mem_cgroup *mem = NULL; 755 enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 756 int ret = 0; 757 758 if (mem_cgroup_subsys.disabled) 759 return 0; 760 761 lock_page_cgroup(page); 762 pc = page_get_page_cgroup(page); 763 if (pc) { 764 mem = pc->mem_cgroup; 765 css_get(&mem->css); 766 if (pc->flags & PAGE_CGROUP_FLAG_CACHE) 767 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 768 } 769 unlock_page_cgroup(page); 770 if (mem) { 771 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, 772 ctype, mem); 773 css_put(&mem->css); 774 } 775 return ret; 776 } 777 778 /* remove redundant charge if migration failed*/ 779 void mem_cgroup_end_migration(struct page *newpage) 780 { 781 /* 782 * At success, page->mapping is not NULL. 783 * special rollback care is necessary when 784 * 1. at migration failure. (newpage->mapping is cleared in this case) 785 * 2. the newpage was moved but not remapped again because the task 786 * exits and the newpage is obsolete. In this case, the new page 787 * may be a swapcache. So, we just call mem_cgroup_uncharge_page() 788 * always for avoiding mess. The page_cgroup will be removed if 789 * unnecessary. File cache pages is still on radix-tree. Don't 790 * care it. 791 */ 792 if (!newpage->mapping) 793 __mem_cgroup_uncharge_common(newpage, 794 MEM_CGROUP_CHARGE_TYPE_FORCE); 795 else if (PageAnon(newpage)) 796 mem_cgroup_uncharge_page(newpage); 797 } 798 799 /* 800 * A call to try to shrink memory usage under specified resource controller. 801 * This is typically used for page reclaiming for shmem for reducing side 802 * effect of page allocation from shmem, which is used by some mem_cgroup. 803 */ 804 int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) 805 { 806 struct mem_cgroup *mem; 807 int progress = 0; 808 int retry = MEM_CGROUP_RECLAIM_RETRIES; 809 810 if (mem_cgroup_subsys.disabled) 811 return 0; 812 if (!mm) 813 return 0; 814 815 rcu_read_lock(); 816 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 817 if (unlikely(!mem)) { 818 rcu_read_unlock(); 819 return 0; 820 } 821 css_get(&mem->css); 822 rcu_read_unlock(); 823 824 do { 825 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); 826 progress += res_counter_check_under_limit(&mem->res); 827 } while (!progress && --retry); 828 829 css_put(&mem->css); 830 if (!retry) 831 return -ENOMEM; 832 return 0; 833 } 834 835 int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) 836 { 837 838 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 839 int progress; 840 int ret = 0; 841 842 while (res_counter_set_limit(&memcg->res, val)) { 843 if (signal_pending(current)) { 844 ret = -EINTR; 845 break; 846 } 847 if (!retry_count) { 848 ret = -EBUSY; 849 break; 850 } 851 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); 852 if (!progress) 853 retry_count--; 854 } 855 return ret; 856 } 857 858 859 /* 860 * This routine traverse page_cgroup in given list and drop them all. 861 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 862 */ 863 #define FORCE_UNCHARGE_BATCH (128) 864 static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, 865 struct mem_cgroup_per_zone *mz, 866 int active) 867 { 868 struct page_cgroup *pc; 869 struct page *page; 870 int count = FORCE_UNCHARGE_BATCH; 871 unsigned long flags; 872 struct list_head *list; 873 874 if (active) 875 list = &mz->active_list; 876 else 877 list = &mz->inactive_list; 878 879 spin_lock_irqsave(&mz->lru_lock, flags); 880 while (!list_empty(list)) { 881 pc = list_entry(list->prev, struct page_cgroup, lru); 882 page = pc->page; 883 get_page(page); 884 spin_unlock_irqrestore(&mz->lru_lock, flags); 885 /* 886 * Check if this page is on LRU. !LRU page can be found 887 * if it's under page migration. 888 */ 889 if (PageLRU(page)) { 890 __mem_cgroup_uncharge_common(page, 891 MEM_CGROUP_CHARGE_TYPE_FORCE); 892 put_page(page); 893 if (--count <= 0) { 894 count = FORCE_UNCHARGE_BATCH; 895 cond_resched(); 896 } 897 } else 898 cond_resched(); 899 spin_lock_irqsave(&mz->lru_lock, flags); 900 } 901 spin_unlock_irqrestore(&mz->lru_lock, flags); 902 } 903 904 /* 905 * make mem_cgroup's charge to be 0 if there is no task. 906 * This enables deleting this mem_cgroup. 907 */ 908 static int mem_cgroup_force_empty(struct mem_cgroup *mem) 909 { 910 int ret = -EBUSY; 911 int node, zid; 912 913 css_get(&mem->css); 914 /* 915 * page reclaim code (kswapd etc..) will move pages between 916 * active_list <-> inactive_list while we don't take a lock. 917 * So, we have to do loop here until all lists are empty. 918 */ 919 while (mem->res.usage > 0) { 920 if (atomic_read(&mem->css.cgroup->count) > 0) 921 goto out; 922 for_each_node_state(node, N_POSSIBLE) 923 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 924 struct mem_cgroup_per_zone *mz; 925 mz = mem_cgroup_zoneinfo(mem, node, zid); 926 /* drop all page_cgroup in active_list */ 927 mem_cgroup_force_empty_list(mem, mz, 1); 928 /* drop all page_cgroup in inactive_list */ 929 mem_cgroup_force_empty_list(mem, mz, 0); 930 } 931 } 932 ret = 0; 933 out: 934 css_put(&mem->css); 935 return ret; 936 } 937 938 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 939 { 940 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 941 cft->private); 942 } 943 /* 944 * The user of this function is... 945 * RES_LIMIT. 946 */ 947 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 948 const char *buffer) 949 { 950 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 951 unsigned long long val; 952 int ret; 953 954 switch (cft->private) { 955 case RES_LIMIT: 956 /* This function does all necessary parse...reuse it */ 957 ret = res_counter_memparse_write_strategy(buffer, &val); 958 if (!ret) 959 ret = mem_cgroup_resize_limit(memcg, val); 960 break; 961 default: 962 ret = -EINVAL; /* should be BUG() ? */ 963 break; 964 } 965 return ret; 966 } 967 968 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 969 { 970 struct mem_cgroup *mem; 971 972 mem = mem_cgroup_from_cont(cont); 973 switch (event) { 974 case RES_MAX_USAGE: 975 res_counter_reset_max(&mem->res); 976 break; 977 case RES_FAILCNT: 978 res_counter_reset_failcnt(&mem->res); 979 break; 980 } 981 return 0; 982 } 983 984 static int mem_force_empty_write(struct cgroup *cont, unsigned int event) 985 { 986 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont)); 987 } 988 989 static const struct mem_cgroup_stat_desc { 990 const char *msg; 991 u64 unit; 992 } mem_cgroup_stat_desc[] = { 993 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 994 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 995 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, 996 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, 997 }; 998 999 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 1000 struct cgroup_map_cb *cb) 1001 { 1002 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 1003 struct mem_cgroup_stat *stat = &mem_cont->stat; 1004 int i; 1005 1006 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 1007 s64 val; 1008 1009 val = mem_cgroup_read_stat(stat, i); 1010 val *= mem_cgroup_stat_desc[i].unit; 1011 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); 1012 } 1013 /* showing # of active pages */ 1014 { 1015 unsigned long active, inactive; 1016 1017 inactive = mem_cgroup_get_all_zonestat(mem_cont, 1018 MEM_CGROUP_ZSTAT_INACTIVE); 1019 active = mem_cgroup_get_all_zonestat(mem_cont, 1020 MEM_CGROUP_ZSTAT_ACTIVE); 1021 cb->fill(cb, "active", (active) * PAGE_SIZE); 1022 cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); 1023 } 1024 return 0; 1025 } 1026 1027 static struct cftype mem_cgroup_files[] = { 1028 { 1029 .name = "usage_in_bytes", 1030 .private = RES_USAGE, 1031 .read_u64 = mem_cgroup_read, 1032 }, 1033 { 1034 .name = "max_usage_in_bytes", 1035 .private = RES_MAX_USAGE, 1036 .trigger = mem_cgroup_reset, 1037 .read_u64 = mem_cgroup_read, 1038 }, 1039 { 1040 .name = "limit_in_bytes", 1041 .private = RES_LIMIT, 1042 .write_string = mem_cgroup_write, 1043 .read_u64 = mem_cgroup_read, 1044 }, 1045 { 1046 .name = "failcnt", 1047 .private = RES_FAILCNT, 1048 .trigger = mem_cgroup_reset, 1049 .read_u64 = mem_cgroup_read, 1050 }, 1051 { 1052 .name = "force_empty", 1053 .trigger = mem_force_empty_write, 1054 }, 1055 { 1056 .name = "stat", 1057 .read_map = mem_control_stat_show, 1058 }, 1059 }; 1060 1061 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1062 { 1063 struct mem_cgroup_per_node *pn; 1064 struct mem_cgroup_per_zone *mz; 1065 int zone, tmp = node; 1066 /* 1067 * This routine is called against possible nodes. 1068 * But it's BUG to call kmalloc() against offline node. 1069 * 1070 * TODO: this routine can waste much memory for nodes which will 1071 * never be onlined. It's better to use memory hotplug callback 1072 * function. 1073 */ 1074 if (!node_state(node, N_NORMAL_MEMORY)) 1075 tmp = -1; 1076 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 1077 if (!pn) 1078 return 1; 1079 1080 mem->info.nodeinfo[node] = pn; 1081 memset(pn, 0, sizeof(*pn)); 1082 1083 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 1084 mz = &pn->zoneinfo[zone]; 1085 INIT_LIST_HEAD(&mz->active_list); 1086 INIT_LIST_HEAD(&mz->inactive_list); 1087 spin_lock_init(&mz->lru_lock); 1088 } 1089 return 0; 1090 } 1091 1092 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1093 { 1094 kfree(mem->info.nodeinfo[node]); 1095 } 1096 1097 static struct mem_cgroup *mem_cgroup_alloc(void) 1098 { 1099 struct mem_cgroup *mem; 1100 1101 if (sizeof(*mem) < PAGE_SIZE) 1102 mem = kmalloc(sizeof(*mem), GFP_KERNEL); 1103 else 1104 mem = vmalloc(sizeof(*mem)); 1105 1106 if (mem) 1107 memset(mem, 0, sizeof(*mem)); 1108 return mem; 1109 } 1110 1111 static void mem_cgroup_free(struct mem_cgroup *mem) 1112 { 1113 if (sizeof(*mem) < PAGE_SIZE) 1114 kfree(mem); 1115 else 1116 vfree(mem); 1117 } 1118 1119 1120 static struct cgroup_subsys_state * 1121 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 1122 { 1123 struct mem_cgroup *mem; 1124 int node; 1125 1126 if (unlikely((cont->parent) == NULL)) { 1127 mem = &init_mem_cgroup; 1128 page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC); 1129 } else { 1130 mem = mem_cgroup_alloc(); 1131 if (!mem) 1132 return ERR_PTR(-ENOMEM); 1133 } 1134 1135 res_counter_init(&mem->res); 1136 1137 for_each_node_state(node, N_POSSIBLE) 1138 if (alloc_mem_cgroup_per_zone_info(mem, node)) 1139 goto free_out; 1140 1141 return &mem->css; 1142 free_out: 1143 for_each_node_state(node, N_POSSIBLE) 1144 free_mem_cgroup_per_zone_info(mem, node); 1145 if (cont->parent != NULL) 1146 mem_cgroup_free(mem); 1147 return ERR_PTR(-ENOMEM); 1148 } 1149 1150 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 1151 struct cgroup *cont) 1152 { 1153 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1154 mem_cgroup_force_empty(mem); 1155 } 1156 1157 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 1158 struct cgroup *cont) 1159 { 1160 int node; 1161 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1162 1163 for_each_node_state(node, N_POSSIBLE) 1164 free_mem_cgroup_per_zone_info(mem, node); 1165 1166 mem_cgroup_free(mem_cgroup_from_cont(cont)); 1167 } 1168 1169 static int mem_cgroup_populate(struct cgroup_subsys *ss, 1170 struct cgroup *cont) 1171 { 1172 return cgroup_add_files(cont, ss, mem_cgroup_files, 1173 ARRAY_SIZE(mem_cgroup_files)); 1174 } 1175 1176 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 1177 struct cgroup *cont, 1178 struct cgroup *old_cont, 1179 struct task_struct *p) 1180 { 1181 struct mm_struct *mm; 1182 struct mem_cgroup *mem, *old_mem; 1183 1184 mm = get_task_mm(p); 1185 if (mm == NULL) 1186 return; 1187 1188 mem = mem_cgroup_from_cont(cont); 1189 old_mem = mem_cgroup_from_cont(old_cont); 1190 1191 /* 1192 * Only thread group leaders are allowed to migrate, the mm_struct is 1193 * in effect owned by the leader 1194 */ 1195 if (!thread_group_leader(p)) 1196 goto out; 1197 1198 out: 1199 mmput(mm); 1200 } 1201 1202 struct cgroup_subsys mem_cgroup_subsys = { 1203 .name = "memory", 1204 .subsys_id = mem_cgroup_subsys_id, 1205 .create = mem_cgroup_create, 1206 .pre_destroy = mem_cgroup_pre_destroy, 1207 .destroy = mem_cgroup_destroy, 1208 .populate = mem_cgroup_populate, 1209 .attach = mem_cgroup_move_task, 1210 .early_init = 0, 1211 }; 1212