1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20 #include <linux/res_counter.h> 21 #include <linux/memcontrol.h> 22 #include <linux/cgroup.h> 23 #include <linux/mm.h> 24 #include <linux/smp.h> 25 #include <linux/page-flags.h> 26 #include <linux/backing-dev.h> 27 #include <linux/bit_spinlock.h> 28 #include <linux/rcupdate.h> 29 #include <linux/swap.h> 30 #include <linux/spinlock.h> 31 #include <linux/fs.h> 32 #include <linux/seq_file.h> 33 34 #include <asm/uaccess.h> 35 36 struct cgroup_subsys mem_cgroup_subsys; 37 static const int MEM_CGROUP_RECLAIM_RETRIES = 5; 38 39 /* 40 * Statistics for memory cgroup. 41 */ 42 enum mem_cgroup_stat_index { 43 /* 44 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 45 */ 46 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 47 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 48 49 MEM_CGROUP_STAT_NSTATS, 50 }; 51 52 struct mem_cgroup_stat_cpu { 53 s64 count[MEM_CGROUP_STAT_NSTATS]; 54 } ____cacheline_aligned_in_smp; 55 56 struct mem_cgroup_stat { 57 struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; 58 }; 59 60 /* 61 * For accounting under irq disable, no need for increment preempt count. 62 */ 63 static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, 64 enum mem_cgroup_stat_index idx, int val) 65 { 66 int cpu = smp_processor_id(); 67 stat->cpustat[cpu].count[idx] += val; 68 } 69 70 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 71 enum mem_cgroup_stat_index idx) 72 { 73 int cpu; 74 s64 ret = 0; 75 for_each_possible_cpu(cpu) 76 ret += stat->cpustat[cpu].count[idx]; 77 return ret; 78 } 79 80 /* 81 * per-zone information in memory controller. 82 */ 83 84 enum mem_cgroup_zstat_index { 85 MEM_CGROUP_ZSTAT_ACTIVE, 86 MEM_CGROUP_ZSTAT_INACTIVE, 87 88 NR_MEM_CGROUP_ZSTAT, 89 }; 90 91 struct mem_cgroup_per_zone { 92 /* 93 * spin_lock to protect the per cgroup LRU 94 */ 95 spinlock_t lru_lock; 96 struct list_head active_list; 97 struct list_head inactive_list; 98 unsigned long count[NR_MEM_CGROUP_ZSTAT]; 99 }; 100 /* Macro for accessing counter */ 101 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 102 103 struct mem_cgroup_per_node { 104 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 105 }; 106 107 struct mem_cgroup_lru_info { 108 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 109 }; 110 111 /* 112 * The memory controller data structure. The memory controller controls both 113 * page cache and RSS per cgroup. We would eventually like to provide 114 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 115 * to help the administrator determine what knobs to tune. 116 * 117 * TODO: Add a water mark for the memory controller. Reclaim will begin when 118 * we hit the water mark. May be even add a low water mark, such that 119 * no reclaim occurs from a cgroup at it's low water mark, this is 120 * a feature that will be implemented much later in the future. 121 */ 122 struct mem_cgroup { 123 struct cgroup_subsys_state css; 124 /* 125 * the counter to account for memory usage 126 */ 127 struct res_counter res; 128 /* 129 * Per cgroup active and inactive list, similar to the 130 * per zone LRU lists. 131 */ 132 struct mem_cgroup_lru_info info; 133 134 int prev_priority; /* for recording reclaim priority */ 135 /* 136 * statistics. 137 */ 138 struct mem_cgroup_stat stat; 139 }; 140 141 /* 142 * We use the lower bit of the page->page_cgroup pointer as a bit spin 143 * lock. We need to ensure that page->page_cgroup is atleast two 144 * byte aligned (based on comments from Nick Piggin) 145 */ 146 #define PAGE_CGROUP_LOCK_BIT 0x0 147 #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) 148 149 /* 150 * A page_cgroup page is associated with every page descriptor. The 151 * page_cgroup helps us identify information about the cgroup 152 */ 153 struct page_cgroup { 154 struct list_head lru; /* per cgroup LRU list */ 155 struct page *page; 156 struct mem_cgroup *mem_cgroup; 157 atomic_t ref_cnt; /* Helpful when pages move b/w */ 158 /* mapped and cached states */ 159 int flags; 160 }; 161 #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 162 #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ 163 164 static inline int page_cgroup_nid(struct page_cgroup *pc) 165 { 166 return page_to_nid(pc->page); 167 } 168 169 static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) 170 { 171 return page_zonenum(pc->page); 172 } 173 174 enum { 175 MEM_CGROUP_TYPE_UNSPEC = 0, 176 MEM_CGROUP_TYPE_MAPPED, 177 MEM_CGROUP_TYPE_CACHED, 178 MEM_CGROUP_TYPE_ALL, 179 MEM_CGROUP_TYPE_MAX, 180 }; 181 182 enum charge_type { 183 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 184 MEM_CGROUP_CHARGE_TYPE_MAPPED, 185 }; 186 187 188 /* 189 * Always modified under lru lock. Then, not necessary to preempt_disable() 190 */ 191 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, 192 bool charge) 193 { 194 int val = (charge)? 1 : -1; 195 struct mem_cgroup_stat *stat = &mem->stat; 196 VM_BUG_ON(!irqs_disabled()); 197 198 if (flags & PAGE_CGROUP_FLAG_CACHE) 199 __mem_cgroup_stat_add_safe(stat, 200 MEM_CGROUP_STAT_CACHE, val); 201 else 202 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 203 } 204 205 static inline struct mem_cgroup_per_zone * 206 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 207 { 208 BUG_ON(!mem->info.nodeinfo[nid]); 209 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 210 } 211 212 static inline struct mem_cgroup_per_zone * 213 page_cgroup_zoneinfo(struct page_cgroup *pc) 214 { 215 struct mem_cgroup *mem = pc->mem_cgroup; 216 int nid = page_cgroup_nid(pc); 217 int zid = page_cgroup_zid(pc); 218 219 return mem_cgroup_zoneinfo(mem, nid, zid); 220 } 221 222 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 223 enum mem_cgroup_zstat_index idx) 224 { 225 int nid, zid; 226 struct mem_cgroup_per_zone *mz; 227 u64 total = 0; 228 229 for_each_online_node(nid) 230 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 231 mz = mem_cgroup_zoneinfo(mem, nid, zid); 232 total += MEM_CGROUP_ZSTAT(mz, idx); 233 } 234 return total; 235 } 236 237 static struct mem_cgroup init_mem_cgroup; 238 239 static inline 240 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 241 { 242 return container_of(cgroup_subsys_state(cont, 243 mem_cgroup_subsys_id), struct mem_cgroup, 244 css); 245 } 246 247 static inline 248 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 249 { 250 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 251 struct mem_cgroup, css); 252 } 253 254 void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p) 255 { 256 struct mem_cgroup *mem; 257 258 mem = mem_cgroup_from_task(p); 259 css_get(&mem->css); 260 mm->mem_cgroup = mem; 261 } 262 263 void mm_free_cgroup(struct mm_struct *mm) 264 { 265 css_put(&mm->mem_cgroup->css); 266 } 267 268 static inline int page_cgroup_locked(struct page *page) 269 { 270 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, 271 &page->page_cgroup); 272 } 273 274 void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 275 { 276 int locked; 277 278 /* 279 * While resetting the page_cgroup we might not hold the 280 * page_cgroup lock. free_hot_cold_page() is an example 281 * of such a scenario 282 */ 283 if (pc) 284 VM_BUG_ON(!page_cgroup_locked(page)); 285 locked = (page->page_cgroup & PAGE_CGROUP_LOCK); 286 page->page_cgroup = ((unsigned long)pc | locked); 287 } 288 289 struct page_cgroup *page_get_page_cgroup(struct page *page) 290 { 291 return (struct page_cgroup *) 292 (page->page_cgroup & ~PAGE_CGROUP_LOCK); 293 } 294 295 static void __always_inline lock_page_cgroup(struct page *page) 296 { 297 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 298 VM_BUG_ON(!page_cgroup_locked(page)); 299 } 300 301 static void __always_inline unlock_page_cgroup(struct page *page) 302 { 303 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 304 } 305 306 /* 307 * Tie new page_cgroup to struct page under lock_page_cgroup() 308 * This can fail if the page has been tied to a page_cgroup. 309 * If success, returns 0. 310 */ 311 static int page_cgroup_assign_new_page_cgroup(struct page *page, 312 struct page_cgroup *pc) 313 { 314 int ret = 0; 315 316 lock_page_cgroup(page); 317 if (!page_get_page_cgroup(page)) 318 page_assign_page_cgroup(page, pc); 319 else /* A page is tied to other pc. */ 320 ret = 1; 321 unlock_page_cgroup(page); 322 return ret; 323 } 324 325 /* 326 * Clear page->page_cgroup member under lock_page_cgroup(). 327 * If given "pc" value is different from one page->page_cgroup, 328 * page->cgroup is not cleared. 329 * Returns a value of page->page_cgroup at lock taken. 330 * A can can detect failure of clearing by following 331 * clear_page_cgroup(page, pc) == pc 332 */ 333 334 static struct page_cgroup *clear_page_cgroup(struct page *page, 335 struct page_cgroup *pc) 336 { 337 struct page_cgroup *ret; 338 /* lock and clear */ 339 lock_page_cgroup(page); 340 ret = page_get_page_cgroup(page); 341 if (likely(ret == pc)) 342 page_assign_page_cgroup(page, NULL); 343 unlock_page_cgroup(page); 344 return ret; 345 } 346 347 static void __mem_cgroup_remove_list(struct page_cgroup *pc) 348 { 349 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 350 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 351 352 if (from) 353 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 354 else 355 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 356 357 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 358 list_del_init(&pc->lru); 359 } 360 361 static void __mem_cgroup_add_list(struct page_cgroup *pc) 362 { 363 int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 364 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 365 366 if (!to) { 367 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 368 list_add(&pc->lru, &mz->inactive_list); 369 } else { 370 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 371 list_add(&pc->lru, &mz->active_list); 372 } 373 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 374 } 375 376 static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 377 { 378 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 379 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 380 381 if (from) 382 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 383 else 384 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; 385 386 if (active) { 387 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 388 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 389 list_move(&pc->lru, &mz->active_list); 390 } else { 391 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 392 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 393 list_move(&pc->lru, &mz->inactive_list); 394 } 395 } 396 397 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 398 { 399 int ret; 400 401 task_lock(task); 402 ret = task->mm && vm_match_cgroup(task->mm, mem); 403 task_unlock(task); 404 return ret; 405 } 406 407 /* 408 * This routine assumes that the appropriate zone's lru lock is already held 409 */ 410 void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 411 { 412 struct mem_cgroup_per_zone *mz; 413 unsigned long flags; 414 415 if (!pc) 416 return; 417 418 mz = page_cgroup_zoneinfo(pc); 419 spin_lock_irqsave(&mz->lru_lock, flags); 420 __mem_cgroup_move_lists(pc, active); 421 spin_unlock_irqrestore(&mz->lru_lock, flags); 422 } 423 424 /* 425 * Calculate mapped_ratio under memory controller. This will be used in 426 * vmscan.c for deteremining we have to reclaim mapped pages. 427 */ 428 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) 429 { 430 long total, rss; 431 432 /* 433 * usage is recorded in bytes. But, here, we assume the number of 434 * physical pages can be represented by "long" on any arch. 435 */ 436 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; 437 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 438 return (int)((rss * 100L) / total); 439 } 440 /* 441 * This function is called from vmscan.c. In page reclaiming loop. balance 442 * between active and inactive list is calculated. For memory controller 443 * page reclaiming, we should use using mem_cgroup's imbalance rather than 444 * zone's global lru imbalance. 445 */ 446 long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) 447 { 448 unsigned long active, inactive; 449 /* active and inactive are the number of pages. 'long' is ok.*/ 450 active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); 451 inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); 452 return (long) (active / (inactive + 1)); 453 } 454 455 /* 456 * prev_priority control...this will be used in memory reclaim path. 457 */ 458 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 459 { 460 return mem->prev_priority; 461 } 462 463 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 464 { 465 if (priority < mem->prev_priority) 466 mem->prev_priority = priority; 467 } 468 469 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 470 { 471 mem->prev_priority = priority; 472 } 473 474 /* 475 * Calculate # of pages to be scanned in this priority/zone. 476 * See also vmscan.c 477 * 478 * priority starts from "DEF_PRIORITY" and decremented in each loop. 479 * (see include/linux/mmzone.h) 480 */ 481 482 long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, 483 struct zone *zone, int priority) 484 { 485 long nr_active; 486 int nid = zone->zone_pgdat->node_id; 487 int zid = zone_idx(zone); 488 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 489 490 nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); 491 return (nr_active >> priority); 492 } 493 494 long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, 495 struct zone *zone, int priority) 496 { 497 long nr_inactive; 498 int nid = zone->zone_pgdat->node_id; 499 int zid = zone_idx(zone); 500 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 501 502 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); 503 504 return (nr_inactive >> priority); 505 } 506 507 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 508 struct list_head *dst, 509 unsigned long *scanned, int order, 510 int mode, struct zone *z, 511 struct mem_cgroup *mem_cont, 512 int active) 513 { 514 unsigned long nr_taken = 0; 515 struct page *page; 516 unsigned long scan; 517 LIST_HEAD(pc_list); 518 struct list_head *src; 519 struct page_cgroup *pc, *tmp; 520 int nid = z->zone_pgdat->node_id; 521 int zid = zone_idx(z); 522 struct mem_cgroup_per_zone *mz; 523 524 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 525 if (active) 526 src = &mz->active_list; 527 else 528 src = &mz->inactive_list; 529 530 531 spin_lock(&mz->lru_lock); 532 scan = 0; 533 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 534 if (scan >= nr_to_scan) 535 break; 536 page = pc->page; 537 VM_BUG_ON(!pc); 538 539 if (unlikely(!PageLRU(page))) 540 continue; 541 542 if (PageActive(page) && !active) { 543 __mem_cgroup_move_lists(pc, true); 544 continue; 545 } 546 if (!PageActive(page) && active) { 547 __mem_cgroup_move_lists(pc, false); 548 continue; 549 } 550 551 scan++; 552 list_move(&pc->lru, &pc_list); 553 554 if (__isolate_lru_page(page, mode) == 0) { 555 list_move(&page->lru, dst); 556 nr_taken++; 557 } 558 } 559 560 list_splice(&pc_list, src); 561 spin_unlock(&mz->lru_lock); 562 563 *scanned = scan; 564 return nr_taken; 565 } 566 567 /* 568 * Charge the memory controller for page usage. 569 * Return 570 * 0 if the charge was successful 571 * < 0 if the cgroup is over its limit 572 */ 573 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 574 gfp_t gfp_mask, enum charge_type ctype) 575 { 576 struct mem_cgroup *mem; 577 struct page_cgroup *pc; 578 unsigned long flags; 579 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 580 struct mem_cgroup_per_zone *mz; 581 582 /* 583 * Should page_cgroup's go to their own slab? 584 * One could optimize the performance of the charging routine 585 * by saving a bit in the page_flags and using it as a lock 586 * to see if the cgroup page already has a page_cgroup associated 587 * with it 588 */ 589 retry: 590 if (page) { 591 lock_page_cgroup(page); 592 pc = page_get_page_cgroup(page); 593 /* 594 * The page_cgroup exists and 595 * the page has already been accounted. 596 */ 597 if (pc) { 598 if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { 599 /* this page is under being uncharged ? */ 600 unlock_page_cgroup(page); 601 cpu_relax(); 602 goto retry; 603 } else { 604 unlock_page_cgroup(page); 605 goto done; 606 } 607 } 608 unlock_page_cgroup(page); 609 } 610 611 pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); 612 if (pc == NULL) 613 goto err; 614 615 /* 616 * We always charge the cgroup the mm_struct belongs to. 617 * The mm_struct's mem_cgroup changes on task migration if the 618 * thread group leader migrates. It's possible that mm is not 619 * set, if so charge the init_mm (happens for pagecache usage). 620 */ 621 if (!mm) 622 mm = &init_mm; 623 624 rcu_read_lock(); 625 mem = rcu_dereference(mm->mem_cgroup); 626 /* 627 * For every charge from the cgroup, increment reference 628 * count 629 */ 630 css_get(&mem->css); 631 rcu_read_unlock(); 632 633 /* 634 * If we created the page_cgroup, we should free it on exceeding 635 * the cgroup limit. 636 */ 637 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 638 if (!(gfp_mask & __GFP_WAIT)) 639 goto out; 640 641 if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) 642 continue; 643 644 /* 645 * try_to_free_mem_cgroup_pages() might not give us a full 646 * picture of reclaim. Some pages are reclaimed and might be 647 * moved to swap cache or just unmapped from the cgroup. 648 * Check the limit again to see if the reclaim reduced the 649 * current usage of the cgroup before giving up 650 */ 651 if (res_counter_check_under_limit(&mem->res)) 652 continue; 653 654 if (!nr_retries--) { 655 mem_cgroup_out_of_memory(mem, gfp_mask); 656 goto out; 657 } 658 congestion_wait(WRITE, HZ/10); 659 } 660 661 atomic_set(&pc->ref_cnt, 1); 662 pc->mem_cgroup = mem; 663 pc->page = page; 664 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 665 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 666 pc->flags |= PAGE_CGROUP_FLAG_CACHE; 667 668 if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) { 669 /* 670 * Another charge has been added to this page already. 671 * We take lock_page_cgroup(page) again and read 672 * page->cgroup, increment refcnt.... just retry is OK. 673 */ 674 res_counter_uncharge(&mem->res, PAGE_SIZE); 675 css_put(&mem->css); 676 kfree(pc); 677 if (!page) 678 goto done; 679 goto retry; 680 } 681 682 mz = page_cgroup_zoneinfo(pc); 683 spin_lock_irqsave(&mz->lru_lock, flags); 684 /* Update statistics vector */ 685 __mem_cgroup_add_list(pc); 686 spin_unlock_irqrestore(&mz->lru_lock, flags); 687 688 done: 689 return 0; 690 out: 691 css_put(&mem->css); 692 kfree(pc); 693 err: 694 return -ENOMEM; 695 } 696 697 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, 698 gfp_t gfp_mask) 699 { 700 return mem_cgroup_charge_common(page, mm, gfp_mask, 701 MEM_CGROUP_CHARGE_TYPE_MAPPED); 702 } 703 704 /* 705 * See if the cached pages should be charged at all? 706 */ 707 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 708 gfp_t gfp_mask) 709 { 710 int ret = 0; 711 if (!mm) 712 mm = &init_mm; 713 714 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 715 MEM_CGROUP_CHARGE_TYPE_CACHE); 716 return ret; 717 } 718 719 /* 720 * Uncharging is always a welcome operation, we never complain, simply 721 * uncharge. This routine should be called with lock_page_cgroup held 722 */ 723 void mem_cgroup_uncharge(struct page_cgroup *pc) 724 { 725 struct mem_cgroup *mem; 726 struct mem_cgroup_per_zone *mz; 727 struct page *page; 728 unsigned long flags; 729 730 /* 731 * Check if our page_cgroup is valid 732 */ 733 if (!pc) 734 return; 735 736 if (atomic_dec_and_test(&pc->ref_cnt)) { 737 page = pc->page; 738 mz = page_cgroup_zoneinfo(pc); 739 /* 740 * get page->cgroup and clear it under lock. 741 * force_empty can drop page->cgroup without checking refcnt. 742 */ 743 unlock_page_cgroup(page); 744 if (clear_page_cgroup(page, pc) == pc) { 745 mem = pc->mem_cgroup; 746 css_put(&mem->css); 747 res_counter_uncharge(&mem->res, PAGE_SIZE); 748 spin_lock_irqsave(&mz->lru_lock, flags); 749 __mem_cgroup_remove_list(pc); 750 spin_unlock_irqrestore(&mz->lru_lock, flags); 751 kfree(pc); 752 } 753 lock_page_cgroup(page); 754 } 755 } 756 757 void mem_cgroup_uncharge_page(struct page *page) 758 { 759 lock_page_cgroup(page); 760 mem_cgroup_uncharge(page_get_page_cgroup(page)); 761 unlock_page_cgroup(page); 762 } 763 764 /* 765 * Returns non-zero if a page (under migration) has valid page_cgroup member. 766 * Refcnt of page_cgroup is incremented. 767 */ 768 769 int mem_cgroup_prepare_migration(struct page *page) 770 { 771 struct page_cgroup *pc; 772 int ret = 0; 773 lock_page_cgroup(page); 774 pc = page_get_page_cgroup(page); 775 if (pc && atomic_inc_not_zero(&pc->ref_cnt)) 776 ret = 1; 777 unlock_page_cgroup(page); 778 return ret; 779 } 780 781 void mem_cgroup_end_migration(struct page *page) 782 { 783 struct page_cgroup *pc; 784 785 lock_page_cgroup(page); 786 pc = page_get_page_cgroup(page); 787 mem_cgroup_uncharge(pc); 788 unlock_page_cgroup(page); 789 } 790 /* 791 * We know both *page* and *newpage* are now not-on-LRU and Pg_locked. 792 * And no race with uncharge() routines because page_cgroup for *page* 793 * has extra one reference by mem_cgroup_prepare_migration. 794 */ 795 796 void mem_cgroup_page_migration(struct page *page, struct page *newpage) 797 { 798 struct page_cgroup *pc; 799 struct mem_cgroup *mem; 800 unsigned long flags; 801 struct mem_cgroup_per_zone *mz; 802 retry: 803 pc = page_get_page_cgroup(page); 804 if (!pc) 805 return; 806 mem = pc->mem_cgroup; 807 mz = page_cgroup_zoneinfo(pc); 808 if (clear_page_cgroup(page, pc) != pc) 809 goto retry; 810 spin_lock_irqsave(&mz->lru_lock, flags); 811 812 __mem_cgroup_remove_list(pc); 813 spin_unlock_irqrestore(&mz->lru_lock, flags); 814 815 pc->page = newpage; 816 lock_page_cgroup(newpage); 817 page_assign_page_cgroup(newpage, pc); 818 unlock_page_cgroup(newpage); 819 820 mz = page_cgroup_zoneinfo(pc); 821 spin_lock_irqsave(&mz->lru_lock, flags); 822 __mem_cgroup_add_list(pc); 823 spin_unlock_irqrestore(&mz->lru_lock, flags); 824 return; 825 } 826 827 /* 828 * This routine traverse page_cgroup in given list and drop them all. 829 * This routine ignores page_cgroup->ref_cnt. 830 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 831 */ 832 #define FORCE_UNCHARGE_BATCH (128) 833 static void 834 mem_cgroup_force_empty_list(struct mem_cgroup *mem, 835 struct mem_cgroup_per_zone *mz, 836 int active) 837 { 838 struct page_cgroup *pc; 839 struct page *page; 840 int count; 841 unsigned long flags; 842 struct list_head *list; 843 844 if (active) 845 list = &mz->active_list; 846 else 847 list = &mz->inactive_list; 848 849 if (list_empty(list)) 850 return; 851 retry: 852 count = FORCE_UNCHARGE_BATCH; 853 spin_lock_irqsave(&mz->lru_lock, flags); 854 855 while (--count && !list_empty(list)) { 856 pc = list_entry(list->prev, struct page_cgroup, lru); 857 page = pc->page; 858 /* Avoid race with charge */ 859 atomic_set(&pc->ref_cnt, 0); 860 if (clear_page_cgroup(page, pc) == pc) { 861 css_put(&mem->css); 862 res_counter_uncharge(&mem->res, PAGE_SIZE); 863 __mem_cgroup_remove_list(pc); 864 kfree(pc); 865 } else /* being uncharged ? ...do relax */ 866 break; 867 } 868 spin_unlock_irqrestore(&mz->lru_lock, flags); 869 if (!list_empty(list)) { 870 cond_resched(); 871 goto retry; 872 } 873 return; 874 } 875 876 /* 877 * make mem_cgroup's charge to be 0 if there is no task. 878 * This enables deleting this mem_cgroup. 879 */ 880 881 int mem_cgroup_force_empty(struct mem_cgroup *mem) 882 { 883 int ret = -EBUSY; 884 int node, zid; 885 css_get(&mem->css); 886 /* 887 * page reclaim code (kswapd etc..) will move pages between 888 ` * active_list <-> inactive_list while we don't take a lock. 889 * So, we have to do loop here until all lists are empty. 890 */ 891 while (mem->res.usage > 0) { 892 if (atomic_read(&mem->css.cgroup->count) > 0) 893 goto out; 894 for_each_node_state(node, N_POSSIBLE) 895 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 896 struct mem_cgroup_per_zone *mz; 897 mz = mem_cgroup_zoneinfo(mem, node, zid); 898 /* drop all page_cgroup in active_list */ 899 mem_cgroup_force_empty_list(mem, mz, 1); 900 /* drop all page_cgroup in inactive_list */ 901 mem_cgroup_force_empty_list(mem, mz, 0); 902 } 903 } 904 ret = 0; 905 out: 906 css_put(&mem->css); 907 return ret; 908 } 909 910 911 912 int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) 913 { 914 *tmp = memparse(buf, &buf); 915 if (*buf != '\0') 916 return -EINVAL; 917 918 /* 919 * Round up the value to the closest page size 920 */ 921 *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; 922 return 0; 923 } 924 925 static ssize_t mem_cgroup_read(struct cgroup *cont, 926 struct cftype *cft, struct file *file, 927 char __user *userbuf, size_t nbytes, loff_t *ppos) 928 { 929 return res_counter_read(&mem_cgroup_from_cont(cont)->res, 930 cft->private, userbuf, nbytes, ppos, 931 NULL); 932 } 933 934 static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 935 struct file *file, const char __user *userbuf, 936 size_t nbytes, loff_t *ppos) 937 { 938 return res_counter_write(&mem_cgroup_from_cont(cont)->res, 939 cft->private, userbuf, nbytes, ppos, 940 mem_cgroup_write_strategy); 941 } 942 943 static ssize_t mem_force_empty_write(struct cgroup *cont, 944 struct cftype *cft, struct file *file, 945 const char __user *userbuf, 946 size_t nbytes, loff_t *ppos) 947 { 948 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 949 int ret; 950 ret = mem_cgroup_force_empty(mem); 951 if (!ret) 952 ret = nbytes; 953 return ret; 954 } 955 956 /* 957 * Note: This should be removed if cgroup supports write-only file. 958 */ 959 960 static ssize_t mem_force_empty_read(struct cgroup *cont, 961 struct cftype *cft, 962 struct file *file, char __user *userbuf, 963 size_t nbytes, loff_t *ppos) 964 { 965 return -EINVAL; 966 } 967 968 969 static const struct mem_cgroup_stat_desc { 970 const char *msg; 971 u64 unit; 972 } mem_cgroup_stat_desc[] = { 973 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 974 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 975 }; 976 977 static int mem_control_stat_show(struct seq_file *m, void *arg) 978 { 979 struct cgroup *cont = m->private; 980 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 981 struct mem_cgroup_stat *stat = &mem_cont->stat; 982 int i; 983 984 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 985 s64 val; 986 987 val = mem_cgroup_read_stat(stat, i); 988 val *= mem_cgroup_stat_desc[i].unit; 989 seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, 990 (long long)val); 991 } 992 /* showing # of active pages */ 993 { 994 unsigned long active, inactive; 995 996 inactive = mem_cgroup_get_all_zonestat(mem_cont, 997 MEM_CGROUP_ZSTAT_INACTIVE); 998 active = mem_cgroup_get_all_zonestat(mem_cont, 999 MEM_CGROUP_ZSTAT_ACTIVE); 1000 seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); 1001 seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); 1002 } 1003 return 0; 1004 } 1005 1006 static const struct file_operations mem_control_stat_file_operations = { 1007 .read = seq_read, 1008 .llseek = seq_lseek, 1009 .release = single_release, 1010 }; 1011 1012 static int mem_control_stat_open(struct inode *unused, struct file *file) 1013 { 1014 /* XXX __d_cont */ 1015 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; 1016 1017 file->f_op = &mem_control_stat_file_operations; 1018 return single_open(file, mem_control_stat_show, cont); 1019 } 1020 1021 1022 1023 static struct cftype mem_cgroup_files[] = { 1024 { 1025 .name = "usage_in_bytes", 1026 .private = RES_USAGE, 1027 .read = mem_cgroup_read, 1028 }, 1029 { 1030 .name = "limit_in_bytes", 1031 .private = RES_LIMIT, 1032 .write = mem_cgroup_write, 1033 .read = mem_cgroup_read, 1034 }, 1035 { 1036 .name = "failcnt", 1037 .private = RES_FAILCNT, 1038 .read = mem_cgroup_read, 1039 }, 1040 { 1041 .name = "force_empty", 1042 .write = mem_force_empty_write, 1043 .read = mem_force_empty_read, 1044 }, 1045 { 1046 .name = "stat", 1047 .open = mem_control_stat_open, 1048 }, 1049 }; 1050 1051 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1052 { 1053 struct mem_cgroup_per_node *pn; 1054 struct mem_cgroup_per_zone *mz; 1055 int zone; 1056 /* 1057 * This routine is called against possible nodes. 1058 * But it's BUG to call kmalloc() against offline node. 1059 * 1060 * TODO: this routine can waste much memory for nodes which will 1061 * never be onlined. It's better to use memory hotplug callback 1062 * function. 1063 */ 1064 if (node_state(node, N_HIGH_MEMORY)) 1065 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node); 1066 else 1067 pn = kmalloc(sizeof(*pn), GFP_KERNEL); 1068 if (!pn) 1069 return 1; 1070 1071 mem->info.nodeinfo[node] = pn; 1072 memset(pn, 0, sizeof(*pn)); 1073 1074 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 1075 mz = &pn->zoneinfo[zone]; 1076 INIT_LIST_HEAD(&mz->active_list); 1077 INIT_LIST_HEAD(&mz->inactive_list); 1078 spin_lock_init(&mz->lru_lock); 1079 } 1080 return 0; 1081 } 1082 1083 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1084 { 1085 kfree(mem->info.nodeinfo[node]); 1086 } 1087 1088 1089 static struct mem_cgroup init_mem_cgroup; 1090 1091 static struct cgroup_subsys_state * 1092 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 1093 { 1094 struct mem_cgroup *mem; 1095 int node; 1096 1097 if (unlikely((cont->parent) == NULL)) { 1098 mem = &init_mem_cgroup; 1099 init_mm.mem_cgroup = mem; 1100 } else 1101 mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); 1102 1103 if (mem == NULL) 1104 return NULL; 1105 1106 res_counter_init(&mem->res); 1107 1108 memset(&mem->info, 0, sizeof(mem->info)); 1109 1110 for_each_node_state(node, N_POSSIBLE) 1111 if (alloc_mem_cgroup_per_zone_info(mem, node)) 1112 goto free_out; 1113 1114 return &mem->css; 1115 free_out: 1116 for_each_node_state(node, N_POSSIBLE) 1117 free_mem_cgroup_per_zone_info(mem, node); 1118 if (cont->parent != NULL) 1119 kfree(mem); 1120 return NULL; 1121 } 1122 1123 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 1124 struct cgroup *cont) 1125 { 1126 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1127 mem_cgroup_force_empty(mem); 1128 } 1129 1130 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 1131 struct cgroup *cont) 1132 { 1133 int node; 1134 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1135 1136 for_each_node_state(node, N_POSSIBLE) 1137 free_mem_cgroup_per_zone_info(mem, node); 1138 1139 kfree(mem_cgroup_from_cont(cont)); 1140 } 1141 1142 static int mem_cgroup_populate(struct cgroup_subsys *ss, 1143 struct cgroup *cont) 1144 { 1145 return cgroup_add_files(cont, ss, mem_cgroup_files, 1146 ARRAY_SIZE(mem_cgroup_files)); 1147 } 1148 1149 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 1150 struct cgroup *cont, 1151 struct cgroup *old_cont, 1152 struct task_struct *p) 1153 { 1154 struct mm_struct *mm; 1155 struct mem_cgroup *mem, *old_mem; 1156 1157 mm = get_task_mm(p); 1158 if (mm == NULL) 1159 return; 1160 1161 mem = mem_cgroup_from_cont(cont); 1162 old_mem = mem_cgroup_from_cont(old_cont); 1163 1164 if (mem == old_mem) 1165 goto out; 1166 1167 /* 1168 * Only thread group leaders are allowed to migrate, the mm_struct is 1169 * in effect owned by the leader 1170 */ 1171 if (p->tgid != p->pid) 1172 goto out; 1173 1174 css_get(&mem->css); 1175 rcu_assign_pointer(mm->mem_cgroup, mem); 1176 css_put(&old_mem->css); 1177 1178 out: 1179 mmput(mm); 1180 return; 1181 } 1182 1183 struct cgroup_subsys mem_cgroup_subsys = { 1184 .name = "memory", 1185 .subsys_id = mem_cgroup_subsys_id, 1186 .create = mem_cgroup_create, 1187 .pre_destroy = mem_cgroup_pre_destroy, 1188 .destroy = mem_cgroup_destroy, 1189 .populate = mem_cgroup_populate, 1190 .attach = mem_cgroup_move_task, 1191 .early_init = 0, 1192 }; 1193