Lines Matching +full:no +full:- +full:ref +full:- +full:current +full:- +full:limit
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* memcontrol.c - Memory Controller
28 #include <linux/cgroup-defs.h>
39 #include <linux/page-flags.h>
40 #include <linux/backing-dev.h>
70 #include "memcontrol-v1.h"
104 return tsk_is_oom_victim(current) || fatal_signal_pending(current) || in task_is_dying()
105 (current->flags & PF_EXITING); in task_is_dying()
113 return &memcg->vmpressure; in memcg_to_vmpressure()
135 static void obj_cgroup_release(struct percpu_ref *ref) in obj_cgroup_release() argument
137 struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); in obj_cgroup_release()
144 * objcg->nr_charged_bytes can't have an arbitrary byte value. in obj_cgroup_release()
148 * 1) CPU0: objcg == stock->cached_objcg in obj_cgroup_release()
153 * objcg->nr_charged_bytes = PAGE_SIZE - 92 in obj_cgroup_release()
155 * 92 bytes are added to stock->nr_bytes in obj_cgroup_release()
157 * 92 bytes are added to objcg->nr_charged_bytes in obj_cgroup_release()
162 nr_bytes = atomic_read(&objcg->nr_charged_bytes); in obj_cgroup_release()
163 WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); in obj_cgroup_release()
170 list_del(&objcg->list); in obj_cgroup_release()
173 percpu_ref_exit(ref); in obj_cgroup_release()
186 ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, in obj_cgroup_alloc()
192 INIT_LIST_HEAD(&objcg->list); in obj_cgroup_alloc()
201 objcg = rcu_replace_pointer(memcg->objcg, NULL, true); in memcg_reparent_objcgs()
206 list_add(&objcg->list, &memcg->objcg_list); in memcg_reparent_objcgs()
208 list_for_each_entry(iter, &memcg->objcg_list, list) in memcg_reparent_objcgs()
209 WRITE_ONCE(iter->memcg, parent); in memcg_reparent_objcgs()
211 list_splice(&memcg->objcg_list, &parent->objcg_list); in memcg_reparent_objcgs()
215 percpu_ref_kill(&objcg->refcnt); in memcg_reparent_objcgs()
231 * mem_cgroup_css_from_folio - css of the memcg associated with a folio
248 return &memcg->css; in mem_cgroup_css_from_folio()
252 * page_cgroup_ino - return inode number of the memcg a page is charged to
273 while (memcg && !(memcg->css.flags & CSS_ONLINE)) in page_cgroup_ino()
276 ino = cgroup_ino(memcg->css.cgroup); in page_cgroup_ino()
371 /* Non-hierarchical (CPU aggregated) state */
392 x = READ_ONCE(pn->lruvec_stats->state[i]); in lruvec_page_state()
415 x = READ_ONCE(pn->lruvec_stats->state_local[i]); in lruvec_page_state_local()
511 /* Non-hierarchical (CPU aggregated) page state & events */
569 return atomic64_read(&vmstats->stats_updates) > in memcg_vmstats_needs_flush()
582 cgroup_rstat_updated(memcg->css.cgroup, cpu); in memcg_rstat_updated()
583 statc = this_cpu_ptr(memcg->vmstats_percpu); in memcg_rstat_updated()
584 for (; statc; statc = statc->parent) { in memcg_rstat_updated()
585 stats_updates = READ_ONCE(statc->stats_updates) + abs(val); in memcg_rstat_updated()
586 WRITE_ONCE(statc->stats_updates, stats_updates); in memcg_rstat_updated()
591 * If @memcg is already flush-able, increasing stats_updates is in memcg_rstat_updated()
594 if (!memcg_vmstats_needs_flush(statc->vmstats)) in memcg_rstat_updated()
596 &statc->vmstats->stats_updates); in memcg_rstat_updated()
597 WRITE_ONCE(statc->stats_updates, 0); in memcg_rstat_updated()
603 bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats); in __mem_cgroup_flush_stats()
605 trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates), in __mem_cgroup_flush_stats()
614 cgroup_rstat_flush(memcg->css.cgroup); in __mem_cgroup_flush_stats()
618 * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree
622 * minimum amount of work to be done even if there are no stat updates to flush.
648 * in latency-sensitive paths is as cheap as possible. in flush_memcg_stats_dwork()
662 x = READ_ONCE(memcg->vmstats->state[i]); in memcg_page_state()
674 * up non-zero sub-page updates to 1 page as zero page updates are ignored.
687 * __mod_memcg_state - update cgroup memory statistics
689 * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
703 __this_cpu_add(memcg->vmstats_percpu->state[i], val); in __mod_memcg_state()
718 x = READ_ONCE(memcg->vmstats->state_local[i]); in memcg_page_state_local()
738 memcg = pn->memcg; in __mod_memcg_lruvec_state()
742 * update their counter from in-interrupt context. For these two in __mod_memcg_lruvec_state()
760 __this_cpu_add(memcg->vmstats_percpu->state[i], val); in __mod_memcg_lruvec_state()
763 __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); in __mod_memcg_lruvec_state()
772 * __mod_lruvec_state - update lruvec memory statistics
779 * change of state at this level: per-node, per-cgroup, per-lruvec.
801 /* Untracked pages have no memcg, no lruvec. Update only the node */ in __lruvec_stat_mod_folio()
824 * Untracked pages have no memcg, no lruvec. Update only the in __mod_lruvec_kmem_state()
826 * when we free the slab object, we need to update the per-memcg in __mod_lruvec_kmem_state()
839 * __count_memcg_events - account VM events in a cgroup
856 __this_cpu_add(memcg->vmstats_percpu->events[i], count); in __count_memcg_events()
869 return READ_ONCE(memcg->vmstats->events[i]); in memcg_events()
879 return READ_ONCE(memcg->vmstats->events_local[i]); in memcg_events_local()
885 * mm_update_next_owner() may clear mm->owner to NULL in mem_cgroup_from_task()
901 return current->active_memcg; in active_memcg()
908 * Obtain a reference on mm->memcg and returns it if successful. If mm
911 * 2) current->mm->memcg, if available
927 * No need to css_get on root memcg as the reference in get_mem_cgroup_from_mm()
934 /* remote memcg must hold a ref */ in get_mem_cgroup_from_mm()
935 css_get(&memcg->css); in get_mem_cgroup_from_mm()
938 mm = current->mm; in get_mem_cgroup_from_mm()
945 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); in get_mem_cgroup_from_mm()
948 } while (!css_tryget(&memcg->css)); in get_mem_cgroup_from_mm()
955 * get_mem_cgroup_from_current - Obtain a reference on current task's memcg.
966 memcg = mem_cgroup_from_task(current); in get_mem_cgroup_from_current()
967 if (!css_tryget(&memcg->css)) { in get_mem_cgroup_from_current()
976 * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
987 if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) in get_mem_cgroup_from_folio()
994 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1000 * @root itself, or %NULL after a full round-trip.
1004 * to cancel a hierarchy walk before the round-trip is complete.
1031 int nid = reclaim->pgdat->node_id; in mem_cgroup_iter()
1033 iter = &root->nodeinfo[nid]->iter; in mem_cgroup_iter()
1034 gen = atomic_read(&iter->generation); in mem_cgroup_iter()
1037 * On start, join the current reclaim iteration cycle. in mem_cgroup_iter()
1041 reclaim->generation = gen; in mem_cgroup_iter()
1042 else if (reclaim->generation != gen) in mem_cgroup_iter()
1045 pos = READ_ONCE(iter->position); in mem_cgroup_iter()
1049 css = pos ? &pos->css : NULL; in mem_cgroup_iter()
1051 while ((css = css_next_descendant_pre(css, &root->css))) { in mem_cgroup_iter()
1057 if (css == &root->css || css_tryget(css)) in mem_cgroup_iter()
1069 if (cmpxchg(&iter->position, pos, next) != pos) { in mem_cgroup_iter()
1070 if (css && css != &root->css) in mem_cgroup_iter()
1076 atomic_inc(&iter->generation); in mem_cgroup_iter()
1081 * the hierarchy - make sure they see at least in mem_cgroup_iter()
1092 css_put(&prev->css); in mem_cgroup_iter()
1098 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1108 css_put(&prev->css); in mem_cgroup_iter_break()
1119 mz = from->nodeinfo[nid]; in __invalidate_reclaim_iterators()
1120 iter = &mz->iter; in __invalidate_reclaim_iterators()
1121 cmpxchg(&iter->position, dead_memcg, NULL); in __invalidate_reclaim_iterators()
1136 * When cgroup1 non-hierarchy mode is used, in invalidate_reclaim_iterators()
1147 * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1153 * descendants and calls @fn for each task. If @fn returns a non-zero
1172 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); in mem_cgroup_scan_tasks()
1205 * folio_lruvec_lock - Lock the lruvec for a folio.
1209 * - folio locked
1210 * - folio_test_lru false
1211 * - folio frozen (refcount of 0)
1219 spin_lock(&lruvec->lru_lock); in folio_lruvec_lock()
1226 * folio_lruvec_lock_irq - Lock the lruvec for a folio.
1230 * - folio locked
1231 * - folio_test_lru false
1232 * - folio frozen (refcount of 0)
1241 spin_lock_irq(&lruvec->lru_lock); in folio_lruvec_lock_irq()
1248 * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
1253 * - folio locked
1254 * - folio_test_lru false
1255 * - folio frozen (refcount of 0)
1265 spin_lock_irqsave(&lruvec->lru_lock, *flags); in folio_lruvec_lock_irqsave()
1272 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1292 lru_size = &mz->lru_zone_size[zid][lru]; in mem_cgroup_update_lru_size()
1310 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1320 unsigned long limit; in mem_cgroup_margin() local
1322 count = page_counter_read(&memcg->memory); in mem_cgroup_margin()
1323 limit = READ_ONCE(memcg->memory.max); in mem_cgroup_margin()
1324 if (count < limit) in mem_cgroup_margin()
1325 margin = limit - count; in mem_cgroup_margin()
1328 count = page_counter_read(&memcg->memsw); in mem_cgroup_margin()
1329 limit = READ_ONCE(memcg->memsw.max); in mem_cgroup_margin()
1330 if (count < limit) in mem_cgroup_margin()
1331 margin = min(margin, limit - count); in mem_cgroup_margin()
1477 * 1) generic big picture -> specifics and details in memcg_stat_format()
1478 * 2) reflecting userspace activity -> reflecting kernel heuristics in memcg_stat_format()
1480 * Current memory state: in memcg_stat_format()
1537 * @memcg: The memory cgroup that went over limit
1549 pr_cont_cgroup_path(memcg->css.cgroup); in mem_cgroup_print_oom_context()
1562 * @memcg: The memory cgroup that went over limit
1572 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", in mem_cgroup_print_oom_meminfo()
1573 K((u64)page_counter_read(&memcg->memory)), in mem_cgroup_print_oom_meminfo()
1574 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); in mem_cgroup_print_oom_meminfo()
1576 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", in mem_cgroup_print_oom_meminfo()
1577 K((u64)page_counter_read(&memcg->swap)), in mem_cgroup_print_oom_meminfo()
1578 K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); in mem_cgroup_print_oom_meminfo()
1581 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", in mem_cgroup_print_oom_meminfo()
1582 K((u64)page_counter_read(&memcg->memsw)), in mem_cgroup_print_oom_meminfo()
1583 K((u64)memcg->memsw.max), memcg->memsw.failcnt); in mem_cgroup_print_oom_meminfo()
1584 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", in mem_cgroup_print_oom_meminfo()
1585 K((u64)page_counter_read(&memcg->kmem)), in mem_cgroup_print_oom_meminfo()
1586 K((u64)memcg->kmem.max), memcg->kmem.failcnt); in mem_cgroup_print_oom_meminfo()
1591 pr_cont_cgroup_path(memcg->css.cgroup); in mem_cgroup_print_oom_meminfo()
1599 * Return the memory (and swap, if configured) limit for a memcg.
1603 unsigned long max = READ_ONCE(memcg->memory.max); in mem_cgroup_get_max()
1607 /* Calculate swap excess capacity from memsw limit */ in mem_cgroup_get_max()
1608 unsigned long swap = READ_ONCE(memcg->memsw.max) - max; in mem_cgroup_get_max()
1614 max += min(READ_ONCE(memcg->swap.max), in mem_cgroup_get_max()
1622 return page_counter_read(&memcg->memory); in mem_cgroup_size()
1678 * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
1680 * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
1683 * by killing all belonging OOM-killable tasks.
1685 * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
1716 * highest-level memory cgroup with oom.group set. in mem_cgroup_get_oom_group()
1719 if (READ_ONCE(memcg->oom_group)) in mem_cgroup_get_oom_group()
1727 css_get(&oom_group->css); in mem_cgroup_get_oom_group()
1737 pr_cont_cgroup_path(memcg->css.cgroup); in mem_cgroup_print_oom_group()
1770 * The charges will only happen if @memcg matches the current cpu's memcg
1789 stock_pages = READ_ONCE(stock->nr_pages); in consume_stock()
1790 if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) { in consume_stock()
1791 WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages); in consume_stock()
1805 unsigned int stock_pages = READ_ONCE(stock->nr_pages); in drain_stock()
1806 struct mem_cgroup *old = READ_ONCE(stock->cached); in drain_stock()
1812 page_counter_uncharge(&old->memory, stock_pages); in drain_stock()
1814 page_counter_uncharge(&old->memsw, stock_pages); in drain_stock()
1816 WRITE_ONCE(stock->nr_pages, 0); in drain_stock()
1819 css_put(&old->css); in drain_stock()
1820 WRITE_ONCE(stock->cached, NULL); in drain_stock()
1839 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); in drain_local_stock()
1855 if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ in __refill_stock()
1857 css_get(&memcg->css); in __refill_stock()
1858 WRITE_ONCE(stock->cached, memcg); in __refill_stock()
1860 stock_pages = READ_ONCE(stock->nr_pages) + nr_pages; in __refill_stock()
1861 WRITE_ONCE(stock->nr_pages, stock_pages); in __refill_stock()
1877 * Drains all per-CPU charge caches for given root_memcg resp. subtree
1888 * Notify other cpus that system-wide "drain" is running in drain_all_stock()
1891 * per-cpu data. CPU up doesn't touch memcg_stock at all. in drain_all_stock()
1901 memcg = READ_ONCE(stock->cached); in drain_all_stock()
1902 if (memcg && READ_ONCE(stock->nr_pages) && in drain_all_stock()
1910 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { in drain_all_stock()
1912 drain_local_stock(&stock->work); in drain_all_stock()
1914 schedule_work_on(cpu, &stock->work); in drain_all_stock()
1949 if (page_counter_read(&memcg->memory) <= in reclaim_high()
1950 READ_ONCE(memcg->memory.high)) in reclaim_high()
1987 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
1989 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
1994 * reasonable delay curve compared to precision-adjusted overage, not
1996 * limit penalises misbehaviour cgroups by slowing them down exponentially. For
1999 * +-------+------------------------+
2001 * +-------+------------------------+
2023 * +-------+------------------------+
2041 overage = usage - high; in calculate_overage()
2051 overage = calculate_overage(page_counter_read(&memcg->memory), in mem_find_max_overage()
2052 READ_ONCE(memcg->memory.high)); in mem_find_max_overage()
2065 overage = calculate_overage(page_counter_read(&memcg->swap), in swap_find_max_overage()
2066 READ_ONCE(memcg->swap.high)); in swap_find_max_overage()
2093 * memcg in question makes it clear that it has no intention of stopping in calculate_high_delay()
2103 * N-sized allocations are throttled approximately the same as one in calculate_high_delay()
2104 * 4N-sized allocation. in calculate_high_delay()
2107 * larger the current charge patch is than that. in calculate_high_delay()
2113 * Reclaims memory over the high limit. Called directly from
2122 unsigned int nr_pages = current->memcg_nr_pages_over_high; in mem_cgroup_handle_over_high()
2130 memcg = get_mem_cgroup_from_mm(current->mm); in mem_cgroup_handle_over_high()
2131 current->memcg_nr_pages_over_high = 0; in mem_cgroup_handle_over_high()
2136 * memory.high enforcement isn't as strict, and there is no in mem_cgroup_handle_over_high()
2189 if (nr_reclaimed || nr_retries--) { in mem_cgroup_handle_over_high()
2195 * Reclaim didn't manage to push usage below the limit, slow in mem_cgroup_handle_over_high()
2200 * need to account for any ill-begotten jiffies to pay them off later. in mem_cgroup_handle_over_high()
2207 css_put(&memcg->css); in mem_cgroup_handle_over_high()
2229 page_counter_try_charge(&memcg->memsw, batch, &counter)) { in try_charge_memcg()
2230 if (page_counter_try_charge(&memcg->memory, batch, &counter)) in try_charge_memcg()
2233 page_counter_uncharge(&memcg->memsw, batch); in try_charge_memcg()
2249 * under the limit over triggering OOM kills in these cases. in try_charge_memcg()
2251 if (unlikely(current->flags & PF_MEMALLOC)) in try_charge_memcg()
2254 if (unlikely(task_in_memcg_oom(current))) in try_charge_memcg()
2280 * Even though the limit is exceeded at this point, reclaim in try_charge_memcg()
2285 * unlikely to succeed so close to the limit, and we fall back in try_charge_memcg()
2291 if (nr_retries--) in try_charge_memcg()
2320 return -ENOMEM; in try_charge_memcg()
2331 * being freed very soon. Allow memory usage go over the limit in try_charge_memcg()
2334 page_counter_charge(&memcg->memory, nr_pages); in try_charge_memcg()
2336 page_counter_charge(&memcg->memsw, nr_pages); in try_charge_memcg()
2342 refill_stock(memcg, batch - nr_pages); in try_charge_memcg()
2349 * not recorded as it most likely matches current's and won't in try_charge_memcg()
2350 * change in the meantime. As high limit is checked again before in try_charge_memcg()
2356 mem_high = page_counter_read(&memcg->memory) > in try_charge_memcg()
2357 READ_ONCE(memcg->memory.high); in try_charge_memcg()
2358 swap_high = page_counter_read(&memcg->swap) > in try_charge_memcg()
2359 READ_ONCE(memcg->swap.high); in try_charge_memcg()
2364 schedule_work(&memcg->high_work); in try_charge_memcg()
2376 * Target some best-effort fairness between the tasks, in try_charge_memcg()
2380 current->memcg_nr_pages_over_high += batch; in try_charge_memcg()
2381 set_notify_resume(current); in try_charge_memcg()
2393 if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && in try_charge_memcg()
2394 !(current->flags & PF_MEMALLOC) && in try_charge_memcg()
2406 * - the page lock in commit_charge()
2407 * - LRU isolation in commit_charge()
2408 * - exclusive reference in commit_charge()
2410 folio->memcg_data = (unsigned long)memcg; in commit_charge()
2431 * Slab objects are accounted individually, not per-page. in mem_cgroup_from_obj_folio()
2433 * slab->obj_exts. in mem_cgroup_from_obj_folio()
2445 off = obj_to_index(slab->slab_cache, slab, p); in mem_cgroup_from_obj_folio()
2455 * slab->obj_exts has not been freed yet in mem_cgroup_from_obj_folio()
2484 objcg = rcu_dereference(memcg->objcg); in __get_obj_cgroup_from_memcg()
2499 old = xchg(¤t->objcg, NULL); in current_objcg_update()
2508 /* If new objcg is NULL, no reason for the second atomic update. */ in current_objcg_update()
2509 if (!current->mm || (current->flags & PF_KTHREAD)) in current_objcg_update()
2522 * Obtain the new objcg pointer. The current task can be in current_objcg_update()
2529 memcg = mem_cgroup_from_task(current); in current_objcg_update()
2538 } while (!try_cmpxchg(¤t->objcg, &old, objcg)); in current_objcg_update()
2549 memcg = current->active_memcg; in current_obj_cgroup()
2553 objcg = READ_ONCE(current->objcg); in current_obj_cgroup()
2558 * to use the objcg by the current task. in current_obj_cgroup()
2578 objcg = rcu_dereference_check(memcg->objcg, 1); in current_obj_cgroup()
2622 mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); in obj_cgroup_uncharge_pages()
2623 memcg1_account_kmem(memcg, -nr_pages); in obj_cgroup_uncharge_pages()
2626 css_put(&memcg->css); in obj_cgroup_uncharge_pages()
2652 css_put(&memcg->css); in obj_cgroup_charge_pages()
2658 * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
2675 page->memcg_data = (unsigned long)objcg | in __memcg_kmem_charge_page()
2699 folio->memcg_data = 0; in __memcg_kmem_uncharge_page()
2719 if (READ_ONCE(stock->cached_objcg) != objcg) { in mod_objcg_state()
2722 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) in mod_objcg_state()
2723 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; in mod_objcg_state()
2724 WRITE_ONCE(stock->cached_objcg, objcg); in mod_objcg_state()
2725 stock->cached_pgdat = pgdat; in mod_objcg_state()
2726 } else if (stock->cached_pgdat != pgdat) { in mod_objcg_state()
2728 struct pglist_data *oldpg = stock->cached_pgdat; in mod_objcg_state()
2730 if (stock->nr_slab_reclaimable_b) { in mod_objcg_state()
2732 stock->nr_slab_reclaimable_b); in mod_objcg_state()
2733 stock->nr_slab_reclaimable_b = 0; in mod_objcg_state()
2735 if (stock->nr_slab_unreclaimable_b) { in mod_objcg_state()
2737 stock->nr_slab_unreclaimable_b); in mod_objcg_state()
2738 stock->nr_slab_unreclaimable_b = 0; in mod_objcg_state()
2740 stock->cached_pgdat = pgdat; in mod_objcg_state()
2743 bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b in mod_objcg_state()
2744 : &stock->nr_slab_unreclaimable_b; in mod_objcg_state()
2777 if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { in consume_obj_stock()
2778 stock->nr_bytes -= nr_bytes; in consume_obj_stock()
2789 struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); in drain_obj_stock()
2794 if (stock->nr_bytes) { in drain_obj_stock()
2795 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; in drain_obj_stock()
2796 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); in drain_obj_stock()
2803 mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); in drain_obj_stock()
2804 memcg1_account_kmem(memcg, -nr_pages); in drain_obj_stock()
2807 css_put(&memcg->css); in drain_obj_stock()
2811 * The leftover is flushed to the centralized per-memcg value. in drain_obj_stock()
2813 * to a per-cpu stock (probably, on an other CPU), see in drain_obj_stock()
2816 * How often it's flushed is a trade-off between the memory in drain_obj_stock()
2817 * limit enforcement accuracy and potential CPU contention, in drain_obj_stock()
2820 atomic_add(nr_bytes, &old->nr_charged_bytes); in drain_obj_stock()
2821 stock->nr_bytes = 0; in drain_obj_stock()
2825 * Flush the vmstat data in current stock in drain_obj_stock()
2827 if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { in drain_obj_stock()
2828 if (stock->nr_slab_reclaimable_b) { in drain_obj_stock()
2829 __mod_objcg_mlstate(old, stock->cached_pgdat, in drain_obj_stock()
2831 stock->nr_slab_reclaimable_b); in drain_obj_stock()
2832 stock->nr_slab_reclaimable_b = 0; in drain_obj_stock()
2834 if (stock->nr_slab_unreclaimable_b) { in drain_obj_stock()
2835 __mod_objcg_mlstate(old, stock->cached_pgdat, in drain_obj_stock()
2837 stock->nr_slab_unreclaimable_b); in drain_obj_stock()
2838 stock->nr_slab_unreclaimable_b = 0; in drain_obj_stock()
2840 stock->cached_pgdat = NULL; in drain_obj_stock()
2843 WRITE_ONCE(stock->cached_objcg, NULL); in drain_obj_stock()
2854 struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); in obj_stock_flush_required()
2877 if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ in refill_obj_stock()
2880 WRITE_ONCE(stock->cached_objcg, objcg); in refill_obj_stock()
2881 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) in refill_obj_stock()
2882 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; in refill_obj_stock()
2885 stock->nr_bytes += nr_bytes; in refill_obj_stock()
2887 if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { in refill_obj_stock()
2888 nr_pages = stock->nr_bytes >> PAGE_SHIFT; in refill_obj_stock()
2889 stock->nr_bytes &= (PAGE_SIZE - 1); in refill_obj_stock()
2908 * In theory, objcg->nr_charged_bytes can have enough in obj_cgroup_charge()
2909 * pre-charged bytes to satisfy the allocation. However, in obj_cgroup_charge()
2910 * flushing objcg->nr_charged_bytes requires two atomic in obj_cgroup_charge()
2911 * operations, and objcg->nr_charged_bytes can't be big. in obj_cgroup_charge()
2912 * The shared objcg->nr_charged_bytes can also become a in obj_cgroup_charge()
2916 * objcg->nr_charged_bytes later on when objcg changes. in obj_cgroup_charge()
2918 * The stock's nr_bytes may contain enough pre-charged bytes in obj_cgroup_charge()
2920 * on the pre-charged bytes not being changed outside of in obj_cgroup_charge()
2922 * pre-charged bytes as well when charging pages. To avoid a in obj_cgroup_charge()
2925 * to temporarily allow the pre-charged bytes to exceed the page in obj_cgroup_charge()
2926 * size limit. The maximum reachable value of the pre-charged in obj_cgroup_charge()
2927 * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data in obj_cgroup_charge()
2931 nr_bytes = size & (PAGE_SIZE - 1); in obj_cgroup_charge()
2938 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); in obj_cgroup_charge()
2954 return s->size + sizeof(struct obj_cgroup *); in obj_full_size()
2966 * The obtained objcg pointer is safe to use within the current scope, in __memcg_slab_post_alloc_hook()
2967 * defined by current task or set_active_memcg() pair. in __memcg_slab_post_alloc_hook()
2991 css_put(&memcg->css); in __memcg_slab_post_alloc_hook()
3034 -obj_full_size(s)); in __memcg_slab_free_hook()
3053 folio_page(folio, i)->memcg_data = folio->memcg_data; in split_page_memcg()
3056 obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1); in split_page_memcg()
3058 css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1); in split_page_memcg()
3073 val += total_swap_pages - get_nr_swap_pages(); in mem_cgroup_usage()
3076 val = page_counter_read(&memcg->memory); in mem_cgroup_usage()
3078 val = page_counter_read(&memcg->memsw); in mem_cgroup_usage()
3095 return -ENOMEM; in memcg_online_kmem()
3097 objcg->memcg = memcg; in memcg_online_kmem()
3098 rcu_assign_pointer(memcg->objcg, objcg); in memcg_online_kmem()
3100 memcg->orig_objcg = objcg; in memcg_online_kmem()
3104 memcg->kmemcg_id = memcg->id.id; in memcg_online_kmem()
3138 return wb_domain_init(&memcg->cgwb_domain, gfp); in memcg_wb_domain_init()
3143 wb_domain_exit(&memcg->cgwb_domain); in memcg_wb_domain_exit()
3148 wb_domain_size_changed(&memcg->cgwb_domain); in memcg_wb_domain_size_changed()
3153 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); in mem_cgroup_wb_domain()
3155 if (!memcg->css.parent) in mem_cgroup_wb_domain()
3158 return &memcg->cgwb_domain; in mem_cgroup_wb_domain()
3162 * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
3170 * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
3173 * A memcg's headroom is "min(max, high) - used". In the hierarchy, the
3183 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); in mem_cgroup_wb_stats()
3195 unsigned long ceiling = min(READ_ONCE(memcg->memory.max), in mem_cgroup_wb_stats()
3196 READ_ONCE(memcg->memory.high)); in mem_cgroup_wb_stats()
3197 unsigned long used = page_counter_read(&memcg->memory); in mem_cgroup_wb_stats()
3199 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); in mem_cgroup_wb_stats()
3208 * tracks ownership per-page while the latter per-inode. This was a
3209 * deliberate design decision because honoring per-page ownership in the
3211 * and deemed unnecessary given that write-sharing an inode across
3212 * different cgroups isn't a common use-case.
3214 * Combined with inode majority-writer ownership switching, this works well
3235 * page - a page whose memcg and writeback ownerships don't match - is
3241 * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
3255 int oldest = -1; in mem_cgroup_track_foreign_dirty_slowpath()
3266 frn = &memcg->cgwb_frn[i]; in mem_cgroup_track_foreign_dirty_slowpath()
3267 if (frn->bdi_id == wb->bdi->id && in mem_cgroup_track_foreign_dirty_slowpath()
3268 frn->memcg_id == wb->memcg_css->id) in mem_cgroup_track_foreign_dirty_slowpath()
3270 if (time_before64(frn->at, oldest_at) && in mem_cgroup_track_foreign_dirty_slowpath()
3271 atomic_read(&frn->done.cnt) == 1) { in mem_cgroup_track_foreign_dirty_slowpath()
3273 oldest_at = frn->at; in mem_cgroup_track_foreign_dirty_slowpath()
3279 * Re-using an existing one. Update timestamp lazily to in mem_cgroup_track_foreign_dirty_slowpath()
3281 * reasonably up-to-date and significantly shorter than in mem_cgroup_track_foreign_dirty_slowpath()
3289 if (time_before64(frn->at, now - update_intv)) in mem_cgroup_track_foreign_dirty_slowpath()
3290 frn->at = now; in mem_cgroup_track_foreign_dirty_slowpath()
3293 frn = &memcg->cgwb_frn[oldest]; in mem_cgroup_track_foreign_dirty_slowpath()
3294 frn->bdi_id = wb->bdi->id; in mem_cgroup_track_foreign_dirty_slowpath()
3295 frn->memcg_id = wb->memcg_css->id; in mem_cgroup_track_foreign_dirty_slowpath()
3296 frn->at = now; in mem_cgroup_track_foreign_dirty_slowpath()
3303 struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); in mem_cgroup_flush_foreign()
3309 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; in mem_cgroup_flush_foreign()
3313 * writeback on it has already started. No need to kick it in mem_cgroup_flush_foreign()
3317 if (time_after64(frn->at, now - intv) && in mem_cgroup_flush_foreign()
3318 atomic_read(&frn->done.cnt) == 1) { in mem_cgroup_flush_foreign()
3319 frn->at = 0; in mem_cgroup_flush_foreign()
3320 trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); in mem_cgroup_flush_foreign()
3321 cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, in mem_cgroup_flush_foreign()
3323 &frn->done); in mem_cgroup_flush_foreign()
3348 * Swap-out records and page cache shadow entries need to store memcg
3351 * memory-controlled cgroups to 64k.
3358 * even when there are much fewer than 64k cgroups - possibly none.
3360 * Maintain a private 16-bit ID space for memcg, and allow the ID to
3361 * be freed and recycled when it's no longer needed, which is usually
3369 #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
3374 if (memcg->id.id > 0) { in mem_cgroup_id_remove()
3375 xa_erase(&mem_cgroup_ids, memcg->id.id); in mem_cgroup_id_remove()
3376 memcg->id.id = 0; in mem_cgroup_id_remove()
3383 refcount_add(n, &memcg->id.ref); in mem_cgroup_id_get_many()
3388 if (refcount_sub_and_test(n, &memcg->id.ref)) { in mem_cgroup_id_put_many()
3392 css_put(&memcg->css); in mem_cgroup_id_put_many()
3402 * mem_cgroup_from_id - look up a memcg from a memcg id
3428 memcg = ERR_PTR(-ENOENT); in mem_cgroup_get_from_ino()
3444 pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), in alloc_mem_cgroup_per_node_info()
3446 if (!pn->lruvec_stats) in alloc_mem_cgroup_per_node_info()
3449 pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, in alloc_mem_cgroup_per_node_info()
3451 if (!pn->lruvec_stats_percpu) in alloc_mem_cgroup_per_node_info()
3454 lruvec_init(&pn->lruvec); in alloc_mem_cgroup_per_node_info()
3455 pn->memcg = memcg; in alloc_mem_cgroup_per_node_info()
3457 memcg->nodeinfo[node] = pn; in alloc_mem_cgroup_per_node_info()
3460 kfree(pn->lruvec_stats); in alloc_mem_cgroup_per_node_info()
3467 struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; in free_mem_cgroup_per_node_info()
3472 free_percpu(pn->lruvec_stats_percpu); in free_mem_cgroup_per_node_info()
3473 kfree(pn->lruvec_stats); in free_mem_cgroup_per_node_info()
3481 obj_cgroup_put(memcg->orig_objcg); in __mem_cgroup_free()
3486 kfree(memcg->vmstats); in __mem_cgroup_free()
3487 free_percpu(memcg->vmstats_percpu); in __mem_cgroup_free()
3508 return ERR_PTR(-ENOMEM); in mem_cgroup_alloc()
3510 error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL, in mem_cgroup_alloc()
3514 error = -ENOMEM; in mem_cgroup_alloc()
3516 memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), in mem_cgroup_alloc()
3518 if (!memcg->vmstats) in mem_cgroup_alloc()
3521 memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, in mem_cgroup_alloc()
3523 if (!memcg->vmstats_percpu) in mem_cgroup_alloc()
3531 pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); in mem_cgroup_alloc()
3532 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); in mem_cgroup_alloc()
3533 statc->parent = parent ? pstatc : NULL; in mem_cgroup_alloc()
3534 statc->vmstats = memcg->vmstats; in mem_cgroup_alloc()
3544 INIT_WORK(&memcg->high_work, high_work_func); in mem_cgroup_alloc()
3545 vmpressure_init(&memcg->vmpressure); in mem_cgroup_alloc()
3546 INIT_LIST_HEAD(&memcg->memory_peaks); in mem_cgroup_alloc()
3547 INIT_LIST_HEAD(&memcg->swap_peaks); in mem_cgroup_alloc()
3548 spin_lock_init(&memcg->peaks_lock); in mem_cgroup_alloc()
3549 memcg->socket_pressure = jiffies; in mem_cgroup_alloc()
3551 memcg->kmemcg_id = -1; in mem_cgroup_alloc()
3552 INIT_LIST_HEAD(&memcg->objcg_list); in mem_cgroup_alloc()
3554 INIT_LIST_HEAD(&memcg->cgwb_list); in mem_cgroup_alloc()
3556 memcg->cgwb_frn[i].done = in mem_cgroup_alloc()
3560 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); in mem_cgroup_alloc()
3561 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); in mem_cgroup_alloc()
3562 memcg->deferred_split_queue.split_queue_len = 0; in mem_cgroup_alloc()
3584 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); in mem_cgroup_css_alloc()
3587 memcg->zswap_max = PAGE_COUNTER_MAX; in mem_cgroup_css_alloc()
3588 WRITE_ONCE(memcg->zswap_writeback, true); in mem_cgroup_css_alloc()
3590 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); in mem_cgroup_css_alloc()
3592 WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); in mem_cgroup_css_alloc()
3594 page_counter_init(&memcg->memory, &parent->memory, true); in mem_cgroup_css_alloc()
3595 page_counter_init(&memcg->swap, &parent->swap, false); in mem_cgroup_css_alloc()
3597 WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable)); in mem_cgroup_css_alloc()
3598 page_counter_init(&memcg->kmem, &parent->kmem, false); in mem_cgroup_css_alloc()
3599 page_counter_init(&memcg->tcpmem, &parent->tcpmem, false); in mem_cgroup_css_alloc()
3604 page_counter_init(&memcg->memory, NULL, true); in mem_cgroup_css_alloc()
3605 page_counter_init(&memcg->swap, NULL, false); in mem_cgroup_css_alloc()
3607 page_counter_init(&memcg->kmem, NULL, false); in mem_cgroup_css_alloc()
3608 page_counter_init(&memcg->tcpmem, NULL, false); in mem_cgroup_css_alloc()
3611 return &memcg->css; in mem_cgroup_css_alloc()
3620 return &memcg->css; in mem_cgroup_css_alloc()
3644 refcount_set(&memcg->id.ref, 1); in mem_cgroup_css_online()
3651 * css_tryget_online(). But right now there are no users that in mem_cgroup_css_online()
3657 xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL); in mem_cgroup_css_online()
3664 return -ENOMEM; in mem_cgroup_css_online()
3673 page_counter_set_min(&memcg->memory, 0); in mem_cgroup_css_offline()
3674 page_counter_set_low(&memcg->memory, 0); in mem_cgroup_css_offline()
3703 wb_wait_for_completion(&memcg->cgwb_frn[i].done); in mem_cgroup_css_free()
3714 vmpressure_cleanup(&memcg->vmpressure); in mem_cgroup_css_free()
3715 cancel_work_sync(&memcg->high_work); in mem_cgroup_css_free()
3722 * mem_cgroup_css_reset - reset the states of a mem_cgroup
3731 * The current implementation only resets the essential configurations.
3738 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); in mem_cgroup_css_reset()
3739 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); in mem_cgroup_css_reset()
3741 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); in mem_cgroup_css_reset()
3742 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); in mem_cgroup_css_reset()
3744 page_counter_set_min(&memcg->memory, 0); in mem_cgroup_css_reset()
3745 page_counter_set_low(&memcg->memory, 0); in mem_cgroup_css_reset()
3746 page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); in mem_cgroup_css_reset()
3748 page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); in mem_cgroup_css_reset()
3755 /* pointer to the non-hierarchichal (CPU aggregated) counters */
3774 for (i = 0; i < ac->size; i++) { in mem_cgroup_stat_aggregate()
3777 * below us. We're in a per-cpu loop here and this is in mem_cgroup_stat_aggregate()
3780 delta = ac->pending[i]; in mem_cgroup_stat_aggregate()
3782 ac->pending[i] = 0; in mem_cgroup_stat_aggregate()
3786 v = READ_ONCE(ac->cstat[i]); in mem_cgroup_stat_aggregate()
3787 if (v != ac->cstat_prev[i]) { in mem_cgroup_stat_aggregate()
3788 delta_cpu = v - ac->cstat_prev[i]; in mem_cgroup_stat_aggregate()
3790 ac->cstat_prev[i] = v; in mem_cgroup_stat_aggregate()
3795 ac->local[i] += delta_cpu; in mem_cgroup_stat_aggregate()
3798 ac->aggregate[i] += delta; in mem_cgroup_stat_aggregate()
3799 if (ac->ppending) in mem_cgroup_stat_aggregate()
3800 ac->ppending[i] += delta; in mem_cgroup_stat_aggregate()
3813 statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); in mem_cgroup_css_rstat_flush()
3816 .aggregate = memcg->vmstats->state, in mem_cgroup_css_rstat_flush()
3817 .local = memcg->vmstats->state_local, in mem_cgroup_css_rstat_flush()
3818 .pending = memcg->vmstats->state_pending, in mem_cgroup_css_rstat_flush()
3819 .ppending = parent ? parent->vmstats->state_pending : NULL, in mem_cgroup_css_rstat_flush()
3820 .cstat = statc->state, in mem_cgroup_css_rstat_flush()
3821 .cstat_prev = statc->state_prev, in mem_cgroup_css_rstat_flush()
3827 .aggregate = memcg->vmstats->events, in mem_cgroup_css_rstat_flush()
3828 .local = memcg->vmstats->events_local, in mem_cgroup_css_rstat_flush()
3829 .pending = memcg->vmstats->events_pending, in mem_cgroup_css_rstat_flush()
3830 .ppending = parent ? parent->vmstats->events_pending : NULL, in mem_cgroup_css_rstat_flush()
3831 .cstat = statc->events, in mem_cgroup_css_rstat_flush()
3832 .cstat_prev = statc->events_prev, in mem_cgroup_css_rstat_flush()
3838 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; in mem_cgroup_css_rstat_flush()
3839 struct lruvec_stats *lstats = pn->lruvec_stats; in mem_cgroup_css_rstat_flush()
3844 plstats = parent->nodeinfo[nid]->lruvec_stats; in mem_cgroup_css_rstat_flush()
3846 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); in mem_cgroup_css_rstat_flush()
3849 .aggregate = lstats->state, in mem_cgroup_css_rstat_flush()
3850 .local = lstats->state_local, in mem_cgroup_css_rstat_flush()
3851 .pending = lstats->state_pending, in mem_cgroup_css_rstat_flush()
3852 .ppending = plstats ? plstats->state_pending : NULL, in mem_cgroup_css_rstat_flush()
3853 .cstat = lstatc->state, in mem_cgroup_css_rstat_flush()
3854 .cstat_prev = lstatc->state_prev, in mem_cgroup_css_rstat_flush()
3860 WRITE_ONCE(statc->stats_updates, 0); in mem_cgroup_css_rstat_flush()
3861 /* We are in a per-cpu loop here, only do the atomic write once */ in mem_cgroup_css_rstat_flush()
3862 if (atomic64_read(&memcg->vmstats->stats_updates)) in mem_cgroup_css_rstat_flush()
3863 atomic64_set(&memcg->vmstats->stats_updates, 0); in mem_cgroup_css_rstat_flush()
3869 * Set the update flag to cause task->objcg to be initialized lazily in mem_cgroup_fork()
3871 * because it's always performed on the current task, so does in mem_cgroup_fork()
3874 task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG; in mem_cgroup_fork()
3879 struct obj_cgroup *objcg = task->objcg; in mem_cgroup_exit()
3888 * because it's always performed on the current task, so does in mem_cgroup_exit()
3891 task->objcg = NULL; in mem_cgroup_exit()
3908 if (task->mm && READ_ONCE(task->mm->owner) == task) in mem_cgroup_lru_gen_attach()
3909 lru_gen_migrate_mm(task->mm); in mem_cgroup_lru_gen_attach()
3923 set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg); in mem_cgroup_kmem_attach()
3948 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; in memory_current_read()
3951 #define OFP_PEAK_UNSET (((-1UL)))
3955 struct cgroup_of_peak *ofp = of_peak(sf->private); in peak_show()
3956 u64 fd_peak = READ_ONCE(ofp->value), peak; in peak_show()
3960 peak = pc->watermark; in peak_show()
3962 peak = max(fd_peak, READ_ONCE(pc->local_watermark)); in peak_show()
3972 return peak_show(sf, v, &memcg->memory); in memory_peak_show()
3979 ofp->value = OFP_PEAK_UNSET; in peak_open()
3988 if (ofp->value == OFP_PEAK_UNSET) { in peak_release()
3989 /* fast path (no writes on this fd) */ in peak_release()
3992 spin_lock(&memcg->peaks_lock); in peak_release()
3993 list_del(&ofp->list); in peak_release()
3994 spin_unlock(&memcg->peaks_lock); in peak_release()
4006 spin_lock(&memcg->peaks_lock); in peak_write()
4009 WRITE_ONCE(pc->local_watermark, usage); in peak_write()
4012 if (usage > peer_ctx->value) in peak_write()
4013 WRITE_ONCE(peer_ctx->value, usage); in peak_write()
4016 if (ofp->value == -1) in peak_write()
4017 list_add(&ofp->list, watchers); in peak_write()
4019 WRITE_ONCE(ofp->value, usage); in peak_write()
4020 spin_unlock(&memcg->peaks_lock); in peak_write()
4030 return peak_write(of, buf, nbytes, off, &memcg->memory, in memory_peak_write()
4031 &memcg->memory_peaks); in memory_peak_write()
4039 READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); in memory_min_show()
4054 page_counter_set_min(&memcg->memory, min); in memory_min_write()
4062 READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); in memory_low_show()
4077 page_counter_set_low(&memcg->memory, low); in memory_low_write()
4085 READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); in memory_high_show()
4102 page_counter_set_high(&memcg->memory, high); in memory_high_write()
4105 unsigned long nr_pages = page_counter_read(&memcg->memory); in memory_high_write()
4111 if (signal_pending(current)) in memory_high_write()
4120 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, in memory_high_write()
4123 if (!reclaimed && !nr_retries--) in memory_high_write()
4134 READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); in memory_max_show()
4151 xchg(&memcg->memory.max, max); in memory_max_write()
4154 unsigned long nr_pages = page_counter_read(&memcg->memory); in memory_max_write()
4159 if (signal_pending(current)) in memory_max_write()
4169 if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, in memory_max_write()
4171 nr_reclaims--; in memory_max_write()
4205 __memory_events_show(m, memcg->memory_events); in memory_events_show()
4213 __memory_events_show(m, memcg->memory_events_local); in memory_events_local_show()
4224 return -ENOMEM; in memory_stat_show()
4274 seq_printf(m, "%d\n", READ_ONCE(memcg->oom_group)); in memory_oom_group_show()
4287 return -EINVAL; in memory_oom_group_write()
4294 return -EINVAL; in memory_oom_group_write()
4296 WRITE_ONCE(memcg->oom_group, oom_group); in memory_oom_group_write()
4317 int swappiness = -1; in memory_reclaim()
4327 return -EINVAL; in memory_reclaim()
4337 return -EINVAL; in memory_reclaim()
4339 return -EINVAL; in memory_reclaim()
4342 return -EINVAL; in memory_reclaim()
4349 unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; in memory_reclaim()
4352 if (signal_pending(current)) in memory_reclaim()
4353 return -EINTR; in memory_reclaim()
4366 swappiness == -1 ? NULL : &swappiness); in memory_reclaim()
4368 if (!reclaimed && !nr_retries--) in memory_reclaim()
4369 return -EAGAIN; in memory_reclaim()
4379 .name = "current",
4470 * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
4471 * @root: the top ancestor of the sub-tree being checked
4475 * of a top-down tree iteration, not for isolated queries.
4489 page_counter_calculate_protection(&root->memory, &memcg->memory, recursive_protection); in mem_cgroup_calculate_protection()
4501 css_get(&memcg->css); in charge_memcg()
4515 css_put(&memcg->css); in __mem_cgroup_charge()
4521 * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio
4539 * system-level stats via lruvec_stat_mod_folio. Return 0, and skip in mem_cgroup_charge_hugetlb()
4547 ret = -ENOMEM; in mem_cgroup_charge_hugetlb()
4555 * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
4579 if (!memcg || !css_tryget_online(&memcg->css)) in mem_cgroup_swapin_charge_folio()
4585 css_put(&memcg->css); in mem_cgroup_swapin_charge_folio()
4590 * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
4609 * so this is a non-issue here. Memory and swap charge lifetimes in mem_cgroup_swapin_uncharge_swap()
4638 if (ug->nr_memory) { in uncharge_batch()
4639 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); in uncharge_batch()
4641 page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); in uncharge_batch()
4642 if (ug->nr_kmem) { in uncharge_batch()
4643 mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem); in uncharge_batch()
4644 memcg1_account_kmem(ug->memcg, -ug->nr_kmem); in uncharge_batch()
4646 memcg1_oom_recover(ug->memcg); in uncharge_batch()
4649 memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid); in uncharge_batch()
4652 css_put(&ug->memcg->css); in uncharge_batch()
4682 if (ug->memcg != memcg) { in uncharge_folio()
4683 if (ug->memcg) { in uncharge_folio()
4687 ug->memcg = memcg; in uncharge_folio()
4688 ug->nid = folio_nid(folio); in uncharge_folio()
4691 css_get(&memcg->css); in uncharge_folio()
4697 ug->nr_memory += nr_pages; in uncharge_folio()
4698 ug->nr_kmem += nr_pages; in uncharge_folio()
4700 folio->memcg_data = 0; in uncharge_folio()
4705 ug->nr_memory += nr_pages; in uncharge_folio()
4706 ug->pgpgout++; in uncharge_folio()
4709 folio->memcg_data = 0; in uncharge_folio()
4712 css_put(&memcg->css); in uncharge_folio()
4719 /* Don't touch folio->lru of any random page, pre-check: */ in __mem_cgroup_uncharge()
4734 for (i = 0; i < folios->nr; i++) in __mem_cgroup_uncharge_folios()
4735 uncharge_folio(folios->folios[i], &ug); in __mem_cgroup_uncharge_folios()
4741 * mem_cgroup_replace_folio - Charge a folio's replacement.
4748 * Both folios must be locked, @new->mapping must be set up.
4772 /* Force-charge the new page. The old one will be freed soon */ in mem_cgroup_replace_folio()
4774 page_counter_charge(&memcg->memory, nr_pages); in mem_cgroup_replace_folio()
4776 page_counter_charge(&memcg->memsw, nr_pages); in mem_cgroup_replace_folio()
4779 css_get(&memcg->css); in mem_cgroup_replace_folio()
4785 * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio.
4793 * Both folios must be locked, @new->mapping must be set up.
4818 /* Transfer the charge and the css ref */ in mem_cgroup_migrate()
4821 /* Warning should never happen, so don't worry about refcount non-0 */ in mem_cgroup_migrate()
4823 old->memcg_data = 0; in mem_cgroup_migrate()
4841 memcg = mem_cgroup_from_task(current); in mem_cgroup_sk_alloc()
4846 if (css_tryget(&memcg->css)) in mem_cgroup_sk_alloc()
4847 sk->sk_memcg = memcg; in mem_cgroup_sk_alloc()
4854 if (sk->sk_memcg) in mem_cgroup_sk_free()
4855 css_put(&sk->sk_memcg->css); in mem_cgroup_sk_free()
4859 * mem_cgroup_charge_skmem - charge socket memory
4865 * @memcg's configured limit, %false if it doesn't.
4882 * mem_cgroup_uncharge_skmem - uncharge socket memory
4893 mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); in mem_cgroup_uncharge_skmem()
4920 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
4930 * used for per-memcg-per-cpu caching of per-node statistics. In order in mem_cgroup_init()
4940 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, in mem_cgroup_init()
4950 while (!refcount_inc_not_zero(&memcg->id.ref)) { in mem_cgroup_id_get_online()
4967 * mem_cgroup_swapout - transfer a memsw charge to swap
5002 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); in mem_cgroup_swapout()
5008 folio->memcg_data = 0; in mem_cgroup_swapout()
5011 page_counter_uncharge(&memcg->memory, nr_entries); in mem_cgroup_swapout()
5015 page_counter_charge(&swap_memcg->memsw, nr_entries); in mem_cgroup_swapout()
5016 page_counter_uncharge(&memcg->memsw, nr_entries); in mem_cgroup_swapout()
5020 css_put(&memcg->css); in mem_cgroup_swapout()
5024 * __mem_cgroup_try_charge_swap - try charging swap space for a folio
5030 * Returns 0 on success, -ENOMEM on failure.
5055 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { in __mem_cgroup_try_charge_swap()
5059 return -ENOMEM; in __mem_cgroup_try_charge_swap()
5064 mem_cgroup_id_get_many(memcg, nr_pages - 1); in __mem_cgroup_try_charge_swap()
5073 * __mem_cgroup_uncharge_swap - uncharge swap space
5088 page_counter_uncharge(&memcg->memsw, nr_pages); in __mem_cgroup_uncharge_swap()
5090 page_counter_uncharge(&memcg->swap, nr_pages); in __mem_cgroup_uncharge_swap()
5092 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); in __mem_cgroup_uncharge_swap()
5106 READ_ONCE(memcg->swap.max) - in mem_cgroup_get_nr_swap_pages()
5107 page_counter_read(&memcg->swap)); in mem_cgroup_get_nr_swap_pages()
5127 unsigned long usage = page_counter_read(&memcg->swap); in mem_cgroup_swap_full()
5129 if (usage * 2 >= READ_ONCE(memcg->swap.high) || in mem_cgroup_swap_full()
5130 usage * 2 >= READ_ONCE(memcg->swap.max)) in mem_cgroup_swap_full()
5144 "Please report your usecase to linux-mm@kvack.org if you " in setup_swap_account()
5155 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; in swap_current_read()
5162 return peak_show(sf, v, &memcg->swap); in swap_peak_show()
5170 return peak_write(of, buf, nbytes, off, &memcg->swap, in swap_peak_write()
5171 &memcg->swap_peaks); in swap_peak_write()
5177 READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); in swap_high_show()
5192 page_counter_set_high(&memcg->swap, high); in swap_high_write()
5200 READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); in swap_max_show()
5215 xchg(&memcg->swap.max, max); in swap_max_write()
5225 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); in swap_events_show()
5227 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); in swap_events_show()
5229 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); in swap_events_show()
5236 .name = "swap.current",
5271 * obj_cgroup_may_zswap - check if this cgroup can zswap
5274 * Check if the hierarchical zswap limit has been reached.
5278 * once compression has occurred, and this optimistic pre-check avoids
5279 * spending cycles on compression when there is already no room left
5293 unsigned long max = READ_ONCE(memcg->zswap_max); in obj_cgroup_may_zswap()
5316 * obj_cgroup_charge_zswap - charge compression backend memory
5330 VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); in obj_cgroup_charge_zswap()
5344 * obj_cgroup_uncharge_zswap - uncharge compression backend memory
5361 mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); in obj_cgroup_uncharge_zswap()
5362 mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); in obj_cgroup_uncharge_zswap()
5373 if (!READ_ONCE(memcg->zswap_writeback)) in mem_cgroup_zswap_writeback_enabled()
5391 READ_ONCE(mem_cgroup_from_seq(m)->zswap_max)); in zswap_max_show()
5406 xchg(&memcg->zswap_max, max); in zswap_max_write()
5415 seq_printf(m, "%d\n", READ_ONCE(memcg->zswap_writeback)); in zswap_writeback_show()
5430 return -EINVAL; in zswap_writeback_write()
5432 WRITE_ONCE(memcg->zswap_writeback, zswap_writeback); in zswap_writeback_write()
5438 .name = "zswap.current",