170ddf637SAnton Vorontsov /* 270ddf637SAnton Vorontsov * Linux VM pressure 370ddf637SAnton Vorontsov * 470ddf637SAnton Vorontsov * Copyright 2012 Linaro Ltd. 570ddf637SAnton Vorontsov * Anton Vorontsov <anton.vorontsov@linaro.org> 670ddf637SAnton Vorontsov * 770ddf637SAnton Vorontsov * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, 870ddf637SAnton Vorontsov * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. 970ddf637SAnton Vorontsov * 1070ddf637SAnton Vorontsov * This program is free software; you can redistribute it and/or modify it 1170ddf637SAnton Vorontsov * under the terms of the GNU General Public License version 2 as published 1270ddf637SAnton Vorontsov * by the Free Software Foundation. 1370ddf637SAnton Vorontsov */ 1470ddf637SAnton Vorontsov 1570ddf637SAnton Vorontsov #include <linux/cgroup.h> 1670ddf637SAnton Vorontsov #include <linux/fs.h> 1770ddf637SAnton Vorontsov #include <linux/log2.h> 1870ddf637SAnton Vorontsov #include <linux/sched.h> 1970ddf637SAnton Vorontsov #include <linux/mm.h> 2070ddf637SAnton Vorontsov #include <linux/vmstat.h> 2170ddf637SAnton Vorontsov #include <linux/eventfd.h> 221ff6bbfdSTejun Heo #include <linux/slab.h> 2370ddf637SAnton Vorontsov #include <linux/swap.h> 2470ddf637SAnton Vorontsov #include <linux/printk.h> 2570ddf637SAnton Vorontsov #include <linux/vmpressure.h> 2670ddf637SAnton Vorontsov 2770ddf637SAnton Vorontsov /* 2870ddf637SAnton Vorontsov * The window size (vmpressure_win) is the number of scanned pages before 2970ddf637SAnton Vorontsov * we try to analyze scanned/reclaimed ratio. So the window is used as a 3070ddf637SAnton Vorontsov * rate-limit tunable for the "low" level notification, and also for 3170ddf637SAnton Vorontsov * averaging the ratio for medium/critical levels. Using small window 3270ddf637SAnton Vorontsov * sizes can cause lot of false positives, but too big window size will 3370ddf637SAnton Vorontsov * delay the notifications. 3470ddf637SAnton Vorontsov * 3570ddf637SAnton Vorontsov * As the vmscan reclaimer logic works with chunks which are multiple of 3670ddf637SAnton Vorontsov * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. 3770ddf637SAnton Vorontsov * 3870ddf637SAnton Vorontsov * TODO: Make the window size depend on machine size, as we do for vmstat 3970ddf637SAnton Vorontsov * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). 4070ddf637SAnton Vorontsov */ 4170ddf637SAnton Vorontsov static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; 4270ddf637SAnton Vorontsov 4370ddf637SAnton Vorontsov /* 4470ddf637SAnton Vorontsov * These thresholds are used when we account memory pressure through 4570ddf637SAnton Vorontsov * scanned/reclaimed ratio. The current values were chosen empirically. In 4670ddf637SAnton Vorontsov * essence, they are percents: the higher the value, the more number 4770ddf637SAnton Vorontsov * unsuccessful reclaims there were. 4870ddf637SAnton Vorontsov */ 4970ddf637SAnton Vorontsov static const unsigned int vmpressure_level_med = 60; 5070ddf637SAnton Vorontsov static const unsigned int vmpressure_level_critical = 95; 5170ddf637SAnton Vorontsov 5270ddf637SAnton Vorontsov /* 5370ddf637SAnton Vorontsov * When there are too little pages left to scan, vmpressure() may miss the 5470ddf637SAnton Vorontsov * critical pressure as number of pages will be less than "window size". 5570ddf637SAnton Vorontsov * However, in that case the vmscan priority will raise fast as the 5670ddf637SAnton Vorontsov * reclaimer will try to scan LRUs more deeply. 5770ddf637SAnton Vorontsov * 5870ddf637SAnton Vorontsov * The vmscan logic considers these special priorities: 5970ddf637SAnton Vorontsov * 6070ddf637SAnton Vorontsov * prio == DEF_PRIORITY (12): reclaimer starts with that value 6170ddf637SAnton Vorontsov * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed 6270ddf637SAnton Vorontsov * prio == 0 : close to OOM, kernel scans every page in an lru 6370ddf637SAnton Vorontsov * 6470ddf637SAnton Vorontsov * Any value in this range is acceptable for this tunable (i.e. from 12 to 6570ddf637SAnton Vorontsov * 0). Current value for the vmpressure_level_critical_prio is chosen 6670ddf637SAnton Vorontsov * empirically, but the number, in essence, means that we consider 6770ddf637SAnton Vorontsov * critical level when scanning depth is ~10% of the lru size (vmscan 6870ddf637SAnton Vorontsov * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one 6970ddf637SAnton Vorontsov * eights). 7070ddf637SAnton Vorontsov */ 7170ddf637SAnton Vorontsov static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); 7270ddf637SAnton Vorontsov 7370ddf637SAnton Vorontsov static struct vmpressure *work_to_vmpressure(struct work_struct *work) 7470ddf637SAnton Vorontsov { 7570ddf637SAnton Vorontsov return container_of(work, struct vmpressure, work); 7670ddf637SAnton Vorontsov } 7770ddf637SAnton Vorontsov 7870ddf637SAnton Vorontsov static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) 7970ddf637SAnton Vorontsov { 80182446d0STejun Heo struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); 81182446d0STejun Heo struct mem_cgroup *memcg = mem_cgroup_from_css(css); 8270ddf637SAnton Vorontsov 8370ddf637SAnton Vorontsov memcg = parent_mem_cgroup(memcg); 8470ddf637SAnton Vorontsov if (!memcg) 8570ddf637SAnton Vorontsov return NULL; 8670ddf637SAnton Vorontsov return memcg_to_vmpressure(memcg); 8770ddf637SAnton Vorontsov } 8870ddf637SAnton Vorontsov 8970ddf637SAnton Vorontsov enum vmpressure_levels { 9070ddf637SAnton Vorontsov VMPRESSURE_LOW = 0, 9170ddf637SAnton Vorontsov VMPRESSURE_MEDIUM, 9270ddf637SAnton Vorontsov VMPRESSURE_CRITICAL, 9370ddf637SAnton Vorontsov VMPRESSURE_NUM_LEVELS, 9470ddf637SAnton Vorontsov }; 9570ddf637SAnton Vorontsov 9670ddf637SAnton Vorontsov static const char * const vmpressure_str_levels[] = { 9770ddf637SAnton Vorontsov [VMPRESSURE_LOW] = "low", 9870ddf637SAnton Vorontsov [VMPRESSURE_MEDIUM] = "medium", 9970ddf637SAnton Vorontsov [VMPRESSURE_CRITICAL] = "critical", 10070ddf637SAnton Vorontsov }; 10170ddf637SAnton Vorontsov 10270ddf637SAnton Vorontsov static enum vmpressure_levels vmpressure_level(unsigned long pressure) 10370ddf637SAnton Vorontsov { 10470ddf637SAnton Vorontsov if (pressure >= vmpressure_level_critical) 10570ddf637SAnton Vorontsov return VMPRESSURE_CRITICAL; 10670ddf637SAnton Vorontsov else if (pressure >= vmpressure_level_med) 10770ddf637SAnton Vorontsov return VMPRESSURE_MEDIUM; 10870ddf637SAnton Vorontsov return VMPRESSURE_LOW; 10970ddf637SAnton Vorontsov } 11070ddf637SAnton Vorontsov 11170ddf637SAnton Vorontsov static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, 11270ddf637SAnton Vorontsov unsigned long reclaimed) 11370ddf637SAnton Vorontsov { 11470ddf637SAnton Vorontsov unsigned long scale = scanned + reclaimed; 11570ddf637SAnton Vorontsov unsigned long pressure; 11670ddf637SAnton Vorontsov 11770ddf637SAnton Vorontsov /* 11870ddf637SAnton Vorontsov * We calculate the ratio (in percents) of how many pages were 11970ddf637SAnton Vorontsov * scanned vs. reclaimed in a given time frame (window). Note that 12070ddf637SAnton Vorontsov * time is in VM reclaimer's "ticks", i.e. number of pages 12170ddf637SAnton Vorontsov * scanned. This makes it possible to set desired reaction time 12270ddf637SAnton Vorontsov * and serves as a ratelimit. 12370ddf637SAnton Vorontsov */ 12470ddf637SAnton Vorontsov pressure = scale - (reclaimed * scale / scanned); 12570ddf637SAnton Vorontsov pressure = pressure * 100 / scale; 12670ddf637SAnton Vorontsov 12770ddf637SAnton Vorontsov pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, 12870ddf637SAnton Vorontsov scanned, reclaimed); 12970ddf637SAnton Vorontsov 13070ddf637SAnton Vorontsov return vmpressure_level(pressure); 13170ddf637SAnton Vorontsov } 13270ddf637SAnton Vorontsov 13370ddf637SAnton Vorontsov struct vmpressure_event { 13470ddf637SAnton Vorontsov struct eventfd_ctx *efd; 13570ddf637SAnton Vorontsov enum vmpressure_levels level; 13670ddf637SAnton Vorontsov struct list_head node; 13770ddf637SAnton Vorontsov }; 13870ddf637SAnton Vorontsov 13970ddf637SAnton Vorontsov static bool vmpressure_event(struct vmpressure *vmpr, 140*8e8ae645SJohannes Weiner enum vmpressure_levels level) 14170ddf637SAnton Vorontsov { 14270ddf637SAnton Vorontsov struct vmpressure_event *ev; 14370ddf637SAnton Vorontsov bool signalled = false; 14470ddf637SAnton Vorontsov 14570ddf637SAnton Vorontsov mutex_lock(&vmpr->events_lock); 14670ddf637SAnton Vorontsov 14770ddf637SAnton Vorontsov list_for_each_entry(ev, &vmpr->events, node) { 14870ddf637SAnton Vorontsov if (level >= ev->level) { 14970ddf637SAnton Vorontsov eventfd_signal(ev->efd, 1); 15070ddf637SAnton Vorontsov signalled = true; 15170ddf637SAnton Vorontsov } 15270ddf637SAnton Vorontsov } 15370ddf637SAnton Vorontsov 15470ddf637SAnton Vorontsov mutex_unlock(&vmpr->events_lock); 15570ddf637SAnton Vorontsov 15670ddf637SAnton Vorontsov return signalled; 15770ddf637SAnton Vorontsov } 15870ddf637SAnton Vorontsov 15970ddf637SAnton Vorontsov static void vmpressure_work_fn(struct work_struct *work) 16070ddf637SAnton Vorontsov { 16170ddf637SAnton Vorontsov struct vmpressure *vmpr = work_to_vmpressure(work); 16270ddf637SAnton Vorontsov unsigned long scanned; 16370ddf637SAnton Vorontsov unsigned long reclaimed; 164*8e8ae645SJohannes Weiner enum vmpressure_levels level; 16570ddf637SAnton Vorontsov 16691b57191SAndrew Morton spin_lock(&vmpr->sr_lock); 16770ddf637SAnton Vorontsov /* 16870ddf637SAnton Vorontsov * Several contexts might be calling vmpressure(), so it is 16970ddf637SAnton Vorontsov * possible that the work was rescheduled again before the old 17070ddf637SAnton Vorontsov * work context cleared the counters. In that case we will run 17170ddf637SAnton Vorontsov * just after the old work returns, but then scanned might be zero 17270ddf637SAnton Vorontsov * here. No need for any locks here since we don't care if 17370ddf637SAnton Vorontsov * vmpr->reclaimed is in sync. 17470ddf637SAnton Vorontsov */ 175*8e8ae645SJohannes Weiner scanned = vmpr->tree_scanned; 17691b57191SAndrew Morton if (!scanned) { 17791b57191SAndrew Morton spin_unlock(&vmpr->sr_lock); 17891b57191SAndrew Morton return; 17991b57191SAndrew Morton } 18091b57191SAndrew Morton 181*8e8ae645SJohannes Weiner reclaimed = vmpr->tree_reclaimed; 182*8e8ae645SJohannes Weiner vmpr->tree_scanned = 0; 183*8e8ae645SJohannes Weiner vmpr->tree_reclaimed = 0; 18422f2020fSMichal Hocko spin_unlock(&vmpr->sr_lock); 18570ddf637SAnton Vorontsov 186*8e8ae645SJohannes Weiner level = vmpressure_calc_level(scanned, reclaimed); 187*8e8ae645SJohannes Weiner 18870ddf637SAnton Vorontsov do { 189*8e8ae645SJohannes Weiner if (vmpressure_event(vmpr, level)) 19070ddf637SAnton Vorontsov break; 19170ddf637SAnton Vorontsov /* 19270ddf637SAnton Vorontsov * If not handled, propagate the event upward into the 19370ddf637SAnton Vorontsov * hierarchy. 19470ddf637SAnton Vorontsov */ 19570ddf637SAnton Vorontsov } while ((vmpr = vmpressure_parent(vmpr))); 19670ddf637SAnton Vorontsov } 19770ddf637SAnton Vorontsov 19870ddf637SAnton Vorontsov /** 19970ddf637SAnton Vorontsov * vmpressure() - Account memory pressure through scanned/reclaimed ratio 20070ddf637SAnton Vorontsov * @gfp: reclaimer's gfp mask 20170ddf637SAnton Vorontsov * @memcg: cgroup memory controller handle 202*8e8ae645SJohannes Weiner * @tree: legacy subtree mode 20370ddf637SAnton Vorontsov * @scanned: number of pages scanned 20470ddf637SAnton Vorontsov * @reclaimed: number of pages reclaimed 20570ddf637SAnton Vorontsov * 20670ddf637SAnton Vorontsov * This function should be called from the vmscan reclaim path to account 20770ddf637SAnton Vorontsov * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw 20870ddf637SAnton Vorontsov * pressure index is then further refined and averaged over time. 20970ddf637SAnton Vorontsov * 210*8e8ae645SJohannes Weiner * If @tree is set, vmpressure is in traditional userspace reporting 211*8e8ae645SJohannes Weiner * mode: @memcg is considered the pressure root and userspace is 212*8e8ae645SJohannes Weiner * notified of the entire subtree's reclaim efficiency. 213*8e8ae645SJohannes Weiner * 214*8e8ae645SJohannes Weiner * If @tree is not set, reclaim efficiency is recorded for @memcg, and 215*8e8ae645SJohannes Weiner * only in-kernel users are notified. 216*8e8ae645SJohannes Weiner * 21770ddf637SAnton Vorontsov * This function does not return any value. 21870ddf637SAnton Vorontsov */ 219*8e8ae645SJohannes Weiner void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, 22070ddf637SAnton Vorontsov unsigned long scanned, unsigned long reclaimed) 22170ddf637SAnton Vorontsov { 22270ddf637SAnton Vorontsov struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 22370ddf637SAnton Vorontsov 22470ddf637SAnton Vorontsov /* 22570ddf637SAnton Vorontsov * Here we only want to account pressure that userland is able to 22670ddf637SAnton Vorontsov * help us with. For example, suppose that DMA zone is under 22770ddf637SAnton Vorontsov * pressure; if we notify userland about that kind of pressure, 22870ddf637SAnton Vorontsov * then it will be mostly a waste as it will trigger unnecessary 22970ddf637SAnton Vorontsov * freeing of memory by userland (since userland is more likely to 23070ddf637SAnton Vorontsov * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That 23170ddf637SAnton Vorontsov * is why we include only movable, highmem and FS/IO pages. 23270ddf637SAnton Vorontsov * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so 23370ddf637SAnton Vorontsov * we account it too. 23470ddf637SAnton Vorontsov */ 23570ddf637SAnton Vorontsov if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) 23670ddf637SAnton Vorontsov return; 23770ddf637SAnton Vorontsov 23870ddf637SAnton Vorontsov /* 23970ddf637SAnton Vorontsov * If we got here with no pages scanned, then that is an indicator 24070ddf637SAnton Vorontsov * that reclaimer was unable to find any shrinkable LRUs at the 24170ddf637SAnton Vorontsov * current scanning depth. But it does not mean that we should 24270ddf637SAnton Vorontsov * report the critical pressure, yet. If the scanning priority 24370ddf637SAnton Vorontsov * (scanning depth) goes too high (deep), we will be notified 24470ddf637SAnton Vorontsov * through vmpressure_prio(). But so far, keep calm. 24570ddf637SAnton Vorontsov */ 24670ddf637SAnton Vorontsov if (!scanned) 24770ddf637SAnton Vorontsov return; 24870ddf637SAnton Vorontsov 249*8e8ae645SJohannes Weiner if (tree) { 25022f2020fSMichal Hocko spin_lock(&vmpr->sr_lock); 251*8e8ae645SJohannes Weiner vmpr->tree_scanned += scanned; 252*8e8ae645SJohannes Weiner vmpr->tree_reclaimed += reclaimed; 25370ddf637SAnton Vorontsov scanned = vmpr->scanned; 25422f2020fSMichal Hocko spin_unlock(&vmpr->sr_lock); 25570ddf637SAnton Vorontsov 2568e0ed445SMichal Hocko if (scanned < vmpressure_win) 25770ddf637SAnton Vorontsov return; 25870ddf637SAnton Vorontsov schedule_work(&vmpr->work); 259*8e8ae645SJohannes Weiner } else { 260*8e8ae645SJohannes Weiner enum vmpressure_levels level; 261*8e8ae645SJohannes Weiner 262*8e8ae645SJohannes Weiner /* For now, no users for root-level efficiency */ 263*8e8ae645SJohannes Weiner if (memcg == root_mem_cgroup) 264*8e8ae645SJohannes Weiner return; 265*8e8ae645SJohannes Weiner 266*8e8ae645SJohannes Weiner spin_lock(&vmpr->sr_lock); 267*8e8ae645SJohannes Weiner scanned = vmpr->scanned += scanned; 268*8e8ae645SJohannes Weiner reclaimed = vmpr->reclaimed += reclaimed; 269*8e8ae645SJohannes Weiner if (scanned < vmpressure_win) { 270*8e8ae645SJohannes Weiner spin_unlock(&vmpr->sr_lock); 271*8e8ae645SJohannes Weiner return; 272*8e8ae645SJohannes Weiner } 273*8e8ae645SJohannes Weiner vmpr->scanned = vmpr->reclaimed = 0; 274*8e8ae645SJohannes Weiner spin_unlock(&vmpr->sr_lock); 275*8e8ae645SJohannes Weiner 276*8e8ae645SJohannes Weiner level = vmpressure_calc_level(scanned, reclaimed); 277*8e8ae645SJohannes Weiner 278*8e8ae645SJohannes Weiner if (level > VMPRESSURE_LOW) { 279*8e8ae645SJohannes Weiner /* 280*8e8ae645SJohannes Weiner * Let the socket buffer allocator know that 281*8e8ae645SJohannes Weiner * we are having trouble reclaiming LRU pages. 282*8e8ae645SJohannes Weiner * 283*8e8ae645SJohannes Weiner * For hysteresis keep the pressure state 284*8e8ae645SJohannes Weiner * asserted for a second in which subsequent 285*8e8ae645SJohannes Weiner * pressure events can occur. 286*8e8ae645SJohannes Weiner */ 287*8e8ae645SJohannes Weiner memcg->socket_pressure = jiffies + HZ; 288*8e8ae645SJohannes Weiner } 289*8e8ae645SJohannes Weiner } 29070ddf637SAnton Vorontsov } 29170ddf637SAnton Vorontsov 29270ddf637SAnton Vorontsov /** 29370ddf637SAnton Vorontsov * vmpressure_prio() - Account memory pressure through reclaimer priority level 29470ddf637SAnton Vorontsov * @gfp: reclaimer's gfp mask 29570ddf637SAnton Vorontsov * @memcg: cgroup memory controller handle 29670ddf637SAnton Vorontsov * @prio: reclaimer's priority 29770ddf637SAnton Vorontsov * 29870ddf637SAnton Vorontsov * This function should be called from the reclaim path every time when 29970ddf637SAnton Vorontsov * the vmscan's reclaiming priority (scanning depth) changes. 30070ddf637SAnton Vorontsov * 30170ddf637SAnton Vorontsov * This function does not return any value. 30270ddf637SAnton Vorontsov */ 30370ddf637SAnton Vorontsov void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) 30470ddf637SAnton Vorontsov { 30570ddf637SAnton Vorontsov /* 30670ddf637SAnton Vorontsov * We only use prio for accounting critical level. For more info 30770ddf637SAnton Vorontsov * see comment for vmpressure_level_critical_prio variable above. 30870ddf637SAnton Vorontsov */ 30970ddf637SAnton Vorontsov if (prio > vmpressure_level_critical_prio) 31070ddf637SAnton Vorontsov return; 31170ddf637SAnton Vorontsov 31270ddf637SAnton Vorontsov /* 31370ddf637SAnton Vorontsov * OK, the prio is below the threshold, updating vmpressure 31470ddf637SAnton Vorontsov * information before shrinker dives into long shrinking of long 31570ddf637SAnton Vorontsov * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 31670ddf637SAnton Vorontsov * to the vmpressure() basically means that we signal 'critical' 31770ddf637SAnton Vorontsov * level. 31870ddf637SAnton Vorontsov */ 319*8e8ae645SJohannes Weiner vmpressure(gfp, memcg, true, vmpressure_win, 0); 32070ddf637SAnton Vorontsov } 32170ddf637SAnton Vorontsov 32270ddf637SAnton Vorontsov /** 32370ddf637SAnton Vorontsov * vmpressure_register_event() - Bind vmpressure notifications to an eventfd 32459b6f873STejun Heo * @memcg: memcg that is interested in vmpressure notifications 32570ddf637SAnton Vorontsov * @eventfd: eventfd context to link notifications with 32670ddf637SAnton Vorontsov * @args: event arguments (used to set up a pressure level threshold) 32770ddf637SAnton Vorontsov * 32870ddf637SAnton Vorontsov * This function associates eventfd context with the vmpressure 32970ddf637SAnton Vorontsov * infrastructure, so that the notifications will be delivered to the 33070ddf637SAnton Vorontsov * @eventfd. The @args parameter is a string that denotes pressure level 33170ddf637SAnton Vorontsov * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or 33270ddf637SAnton Vorontsov * "critical"). 33370ddf637SAnton Vorontsov * 334347c4a87STejun Heo * To be used as memcg event method. 33570ddf637SAnton Vorontsov */ 33659b6f873STejun Heo int vmpressure_register_event(struct mem_cgroup *memcg, 337347c4a87STejun Heo struct eventfd_ctx *eventfd, const char *args) 33870ddf637SAnton Vorontsov { 33959b6f873STejun Heo struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 34070ddf637SAnton Vorontsov struct vmpressure_event *ev; 34170ddf637SAnton Vorontsov int level; 34270ddf637SAnton Vorontsov 34370ddf637SAnton Vorontsov for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) { 34470ddf637SAnton Vorontsov if (!strcmp(vmpressure_str_levels[level], args)) 34570ddf637SAnton Vorontsov break; 34670ddf637SAnton Vorontsov } 34770ddf637SAnton Vorontsov 34870ddf637SAnton Vorontsov if (level >= VMPRESSURE_NUM_LEVELS) 34970ddf637SAnton Vorontsov return -EINVAL; 35070ddf637SAnton Vorontsov 35170ddf637SAnton Vorontsov ev = kzalloc(sizeof(*ev), GFP_KERNEL); 35270ddf637SAnton Vorontsov if (!ev) 35370ddf637SAnton Vorontsov return -ENOMEM; 35470ddf637SAnton Vorontsov 35570ddf637SAnton Vorontsov ev->efd = eventfd; 35670ddf637SAnton Vorontsov ev->level = level; 35770ddf637SAnton Vorontsov 35870ddf637SAnton Vorontsov mutex_lock(&vmpr->events_lock); 35970ddf637SAnton Vorontsov list_add(&ev->node, &vmpr->events); 36070ddf637SAnton Vorontsov mutex_unlock(&vmpr->events_lock); 36170ddf637SAnton Vorontsov 36270ddf637SAnton Vorontsov return 0; 36370ddf637SAnton Vorontsov } 36470ddf637SAnton Vorontsov 36570ddf637SAnton Vorontsov /** 36670ddf637SAnton Vorontsov * vmpressure_unregister_event() - Unbind eventfd from vmpressure 36759b6f873STejun Heo * @memcg: memcg handle 36870ddf637SAnton Vorontsov * @eventfd: eventfd context that was used to link vmpressure with the @cg 36970ddf637SAnton Vorontsov * 37070ddf637SAnton Vorontsov * This function does internal manipulations to detach the @eventfd from 37170ddf637SAnton Vorontsov * the vmpressure notifications, and then frees internal resources 37270ddf637SAnton Vorontsov * associated with the @eventfd (but the @eventfd itself is not freed). 37370ddf637SAnton Vorontsov * 374347c4a87STejun Heo * To be used as memcg event method. 37570ddf637SAnton Vorontsov */ 37659b6f873STejun Heo void vmpressure_unregister_event(struct mem_cgroup *memcg, 37770ddf637SAnton Vorontsov struct eventfd_ctx *eventfd) 37870ddf637SAnton Vorontsov { 37959b6f873STejun Heo struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 38070ddf637SAnton Vorontsov struct vmpressure_event *ev; 38170ddf637SAnton Vorontsov 38270ddf637SAnton Vorontsov mutex_lock(&vmpr->events_lock); 38370ddf637SAnton Vorontsov list_for_each_entry(ev, &vmpr->events, node) { 38470ddf637SAnton Vorontsov if (ev->efd != eventfd) 38570ddf637SAnton Vorontsov continue; 38670ddf637SAnton Vorontsov list_del(&ev->node); 38770ddf637SAnton Vorontsov kfree(ev); 38870ddf637SAnton Vorontsov break; 38970ddf637SAnton Vorontsov } 39070ddf637SAnton Vorontsov mutex_unlock(&vmpr->events_lock); 39170ddf637SAnton Vorontsov } 39270ddf637SAnton Vorontsov 39370ddf637SAnton Vorontsov /** 39470ddf637SAnton Vorontsov * vmpressure_init() - Initialize vmpressure control structure 39570ddf637SAnton Vorontsov * @vmpr: Structure to be initialized 39670ddf637SAnton Vorontsov * 39770ddf637SAnton Vorontsov * This function should be called on every allocated vmpressure structure 39870ddf637SAnton Vorontsov * before any usage. 39970ddf637SAnton Vorontsov */ 40070ddf637SAnton Vorontsov void vmpressure_init(struct vmpressure *vmpr) 40170ddf637SAnton Vorontsov { 40222f2020fSMichal Hocko spin_lock_init(&vmpr->sr_lock); 40370ddf637SAnton Vorontsov mutex_init(&vmpr->events_lock); 40470ddf637SAnton Vorontsov INIT_LIST_HEAD(&vmpr->events); 40570ddf637SAnton Vorontsov INIT_WORK(&vmpr->work, vmpressure_work_fn); 40670ddf637SAnton Vorontsov } 40733cb876eSMichal Hocko 40833cb876eSMichal Hocko /** 40933cb876eSMichal Hocko * vmpressure_cleanup() - shuts down vmpressure control structure 41033cb876eSMichal Hocko * @vmpr: Structure to be cleaned up 41133cb876eSMichal Hocko * 41233cb876eSMichal Hocko * This function should be called before the structure in which it is 41333cb876eSMichal Hocko * embedded is cleaned up. 41433cb876eSMichal Hocko */ 41533cb876eSMichal Hocko void vmpressure_cleanup(struct vmpressure *vmpr) 41633cb876eSMichal Hocko { 41733cb876eSMichal Hocko /* 41833cb876eSMichal Hocko * Make sure there is no pending work before eventfd infrastructure 41933cb876eSMichal Hocko * goes away. 42033cb876eSMichal Hocko */ 42133cb876eSMichal Hocko flush_work(&vmpr->work); 42233cb876eSMichal Hocko } 423