1*d2912cb1SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only 270ddf637SAnton Vorontsov /* 370ddf637SAnton Vorontsov * Linux VM pressure 470ddf637SAnton Vorontsov * 570ddf637SAnton Vorontsov * Copyright 2012 Linaro Ltd. 670ddf637SAnton Vorontsov * Anton Vorontsov <anton.vorontsov@linaro.org> 770ddf637SAnton Vorontsov * 870ddf637SAnton Vorontsov * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, 970ddf637SAnton Vorontsov * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. 1070ddf637SAnton Vorontsov */ 1170ddf637SAnton Vorontsov 1270ddf637SAnton Vorontsov #include <linux/cgroup.h> 1370ddf637SAnton Vorontsov #include <linux/fs.h> 1470ddf637SAnton Vorontsov #include <linux/log2.h> 1570ddf637SAnton Vorontsov #include <linux/sched.h> 1670ddf637SAnton Vorontsov #include <linux/mm.h> 1770ddf637SAnton Vorontsov #include <linux/vmstat.h> 1870ddf637SAnton Vorontsov #include <linux/eventfd.h> 191ff6bbfdSTejun Heo #include <linux/slab.h> 2070ddf637SAnton Vorontsov #include <linux/swap.h> 2170ddf637SAnton Vorontsov #include <linux/printk.h> 2270ddf637SAnton Vorontsov #include <linux/vmpressure.h> 2370ddf637SAnton Vorontsov 2470ddf637SAnton Vorontsov /* 2570ddf637SAnton Vorontsov * The window size (vmpressure_win) is the number of scanned pages before 2670ddf637SAnton Vorontsov * we try to analyze scanned/reclaimed ratio. So the window is used as a 2770ddf637SAnton Vorontsov * rate-limit tunable for the "low" level notification, and also for 2870ddf637SAnton Vorontsov * averaging the ratio for medium/critical levels. Using small window 2970ddf637SAnton Vorontsov * sizes can cause lot of false positives, but too big window size will 3070ddf637SAnton Vorontsov * delay the notifications. 3170ddf637SAnton Vorontsov * 3270ddf637SAnton Vorontsov * As the vmscan reclaimer logic works with chunks which are multiple of 3370ddf637SAnton Vorontsov * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. 3470ddf637SAnton Vorontsov * 3570ddf637SAnton Vorontsov * TODO: Make the window size depend on machine size, as we do for vmstat 3670ddf637SAnton Vorontsov * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). 3770ddf637SAnton Vorontsov */ 3870ddf637SAnton Vorontsov static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; 3970ddf637SAnton Vorontsov 4070ddf637SAnton Vorontsov /* 4170ddf637SAnton Vorontsov * These thresholds are used when we account memory pressure through 4270ddf637SAnton Vorontsov * scanned/reclaimed ratio. The current values were chosen empirically. In 4370ddf637SAnton Vorontsov * essence, they are percents: the higher the value, the more number 4470ddf637SAnton Vorontsov * unsuccessful reclaims there were. 4570ddf637SAnton Vorontsov */ 4670ddf637SAnton Vorontsov static const unsigned int vmpressure_level_med = 60; 4770ddf637SAnton Vorontsov static const unsigned int vmpressure_level_critical = 95; 4870ddf637SAnton Vorontsov 4970ddf637SAnton Vorontsov /* 5070ddf637SAnton Vorontsov * When there are too little pages left to scan, vmpressure() may miss the 5170ddf637SAnton Vorontsov * critical pressure as number of pages will be less than "window size". 5270ddf637SAnton Vorontsov * However, in that case the vmscan priority will raise fast as the 5370ddf637SAnton Vorontsov * reclaimer will try to scan LRUs more deeply. 5470ddf637SAnton Vorontsov * 5570ddf637SAnton Vorontsov * The vmscan logic considers these special priorities: 5670ddf637SAnton Vorontsov * 5770ddf637SAnton Vorontsov * prio == DEF_PRIORITY (12): reclaimer starts with that value 5870ddf637SAnton Vorontsov * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed 5970ddf637SAnton Vorontsov * prio == 0 : close to OOM, kernel scans every page in an lru 6070ddf637SAnton Vorontsov * 6170ddf637SAnton Vorontsov * Any value in this range is acceptable for this tunable (i.e. from 12 to 6270ddf637SAnton Vorontsov * 0). Current value for the vmpressure_level_critical_prio is chosen 6370ddf637SAnton Vorontsov * empirically, but the number, in essence, means that we consider 6470ddf637SAnton Vorontsov * critical level when scanning depth is ~10% of the lru size (vmscan 6570ddf637SAnton Vorontsov * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one 6670ddf637SAnton Vorontsov * eights). 6770ddf637SAnton Vorontsov */ 6870ddf637SAnton Vorontsov static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); 6970ddf637SAnton Vorontsov 7070ddf637SAnton Vorontsov static struct vmpressure *work_to_vmpressure(struct work_struct *work) 7170ddf637SAnton Vorontsov { 7270ddf637SAnton Vorontsov return container_of(work, struct vmpressure, work); 7370ddf637SAnton Vorontsov } 7470ddf637SAnton Vorontsov 7570ddf637SAnton Vorontsov static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) 7670ddf637SAnton Vorontsov { 77182446d0STejun Heo struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); 78182446d0STejun Heo struct mem_cgroup *memcg = mem_cgroup_from_css(css); 7970ddf637SAnton Vorontsov 8070ddf637SAnton Vorontsov memcg = parent_mem_cgroup(memcg); 8170ddf637SAnton Vorontsov if (!memcg) 8270ddf637SAnton Vorontsov return NULL; 8370ddf637SAnton Vorontsov return memcg_to_vmpressure(memcg); 8470ddf637SAnton Vorontsov } 8570ddf637SAnton Vorontsov 8670ddf637SAnton Vorontsov enum vmpressure_levels { 8770ddf637SAnton Vorontsov VMPRESSURE_LOW = 0, 8870ddf637SAnton Vorontsov VMPRESSURE_MEDIUM, 8970ddf637SAnton Vorontsov VMPRESSURE_CRITICAL, 9070ddf637SAnton Vorontsov VMPRESSURE_NUM_LEVELS, 9170ddf637SAnton Vorontsov }; 9270ddf637SAnton Vorontsov 93b6bb9811SDavid Rientjes enum vmpressure_modes { 94b6bb9811SDavid Rientjes VMPRESSURE_NO_PASSTHROUGH = 0, 95b6bb9811SDavid Rientjes VMPRESSURE_HIERARCHY, 96b6bb9811SDavid Rientjes VMPRESSURE_LOCAL, 97b6bb9811SDavid Rientjes VMPRESSURE_NUM_MODES, 98b6bb9811SDavid Rientjes }; 99b6bb9811SDavid Rientjes 10070ddf637SAnton Vorontsov static const char * const vmpressure_str_levels[] = { 10170ddf637SAnton Vorontsov [VMPRESSURE_LOW] = "low", 10270ddf637SAnton Vorontsov [VMPRESSURE_MEDIUM] = "medium", 10370ddf637SAnton Vorontsov [VMPRESSURE_CRITICAL] = "critical", 10470ddf637SAnton Vorontsov }; 10570ddf637SAnton Vorontsov 106b6bb9811SDavid Rientjes static const char * const vmpressure_str_modes[] = { 107b6bb9811SDavid Rientjes [VMPRESSURE_NO_PASSTHROUGH] = "default", 108b6bb9811SDavid Rientjes [VMPRESSURE_HIERARCHY] = "hierarchy", 109b6bb9811SDavid Rientjes [VMPRESSURE_LOCAL] = "local", 110b6bb9811SDavid Rientjes }; 111b6bb9811SDavid Rientjes 11270ddf637SAnton Vorontsov static enum vmpressure_levels vmpressure_level(unsigned long pressure) 11370ddf637SAnton Vorontsov { 11470ddf637SAnton Vorontsov if (pressure >= vmpressure_level_critical) 11570ddf637SAnton Vorontsov return VMPRESSURE_CRITICAL; 11670ddf637SAnton Vorontsov else if (pressure >= vmpressure_level_med) 11770ddf637SAnton Vorontsov return VMPRESSURE_MEDIUM; 11870ddf637SAnton Vorontsov return VMPRESSURE_LOW; 11970ddf637SAnton Vorontsov } 12070ddf637SAnton Vorontsov 12170ddf637SAnton Vorontsov static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, 12270ddf637SAnton Vorontsov unsigned long reclaimed) 12370ddf637SAnton Vorontsov { 12470ddf637SAnton Vorontsov unsigned long scale = scanned + reclaimed; 125e1587a49SVinayak Menon unsigned long pressure = 0; 12670ddf637SAnton Vorontsov 12770ddf637SAnton Vorontsov /* 128d7143e31Szhongjiang * reclaimed can be greater than scanned for things such as reclaimed 129d7143e31Szhongjiang * slab pages. shrink_node() just adds reclaimed pages without a 130d7143e31Szhongjiang * related increment to scanned pages. 131e1587a49SVinayak Menon */ 132e1587a49SVinayak Menon if (reclaimed >= scanned) 133e1587a49SVinayak Menon goto out; 134e1587a49SVinayak Menon /* 13570ddf637SAnton Vorontsov * We calculate the ratio (in percents) of how many pages were 13670ddf637SAnton Vorontsov * scanned vs. reclaimed in a given time frame (window). Note that 13770ddf637SAnton Vorontsov * time is in VM reclaimer's "ticks", i.e. number of pages 13870ddf637SAnton Vorontsov * scanned. This makes it possible to set desired reaction time 13970ddf637SAnton Vorontsov * and serves as a ratelimit. 14070ddf637SAnton Vorontsov */ 14170ddf637SAnton Vorontsov pressure = scale - (reclaimed * scale / scanned); 14270ddf637SAnton Vorontsov pressure = pressure * 100 / scale; 14370ddf637SAnton Vorontsov 144e1587a49SVinayak Menon out: 14570ddf637SAnton Vorontsov pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, 14670ddf637SAnton Vorontsov scanned, reclaimed); 14770ddf637SAnton Vorontsov 14870ddf637SAnton Vorontsov return vmpressure_level(pressure); 14970ddf637SAnton Vorontsov } 15070ddf637SAnton Vorontsov 15170ddf637SAnton Vorontsov struct vmpressure_event { 15270ddf637SAnton Vorontsov struct eventfd_ctx *efd; 15370ddf637SAnton Vorontsov enum vmpressure_levels level; 154b6bb9811SDavid Rientjes enum vmpressure_modes mode; 15570ddf637SAnton Vorontsov struct list_head node; 15670ddf637SAnton Vorontsov }; 15770ddf637SAnton Vorontsov 15870ddf637SAnton Vorontsov static bool vmpressure_event(struct vmpressure *vmpr, 159b6bb9811SDavid Rientjes const enum vmpressure_levels level, 160b6bb9811SDavid Rientjes bool ancestor, bool signalled) 16170ddf637SAnton Vorontsov { 16270ddf637SAnton Vorontsov struct vmpressure_event *ev; 163b6bb9811SDavid Rientjes bool ret = false; 16470ddf637SAnton Vorontsov 16570ddf637SAnton Vorontsov mutex_lock(&vmpr->events_lock); 16670ddf637SAnton Vorontsov list_for_each_entry(ev, &vmpr->events, node) { 167b6bb9811SDavid Rientjes if (ancestor && ev->mode == VMPRESSURE_LOCAL) 168b6bb9811SDavid Rientjes continue; 169b6bb9811SDavid Rientjes if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH) 170b6bb9811SDavid Rientjes continue; 171b6bb9811SDavid Rientjes if (level < ev->level) 172b6bb9811SDavid Rientjes continue; 17370ddf637SAnton Vorontsov eventfd_signal(ev->efd, 1); 174b6bb9811SDavid Rientjes ret = true; 17570ddf637SAnton Vorontsov } 17670ddf637SAnton Vorontsov mutex_unlock(&vmpr->events_lock); 17770ddf637SAnton Vorontsov 178b6bb9811SDavid Rientjes return ret; 17970ddf637SAnton Vorontsov } 18070ddf637SAnton Vorontsov 18170ddf637SAnton Vorontsov static void vmpressure_work_fn(struct work_struct *work) 18270ddf637SAnton Vorontsov { 18370ddf637SAnton Vorontsov struct vmpressure *vmpr = work_to_vmpressure(work); 18470ddf637SAnton Vorontsov unsigned long scanned; 18570ddf637SAnton Vorontsov unsigned long reclaimed; 1868e8ae645SJohannes Weiner enum vmpressure_levels level; 187b6bb9811SDavid Rientjes bool ancestor = false; 188b6bb9811SDavid Rientjes bool signalled = false; 18970ddf637SAnton Vorontsov 19091b57191SAndrew Morton spin_lock(&vmpr->sr_lock); 19170ddf637SAnton Vorontsov /* 19270ddf637SAnton Vorontsov * Several contexts might be calling vmpressure(), so it is 19370ddf637SAnton Vorontsov * possible that the work was rescheduled again before the old 19470ddf637SAnton Vorontsov * work context cleared the counters. In that case we will run 19570ddf637SAnton Vorontsov * just after the old work returns, but then scanned might be zero 19670ddf637SAnton Vorontsov * here. No need for any locks here since we don't care if 19770ddf637SAnton Vorontsov * vmpr->reclaimed is in sync. 19870ddf637SAnton Vorontsov */ 1998e8ae645SJohannes Weiner scanned = vmpr->tree_scanned; 20091b57191SAndrew Morton if (!scanned) { 20191b57191SAndrew Morton spin_unlock(&vmpr->sr_lock); 20291b57191SAndrew Morton return; 20391b57191SAndrew Morton } 20491b57191SAndrew Morton 2058e8ae645SJohannes Weiner reclaimed = vmpr->tree_reclaimed; 2068e8ae645SJohannes Weiner vmpr->tree_scanned = 0; 2078e8ae645SJohannes Weiner vmpr->tree_reclaimed = 0; 20822f2020fSMichal Hocko spin_unlock(&vmpr->sr_lock); 20970ddf637SAnton Vorontsov 2108e8ae645SJohannes Weiner level = vmpressure_calc_level(scanned, reclaimed); 2118e8ae645SJohannes Weiner 21270ddf637SAnton Vorontsov do { 213b6bb9811SDavid Rientjes if (vmpressure_event(vmpr, level, ancestor, signalled)) 214b6bb9811SDavid Rientjes signalled = true; 215b6bb9811SDavid Rientjes ancestor = true; 21670ddf637SAnton Vorontsov } while ((vmpr = vmpressure_parent(vmpr))); 21770ddf637SAnton Vorontsov } 21870ddf637SAnton Vorontsov 21970ddf637SAnton Vorontsov /** 22070ddf637SAnton Vorontsov * vmpressure() - Account memory pressure through scanned/reclaimed ratio 22170ddf637SAnton Vorontsov * @gfp: reclaimer's gfp mask 22270ddf637SAnton Vorontsov * @memcg: cgroup memory controller handle 2238e8ae645SJohannes Weiner * @tree: legacy subtree mode 22470ddf637SAnton Vorontsov * @scanned: number of pages scanned 22570ddf637SAnton Vorontsov * @reclaimed: number of pages reclaimed 22670ddf637SAnton Vorontsov * 22770ddf637SAnton Vorontsov * This function should be called from the vmscan reclaim path to account 22870ddf637SAnton Vorontsov * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw 22970ddf637SAnton Vorontsov * pressure index is then further refined and averaged over time. 23070ddf637SAnton Vorontsov * 2318e8ae645SJohannes Weiner * If @tree is set, vmpressure is in traditional userspace reporting 2328e8ae645SJohannes Weiner * mode: @memcg is considered the pressure root and userspace is 2338e8ae645SJohannes Weiner * notified of the entire subtree's reclaim efficiency. 2348e8ae645SJohannes Weiner * 2358e8ae645SJohannes Weiner * If @tree is not set, reclaim efficiency is recorded for @memcg, and 2368e8ae645SJohannes Weiner * only in-kernel users are notified. 2378e8ae645SJohannes Weiner * 23870ddf637SAnton Vorontsov * This function does not return any value. 23970ddf637SAnton Vorontsov */ 2408e8ae645SJohannes Weiner void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, 24170ddf637SAnton Vorontsov unsigned long scanned, unsigned long reclaimed) 24270ddf637SAnton Vorontsov { 24370ddf637SAnton Vorontsov struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 24470ddf637SAnton Vorontsov 24570ddf637SAnton Vorontsov /* 24670ddf637SAnton Vorontsov * Here we only want to account pressure that userland is able to 24770ddf637SAnton Vorontsov * help us with. For example, suppose that DMA zone is under 24870ddf637SAnton Vorontsov * pressure; if we notify userland about that kind of pressure, 24970ddf637SAnton Vorontsov * then it will be mostly a waste as it will trigger unnecessary 25070ddf637SAnton Vorontsov * freeing of memory by userland (since userland is more likely to 25170ddf637SAnton Vorontsov * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That 25270ddf637SAnton Vorontsov * is why we include only movable, highmem and FS/IO pages. 25370ddf637SAnton Vorontsov * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so 25470ddf637SAnton Vorontsov * we account it too. 25570ddf637SAnton Vorontsov */ 25670ddf637SAnton Vorontsov if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) 25770ddf637SAnton Vorontsov return; 25870ddf637SAnton Vorontsov 25970ddf637SAnton Vorontsov /* 26070ddf637SAnton Vorontsov * If we got here with no pages scanned, then that is an indicator 26170ddf637SAnton Vorontsov * that reclaimer was unable to find any shrinkable LRUs at the 26270ddf637SAnton Vorontsov * current scanning depth. But it does not mean that we should 26370ddf637SAnton Vorontsov * report the critical pressure, yet. If the scanning priority 26470ddf637SAnton Vorontsov * (scanning depth) goes too high (deep), we will be notified 26570ddf637SAnton Vorontsov * through vmpressure_prio(). But so far, keep calm. 26670ddf637SAnton Vorontsov */ 26770ddf637SAnton Vorontsov if (!scanned) 26870ddf637SAnton Vorontsov return; 26970ddf637SAnton Vorontsov 2708e8ae645SJohannes Weiner if (tree) { 27122f2020fSMichal Hocko spin_lock(&vmpr->sr_lock); 2723c1da7beSVladimir Davydov scanned = vmpr->tree_scanned += scanned; 2738e8ae645SJohannes Weiner vmpr->tree_reclaimed += reclaimed; 27422f2020fSMichal Hocko spin_unlock(&vmpr->sr_lock); 27570ddf637SAnton Vorontsov 2768e0ed445SMichal Hocko if (scanned < vmpressure_win) 27770ddf637SAnton Vorontsov return; 27870ddf637SAnton Vorontsov schedule_work(&vmpr->work); 2798e8ae645SJohannes Weiner } else { 2808e8ae645SJohannes Weiner enum vmpressure_levels level; 2818e8ae645SJohannes Weiner 2828e8ae645SJohannes Weiner /* For now, no users for root-level efficiency */ 283686739f6SHugh Dickins if (!memcg || memcg == root_mem_cgroup) 2848e8ae645SJohannes Weiner return; 2858e8ae645SJohannes Weiner 2868e8ae645SJohannes Weiner spin_lock(&vmpr->sr_lock); 2878e8ae645SJohannes Weiner scanned = vmpr->scanned += scanned; 2888e8ae645SJohannes Weiner reclaimed = vmpr->reclaimed += reclaimed; 2898e8ae645SJohannes Weiner if (scanned < vmpressure_win) { 2908e8ae645SJohannes Weiner spin_unlock(&vmpr->sr_lock); 2918e8ae645SJohannes Weiner return; 2928e8ae645SJohannes Weiner } 2938e8ae645SJohannes Weiner vmpr->scanned = vmpr->reclaimed = 0; 2948e8ae645SJohannes Weiner spin_unlock(&vmpr->sr_lock); 2958e8ae645SJohannes Weiner 2968e8ae645SJohannes Weiner level = vmpressure_calc_level(scanned, reclaimed); 2978e8ae645SJohannes Weiner 2988e8ae645SJohannes Weiner if (level > VMPRESSURE_LOW) { 2998e8ae645SJohannes Weiner /* 3008e8ae645SJohannes Weiner * Let the socket buffer allocator know that 3018e8ae645SJohannes Weiner * we are having trouble reclaiming LRU pages. 3028e8ae645SJohannes Weiner * 3038e8ae645SJohannes Weiner * For hysteresis keep the pressure state 3048e8ae645SJohannes Weiner * asserted for a second in which subsequent 3058e8ae645SJohannes Weiner * pressure events can occur. 3068e8ae645SJohannes Weiner */ 3078e8ae645SJohannes Weiner memcg->socket_pressure = jiffies + HZ; 3088e8ae645SJohannes Weiner } 3098e8ae645SJohannes Weiner } 31070ddf637SAnton Vorontsov } 31170ddf637SAnton Vorontsov 31270ddf637SAnton Vorontsov /** 31370ddf637SAnton Vorontsov * vmpressure_prio() - Account memory pressure through reclaimer priority level 31470ddf637SAnton Vorontsov * @gfp: reclaimer's gfp mask 31570ddf637SAnton Vorontsov * @memcg: cgroup memory controller handle 31670ddf637SAnton Vorontsov * @prio: reclaimer's priority 31770ddf637SAnton Vorontsov * 31870ddf637SAnton Vorontsov * This function should be called from the reclaim path every time when 31970ddf637SAnton Vorontsov * the vmscan's reclaiming priority (scanning depth) changes. 32070ddf637SAnton Vorontsov * 32170ddf637SAnton Vorontsov * This function does not return any value. 32270ddf637SAnton Vorontsov */ 32370ddf637SAnton Vorontsov void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) 32470ddf637SAnton Vorontsov { 32570ddf637SAnton Vorontsov /* 32670ddf637SAnton Vorontsov * We only use prio for accounting critical level. For more info 32770ddf637SAnton Vorontsov * see comment for vmpressure_level_critical_prio variable above. 32870ddf637SAnton Vorontsov */ 32970ddf637SAnton Vorontsov if (prio > vmpressure_level_critical_prio) 33070ddf637SAnton Vorontsov return; 33170ddf637SAnton Vorontsov 33270ddf637SAnton Vorontsov /* 33370ddf637SAnton Vorontsov * OK, the prio is below the threshold, updating vmpressure 33470ddf637SAnton Vorontsov * information before shrinker dives into long shrinking of long 33570ddf637SAnton Vorontsov * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 33670ddf637SAnton Vorontsov * to the vmpressure() basically means that we signal 'critical' 33770ddf637SAnton Vorontsov * level. 33870ddf637SAnton Vorontsov */ 3398e8ae645SJohannes Weiner vmpressure(gfp, memcg, true, vmpressure_win, 0); 34070ddf637SAnton Vorontsov } 34170ddf637SAnton Vorontsov 342b6bb9811SDavid Rientjes #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) 343b6bb9811SDavid Rientjes 34470ddf637SAnton Vorontsov /** 34570ddf637SAnton Vorontsov * vmpressure_register_event() - Bind vmpressure notifications to an eventfd 34659b6f873STejun Heo * @memcg: memcg that is interested in vmpressure notifications 34770ddf637SAnton Vorontsov * @eventfd: eventfd context to link notifications with 348b6bb9811SDavid Rientjes * @args: event arguments (pressure level threshold, optional mode) 34970ddf637SAnton Vorontsov * 35070ddf637SAnton Vorontsov * This function associates eventfd context with the vmpressure 35170ddf637SAnton Vorontsov * infrastructure, so that the notifications will be delivered to the 352b6bb9811SDavid Rientjes * @eventfd. The @args parameter is a comma-delimited string that denotes a 353b6bb9811SDavid Rientjes * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium", 354b6bb9811SDavid Rientjes * or "critical") and an optional mode (one of vmpressure_str_modes, i.e. 355b6bb9811SDavid Rientjes * "hierarchy" or "local"). 35670ddf637SAnton Vorontsov * 357347c4a87STejun Heo * To be used as memcg event method. 35870ddf637SAnton Vorontsov */ 35959b6f873STejun Heo int vmpressure_register_event(struct mem_cgroup *memcg, 360347c4a87STejun Heo struct eventfd_ctx *eventfd, const char *args) 36170ddf637SAnton Vorontsov { 36259b6f873STejun Heo struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 36370ddf637SAnton Vorontsov struct vmpressure_event *ev; 364b6bb9811SDavid Rientjes enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; 365b6bb9811SDavid Rientjes enum vmpressure_levels level = -1; 366b6bb9811SDavid Rientjes char *spec, *spec_orig; 367b6bb9811SDavid Rientjes char *token; 368b6bb9811SDavid Rientjes int ret = 0; 36970ddf637SAnton Vorontsov 370d62ff365SAndy Shevchenko spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); 371b6bb9811SDavid Rientjes if (!spec) { 372b6bb9811SDavid Rientjes ret = -ENOMEM; 373b6bb9811SDavid Rientjes goto out; 374b6bb9811SDavid Rientjes } 375b6bb9811SDavid Rientjes 376b6bb9811SDavid Rientjes /* Find required level */ 377b6bb9811SDavid Rientjes token = strsep(&spec, ","); 3783cadfa2bSAndy Shevchenko level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); 3793cadfa2bSAndy Shevchenko if (level < 0) { 3803cadfa2bSAndy Shevchenko ret = level; 381b6bb9811SDavid Rientjes goto out; 38270ddf637SAnton Vorontsov } 38370ddf637SAnton Vorontsov 384b6bb9811SDavid Rientjes /* Find optional mode */ 385b6bb9811SDavid Rientjes token = strsep(&spec, ","); 386b6bb9811SDavid Rientjes if (token) { 3873cadfa2bSAndy Shevchenko mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); 3883cadfa2bSAndy Shevchenko if (mode < 0) { 3893cadfa2bSAndy Shevchenko ret = mode; 390b6bb9811SDavid Rientjes goto out; 391b6bb9811SDavid Rientjes } 392b6bb9811SDavid Rientjes } 39370ddf637SAnton Vorontsov 39470ddf637SAnton Vorontsov ev = kzalloc(sizeof(*ev), GFP_KERNEL); 395b6bb9811SDavid Rientjes if (!ev) { 396b6bb9811SDavid Rientjes ret = -ENOMEM; 397b6bb9811SDavid Rientjes goto out; 398b6bb9811SDavid Rientjes } 39970ddf637SAnton Vorontsov 40070ddf637SAnton Vorontsov ev->efd = eventfd; 40170ddf637SAnton Vorontsov ev->level = level; 402b6bb9811SDavid Rientjes ev->mode = mode; 40370ddf637SAnton Vorontsov 40470ddf637SAnton Vorontsov mutex_lock(&vmpr->events_lock); 40570ddf637SAnton Vorontsov list_add(&ev->node, &vmpr->events); 40670ddf637SAnton Vorontsov mutex_unlock(&vmpr->events_lock); 407b6bb9811SDavid Rientjes out: 408b6bb9811SDavid Rientjes kfree(spec_orig); 409b6bb9811SDavid Rientjes return ret; 41070ddf637SAnton Vorontsov } 41170ddf637SAnton Vorontsov 41270ddf637SAnton Vorontsov /** 41370ddf637SAnton Vorontsov * vmpressure_unregister_event() - Unbind eventfd from vmpressure 41459b6f873STejun Heo * @memcg: memcg handle 41570ddf637SAnton Vorontsov * @eventfd: eventfd context that was used to link vmpressure with the @cg 41670ddf637SAnton Vorontsov * 41770ddf637SAnton Vorontsov * This function does internal manipulations to detach the @eventfd from 41870ddf637SAnton Vorontsov * the vmpressure notifications, and then frees internal resources 41970ddf637SAnton Vorontsov * associated with the @eventfd (but the @eventfd itself is not freed). 42070ddf637SAnton Vorontsov * 421347c4a87STejun Heo * To be used as memcg event method. 42270ddf637SAnton Vorontsov */ 42359b6f873STejun Heo void vmpressure_unregister_event(struct mem_cgroup *memcg, 42470ddf637SAnton Vorontsov struct eventfd_ctx *eventfd) 42570ddf637SAnton Vorontsov { 42659b6f873STejun Heo struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 42770ddf637SAnton Vorontsov struct vmpressure_event *ev; 42870ddf637SAnton Vorontsov 42970ddf637SAnton Vorontsov mutex_lock(&vmpr->events_lock); 43070ddf637SAnton Vorontsov list_for_each_entry(ev, &vmpr->events, node) { 43170ddf637SAnton Vorontsov if (ev->efd != eventfd) 43270ddf637SAnton Vorontsov continue; 43370ddf637SAnton Vorontsov list_del(&ev->node); 43470ddf637SAnton Vorontsov kfree(ev); 43570ddf637SAnton Vorontsov break; 43670ddf637SAnton Vorontsov } 43770ddf637SAnton Vorontsov mutex_unlock(&vmpr->events_lock); 43870ddf637SAnton Vorontsov } 43970ddf637SAnton Vorontsov 44070ddf637SAnton Vorontsov /** 44170ddf637SAnton Vorontsov * vmpressure_init() - Initialize vmpressure control structure 44270ddf637SAnton Vorontsov * @vmpr: Structure to be initialized 44370ddf637SAnton Vorontsov * 44470ddf637SAnton Vorontsov * This function should be called on every allocated vmpressure structure 44570ddf637SAnton Vorontsov * before any usage. 44670ddf637SAnton Vorontsov */ 44770ddf637SAnton Vorontsov void vmpressure_init(struct vmpressure *vmpr) 44870ddf637SAnton Vorontsov { 44922f2020fSMichal Hocko spin_lock_init(&vmpr->sr_lock); 45070ddf637SAnton Vorontsov mutex_init(&vmpr->events_lock); 45170ddf637SAnton Vorontsov INIT_LIST_HEAD(&vmpr->events); 45270ddf637SAnton Vorontsov INIT_WORK(&vmpr->work, vmpressure_work_fn); 45370ddf637SAnton Vorontsov } 45433cb876eSMichal Hocko 45533cb876eSMichal Hocko /** 45633cb876eSMichal Hocko * vmpressure_cleanup() - shuts down vmpressure control structure 45733cb876eSMichal Hocko * @vmpr: Structure to be cleaned up 45833cb876eSMichal Hocko * 45933cb876eSMichal Hocko * This function should be called before the structure in which it is 46033cb876eSMichal Hocko * embedded is cleaned up. 46133cb876eSMichal Hocko */ 46233cb876eSMichal Hocko void vmpressure_cleanup(struct vmpressure *vmpr) 46333cb876eSMichal Hocko { 46433cb876eSMichal Hocko /* 46533cb876eSMichal Hocko * Make sure there is no pending work before eventfd infrastructure 46633cb876eSMichal Hocko * goes away. 46733cb876eSMichal Hocko */ 46833cb876eSMichal Hocko flush_work(&vmpr->work); 46933cb876eSMichal Hocko } 470