xref: /linux/kernel/cgroup/dmem.c (revision 6bd9ed02871f22beb0e50690b0c3caf457104f7c)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>)
4  * Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>)
5  * Partially based on the rdma and misc controllers, which bear the following copyrights:
6  *
7  * Copyright 2020 Google LLC
8  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
9  */
10 
11 #include <linux/cgroup.h>
12 #include <linux/cgroup_dmem.h>
13 #include <linux/list.h>
14 #include <linux/mutex.h>
15 #include <linux/page_counter.h>
16 #include <linux/parser.h>
17 #include <linux/refcount.h>
18 #include <linux/rculist.h>
19 #include <linux/slab.h>
20 
21 struct dmem_cgroup_region {
22 	/**
23 	 * @ref: References keeping the region alive.
24 	 * Keeps the region reference alive after a succesful RCU lookup.
25 	 */
26 	struct kref ref;
27 
28 	/** @rcu: RCU head for freeing */
29 	struct rcu_head rcu;
30 
31 	/**
32 	 * @region_node: Linked into &dmem_cgroup_regions list.
33 	 * Protected by RCU and global spinlock.
34 	 */
35 	struct list_head region_node;
36 
37 	/**
38 	 * @pools: List of pools linked to this region.
39 	 * Protected by global spinlock only
40 	 */
41 	struct list_head pools;
42 
43 	/** @size: Size of region, in bytes */
44 	u64 size;
45 
46 	/** @name: Name describing the node, set by dmem_cgroup_register_region */
47 	char *name;
48 
49 	/**
50 	 * @unregistered: Whether the region is unregistered by its caller.
51 	 * No new pools should be added to the region afterwards.
52 	 */
53 	bool unregistered;
54 };
55 
56 struct dmemcg_state {
57 	struct cgroup_subsys_state css;
58 
59 	struct list_head pools;
60 };
61 
62 struct dmem_cgroup_pool_state {
63 	struct dmem_cgroup_region *region;
64 	struct dmemcg_state *cs;
65 
66 	/* css node, RCU protected against region teardown */
67 	struct list_head	css_node;
68 
69 	/* dev node, no RCU protection required */
70 	struct list_head	region_node;
71 
72 	struct rcu_head rcu;
73 
74 	struct page_counter cnt;
75 	struct dmem_cgroup_pool_state *parent;
76 
77 	refcount_t ref;
78 	bool inited;
79 };
80 
81 /*
82  * 3 operations require locking protection:
83  * - Registering and unregistering region to/from list, requires global lock.
84  * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed.
85  * - Adding a dmem_cgroup_pool_state to a region list.
86  *
87  * Since for the most common operations RCU provides enough protection, I
88  * do not think more granular locking makes sense. Most protection is offered
89  * by RCU and the lockless operating page_counter.
90  */
91 static DEFINE_SPINLOCK(dmemcg_lock);
92 static LIST_HEAD(dmem_cgroup_regions);
93 
94 static void dmemcg_free_region(struct kref *ref);
95 static void dmemcg_pool_free_rcu(struct rcu_head *rcu);
96 
97 static inline struct dmemcg_state *
css_to_dmemcs(struct cgroup_subsys_state * css)98 css_to_dmemcs(struct cgroup_subsys_state *css)
99 {
100 	return container_of(css, struct dmemcg_state, css);
101 }
102 
get_current_dmemcs(void)103 static inline struct dmemcg_state *get_current_dmemcs(void)
104 {
105 	return css_to_dmemcs(task_get_css(current, dmem_cgrp_id));
106 }
107 
parent_dmemcs(struct dmemcg_state * cg)108 static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg)
109 {
110 	return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL;
111 }
112 
dmemcg_pool_get(struct dmem_cgroup_pool_state * pool)113 static void dmemcg_pool_get(struct dmem_cgroup_pool_state *pool)
114 {
115 	refcount_inc(&pool->ref);
116 }
117 
dmemcg_pool_tryget(struct dmem_cgroup_pool_state * pool)118 static bool dmemcg_pool_tryget(struct dmem_cgroup_pool_state *pool)
119 {
120 	return refcount_inc_not_zero(&pool->ref);
121 }
122 
dmemcg_pool_put(struct dmem_cgroup_pool_state * pool)123 static void dmemcg_pool_put(struct dmem_cgroup_pool_state *pool)
124 {
125 	if (!refcount_dec_and_test(&pool->ref))
126 		return;
127 
128 	call_rcu(&pool->rcu, dmemcg_pool_free_rcu);
129 }
130 
dmemcg_pool_free_rcu(struct rcu_head * rcu)131 static void dmemcg_pool_free_rcu(struct rcu_head *rcu)
132 {
133 	struct dmem_cgroup_pool_state *pool = container_of(rcu, typeof(*pool), rcu);
134 
135 	if (pool->parent)
136 		dmemcg_pool_put(pool->parent);
137 	kref_put(&pool->region->ref, dmemcg_free_region);
138 	kfree(pool);
139 }
140 
free_cg_pool(struct dmem_cgroup_pool_state * pool)141 static void free_cg_pool(struct dmem_cgroup_pool_state *pool)
142 {
143 	list_del(&pool->region_node);
144 	dmemcg_pool_put(pool);
145 }
146 
147 static void
set_resource_min(struct dmem_cgroup_pool_state * pool,u64 val)148 set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val)
149 {
150 	page_counter_set_min(&pool->cnt, val);
151 }
152 
153 static void
set_resource_low(struct dmem_cgroup_pool_state * pool,u64 val)154 set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val)
155 {
156 	page_counter_set_low(&pool->cnt, val);
157 }
158 
159 static void
set_resource_max(struct dmem_cgroup_pool_state * pool,u64 val)160 set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val)
161 {
162 	page_counter_set_max(&pool->cnt, val);
163 }
164 
get_resource_low(struct dmem_cgroup_pool_state * pool)165 static u64 get_resource_low(struct dmem_cgroup_pool_state *pool)
166 {
167 	return pool ? READ_ONCE(pool->cnt.low) : 0;
168 }
169 
get_resource_min(struct dmem_cgroup_pool_state * pool)170 static u64 get_resource_min(struct dmem_cgroup_pool_state *pool)
171 {
172 	return pool ? READ_ONCE(pool->cnt.min) : 0;
173 }
174 
get_resource_max(struct dmem_cgroup_pool_state * pool)175 static u64 get_resource_max(struct dmem_cgroup_pool_state *pool)
176 {
177 	return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX;
178 }
179 
get_resource_current(struct dmem_cgroup_pool_state * pool)180 static u64 get_resource_current(struct dmem_cgroup_pool_state *pool)
181 {
182 	return pool ? page_counter_read(&pool->cnt) : 0;
183 }
184 
reset_all_resource_limits(struct dmem_cgroup_pool_state * rpool)185 static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
186 {
187 	set_resource_min(rpool, 0);
188 	set_resource_low(rpool, 0);
189 	set_resource_max(rpool, PAGE_COUNTER_MAX);
190 }
191 
dmemcs_offline(struct cgroup_subsys_state * css)192 static void dmemcs_offline(struct cgroup_subsys_state *css)
193 {
194 	struct dmemcg_state *dmemcs = css_to_dmemcs(css);
195 	struct dmem_cgroup_pool_state *pool;
196 
197 	rcu_read_lock();
198 	list_for_each_entry_rcu(pool, &dmemcs->pools, css_node)
199 		reset_all_resource_limits(pool);
200 	rcu_read_unlock();
201 }
202 
dmemcs_free(struct cgroup_subsys_state * css)203 static void dmemcs_free(struct cgroup_subsys_state *css)
204 {
205 	struct dmemcg_state *dmemcs = css_to_dmemcs(css);
206 	struct dmem_cgroup_pool_state *pool, *next;
207 
208 	spin_lock(&dmemcg_lock);
209 	list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) {
210 		/*
211 		 *The pool is dead and all references are 0,
212 		 * no need for RCU protection with list_del_rcu or freeing.
213 		 */
214 		list_del(&pool->css_node);
215 		free_cg_pool(pool);
216 	}
217 	spin_unlock(&dmemcg_lock);
218 
219 	kfree(dmemcs);
220 }
221 
222 static struct cgroup_subsys_state *
dmemcs_alloc(struct cgroup_subsys_state * parent_css)223 dmemcs_alloc(struct cgroup_subsys_state *parent_css)
224 {
225 	struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL);
226 	if (!dmemcs)
227 		return ERR_PTR(-ENOMEM);
228 
229 	INIT_LIST_HEAD(&dmemcs->pools);
230 	return &dmemcs->css;
231 }
232 
233 static struct dmem_cgroup_pool_state *
find_cg_pool_locked(struct dmemcg_state * dmemcs,struct dmem_cgroup_region * region)234 find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region)
235 {
236 	struct dmem_cgroup_pool_state *pool;
237 
238 	list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock))
239 		if (pool->region == region)
240 			return pool;
241 
242 	return NULL;
243 }
244 
pool_parent(struct dmem_cgroup_pool_state * pool)245 static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool)
246 {
247 	if (!pool->cnt.parent)
248 		return NULL;
249 
250 	return container_of(pool->cnt.parent, typeof(*pool), cnt);
251 }
252 
253 static void
dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state * limit_pool,struct dmem_cgroup_pool_state * test_pool)254 dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool,
255 				 struct dmem_cgroup_pool_state *test_pool)
256 {
257 	struct page_counter *climit;
258 	struct cgroup_subsys_state *css;
259 	struct dmemcg_state *dmemcg_iter;
260 	struct dmem_cgroup_pool_state *pool, *found_pool;
261 
262 	climit = &limit_pool->cnt;
263 
264 	rcu_read_lock();
265 
266 	css_for_each_descendant_pre(css, &limit_pool->cs->css) {
267 		dmemcg_iter = container_of(css, struct dmemcg_state, css);
268 		found_pool = NULL;
269 
270 		list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) {
271 			if (pool->region == limit_pool->region) {
272 				found_pool = pool;
273 				break;
274 			}
275 		}
276 		if (!found_pool)
277 			continue;
278 
279 		page_counter_calculate_protection(
280 			climit, &found_pool->cnt, true);
281 
282 		if (found_pool == test_pool)
283 			break;
284 	}
285 	rcu_read_unlock();
286 }
287 
288 /**
289  * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool
290  * @limit_pool: The pool for which we hit limits
291  * @test_pool: The pool for which to test
292  * @ignore_low: Whether we have to respect low watermarks.
293  * @ret_hit_low: Pointer to whether it makes sense to consider low watermark.
294  *
295  * This function returns true if we can evict from @test_pool, false if not.
296  * When returning false and @ignore_low is false, @ret_hit_low may
297  * be set to true to indicate this function can be retried with @ignore_low
298  * set to true.
299  *
300  * Return: bool
301  */
dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state * limit_pool,struct dmem_cgroup_pool_state * test_pool,bool ignore_low,bool * ret_hit_low)302 bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool,
303 				      struct dmem_cgroup_pool_state *test_pool,
304 				      bool ignore_low, bool *ret_hit_low)
305 {
306 	struct dmem_cgroup_pool_state *pool = test_pool;
307 	struct page_counter *ctest;
308 	u64 used, min, low;
309 
310 	/* Can always evict from current pool, despite limits */
311 	if (limit_pool == test_pool)
312 		return true;
313 
314 	if (limit_pool) {
315 		if (!parent_dmemcs(limit_pool->cs))
316 			return true;
317 
318 		for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool))
319 			{}
320 
321 		if (!pool)
322 			return false;
323 	} else {
324 		/*
325 		 * If there is no cgroup limiting memory usage, use the root
326 		 * cgroup instead for limit calculations.
327 		 */
328 		for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool))
329 			{}
330 	}
331 
332 	ctest = &test_pool->cnt;
333 
334 	dmem_cgroup_calculate_protection(limit_pool, test_pool);
335 
336 	used = page_counter_read(ctest);
337 	min = READ_ONCE(ctest->emin);
338 
339 	if (used <= min)
340 		return false;
341 
342 	if (!ignore_low) {
343 		low = READ_ONCE(ctest->elow);
344 		if (used > low)
345 			return true;
346 
347 		*ret_hit_low = true;
348 		return false;
349 	}
350 	return true;
351 }
352 EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable);
353 
354 static struct dmem_cgroup_pool_state *
alloc_pool_single(struct dmemcg_state * dmemcs,struct dmem_cgroup_region * region,struct dmem_cgroup_pool_state ** allocpool)355 alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
356 		  struct dmem_cgroup_pool_state **allocpool)
357 {
358 	struct dmemcg_state *parent = parent_dmemcs(dmemcs);
359 	struct dmem_cgroup_pool_state *pool, *ppool = NULL;
360 
361 	if (!*allocpool) {
362 		pool = kzalloc(sizeof(*pool), GFP_NOWAIT);
363 		if (!pool)
364 			return ERR_PTR(-ENOMEM);
365 	} else {
366 		pool = *allocpool;
367 		*allocpool = NULL;
368 	}
369 
370 	pool->region = region;
371 	pool->cs = dmemcs;
372 
373 	if (parent)
374 		ppool = find_cg_pool_locked(parent, region);
375 
376 	page_counter_init(&pool->cnt,
377 			  ppool ? &ppool->cnt : NULL, true);
378 	reset_all_resource_limits(pool);
379 	refcount_set(&pool->ref, 1);
380 	kref_get(&region->ref);
381 	if (ppool && !pool->parent) {
382 		pool->parent = ppool;
383 		dmemcg_pool_get(ppool);
384 	}
385 
386 	list_add_tail_rcu(&pool->css_node, &dmemcs->pools);
387 	list_add_tail(&pool->region_node, &region->pools);
388 
389 	if (!parent)
390 		pool->inited = true;
391 	else
392 		pool->inited = ppool ? ppool->inited : false;
393 	return pool;
394 }
395 
396 static struct dmem_cgroup_pool_state *
get_cg_pool_locked(struct dmemcg_state * dmemcs,struct dmem_cgroup_region * region,struct dmem_cgroup_pool_state ** allocpool)397 get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
398 		   struct dmem_cgroup_pool_state **allocpool)
399 {
400 	struct dmem_cgroup_pool_state *pool, *ppool, *retpool;
401 	struct dmemcg_state *p, *pp;
402 
403 	/*
404 	 * Recursively create pool, we may not initialize yet on
405 	 * recursion, this is done as a separate step.
406 	 */
407 	for (p = dmemcs; p; p = parent_dmemcs(p)) {
408 		pool = find_cg_pool_locked(p, region);
409 		if (!pool)
410 			pool = alloc_pool_single(p, region, allocpool);
411 
412 		if (IS_ERR(pool))
413 			return pool;
414 
415 		if (p == dmemcs && pool->inited)
416 			return pool;
417 
418 		if (pool->inited)
419 			break;
420 	}
421 
422 	retpool = pool = find_cg_pool_locked(dmemcs, region);
423 	for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) {
424 		if (pool->inited)
425 			break;
426 
427 		/* ppool was created if it didn't exist by above loop. */
428 		ppool = find_cg_pool_locked(pp, region);
429 
430 		/* Fix up parent links, mark as inited. */
431 		pool->cnt.parent = &ppool->cnt;
432 		if (ppool && !pool->parent) {
433 			pool->parent = ppool;
434 			dmemcg_pool_get(ppool);
435 		}
436 		pool->inited = true;
437 
438 		pool = ppool;
439 	}
440 
441 	return retpool;
442 }
443 
dmemcg_free_rcu(struct rcu_head * rcu)444 static void dmemcg_free_rcu(struct rcu_head *rcu)
445 {
446 	struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu);
447 	struct dmem_cgroup_pool_state *pool, *next;
448 
449 	list_for_each_entry_safe(pool, next, &region->pools, region_node)
450 		free_cg_pool(pool);
451 	kfree(region->name);
452 	kfree(region);
453 }
454 
dmemcg_free_region(struct kref * ref)455 static void dmemcg_free_region(struct kref *ref)
456 {
457 	struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref);
458 
459 	call_rcu(&cgregion->rcu, dmemcg_free_rcu);
460 }
461 
462 /**
463  * dmem_cgroup_unregister_region() - Unregister a previously registered region.
464  * @region: The region to unregister.
465  *
466  * This function undoes dmem_cgroup_register_region.
467  */
dmem_cgroup_unregister_region(struct dmem_cgroup_region * region)468 void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
469 {
470 	struct dmem_cgroup_pool_state *pool, *next;
471 
472 	if (!region)
473 		return;
474 
475 	spin_lock(&dmemcg_lock);
476 
477 	/* Remove from global region list */
478 	list_del_rcu(&region->region_node);
479 
480 	list_for_each_entry_safe(pool, next, &region->pools, region_node) {
481 		list_del_rcu(&pool->css_node);
482 		list_del(&pool->region_node);
483 		dmemcg_pool_put(pool);
484 	}
485 
486 	/*
487 	 * Ensure any RCU based lookups fail. Additionally,
488 	 * no new pools should be added to the dead region
489 	 * by get_cg_pool_unlocked.
490 	 */
491 	region->unregistered = true;
492 	spin_unlock(&dmemcg_lock);
493 
494 	kref_put(&region->ref, dmemcg_free_region);
495 }
496 EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region);
497 
498 /**
499  * dmem_cgroup_register_region() - Register a regions for dev cgroup.
500  * @size: Size of region to register, in bytes.
501  * @fmt: Region parameters to register
502  *
503  * This function registers a node in the dmem cgroup with the
504  * name given. After calling this function, the region can be
505  * used for allocations.
506  *
507  * Return: NULL or a struct on success, PTR_ERR on failure.
508  */
dmem_cgroup_register_region(u64 size,const char * fmt,...)509 struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...)
510 {
511 	struct dmem_cgroup_region *ret;
512 	char *region_name;
513 	va_list ap;
514 
515 	if (!size)
516 		return NULL;
517 
518 	va_start(ap, fmt);
519 	region_name = kvasprintf(GFP_KERNEL, fmt, ap);
520 	va_end(ap);
521 	if (!region_name)
522 		return ERR_PTR(-ENOMEM);
523 
524 	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
525 	if (!ret) {
526 		kfree(region_name);
527 		return ERR_PTR(-ENOMEM);
528 	}
529 
530 	INIT_LIST_HEAD(&ret->pools);
531 	ret->name = region_name;
532 	ret->size = size;
533 	kref_init(&ret->ref);
534 
535 	spin_lock(&dmemcg_lock);
536 	list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions);
537 	spin_unlock(&dmemcg_lock);
538 
539 	return ret;
540 }
541 EXPORT_SYMBOL_GPL(dmem_cgroup_register_region);
542 
dmemcg_get_region_by_name(const char * name)543 static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name)
544 {
545 	struct dmem_cgroup_region *region;
546 
547 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock))
548 		if (!strcmp(name, region->name) &&
549 		    kref_get_unless_zero(&region->ref))
550 			return region;
551 
552 	return NULL;
553 }
554 
555 /**
556  * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state
557  * @pool: &dmem_cgroup_pool_state
558  *
559  * Called to drop a reference to the limiting pool returned by
560  * dmem_cgroup_try_charge().
561  */
dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state * pool)562 void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool)
563 {
564 	if (pool) {
565 		css_put(&pool->cs->css);
566 		dmemcg_pool_put(pool);
567 	}
568 }
569 EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put);
570 
571 static struct dmem_cgroup_pool_state *
get_cg_pool_unlocked(struct dmemcg_state * cg,struct dmem_cgroup_region * region)572 get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
573 {
574 	struct dmem_cgroup_pool_state *pool, *allocpool = NULL;
575 
576 	/* fastpath lookup? */
577 	rcu_read_lock();
578 	pool = find_cg_pool_locked(cg, region);
579 	if (pool && !READ_ONCE(pool->inited))
580 		pool = NULL;
581 	if (pool && !dmemcg_pool_tryget(pool))
582 		pool = NULL;
583 	rcu_read_unlock();
584 
585 	while (!pool) {
586 		spin_lock(&dmemcg_lock);
587 		if (!region->unregistered)
588 			pool = get_cg_pool_locked(cg, region, &allocpool);
589 		else
590 			pool = ERR_PTR(-ENODEV);
591 		if (!IS_ERR(pool))
592 			dmemcg_pool_get(pool);
593 		spin_unlock(&dmemcg_lock);
594 
595 		if (pool == ERR_PTR(-ENOMEM)) {
596 			pool = NULL;
597 			if (WARN_ON(allocpool))
598 				continue;
599 
600 			allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL);
601 			if (allocpool) {
602 				pool = NULL;
603 				continue;
604 			}
605 		}
606 	}
607 
608 	kfree(allocpool);
609 	return pool;
610 }
611 
612 /**
613  * dmem_cgroup_uncharge() - Uncharge a pool.
614  * @pool: Pool to uncharge.
615  * @size: Size to uncharge.
616  *
617  * Undoes the effects of dmem_cgroup_try_charge.
618  * Must be called with the returned pool as argument,
619  * and same @index and @size.
620  */
dmem_cgroup_uncharge(struct dmem_cgroup_pool_state * pool,u64 size)621 void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
622 {
623 	if (!pool)
624 		return;
625 
626 	page_counter_uncharge(&pool->cnt, size);
627 	css_put(&pool->cs->css);
628 	dmemcg_pool_put(pool);
629 }
630 EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge);
631 
632 /**
633  * dmem_cgroup_try_charge() - Try charging a new allocation to a region.
634  * @region: dmem region to charge
635  * @size: Size (in bytes) to charge.
636  * @ret_pool: On succesfull allocation, the pool that is charged.
637  * @ret_limit_pool: On a failed allocation, the limiting pool.
638  *
639  * This function charges the @region region for a size of @size bytes.
640  *
641  * If the function succeeds, @ret_pool is set, which must be passed to
642  * dmem_cgroup_uncharge() when undoing the allocation.
643  *
644  * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it
645  * will be set to the pool for which the limit is hit. This can be used for
646  * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed
647  * with @dmem_cgroup_pool_state_put().
648  *
649  * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure.
650  */
dmem_cgroup_try_charge(struct dmem_cgroup_region * region,u64 size,struct dmem_cgroup_pool_state ** ret_pool,struct dmem_cgroup_pool_state ** ret_limit_pool)651 int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
652 			  struct dmem_cgroup_pool_state **ret_pool,
653 			  struct dmem_cgroup_pool_state **ret_limit_pool)
654 {
655 	struct dmemcg_state *cg;
656 	struct dmem_cgroup_pool_state *pool;
657 	struct page_counter *fail;
658 	int ret;
659 
660 	*ret_pool = NULL;
661 	if (ret_limit_pool)
662 		*ret_limit_pool = NULL;
663 
664 	/*
665 	 * hold on to css, as cgroup can be removed but resource
666 	 * accounting happens on css.
667 	 */
668 	cg = get_current_dmemcs();
669 
670 	pool = get_cg_pool_unlocked(cg, region);
671 	if (IS_ERR(pool)) {
672 		ret = PTR_ERR(pool);
673 		goto err;
674 	}
675 
676 	if (!page_counter_try_charge(&pool->cnt, size, &fail)) {
677 		if (ret_limit_pool) {
678 			*ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
679 			css_get(&(*ret_limit_pool)->cs->css);
680 			dmemcg_pool_get(*ret_limit_pool);
681 		}
682 		dmemcg_pool_put(pool);
683 		ret = -EAGAIN;
684 		goto err;
685 	}
686 
687 	/* On success, reference from get_current_dmemcs is transferred to *ret_pool */
688 	*ret_pool = pool;
689 	return 0;
690 
691 err:
692 	css_put(&cg->css);
693 	return ret;
694 }
695 EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge);
696 
dmem_cgroup_region_capacity_show(struct seq_file * sf,void * v)697 static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
698 {
699 	struct dmem_cgroup_region *region;
700 
701 	rcu_read_lock();
702 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
703 		seq_puts(sf, region->name);
704 		seq_printf(sf, " %llu\n", region->size);
705 	}
706 	rcu_read_unlock();
707 	return 0;
708 }
709 
dmemcg_parse_limit(char * options,struct dmem_cgroup_region * region,u64 * new_limit)710 static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region,
711 			      u64 *new_limit)
712 {
713 	char *end;
714 
715 	if (!strcmp(options, "max")) {
716 		*new_limit = PAGE_COUNTER_MAX;
717 		return 0;
718 	}
719 
720 	*new_limit = memparse(options, &end);
721 	if (*end != '\0')
722 		return -EINVAL;
723 
724 	return 0;
725 }
726 
dmemcg_limit_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off,void (* apply)(struct dmem_cgroup_pool_state *,u64))727 static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
728 				 char *buf, size_t nbytes, loff_t off,
729 				 void (*apply)(struct dmem_cgroup_pool_state *, u64))
730 {
731 	struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of));
732 	int err = 0;
733 
734 	while (buf && !err) {
735 		struct dmem_cgroup_pool_state *pool = NULL;
736 		char *options, *region_name;
737 		struct dmem_cgroup_region *region;
738 		u64 new_limit;
739 
740 		options = buf;
741 		buf = strchr(buf, '\n');
742 		if (buf)
743 			*buf++ = '\0';
744 
745 		options = strstrip(options);
746 
747 		/* eat empty lines */
748 		if (!options[0])
749 			continue;
750 
751 		region_name = strsep(&options, " \t");
752 		if (!region_name[0])
753 			continue;
754 
755 		if (!options || !*options)
756 			return -EINVAL;
757 
758 		rcu_read_lock();
759 		region = dmemcg_get_region_by_name(region_name);
760 		rcu_read_unlock();
761 
762 		if (!region)
763 			return -EINVAL;
764 
765 		err = dmemcg_parse_limit(options, region, &new_limit);
766 		if (err < 0)
767 			goto out_put;
768 
769 		pool = get_cg_pool_unlocked(dmemcs, region);
770 		if (IS_ERR(pool)) {
771 			err = PTR_ERR(pool);
772 			goto out_put;
773 		}
774 
775 		/* And commit */
776 		apply(pool, new_limit);
777 		dmemcg_pool_put(pool);
778 
779 out_put:
780 		kref_put(&region->ref, dmemcg_free_region);
781 	}
782 
783 
784 	return err ?: nbytes;
785 }
786 
dmemcg_limit_show(struct seq_file * sf,void * v,u64 (* fn)(struct dmem_cgroup_pool_state *))787 static int dmemcg_limit_show(struct seq_file *sf, void *v,
788 			    u64 (*fn)(struct dmem_cgroup_pool_state *))
789 {
790 	struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf));
791 	struct dmem_cgroup_region *region;
792 
793 	rcu_read_lock();
794 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
795 		struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region);
796 		u64 val;
797 
798 		seq_puts(sf, region->name);
799 
800 		val = fn(pool);
801 		if (val < PAGE_COUNTER_MAX)
802 			seq_printf(sf, " %lld\n", val);
803 		else
804 			seq_puts(sf, " max\n");
805 	}
806 	rcu_read_unlock();
807 
808 	return 0;
809 }
810 
dmem_cgroup_region_current_show(struct seq_file * sf,void * v)811 static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v)
812 {
813 	return dmemcg_limit_show(sf, v, get_resource_current);
814 }
815 
dmem_cgroup_region_min_show(struct seq_file * sf,void * v)816 static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v)
817 {
818 	return dmemcg_limit_show(sf, v, get_resource_min);
819 }
820 
dmem_cgroup_region_min_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)821 static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of,
822 				      char *buf, size_t nbytes, loff_t off)
823 {
824 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min);
825 }
826 
dmem_cgroup_region_low_show(struct seq_file * sf,void * v)827 static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v)
828 {
829 	return dmemcg_limit_show(sf, v, get_resource_low);
830 }
831 
dmem_cgroup_region_low_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)832 static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of,
833 				      char *buf, size_t nbytes, loff_t off)
834 {
835 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low);
836 }
837 
dmem_cgroup_region_max_show(struct seq_file * sf,void * v)838 static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v)
839 {
840 	return dmemcg_limit_show(sf, v, get_resource_max);
841 }
842 
dmem_cgroup_region_max_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)843 static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of,
844 				      char *buf, size_t nbytes, loff_t off)
845 {
846 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max);
847 }
848 
849 static struct cftype files[] = {
850 	{
851 		.name = "capacity",
852 		.seq_show = dmem_cgroup_region_capacity_show,
853 		.flags = CFTYPE_ONLY_ON_ROOT,
854 	},
855 	{
856 		.name = "current",
857 		.seq_show = dmem_cgroup_region_current_show,
858 	},
859 	{
860 		.name = "min",
861 		.write = dmem_cgroup_region_min_write,
862 		.seq_show = dmem_cgroup_region_min_show,
863 		.flags = CFTYPE_NOT_ON_ROOT,
864 	},
865 	{
866 		.name = "low",
867 		.write = dmem_cgroup_region_low_write,
868 		.seq_show = dmem_cgroup_region_low_show,
869 		.flags = CFTYPE_NOT_ON_ROOT,
870 	},
871 	{
872 		.name = "max",
873 		.write = dmem_cgroup_region_max_write,
874 		.seq_show = dmem_cgroup_region_max_show,
875 		.flags = CFTYPE_NOT_ON_ROOT,
876 	},
877 	{ } /* Zero entry terminates. */
878 };
879 
880 struct cgroup_subsys dmem_cgrp_subsys = {
881 	.css_alloc	= dmemcs_alloc,
882 	.css_free	= dmemcs_free,
883 	.css_offline	= dmemcs_offline,
884 	.legacy_cftypes	= files,
885 	.dfl_cftypes	= files,
886 };
887