xref: /linux/kernel/cgroup/dmem.c (revision 67da125e30ab17b5b8874eb32882e81cdec17ec8)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>)
4  * Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>)
5  * Partially based on the rdma and misc controllers, which bear the following copyrights:
6  *
7  * Copyright 2020 Google LLC
8  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
9  */
10 
11 #include <linux/cgroup.h>
12 #include <linux/cgroup_dmem.h>
13 #include <linux/list.h>
14 #include <linux/mutex.h>
15 #include <linux/page_counter.h>
16 #include <linux/parser.h>
17 #include <linux/rculist.h>
18 #include <linux/slab.h>
19 
20 struct dmem_cgroup_region {
21 	/**
22 	 * @ref: References keeping the region alive.
23 	 * Keeps the region reference alive after a succesful RCU lookup.
24 	 */
25 	struct kref ref;
26 
27 	/** @rcu: RCU head for freeing */
28 	struct rcu_head rcu;
29 
30 	/**
31 	 * @region_node: Linked into &dmem_cgroup_regions list.
32 	 * Protected by RCU and global spinlock.
33 	 */
34 	struct list_head region_node;
35 
36 	/**
37 	 * @pools: List of pools linked to this region.
38 	 * Protected by global spinlock only
39 	 */
40 	struct list_head pools;
41 
42 	/** @size: Size of region, in bytes */
43 	u64 size;
44 
45 	/** @name: Name describing the node, set by dmem_cgroup_register_region */
46 	char *name;
47 
48 	/**
49 	 * @unregistered: Whether the region is unregistered by its caller.
50 	 * No new pools should be added to the region afterwards.
51 	 */
52 	bool unregistered;
53 };
54 
55 struct dmemcg_state {
56 	struct cgroup_subsys_state css;
57 
58 	struct list_head pools;
59 };
60 
61 struct dmem_cgroup_pool_state {
62 	struct dmem_cgroup_region *region;
63 	struct dmemcg_state *cs;
64 
65 	/* css node, RCU protected against region teardown */
66 	struct list_head	css_node;
67 
68 	/* dev node, no RCU protection required */
69 	struct list_head	region_node;
70 
71 	struct rcu_head rcu;
72 
73 	struct page_counter cnt;
74 
75 	bool inited;
76 };
77 
78 /*
79  * 3 operations require locking protection:
80  * - Registering and unregistering region to/from list, requires global lock.
81  * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed.
82  * - Adding a dmem_cgroup_pool_state to a region list.
83  *
84  * Since for the most common operations RCU provides enough protection, I
85  * do not think more granular locking makes sense. Most protection is offered
86  * by RCU and the lockless operating page_counter.
87  */
88 static DEFINE_SPINLOCK(dmemcg_lock);
89 static LIST_HEAD(dmem_cgroup_regions);
90 
91 static inline struct dmemcg_state *
92 css_to_dmemcs(struct cgroup_subsys_state *css)
93 {
94 	return container_of(css, struct dmemcg_state, css);
95 }
96 
97 static inline struct dmemcg_state *get_current_dmemcs(void)
98 {
99 	return css_to_dmemcs(task_get_css(current, dmem_cgrp_id));
100 }
101 
102 static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg)
103 {
104 	return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL;
105 }
106 
107 static void free_cg_pool(struct dmem_cgroup_pool_state *pool)
108 {
109 	list_del(&pool->region_node);
110 	kfree(pool);
111 }
112 
113 static void
114 set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val)
115 {
116 	page_counter_set_min(&pool->cnt, val);
117 }
118 
119 static void
120 set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val)
121 {
122 	page_counter_set_low(&pool->cnt, val);
123 }
124 
125 static void
126 set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val)
127 {
128 	page_counter_set_max(&pool->cnt, val);
129 }
130 
131 static u64 get_resource_low(struct dmem_cgroup_pool_state *pool)
132 {
133 	return pool ? READ_ONCE(pool->cnt.low) : 0;
134 }
135 
136 static u64 get_resource_min(struct dmem_cgroup_pool_state *pool)
137 {
138 	return pool ? READ_ONCE(pool->cnt.min) : 0;
139 }
140 
141 static u64 get_resource_max(struct dmem_cgroup_pool_state *pool)
142 {
143 	return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX;
144 }
145 
146 static u64 get_resource_current(struct dmem_cgroup_pool_state *pool)
147 {
148 	return pool ? page_counter_read(&pool->cnt) : 0;
149 }
150 
151 static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
152 {
153 	set_resource_min(rpool, 0);
154 	set_resource_low(rpool, 0);
155 	set_resource_max(rpool, PAGE_COUNTER_MAX);
156 }
157 
158 static void dmemcs_offline(struct cgroup_subsys_state *css)
159 {
160 	struct dmemcg_state *dmemcs = css_to_dmemcs(css);
161 	struct dmem_cgroup_pool_state *pool;
162 
163 	rcu_read_lock();
164 	list_for_each_entry_rcu(pool, &dmemcs->pools, css_node)
165 		reset_all_resource_limits(pool);
166 	rcu_read_unlock();
167 }
168 
169 static void dmemcs_free(struct cgroup_subsys_state *css)
170 {
171 	struct dmemcg_state *dmemcs = css_to_dmemcs(css);
172 	struct dmem_cgroup_pool_state *pool, *next;
173 
174 	spin_lock(&dmemcg_lock);
175 	list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) {
176 		/*
177 		 *The pool is dead and all references are 0,
178 		 * no need for RCU protection with list_del_rcu or freeing.
179 		 */
180 		list_del(&pool->css_node);
181 		free_cg_pool(pool);
182 	}
183 	spin_unlock(&dmemcg_lock);
184 
185 	kfree(dmemcs);
186 }
187 
188 static struct cgroup_subsys_state *
189 dmemcs_alloc(struct cgroup_subsys_state *parent_css)
190 {
191 	struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL);
192 	if (!dmemcs)
193 		return ERR_PTR(-ENOMEM);
194 
195 	INIT_LIST_HEAD(&dmemcs->pools);
196 	return &dmemcs->css;
197 }
198 
199 static struct dmem_cgroup_pool_state *
200 find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region)
201 {
202 	struct dmem_cgroup_pool_state *pool;
203 
204 	list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock))
205 		if (pool->region == region)
206 			return pool;
207 
208 	return NULL;
209 }
210 
211 static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool)
212 {
213 	if (!pool->cnt.parent)
214 		return NULL;
215 
216 	return container_of(pool->cnt.parent, typeof(*pool), cnt);
217 }
218 
219 static void
220 dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool,
221 				 struct dmem_cgroup_pool_state *test_pool)
222 {
223 	struct page_counter *climit;
224 	struct cgroup_subsys_state *css;
225 	struct dmemcg_state *dmemcg_iter;
226 	struct dmem_cgroup_pool_state *pool, *found_pool;
227 
228 	climit = &limit_pool->cnt;
229 
230 	rcu_read_lock();
231 
232 	css_for_each_descendant_pre(css, &limit_pool->cs->css) {
233 		dmemcg_iter = container_of(css, struct dmemcg_state, css);
234 		found_pool = NULL;
235 
236 		list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) {
237 			if (pool->region == limit_pool->region) {
238 				found_pool = pool;
239 				break;
240 			}
241 		}
242 		if (!found_pool)
243 			continue;
244 
245 		page_counter_calculate_protection(
246 			climit, &found_pool->cnt, true);
247 
248 		if (found_pool == test_pool)
249 			break;
250 	}
251 	rcu_read_unlock();
252 }
253 
254 /**
255  * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool
256  * @limit_pool: The pool for which we hit limits
257  * @test_pool: The pool for which to test
258  * @ignore_low: Whether we have to respect low watermarks.
259  * @ret_hit_low: Pointer to whether it makes sense to consider low watermark.
260  *
261  * This function returns true if we can evict from @test_pool, false if not.
262  * When returning false and @ignore_low is false, @ret_hit_low may
263  * be set to true to indicate this function can be retried with @ignore_low
264  * set to true.
265  *
266  * Return: bool
267  */
268 bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool,
269 				      struct dmem_cgroup_pool_state *test_pool,
270 				      bool ignore_low, bool *ret_hit_low)
271 {
272 	struct dmem_cgroup_pool_state *pool = test_pool;
273 	struct page_counter *ctest;
274 	u64 used, min, low;
275 
276 	/* Can always evict from current pool, despite limits */
277 	if (limit_pool == test_pool)
278 		return true;
279 
280 	if (limit_pool) {
281 		if (!parent_dmemcs(limit_pool->cs))
282 			return true;
283 
284 		for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool))
285 			{}
286 
287 		if (!pool)
288 			return false;
289 	} else {
290 		/*
291 		 * If there is no cgroup limiting memory usage, use the root
292 		 * cgroup instead for limit calculations.
293 		 */
294 		for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool))
295 			{}
296 	}
297 
298 	ctest = &test_pool->cnt;
299 
300 	dmem_cgroup_calculate_protection(limit_pool, test_pool);
301 
302 	used = page_counter_read(ctest);
303 	min = READ_ONCE(ctest->emin);
304 
305 	if (used <= min)
306 		return false;
307 
308 	if (!ignore_low) {
309 		low = READ_ONCE(ctest->elow);
310 		if (used > low)
311 			return true;
312 
313 		*ret_hit_low = true;
314 		return false;
315 	}
316 	return true;
317 }
318 EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable);
319 
320 static struct dmem_cgroup_pool_state *
321 alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
322 		  struct dmem_cgroup_pool_state **allocpool)
323 {
324 	struct dmemcg_state *parent = parent_dmemcs(dmemcs);
325 	struct dmem_cgroup_pool_state *pool, *ppool = NULL;
326 
327 	if (!*allocpool) {
328 		pool = kzalloc(sizeof(*pool), GFP_NOWAIT);
329 		if (!pool)
330 			return ERR_PTR(-ENOMEM);
331 	} else {
332 		pool = *allocpool;
333 		*allocpool = NULL;
334 	}
335 
336 	pool->region = region;
337 	pool->cs = dmemcs;
338 
339 	if (parent)
340 		ppool = find_cg_pool_locked(parent, region);
341 
342 	page_counter_init(&pool->cnt,
343 			  ppool ? &ppool->cnt : NULL, true);
344 	reset_all_resource_limits(pool);
345 
346 	list_add_tail_rcu(&pool->css_node, &dmemcs->pools);
347 	list_add_tail(&pool->region_node, &region->pools);
348 
349 	if (!parent)
350 		pool->inited = true;
351 	else
352 		pool->inited = ppool ? ppool->inited : false;
353 	return pool;
354 }
355 
356 static struct dmem_cgroup_pool_state *
357 get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
358 		   struct dmem_cgroup_pool_state **allocpool)
359 {
360 	struct dmem_cgroup_pool_state *pool, *ppool, *retpool;
361 	struct dmemcg_state *p, *pp;
362 
363 	/*
364 	 * Recursively create pool, we may not initialize yet on
365 	 * recursion, this is done as a separate step.
366 	 */
367 	for (p = dmemcs; p; p = parent_dmemcs(p)) {
368 		pool = find_cg_pool_locked(p, region);
369 		if (!pool)
370 			pool = alloc_pool_single(p, region, allocpool);
371 
372 		if (IS_ERR(pool))
373 			return pool;
374 
375 		if (p == dmemcs && pool->inited)
376 			return pool;
377 
378 		if (pool->inited)
379 			break;
380 	}
381 
382 	retpool = pool = find_cg_pool_locked(dmemcs, region);
383 	for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) {
384 		if (pool->inited)
385 			break;
386 
387 		/* ppool was created if it didn't exist by above loop. */
388 		ppool = find_cg_pool_locked(pp, region);
389 
390 		/* Fix up parent links, mark as inited. */
391 		pool->cnt.parent = &ppool->cnt;
392 		pool->inited = true;
393 
394 		pool = ppool;
395 	}
396 
397 	return retpool;
398 }
399 
400 static void dmemcg_free_rcu(struct rcu_head *rcu)
401 {
402 	struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu);
403 	struct dmem_cgroup_pool_state *pool, *next;
404 
405 	list_for_each_entry_safe(pool, next, &region->pools, region_node)
406 		free_cg_pool(pool);
407 	kfree(region->name);
408 	kfree(region);
409 }
410 
411 static void dmemcg_free_region(struct kref *ref)
412 {
413 	struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref);
414 
415 	call_rcu(&cgregion->rcu, dmemcg_free_rcu);
416 }
417 
418 /**
419  * dmem_cgroup_unregister_region() - Unregister a previously registered region.
420  * @region: The region to unregister.
421  *
422  * This function undoes dmem_cgroup_register_region.
423  */
424 void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
425 {
426 	struct list_head *entry;
427 
428 	if (!region)
429 		return;
430 
431 	spin_lock(&dmemcg_lock);
432 
433 	/* Remove from global region list */
434 	list_del_rcu(&region->region_node);
435 
436 	list_for_each_rcu(entry, &region->pools) {
437 		struct dmem_cgroup_pool_state *pool =
438 			container_of(entry, typeof(*pool), region_node);
439 
440 		list_del_rcu(&pool->css_node);
441 	}
442 
443 	/*
444 	 * Ensure any RCU based lookups fail. Additionally,
445 	 * no new pools should be added to the dead region
446 	 * by get_cg_pool_unlocked.
447 	 */
448 	region->unregistered = true;
449 	spin_unlock(&dmemcg_lock);
450 
451 	kref_put(&region->ref, dmemcg_free_region);
452 }
453 EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region);
454 
455 /**
456  * dmem_cgroup_register_region() - Register a regions for dev cgroup.
457  * @size: Size of region to register, in bytes.
458  * @fmt: Region parameters to register
459  *
460  * This function registers a node in the dmem cgroup with the
461  * name given. After calling this function, the region can be
462  * used for allocations.
463  *
464  * Return: NULL or a struct on success, PTR_ERR on failure.
465  */
466 struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...)
467 {
468 	struct dmem_cgroup_region *ret;
469 	char *region_name;
470 	va_list ap;
471 
472 	if (!size)
473 		return NULL;
474 
475 	va_start(ap, fmt);
476 	region_name = kvasprintf(GFP_KERNEL, fmt, ap);
477 	va_end(ap);
478 	if (!region_name)
479 		return ERR_PTR(-ENOMEM);
480 
481 	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
482 	if (!ret) {
483 		kfree(region_name);
484 		return ERR_PTR(-ENOMEM);
485 	}
486 
487 	INIT_LIST_HEAD(&ret->pools);
488 	ret->name = region_name;
489 	ret->size = size;
490 	kref_init(&ret->ref);
491 
492 	spin_lock(&dmemcg_lock);
493 	list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions);
494 	spin_unlock(&dmemcg_lock);
495 
496 	return ret;
497 }
498 EXPORT_SYMBOL_GPL(dmem_cgroup_register_region);
499 
500 static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name)
501 {
502 	struct dmem_cgroup_region *region;
503 
504 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock))
505 		if (!strcmp(name, region->name) &&
506 		    kref_get_unless_zero(&region->ref))
507 			return region;
508 
509 	return NULL;
510 }
511 
512 /**
513  * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state
514  * @pool: &dmem_cgroup_pool_state
515  *
516  * Called to drop a reference to the limiting pool returned by
517  * dmem_cgroup_try_charge().
518  */
519 void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool)
520 {
521 	if (pool)
522 		css_put(&pool->cs->css);
523 }
524 EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put);
525 
526 static struct dmem_cgroup_pool_state *
527 get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
528 {
529 	struct dmem_cgroup_pool_state *pool, *allocpool = NULL;
530 
531 	/* fastpath lookup? */
532 	rcu_read_lock();
533 	pool = find_cg_pool_locked(cg, region);
534 	if (pool && !READ_ONCE(pool->inited))
535 		pool = NULL;
536 	rcu_read_unlock();
537 
538 	while (!pool) {
539 		spin_lock(&dmemcg_lock);
540 		if (!region->unregistered)
541 			pool = get_cg_pool_locked(cg, region, &allocpool);
542 		else
543 			pool = ERR_PTR(-ENODEV);
544 		spin_unlock(&dmemcg_lock);
545 
546 		if (pool == ERR_PTR(-ENOMEM)) {
547 			pool = NULL;
548 			if (WARN_ON(allocpool))
549 				continue;
550 
551 			allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL);
552 			if (allocpool) {
553 				pool = NULL;
554 				continue;
555 			}
556 		}
557 	}
558 
559 	kfree(allocpool);
560 	return pool;
561 }
562 
563 /**
564  * dmem_cgroup_uncharge() - Uncharge a pool.
565  * @pool: Pool to uncharge.
566  * @size: Size to uncharge.
567  *
568  * Undoes the effects of dmem_cgroup_try_charge.
569  * Must be called with the returned pool as argument,
570  * and same @index and @size.
571  */
572 void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
573 {
574 	if (!pool)
575 		return;
576 
577 	page_counter_uncharge(&pool->cnt, size);
578 	css_put(&pool->cs->css);
579 }
580 EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge);
581 
582 /**
583  * dmem_cgroup_try_charge() - Try charging a new allocation to a region.
584  * @region: dmem region to charge
585  * @size: Size (in bytes) to charge.
586  * @ret_pool: On succesfull allocation, the pool that is charged.
587  * @ret_limit_pool: On a failed allocation, the limiting pool.
588  *
589  * This function charges the @region region for a size of @size bytes.
590  *
591  * If the function succeeds, @ret_pool is set, which must be passed to
592  * dmem_cgroup_uncharge() when undoing the allocation.
593  *
594  * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it
595  * will be set to the pool for which the limit is hit. This can be used for
596  * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed
597  * with @dmem_cgroup_pool_state_put().
598  *
599  * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure.
600  */
601 int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
602 			  struct dmem_cgroup_pool_state **ret_pool,
603 			  struct dmem_cgroup_pool_state **ret_limit_pool)
604 {
605 	struct dmemcg_state *cg;
606 	struct dmem_cgroup_pool_state *pool;
607 	struct page_counter *fail;
608 	int ret;
609 
610 	*ret_pool = NULL;
611 	if (ret_limit_pool)
612 		*ret_limit_pool = NULL;
613 
614 	/*
615 	 * hold on to css, as cgroup can be removed but resource
616 	 * accounting happens on css.
617 	 */
618 	cg = get_current_dmemcs();
619 
620 	pool = get_cg_pool_unlocked(cg, region);
621 	if (IS_ERR(pool)) {
622 		ret = PTR_ERR(pool);
623 		goto err;
624 	}
625 
626 	if (!page_counter_try_charge(&pool->cnt, size, &fail)) {
627 		if (ret_limit_pool) {
628 			*ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
629 			css_get(&(*ret_limit_pool)->cs->css);
630 		}
631 		ret = -EAGAIN;
632 		goto err;
633 	}
634 
635 	/* On success, reference from get_current_dmemcs is transferred to *ret_pool */
636 	*ret_pool = pool;
637 	return 0;
638 
639 err:
640 	css_put(&cg->css);
641 	return ret;
642 }
643 EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge);
644 
645 static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
646 {
647 	struct dmem_cgroup_region *region;
648 
649 	rcu_read_lock();
650 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
651 		seq_puts(sf, region->name);
652 		seq_printf(sf, " %llu\n", region->size);
653 	}
654 	rcu_read_unlock();
655 	return 0;
656 }
657 
658 static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region,
659 			      u64 *new_limit)
660 {
661 	char *end;
662 
663 	if (!strcmp(options, "max")) {
664 		*new_limit = PAGE_COUNTER_MAX;
665 		return 0;
666 	}
667 
668 	*new_limit = memparse(options, &end);
669 	if (*end != '\0')
670 		return -EINVAL;
671 
672 	return 0;
673 }
674 
675 static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
676 				 char *buf, size_t nbytes, loff_t off,
677 				 void (*apply)(struct dmem_cgroup_pool_state *, u64))
678 {
679 	struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of));
680 	int err = 0;
681 
682 	while (buf && !err) {
683 		struct dmem_cgroup_pool_state *pool = NULL;
684 		char *options, *region_name;
685 		struct dmem_cgroup_region *region;
686 		u64 new_limit;
687 
688 		options = buf;
689 		buf = strchr(buf, '\n');
690 		if (buf)
691 			*buf++ = '\0';
692 
693 		options = strstrip(options);
694 
695 		/* eat empty lines */
696 		if (!options[0])
697 			continue;
698 
699 		region_name = strsep(&options, " \t");
700 		if (!region_name[0])
701 			continue;
702 
703 		rcu_read_lock();
704 		region = dmemcg_get_region_by_name(region_name);
705 		rcu_read_unlock();
706 
707 		if (!region)
708 			return -EINVAL;
709 
710 		err = dmemcg_parse_limit(options, region, &new_limit);
711 		if (err < 0)
712 			goto out_put;
713 
714 		pool = get_cg_pool_unlocked(dmemcs, region);
715 		if (IS_ERR(pool)) {
716 			err = PTR_ERR(pool);
717 			goto out_put;
718 		}
719 
720 		/* And commit */
721 		apply(pool, new_limit);
722 
723 out_put:
724 		kref_put(&region->ref, dmemcg_free_region);
725 	}
726 
727 
728 	return err ?: nbytes;
729 }
730 
731 static int dmemcg_limit_show(struct seq_file *sf, void *v,
732 			    u64 (*fn)(struct dmem_cgroup_pool_state *))
733 {
734 	struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf));
735 	struct dmem_cgroup_region *region;
736 
737 	rcu_read_lock();
738 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
739 		struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region);
740 		u64 val;
741 
742 		seq_puts(sf, region->name);
743 
744 		val = fn(pool);
745 		if (val < PAGE_COUNTER_MAX)
746 			seq_printf(sf, " %lld\n", val);
747 		else
748 			seq_puts(sf, " max\n");
749 	}
750 	rcu_read_unlock();
751 
752 	return 0;
753 }
754 
755 static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v)
756 {
757 	return dmemcg_limit_show(sf, v, get_resource_current);
758 }
759 
760 static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v)
761 {
762 	return dmemcg_limit_show(sf, v, get_resource_min);
763 }
764 
765 static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of,
766 				      char *buf, size_t nbytes, loff_t off)
767 {
768 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min);
769 }
770 
771 static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v)
772 {
773 	return dmemcg_limit_show(sf, v, get_resource_low);
774 }
775 
776 static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of,
777 				      char *buf, size_t nbytes, loff_t off)
778 {
779 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low);
780 }
781 
782 static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v)
783 {
784 	return dmemcg_limit_show(sf, v, get_resource_max);
785 }
786 
787 static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of,
788 				      char *buf, size_t nbytes, loff_t off)
789 {
790 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max);
791 }
792 
793 static struct cftype files[] = {
794 	{
795 		.name = "capacity",
796 		.seq_show = dmem_cgroup_region_capacity_show,
797 		.flags = CFTYPE_ONLY_ON_ROOT,
798 	},
799 	{
800 		.name = "current",
801 		.seq_show = dmem_cgroup_region_current_show,
802 	},
803 	{
804 		.name = "min",
805 		.write = dmem_cgroup_region_min_write,
806 		.seq_show = dmem_cgroup_region_min_show,
807 		.flags = CFTYPE_NOT_ON_ROOT,
808 	},
809 	{
810 		.name = "low",
811 		.write = dmem_cgroup_region_low_write,
812 		.seq_show = dmem_cgroup_region_low_show,
813 		.flags = CFTYPE_NOT_ON_ROOT,
814 	},
815 	{
816 		.name = "max",
817 		.write = dmem_cgroup_region_max_write,
818 		.seq_show = dmem_cgroup_region_max_show,
819 		.flags = CFTYPE_NOT_ON_ROOT,
820 	},
821 	{ } /* Zero entry terminates. */
822 };
823 
824 struct cgroup_subsys dmem_cgrp_subsys = {
825 	.css_alloc	= dmemcs_alloc,
826 	.css_free	= dmemcs_free,
827 	.css_offline	= dmemcs_offline,
828 	.legacy_cftypes	= files,
829 	.dfl_cftypes	= files,
830 };
831