xref: /linux/kernel/cgroup/dmem.c (revision 2b624a2c18656ea32e0849e7bc0018ba3c97ca64)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>)
4  * Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>)
5  * Partially based on the rdma and misc controllers, which bear the following copyrights:
6  *
7  * Copyright 2020 Google LLC
8  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
9  */
10 
11 #include <linux/cgroup.h>
12 #include <linux/cgroup_dmem.h>
13 #include <linux/list.h>
14 #include <linux/mutex.h>
15 #include <linux/page_counter.h>
16 #include <linux/parser.h>
17 #include <linux/slab.h>
18 
19 struct dmem_cgroup_region {
20 	/**
21 	 * @ref: References keeping the region alive.
22 	 * Keeps the region reference alive after a succesful RCU lookup.
23 	 */
24 	struct kref ref;
25 
26 	/** @rcu: RCU head for freeing */
27 	struct rcu_head rcu;
28 
29 	/**
30 	 * @region_node: Linked into &dmem_cgroup_regions list.
31 	 * Protected by RCU and global spinlock.
32 	 */
33 	struct list_head region_node;
34 
35 	/**
36 	 * @pools: List of pools linked to this region.
37 	 * Protected by global spinlock only
38 	 */
39 	struct list_head pools;
40 
41 	/** @size: Size of region, in bytes */
42 	u64 size;
43 
44 	/** @name: Name describing the node, set by dmem_cgroup_register_region */
45 	char *name;
46 
47 	/**
48 	 * @unregistered: Whether the region is unregistered by its caller.
49 	 * No new pools should be added to the region afterwards.
50 	 */
51 	bool unregistered;
52 };
53 
54 struct dmemcg_state {
55 	struct cgroup_subsys_state css;
56 
57 	struct list_head pools;
58 };
59 
60 struct dmem_cgroup_pool_state {
61 	struct dmem_cgroup_region *region;
62 	struct dmemcg_state *cs;
63 
64 	/* css node, RCU protected against region teardown */
65 	struct list_head	css_node;
66 
67 	/* dev node, no RCU protection required */
68 	struct list_head	region_node;
69 
70 	struct rcu_head rcu;
71 
72 	struct page_counter cnt;
73 
74 	bool inited;
75 };
76 
77 /*
78  * 3 operations require locking protection:
79  * - Registering and unregistering region to/from list, requires global lock.
80  * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed.
81  * - Adding a dmem_cgroup_pool_state to a region list.
82  *
83  * Since for the most common operations RCU provides enough protection, I
84  * do not think more granular locking makes sense. Most protection is offered
85  * by RCU and the lockless operating page_counter.
86  */
87 static DEFINE_SPINLOCK(dmemcg_lock);
88 static LIST_HEAD(dmem_cgroup_regions);
89 
90 static inline struct dmemcg_state *
91 css_to_dmemcs(struct cgroup_subsys_state *css)
92 {
93 	return container_of(css, struct dmemcg_state, css);
94 }
95 
96 static inline struct dmemcg_state *get_current_dmemcs(void)
97 {
98 	return css_to_dmemcs(task_get_css(current, dmem_cgrp_id));
99 }
100 
101 static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg)
102 {
103 	return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL;
104 }
105 
106 static void free_cg_pool(struct dmem_cgroup_pool_state *pool)
107 {
108 	list_del(&pool->region_node);
109 	kfree(pool);
110 }
111 
112 static void
113 set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val)
114 {
115 	page_counter_set_min(&pool->cnt, val);
116 }
117 
118 static void
119 set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val)
120 {
121 	page_counter_set_low(&pool->cnt, val);
122 }
123 
124 static void
125 set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val)
126 {
127 	page_counter_set_max(&pool->cnt, val);
128 }
129 
130 static u64 get_resource_low(struct dmem_cgroup_pool_state *pool)
131 {
132 	return pool ? READ_ONCE(pool->cnt.low) : 0;
133 }
134 
135 static u64 get_resource_min(struct dmem_cgroup_pool_state *pool)
136 {
137 	return pool ? READ_ONCE(pool->cnt.min) : 0;
138 }
139 
140 static u64 get_resource_max(struct dmem_cgroup_pool_state *pool)
141 {
142 	return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX;
143 }
144 
145 static u64 get_resource_current(struct dmem_cgroup_pool_state *pool)
146 {
147 	return pool ? page_counter_read(&pool->cnt) : 0;
148 }
149 
150 static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
151 {
152 	set_resource_min(rpool, 0);
153 	set_resource_low(rpool, 0);
154 	set_resource_max(rpool, PAGE_COUNTER_MAX);
155 }
156 
157 static void dmemcs_offline(struct cgroup_subsys_state *css)
158 {
159 	struct dmemcg_state *dmemcs = css_to_dmemcs(css);
160 	struct dmem_cgroup_pool_state *pool;
161 
162 	rcu_read_lock();
163 	list_for_each_entry_rcu(pool, &dmemcs->pools, css_node)
164 		reset_all_resource_limits(pool);
165 	rcu_read_unlock();
166 }
167 
168 static void dmemcs_free(struct cgroup_subsys_state *css)
169 {
170 	struct dmemcg_state *dmemcs = css_to_dmemcs(css);
171 	struct dmem_cgroup_pool_state *pool, *next;
172 
173 	spin_lock(&dmemcg_lock);
174 	list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) {
175 		/*
176 		 *The pool is dead and all references are 0,
177 		 * no need for RCU protection with list_del_rcu or freeing.
178 		 */
179 		list_del(&pool->css_node);
180 		free_cg_pool(pool);
181 	}
182 	spin_unlock(&dmemcg_lock);
183 
184 	kfree(dmemcs);
185 }
186 
187 static struct cgroup_subsys_state *
188 dmemcs_alloc(struct cgroup_subsys_state *parent_css)
189 {
190 	struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL);
191 	if (!dmemcs)
192 		return ERR_PTR(-ENOMEM);
193 
194 	INIT_LIST_HEAD(&dmemcs->pools);
195 	return &dmemcs->css;
196 }
197 
198 static struct dmem_cgroup_pool_state *
199 find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region)
200 {
201 	struct dmem_cgroup_pool_state *pool;
202 
203 	list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock))
204 		if (pool->region == region)
205 			return pool;
206 
207 	return NULL;
208 }
209 
210 static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool)
211 {
212 	if (!pool->cnt.parent)
213 		return NULL;
214 
215 	return container_of(pool->cnt.parent, typeof(*pool), cnt);
216 }
217 
218 static void
219 dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool,
220 				 struct dmem_cgroup_pool_state *test_pool)
221 {
222 	struct page_counter *climit;
223 	struct cgroup_subsys_state *css, *next_css;
224 	struct dmemcg_state *dmemcg_iter;
225 	struct dmem_cgroup_pool_state *pool, *parent_pool;
226 	bool found_descendant;
227 
228 	climit = &limit_pool->cnt;
229 
230 	rcu_read_lock();
231 	parent_pool = pool = limit_pool;
232 	css = &limit_pool->cs->css;
233 
234 	/*
235 	 * This logic is roughly equivalent to css_foreach_descendant_pre,
236 	 * except we also track the parent pool to find out which pool we need
237 	 * to calculate protection values for.
238 	 *
239 	 * We can stop the traversal once we find test_pool among the
240 	 * descendants since we don't really care about any others.
241 	 */
242 	while (pool != test_pool) {
243 		next_css = css_next_child(NULL, css);
244 		if (next_css) {
245 			parent_pool = pool;
246 		} else {
247 			while (css != &limit_pool->cs->css) {
248 				next_css = css_next_child(css, css->parent);
249 				if (next_css)
250 					break;
251 				css = css->parent;
252 				parent_pool = pool_parent(parent_pool);
253 			}
254 			/*
255 			 * We can only hit this when test_pool is not a
256 			 * descendant of limit_pool.
257 			 */
258 			if (WARN_ON_ONCE(css == &limit_pool->cs->css))
259 				break;
260 		}
261 		css = next_css;
262 
263 		found_descendant = false;
264 		dmemcg_iter = container_of(css, struct dmemcg_state, css);
265 
266 		list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) {
267 			if (pool_parent(pool) == parent_pool) {
268 				found_descendant = true;
269 				break;
270 			}
271 		}
272 		if (!found_descendant)
273 			continue;
274 
275 		page_counter_calculate_protection(
276 			climit, &pool->cnt, true);
277 	}
278 	rcu_read_unlock();
279 }
280 
281 /**
282  * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool
283  * @dev: &dmem_cgroup_region
284  * @index: The index number of the region being tested.
285  * @limit_pool: The pool for which we hit limits
286  * @test_pool: The pool for which to test
287  * @ignore_low: Whether we have to respect low watermarks.
288  * @ret_hit_low: Pointer to whether it makes sense to consider low watermark.
289  *
290  * This function returns true if we can evict from @test_pool, false if not.
291  * When returning false and @ignore_low is false, @ret_hit_low may
292  * be set to true to indicate this function can be retried with @ignore_low
293  * set to true.
294  *
295  * Return: bool
296  */
297 bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool,
298 				      struct dmem_cgroup_pool_state *test_pool,
299 				      bool ignore_low, bool *ret_hit_low)
300 {
301 	struct dmem_cgroup_pool_state *pool = test_pool;
302 	struct page_counter *climit, *ctest;
303 	u64 used, min, low;
304 
305 	/* Can always evict from current pool, despite limits */
306 	if (limit_pool == test_pool)
307 		return true;
308 
309 	if (limit_pool) {
310 		if (!parent_dmemcs(limit_pool->cs))
311 			return true;
312 
313 		for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool))
314 			{}
315 
316 		if (!pool)
317 			return false;
318 	} else {
319 		/*
320 		 * If there is no cgroup limiting memory usage, use the root
321 		 * cgroup instead for limit calculations.
322 		 */
323 		for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool))
324 			{}
325 	}
326 
327 	climit = &limit_pool->cnt;
328 	ctest = &test_pool->cnt;
329 
330 	dmem_cgroup_calculate_protection(limit_pool, test_pool);
331 
332 	used = page_counter_read(ctest);
333 	min = READ_ONCE(ctest->emin);
334 
335 	if (used <= min)
336 		return false;
337 
338 	if (!ignore_low) {
339 		low = READ_ONCE(ctest->elow);
340 		if (used > low)
341 			return true;
342 
343 		*ret_hit_low = true;
344 		return false;
345 	}
346 	return true;
347 }
348 EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable);
349 
350 static struct dmem_cgroup_pool_state *
351 alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
352 		  struct dmem_cgroup_pool_state **allocpool)
353 {
354 	struct dmemcg_state *parent = parent_dmemcs(dmemcs);
355 	struct dmem_cgroup_pool_state *pool, *ppool = NULL;
356 
357 	if (!*allocpool) {
358 		pool = kzalloc(sizeof(*pool), GFP_NOWAIT);
359 		if (!pool)
360 			return ERR_PTR(-ENOMEM);
361 	} else {
362 		pool = *allocpool;
363 		*allocpool = NULL;
364 	}
365 
366 	pool->region = region;
367 	pool->cs = dmemcs;
368 
369 	if (parent)
370 		ppool = find_cg_pool_locked(parent, region);
371 
372 	page_counter_init(&pool->cnt,
373 			  ppool ? &ppool->cnt : NULL, true);
374 	reset_all_resource_limits(pool);
375 
376 	list_add_tail_rcu(&pool->css_node, &dmemcs->pools);
377 	list_add_tail(&pool->region_node, &region->pools);
378 
379 	if (!parent)
380 		pool->inited = true;
381 	else
382 		pool->inited = ppool ? ppool->inited : false;
383 	return pool;
384 }
385 
386 static struct dmem_cgroup_pool_state *
387 get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
388 		   struct dmem_cgroup_pool_state **allocpool)
389 {
390 	struct dmem_cgroup_pool_state *pool, *ppool, *retpool;
391 	struct dmemcg_state *p, *pp;
392 
393 	/*
394 	 * Recursively create pool, we may not initialize yet on
395 	 * recursion, this is done as a separate step.
396 	 */
397 	for (p = dmemcs; p; p = parent_dmemcs(p)) {
398 		pool = find_cg_pool_locked(p, region);
399 		if (!pool)
400 			pool = alloc_pool_single(p, region, allocpool);
401 
402 		if (IS_ERR(pool))
403 			return pool;
404 
405 		if (p == dmemcs && pool->inited)
406 			return pool;
407 
408 		if (pool->inited)
409 			break;
410 	}
411 
412 	retpool = pool = find_cg_pool_locked(dmemcs, region);
413 	for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) {
414 		if (pool->inited)
415 			break;
416 
417 		/* ppool was created if it didn't exist by above loop. */
418 		ppool = find_cg_pool_locked(pp, region);
419 
420 		/* Fix up parent links, mark as inited. */
421 		pool->cnt.parent = &ppool->cnt;
422 		pool->inited = true;
423 
424 		pool = ppool;
425 	}
426 
427 	return retpool;
428 }
429 
430 static void dmemcg_free_rcu(struct rcu_head *rcu)
431 {
432 	struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu);
433 	struct dmem_cgroup_pool_state *pool, *next;
434 
435 	list_for_each_entry_safe(pool, next, &region->pools, region_node)
436 		free_cg_pool(pool);
437 	kfree(region->name);
438 	kfree(region);
439 }
440 
441 static void dmemcg_free_region(struct kref *ref)
442 {
443 	struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref);
444 
445 	call_rcu(&cgregion->rcu, dmemcg_free_rcu);
446 }
447 
448 /**
449  * dmem_cgroup_unregister_region() - Unregister a previously registered region.
450  * @region: The region to unregister.
451  *
452  * This function undoes dmem_cgroup_register_region.
453  */
454 void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
455 {
456 	struct list_head *entry;
457 
458 	if (!region)
459 		return;
460 
461 	spin_lock(&dmemcg_lock);
462 
463 	/* Remove from global region list */
464 	list_del_rcu(&region->region_node);
465 
466 	list_for_each_rcu(entry, &region->pools) {
467 		struct dmem_cgroup_pool_state *pool =
468 			container_of(entry, typeof(*pool), region_node);
469 
470 		list_del_rcu(&pool->css_node);
471 	}
472 
473 	/*
474 	 * Ensure any RCU based lookups fail. Additionally,
475 	 * no new pools should be added to the dead region
476 	 * by get_cg_pool_unlocked.
477 	 */
478 	region->unregistered = true;
479 	spin_unlock(&dmemcg_lock);
480 
481 	kref_put(&region->ref, dmemcg_free_region);
482 }
483 EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region);
484 
485 /**
486  * dmem_cgroup_register_region() - Register a regions for dev cgroup.
487  * @size: Size of region to register, in bytes.
488  * @fmt: Region parameters to register
489  *
490  * This function registers a node in the dmem cgroup with the
491  * name given. After calling this function, the region can be
492  * used for allocations.
493  *
494  * Return: NULL or a struct on success, PTR_ERR on failure.
495  */
496 struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...)
497 {
498 	struct dmem_cgroup_region *ret;
499 	char *region_name;
500 	va_list ap;
501 
502 	if (!size)
503 		return NULL;
504 
505 	va_start(ap, fmt);
506 	region_name = kvasprintf(GFP_KERNEL, fmt, ap);
507 	va_end(ap);
508 	if (!region_name)
509 		return ERR_PTR(-ENOMEM);
510 
511 	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
512 	if (!ret) {
513 		kfree(region_name);
514 		return ERR_PTR(-ENOMEM);
515 	}
516 
517 	INIT_LIST_HEAD(&ret->pools);
518 	ret->name = region_name;
519 	ret->size = size;
520 	kref_init(&ret->ref);
521 
522 	spin_lock(&dmemcg_lock);
523 	list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions);
524 	spin_unlock(&dmemcg_lock);
525 
526 	return ret;
527 }
528 EXPORT_SYMBOL_GPL(dmem_cgroup_register_region);
529 
530 static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name)
531 {
532 	struct dmem_cgroup_region *region;
533 
534 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock))
535 		if (!strcmp(name, region->name) &&
536 		    kref_get_unless_zero(&region->ref))
537 			return region;
538 
539 	return NULL;
540 }
541 
542 /**
543  * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state
544  * @pool: &dmem_cgroup_pool_state
545  *
546  * Called to drop a reference to the limiting pool returned by
547  * dmem_cgroup_try_charge().
548  */
549 void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool)
550 {
551 	if (pool)
552 		css_put(&pool->cs->css);
553 }
554 EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put);
555 
556 static struct dmem_cgroup_pool_state *
557 get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
558 {
559 	struct dmem_cgroup_pool_state *pool, *allocpool = NULL;
560 
561 	/* fastpath lookup? */
562 	rcu_read_lock();
563 	pool = find_cg_pool_locked(cg, region);
564 	if (pool && !READ_ONCE(pool->inited))
565 		pool = NULL;
566 	rcu_read_unlock();
567 
568 	while (!pool) {
569 		spin_lock(&dmemcg_lock);
570 		if (!region->unregistered)
571 			pool = get_cg_pool_locked(cg, region, &allocpool);
572 		else
573 			pool = ERR_PTR(-ENODEV);
574 		spin_unlock(&dmemcg_lock);
575 
576 		if (pool == ERR_PTR(-ENOMEM)) {
577 			pool = NULL;
578 			if (WARN_ON(allocpool))
579 				continue;
580 
581 			allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL);
582 			if (allocpool) {
583 				pool = NULL;
584 				continue;
585 			}
586 		}
587 	}
588 
589 	kfree(allocpool);
590 	return pool;
591 }
592 
593 /**
594  * dmem_cgroup_uncharge() - Uncharge a pool.
595  * @pool: Pool to uncharge.
596  * @size: Size to uncharge.
597  *
598  * Undoes the effects of dmem_cgroup_try_charge.
599  * Must be called with the returned pool as argument,
600  * and same @index and @size.
601  */
602 void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
603 {
604 	if (!pool)
605 		return;
606 
607 	page_counter_uncharge(&pool->cnt, size);
608 	css_put(&pool->cs->css);
609 }
610 EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge);
611 
612 /**
613  * dmem_cgroup_try_charge() - Try charging a new allocation to a region.
614  * @dev: Device to charge
615  * @size: Size (in bytes) to charge.
616  * @ret_pool: On succesfull allocation, the pool that is charged.
617  * @ret_limit_pool: On a failed allocation, the limiting pool.
618  *
619  * This function charges the current pool for @dev with region at @index for a
620  * size of @size bytes.
621  *
622  * If the function succeeds, @ret_pool is set, which must be passed to
623  * dmem_cgroup_uncharge() when undoing the allocation.
624  *
625  * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it
626  * will be set to the pool for which the limit is hit. This can be used for
627  * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed
628  * with @dmem_cgroup_pool_state_put().
629  *
630  * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure.
631  */
632 int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
633 			  struct dmem_cgroup_pool_state **ret_pool,
634 			  struct dmem_cgroup_pool_state **ret_limit_pool)
635 {
636 	struct dmemcg_state *cg;
637 	struct dmem_cgroup_pool_state *pool;
638 	struct page_counter *fail;
639 	int ret;
640 
641 	*ret_pool = NULL;
642 	if (ret_limit_pool)
643 		*ret_limit_pool = NULL;
644 
645 	/*
646 	 * hold on to css, as cgroup can be removed but resource
647 	 * accounting happens on css.
648 	 */
649 	cg = get_current_dmemcs();
650 
651 	pool = get_cg_pool_unlocked(cg, region);
652 	if (IS_ERR(pool)) {
653 		ret = PTR_ERR(pool);
654 		goto err;
655 	}
656 
657 	if (!page_counter_try_charge(&pool->cnt, size, &fail)) {
658 		if (ret_limit_pool) {
659 			*ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
660 			css_get(&(*ret_limit_pool)->cs->css);
661 		}
662 		ret = -EAGAIN;
663 		goto err;
664 	}
665 
666 	/* On success, reference from get_current_dmemcs is transferred to *ret_pool */
667 	*ret_pool = pool;
668 	return 0;
669 
670 err:
671 	css_put(&cg->css);
672 	return ret;
673 }
674 EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge);
675 
676 static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
677 {
678 	struct dmem_cgroup_region *region;
679 
680 	rcu_read_lock();
681 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
682 		seq_puts(sf, region->name);
683 		seq_printf(sf, " %llu\n", region->size);
684 	}
685 	rcu_read_unlock();
686 	return 0;
687 }
688 
689 static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region,
690 			      u64 *new_limit)
691 {
692 	char *end;
693 
694 	if (!strcmp(options, "max")) {
695 		*new_limit = PAGE_COUNTER_MAX;
696 		return 0;
697 	}
698 
699 	*new_limit = memparse(options, &end);
700 	if (*end != '\0')
701 		return -EINVAL;
702 
703 	return 0;
704 }
705 
706 static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
707 				 char *buf, size_t nbytes, loff_t off,
708 				 void (*apply)(struct dmem_cgroup_pool_state *, u64))
709 {
710 	struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of));
711 	int err = 0;
712 
713 	while (buf && !err) {
714 		struct dmem_cgroup_pool_state *pool = NULL;
715 		char *options, *region_name;
716 		struct dmem_cgroup_region *region;
717 		u64 new_limit;
718 
719 		options = buf;
720 		buf = strchr(buf, '\n');
721 		if (buf)
722 			*buf++ = '\0';
723 
724 		options = strstrip(options);
725 
726 		/* eat empty lines */
727 		if (!options[0])
728 			continue;
729 
730 		region_name = strsep(&options, " \t");
731 		if (!region_name[0])
732 			continue;
733 
734 		rcu_read_lock();
735 		region = dmemcg_get_region_by_name(region_name);
736 		rcu_read_unlock();
737 
738 		if (!region)
739 			return -EINVAL;
740 
741 		err = dmemcg_parse_limit(options, region, &new_limit);
742 		if (err < 0)
743 			goto out_put;
744 
745 		pool = get_cg_pool_unlocked(dmemcs, region);
746 		if (IS_ERR(pool)) {
747 			err = PTR_ERR(pool);
748 			goto out_put;
749 		}
750 
751 		/* And commit */
752 		apply(pool, new_limit);
753 
754 out_put:
755 		kref_put(&region->ref, dmemcg_free_region);
756 	}
757 
758 
759 	return err ?: nbytes;
760 }
761 
762 static int dmemcg_limit_show(struct seq_file *sf, void *v,
763 			    u64 (*fn)(struct dmem_cgroup_pool_state *))
764 {
765 	struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf));
766 	struct dmem_cgroup_region *region;
767 
768 	rcu_read_lock();
769 	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
770 		struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region);
771 		u64 val;
772 
773 		seq_puts(sf, region->name);
774 
775 		val = fn(pool);
776 		if (val < PAGE_COUNTER_MAX)
777 			seq_printf(sf, " %lld\n", val);
778 		else
779 			seq_puts(sf, " max\n");
780 	}
781 	rcu_read_unlock();
782 
783 	return 0;
784 }
785 
786 static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v)
787 {
788 	return dmemcg_limit_show(sf, v, get_resource_current);
789 }
790 
791 static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v)
792 {
793 	return dmemcg_limit_show(sf, v, get_resource_min);
794 }
795 
796 static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of,
797 				      char *buf, size_t nbytes, loff_t off)
798 {
799 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min);
800 }
801 
802 static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v)
803 {
804 	return dmemcg_limit_show(sf, v, get_resource_low);
805 }
806 
807 static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of,
808 				      char *buf, size_t nbytes, loff_t off)
809 {
810 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low);
811 }
812 
813 static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v)
814 {
815 	return dmemcg_limit_show(sf, v, get_resource_max);
816 }
817 
818 static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of,
819 				      char *buf, size_t nbytes, loff_t off)
820 {
821 	return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max);
822 }
823 
824 static struct cftype files[] = {
825 	{
826 		.name = "capacity",
827 		.seq_show = dmem_cgroup_region_capacity_show,
828 		.flags = CFTYPE_ONLY_ON_ROOT,
829 	},
830 	{
831 		.name = "current",
832 		.seq_show = dmem_cgroup_region_current_show,
833 	},
834 	{
835 		.name = "min",
836 		.write = dmem_cgroup_region_min_write,
837 		.seq_show = dmem_cgroup_region_min_show,
838 		.flags = CFTYPE_NOT_ON_ROOT,
839 	},
840 	{
841 		.name = "low",
842 		.write = dmem_cgroup_region_low_write,
843 		.seq_show = dmem_cgroup_region_low_show,
844 		.flags = CFTYPE_NOT_ON_ROOT,
845 	},
846 	{
847 		.name = "max",
848 		.write = dmem_cgroup_region_max_write,
849 		.seq_show = dmem_cgroup_region_max_show,
850 		.flags = CFTYPE_NOT_ON_ROOT,
851 	},
852 	{ } /* Zero entry terminates. */
853 };
854 
855 struct cgroup_subsys dmem_cgrp_subsys = {
856 	.css_alloc	= dmemcs_alloc,
857 	.css_free	= dmemcs_free,
858 	.css_offline	= dmemcs_offline,
859 	.legacy_cftypes	= files,
860 	.dfl_cftypes	= files,
861 };
862