xref: /linux/mm/hugetlb_cgroup.c (revision 3c2d73de49be528276474c1a53f78b38ee11c1fa)
1 /*
2  *
3  * Copyright IBM Corporation, 2012
4  * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
5  *
6  * Cgroup v2
7  * Copyright (C) 2019 Red Hat, Inc.
8  * Author: Giuseppe Scrivano <gscrivan@redhat.com>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms of version 2.1 of the GNU Lesser General Public License
12  * as published by the Free Software Foundation.
13  *
14  * This program is distributed in the hope that it would be useful, but
15  * WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
17  *
18  */
19 
20 #include <linux/cgroup.h>
21 #include <linux/page_counter.h>
22 #include <linux/slab.h>
23 #include <linux/hugetlb.h>
24 #include <linux/hugetlb_cgroup.h>
25 
26 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
27 #define MEMFILE_IDX(val)	(((val) >> 16) & 0xffff)
28 #define MEMFILE_ATTR(val)	((val) & 0xffff)
29 
30 /* Use t->m[0] to encode the offset */
31 #define MEMFILE_OFFSET(t, m0)	(((offsetof(t, m0) << 16) | sizeof_field(t, m0)))
32 #define MEMFILE_OFFSET0(val)	(((val) >> 16) & 0xffff)
33 #define MEMFILE_FIELD_SIZE(val)	((val) & 0xffff)
34 
35 #define DFL_TMPL_SIZE		ARRAY_SIZE(hugetlb_dfl_tmpl)
36 #define LEGACY_TMPL_SIZE	ARRAY_SIZE(hugetlb_legacy_tmpl)
37 
38 static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
39 static struct cftype *dfl_files;
40 static struct cftype *legacy_files;
41 
42 static inline struct page_counter *
43 __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
44 				     bool rsvd)
45 {
46 	if (rsvd)
47 		return &h_cg->rsvd_hugepage[idx];
48 	return &h_cg->hugepage[idx];
49 }
50 
51 static inline struct page_counter *
52 hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
53 {
54 	return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
55 }
56 
57 static inline struct page_counter *
58 hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
59 {
60 	return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
61 }
62 
63 static inline
64 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
65 {
66 	return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
67 }
68 
69 static inline
70 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
71 {
72 	return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
73 }
74 
75 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
76 {
77 	return (h_cg == root_h_cgroup);
78 }
79 
80 static inline struct hugetlb_cgroup *
81 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
82 {
83 	return hugetlb_cgroup_from_css(h_cg->css.parent);
84 }
85 
86 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
87 {
88 	struct hstate *h;
89 
90 	for_each_hstate(h) {
91 		if (page_counter_read(
92 		    hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
93 			return true;
94 	}
95 	return false;
96 }
97 
98 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
99 				struct hugetlb_cgroup *parent_h_cgroup)
100 {
101 	int idx;
102 
103 	for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
104 		struct page_counter *fault_parent = NULL;
105 		struct page_counter *rsvd_parent = NULL;
106 		unsigned long limit;
107 		int ret;
108 
109 		if (parent_h_cgroup) {
110 			fault_parent = hugetlb_cgroup_counter_from_cgroup(
111 				parent_h_cgroup, idx);
112 			rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
113 				parent_h_cgroup, idx);
114 		}
115 		page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
116 								     idx),
117 				  fault_parent, false);
118 		page_counter_init(
119 			hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
120 			rsvd_parent, false);
121 
122 		limit = round_down(PAGE_COUNTER_MAX,
123 				   pages_per_huge_page(&hstates[idx]));
124 
125 		ret = page_counter_set_max(
126 			hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
127 			limit);
128 		VM_BUG_ON(ret);
129 		ret = page_counter_set_max(
130 			hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
131 			limit);
132 		VM_BUG_ON(ret);
133 	}
134 }
135 
136 static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
137 {
138 	int node;
139 
140 	for_each_node(node)
141 		kfree(h_cgroup->nodeinfo[node]);
142 	kfree(h_cgroup);
143 }
144 
145 static struct cgroup_subsys_state *
146 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
147 {
148 	struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
149 	struct hugetlb_cgroup *h_cgroup;
150 	int node;
151 
152 	h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
153 			   GFP_KERNEL);
154 
155 	if (!h_cgroup)
156 		return ERR_PTR(-ENOMEM);
157 
158 	if (!parent_h_cgroup)
159 		root_h_cgroup = h_cgroup;
160 
161 	/*
162 	 * TODO: this routine can waste much memory for nodes which will
163 	 * never be onlined. It's better to use memory hotplug callback
164 	 * function.
165 	 */
166 	for_each_node(node) {
167 		/* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
168 		int node_to_alloc =
169 			node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
170 		h_cgroup->nodeinfo[node] =
171 			kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
172 				     GFP_KERNEL, node_to_alloc);
173 		if (!h_cgroup->nodeinfo[node])
174 			goto fail_alloc_nodeinfo;
175 	}
176 
177 	hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
178 	return &h_cgroup->css;
179 
180 fail_alloc_nodeinfo:
181 	hugetlb_cgroup_free(h_cgroup);
182 	return ERR_PTR(-ENOMEM);
183 }
184 
185 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
186 {
187 	hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
188 }
189 
190 /*
191  * Should be called with hugetlb_lock held.
192  * Since we are holding hugetlb_lock, pages cannot get moved from
193  * active list or uncharged from the cgroup, So no need to get
194  * page reference and test for page active here. This function
195  * cannot fail.
196  */
197 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
198 				       struct page *page)
199 {
200 	unsigned int nr_pages;
201 	struct page_counter *counter;
202 	struct hugetlb_cgroup *page_hcg;
203 	struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
204 	struct folio *folio = page_folio(page);
205 
206 	page_hcg = hugetlb_cgroup_from_folio(folio);
207 	/*
208 	 * We can have pages in active list without any cgroup
209 	 * ie, hugepage with less than 3 pages. We can safely
210 	 * ignore those pages.
211 	 */
212 	if (!page_hcg || page_hcg != h_cg)
213 		goto out;
214 
215 	nr_pages = compound_nr(page);
216 	if (!parent) {
217 		parent = root_h_cgroup;
218 		/* root has no limit */
219 		page_counter_charge(&parent->hugepage[idx], nr_pages);
220 	}
221 	counter = &h_cg->hugepage[idx];
222 	/* Take the pages off the local counter */
223 	page_counter_cancel(counter, nr_pages);
224 
225 	set_hugetlb_cgroup(folio, parent);
226 out:
227 	return;
228 }
229 
230 /*
231  * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
232  * the parent cgroup.
233  */
234 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
235 {
236 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
237 	struct hstate *h;
238 	struct page *page;
239 
240 	do {
241 		for_each_hstate(h) {
242 			spin_lock_irq(&hugetlb_lock);
243 			list_for_each_entry(page, &h->hugepage_activelist, lru)
244 				hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page);
245 
246 			spin_unlock_irq(&hugetlb_lock);
247 		}
248 		cond_resched();
249 	} while (hugetlb_cgroup_have_usage(h_cg));
250 }
251 
252 static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
253 				 enum hugetlb_memory_event event)
254 {
255 	atomic_long_inc(&hugetlb->events_local[idx][event]);
256 	cgroup_file_notify(&hugetlb->events_local_file[idx]);
257 
258 	do {
259 		atomic_long_inc(&hugetlb->events[idx][event]);
260 		cgroup_file_notify(&hugetlb->events_file[idx]);
261 	} while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
262 		 !hugetlb_cgroup_is_root(hugetlb));
263 }
264 
265 static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
266 					  struct hugetlb_cgroup **ptr,
267 					  bool rsvd)
268 {
269 	int ret = 0;
270 	struct page_counter *counter;
271 	struct hugetlb_cgroup *h_cg = NULL;
272 
273 	if (hugetlb_cgroup_disabled())
274 		goto done;
275 again:
276 	rcu_read_lock();
277 	h_cg = hugetlb_cgroup_from_task(current);
278 	if (!css_tryget(&h_cg->css)) {
279 		rcu_read_unlock();
280 		goto again;
281 	}
282 	rcu_read_unlock();
283 
284 	if (!page_counter_try_charge(
285 		    __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
286 		    nr_pages, &counter)) {
287 		ret = -ENOMEM;
288 		hugetlb_event(h_cg, idx, HUGETLB_MAX);
289 		css_put(&h_cg->css);
290 		goto done;
291 	}
292 	/* Reservations take a reference to the css because they do not get
293 	 * reparented.
294 	 */
295 	if (!rsvd)
296 		css_put(&h_cg->css);
297 done:
298 	*ptr = h_cg;
299 	return ret;
300 }
301 
302 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
303 				 struct hugetlb_cgroup **ptr)
304 {
305 	return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
306 }
307 
308 int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
309 				      struct hugetlb_cgroup **ptr)
310 {
311 	return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
312 }
313 
314 /* Should be called with hugetlb_lock held */
315 static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
316 					   struct hugetlb_cgroup *h_cg,
317 					   struct folio *folio, bool rsvd)
318 {
319 	if (hugetlb_cgroup_disabled() || !h_cg)
320 		return;
321 	lockdep_assert_held(&hugetlb_lock);
322 	__set_hugetlb_cgroup(folio, h_cg, rsvd);
323 	if (!rsvd) {
324 		unsigned long usage =
325 			h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
326 		/*
327 		 * This write is not atomic due to fetching usage and writing
328 		 * to it, but that's fine because we call this with
329 		 * hugetlb_lock held anyway.
330 		 */
331 		WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
332 			   usage + nr_pages);
333 	}
334 }
335 
336 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
337 				  struct hugetlb_cgroup *h_cg,
338 				  struct folio *folio)
339 {
340 	__hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
341 }
342 
343 void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
344 				       struct hugetlb_cgroup *h_cg,
345 				       struct folio *folio)
346 {
347 	__hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
348 }
349 
350 /*
351  * Should be called with hugetlb_lock held
352  */
353 static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
354 					   struct folio *folio, bool rsvd)
355 {
356 	struct hugetlb_cgroup *h_cg;
357 
358 	if (hugetlb_cgroup_disabled())
359 		return;
360 	lockdep_assert_held(&hugetlb_lock);
361 	h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
362 	if (unlikely(!h_cg))
363 		return;
364 	__set_hugetlb_cgroup(folio, NULL, rsvd);
365 
366 	page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
367 								   rsvd),
368 			      nr_pages);
369 
370 	if (rsvd)
371 		css_put(&h_cg->css);
372 	else {
373 		unsigned long usage =
374 			h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
375 		/*
376 		 * This write is not atomic due to fetching usage and writing
377 		 * to it, but that's fine because we call this with
378 		 * hugetlb_lock held anyway.
379 		 */
380 		WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
381 			   usage - nr_pages);
382 	}
383 }
384 
385 void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
386 				  struct folio *folio)
387 {
388 	__hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false);
389 }
390 
391 void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
392 				       struct folio *folio)
393 {
394 	__hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true);
395 }
396 
397 static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
398 					     struct hugetlb_cgroup *h_cg,
399 					     bool rsvd)
400 {
401 	if (hugetlb_cgroup_disabled() || !h_cg)
402 		return;
403 
404 	page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
405 								   rsvd),
406 			      nr_pages);
407 
408 	if (rsvd)
409 		css_put(&h_cg->css);
410 }
411 
412 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
413 				    struct hugetlb_cgroup *h_cg)
414 {
415 	__hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
416 }
417 
418 void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
419 					 struct hugetlb_cgroup *h_cg)
420 {
421 	__hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
422 }
423 
424 void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
425 				     unsigned long end)
426 {
427 	if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
428 	    !resv->css)
429 		return;
430 
431 	page_counter_uncharge(resv->reservation_counter,
432 			      (end - start) * resv->pages_per_hpage);
433 	css_put(resv->css);
434 }
435 
436 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
437 					 struct file_region *rg,
438 					 unsigned long nr_pages,
439 					 bool region_del)
440 {
441 	if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
442 		return;
443 
444 	if (rg->reservation_counter && resv->pages_per_hpage &&
445 	    !resv->reservation_counter) {
446 		page_counter_uncharge(rg->reservation_counter,
447 				      nr_pages * resv->pages_per_hpage);
448 		/*
449 		 * Only do css_put(rg->css) when we delete the entire region
450 		 * because one file_region must hold exactly one css reference.
451 		 */
452 		if (region_del)
453 			css_put(rg->css);
454 	}
455 }
456 
457 enum {
458 	RES_USAGE,
459 	RES_RSVD_USAGE,
460 	RES_LIMIT,
461 	RES_RSVD_LIMIT,
462 	RES_MAX_USAGE,
463 	RES_RSVD_MAX_USAGE,
464 	RES_FAILCNT,
465 	RES_RSVD_FAILCNT,
466 };
467 
468 static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
469 {
470 	int nid;
471 	struct cftype *cft = seq_cft(seq);
472 	int idx = MEMFILE_IDX(cft->private);
473 	bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys);
474 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
475 	struct cgroup_subsys_state *css;
476 	unsigned long usage;
477 
478 	if (legacy) {
479 		/* Add up usage across all nodes for the non-hierarchical total. */
480 		usage = 0;
481 		for_each_node_state(nid, N_MEMORY)
482 			usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
483 		seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
484 
485 		/* Simply print the per-node usage for the non-hierarchical total. */
486 		for_each_node_state(nid, N_MEMORY)
487 			seq_printf(seq, " N%d=%lu", nid,
488 				   READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
489 					   PAGE_SIZE);
490 		seq_putc(seq, '\n');
491 	}
492 
493 	/*
494 	 * The hierarchical total is pretty much the value recorded by the
495 	 * counter, so use that.
496 	 */
497 	seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
498 		   page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
499 
500 	/*
501 	 * For each node, transverse the css tree to obtain the hierarchical
502 	 * node usage.
503 	 */
504 	for_each_node_state(nid, N_MEMORY) {
505 		usage = 0;
506 		rcu_read_lock();
507 		css_for_each_descendant_pre(css, &h_cg->css) {
508 			usage += READ_ONCE(hugetlb_cgroup_from_css(css)
509 						   ->nodeinfo[nid]
510 						   ->usage[idx]);
511 		}
512 		rcu_read_unlock();
513 		seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
514 	}
515 
516 	seq_putc(seq, '\n');
517 
518 	return 0;
519 }
520 
521 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
522 				   struct cftype *cft)
523 {
524 	struct page_counter *counter;
525 	struct page_counter *rsvd_counter;
526 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
527 
528 	counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
529 	rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
530 
531 	switch (MEMFILE_ATTR(cft->private)) {
532 	case RES_USAGE:
533 		return (u64)page_counter_read(counter) * PAGE_SIZE;
534 	case RES_RSVD_USAGE:
535 		return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
536 	case RES_LIMIT:
537 		return (u64)counter->max * PAGE_SIZE;
538 	case RES_RSVD_LIMIT:
539 		return (u64)rsvd_counter->max * PAGE_SIZE;
540 	case RES_MAX_USAGE:
541 		return (u64)counter->watermark * PAGE_SIZE;
542 	case RES_RSVD_MAX_USAGE:
543 		return (u64)rsvd_counter->watermark * PAGE_SIZE;
544 	case RES_FAILCNT:
545 		return counter->failcnt;
546 	case RES_RSVD_FAILCNT:
547 		return rsvd_counter->failcnt;
548 	default:
549 		BUG();
550 	}
551 }
552 
553 static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
554 {
555 	int idx;
556 	u64 val;
557 	struct cftype *cft = seq_cft(seq);
558 	unsigned long limit;
559 	struct page_counter *counter;
560 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
561 
562 	idx = MEMFILE_IDX(cft->private);
563 	counter = &h_cg->hugepage[idx];
564 
565 	limit = round_down(PAGE_COUNTER_MAX,
566 			   pages_per_huge_page(&hstates[idx]));
567 
568 	switch (MEMFILE_ATTR(cft->private)) {
569 	case RES_RSVD_USAGE:
570 		counter = &h_cg->rsvd_hugepage[idx];
571 		fallthrough;
572 	case RES_USAGE:
573 		val = (u64)page_counter_read(counter);
574 		seq_printf(seq, "%llu\n", val * PAGE_SIZE);
575 		break;
576 	case RES_RSVD_LIMIT:
577 		counter = &h_cg->rsvd_hugepage[idx];
578 		fallthrough;
579 	case RES_LIMIT:
580 		val = (u64)counter->max;
581 		if (val == limit)
582 			seq_puts(seq, "max\n");
583 		else
584 			seq_printf(seq, "%llu\n", val * PAGE_SIZE);
585 		break;
586 	default:
587 		BUG();
588 	}
589 
590 	return 0;
591 }
592 
593 static DEFINE_MUTEX(hugetlb_limit_mutex);
594 
595 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
596 				    char *buf, size_t nbytes, loff_t off,
597 				    const char *max)
598 {
599 	int ret, idx;
600 	unsigned long nr_pages;
601 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
602 	bool rsvd = false;
603 
604 	if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
605 		return -EINVAL;
606 
607 	buf = strstrip(buf);
608 	ret = page_counter_memparse(buf, max, &nr_pages);
609 	if (ret)
610 		return ret;
611 
612 	idx = MEMFILE_IDX(of_cft(of)->private);
613 	nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
614 
615 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
616 	case RES_RSVD_LIMIT:
617 		rsvd = true;
618 		fallthrough;
619 	case RES_LIMIT:
620 		mutex_lock(&hugetlb_limit_mutex);
621 		ret = page_counter_set_max(
622 			__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
623 			nr_pages);
624 		mutex_unlock(&hugetlb_limit_mutex);
625 		break;
626 	default:
627 		ret = -EINVAL;
628 		break;
629 	}
630 	return ret ?: nbytes;
631 }
632 
633 static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
634 					   char *buf, size_t nbytes, loff_t off)
635 {
636 	return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
637 }
638 
639 static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
640 					char *buf, size_t nbytes, loff_t off)
641 {
642 	return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
643 }
644 
645 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
646 				    char *buf, size_t nbytes, loff_t off)
647 {
648 	int ret = 0;
649 	struct page_counter *counter, *rsvd_counter;
650 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
651 
652 	counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
653 	rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
654 
655 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
656 	case RES_MAX_USAGE:
657 		page_counter_reset_watermark(counter);
658 		break;
659 	case RES_RSVD_MAX_USAGE:
660 		page_counter_reset_watermark(rsvd_counter);
661 		break;
662 	case RES_FAILCNT:
663 		counter->failcnt = 0;
664 		break;
665 	case RES_RSVD_FAILCNT:
666 		rsvd_counter->failcnt = 0;
667 		break;
668 	default:
669 		ret = -EINVAL;
670 		break;
671 	}
672 	return ret ?: nbytes;
673 }
674 
675 static char *mem_fmt(char *buf, int size, unsigned long hsize)
676 {
677 	if (hsize >= SZ_1G)
678 		snprintf(buf, size, "%luGB", hsize / SZ_1G);
679 	else if (hsize >= SZ_1M)
680 		snprintf(buf, size, "%luMB", hsize / SZ_1M);
681 	else
682 		snprintf(buf, size, "%luKB", hsize / SZ_1K);
683 	return buf;
684 }
685 
686 static int __hugetlb_events_show(struct seq_file *seq, bool local)
687 {
688 	int idx;
689 	long max;
690 	struct cftype *cft = seq_cft(seq);
691 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
692 
693 	idx = MEMFILE_IDX(cft->private);
694 
695 	if (local)
696 		max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
697 	else
698 		max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
699 
700 	seq_printf(seq, "max %lu\n", max);
701 
702 	return 0;
703 }
704 
705 static int hugetlb_events_show(struct seq_file *seq, void *v)
706 {
707 	return __hugetlb_events_show(seq, false);
708 }
709 
710 static int hugetlb_events_local_show(struct seq_file *seq, void *v)
711 {
712 	return __hugetlb_events_show(seq, true);
713 }
714 
715 static struct cftype hugetlb_dfl_tmpl[] = {
716 	{
717 		.name = "max",
718 		.private = RES_LIMIT,
719 		.seq_show = hugetlb_cgroup_read_u64_max,
720 		.write = hugetlb_cgroup_write_dfl,
721 		.flags = CFTYPE_NOT_ON_ROOT,
722 	},
723 	{
724 		.name = "rsvd.max",
725 		.private = RES_RSVD_LIMIT,
726 		.seq_show = hugetlb_cgroup_read_u64_max,
727 		.write = hugetlb_cgroup_write_dfl,
728 		.flags = CFTYPE_NOT_ON_ROOT,
729 	},
730 	{
731 		.name = "current",
732 		.private = RES_USAGE,
733 		.seq_show = hugetlb_cgroup_read_u64_max,
734 		.flags = CFTYPE_NOT_ON_ROOT,
735 	},
736 	{
737 		.name = "rsvd.current",
738 		.private = RES_RSVD_USAGE,
739 		.seq_show = hugetlb_cgroup_read_u64_max,
740 		.flags = CFTYPE_NOT_ON_ROOT,
741 	},
742 	{
743 		.name = "events",
744 		.seq_show = hugetlb_events_show,
745 		.file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]),
746 		.flags = CFTYPE_NOT_ON_ROOT,
747 	},
748 	{
749 		.name = "events.local",
750 		.seq_show = hugetlb_events_local_show,
751 		.file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]),
752 		.flags = CFTYPE_NOT_ON_ROOT,
753 	},
754 	{
755 		.name = "numa_stat",
756 		.seq_show = hugetlb_cgroup_read_numa_stat,
757 		.flags = CFTYPE_NOT_ON_ROOT,
758 	},
759 	/* don't need terminator here */
760 };
761 
762 static struct cftype hugetlb_legacy_tmpl[] = {
763 	{
764 		.name = "limit_in_bytes",
765 		.private = RES_LIMIT,
766 		.read_u64 = hugetlb_cgroup_read_u64,
767 		.write = hugetlb_cgroup_write_legacy,
768 	},
769 	{
770 		.name = "rsvd.limit_in_bytes",
771 		.private = RES_RSVD_LIMIT,
772 		.read_u64 = hugetlb_cgroup_read_u64,
773 		.write = hugetlb_cgroup_write_legacy,
774 	},
775 	{
776 		.name = "usage_in_bytes",
777 		.private = RES_USAGE,
778 		.read_u64 = hugetlb_cgroup_read_u64,
779 	},
780 	{
781 		.name = "rsvd.usage_in_bytes",
782 		.private = RES_RSVD_USAGE,
783 		.read_u64 = hugetlb_cgroup_read_u64,
784 	},
785 	{
786 		.name = "max_usage_in_bytes",
787 		.private = RES_MAX_USAGE,
788 		.write = hugetlb_cgroup_reset,
789 		.read_u64 = hugetlb_cgroup_read_u64,
790 	},
791 	{
792 		.name = "rsvd.max_usage_in_bytes",
793 		.private = RES_RSVD_MAX_USAGE,
794 		.write = hugetlb_cgroup_reset,
795 		.read_u64 = hugetlb_cgroup_read_u64,
796 	},
797 	{
798 		.name = "failcnt",
799 		.private = RES_FAILCNT,
800 		.write = hugetlb_cgroup_reset,
801 		.read_u64 = hugetlb_cgroup_read_u64,
802 	},
803 	{
804 		.name = "rsvd.failcnt",
805 		.private = RES_RSVD_FAILCNT,
806 		.write = hugetlb_cgroup_reset,
807 		.read_u64 = hugetlb_cgroup_read_u64,
808 	},
809 	{
810 		.name = "numa_stat",
811 		.seq_show = hugetlb_cgroup_read_numa_stat,
812 	},
813 	/* don't need terminator here */
814 };
815 
816 static void __init
817 hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft,
818 			     struct cftype *tmpl, int tmpl_size)
819 {
820 	char buf[32];
821 	int i, idx = hstate_index(h);
822 
823 	/* format the size */
824 	mem_fmt(buf, sizeof(buf), huge_page_size(h));
825 
826 	for (i = 0; i < tmpl_size; cft++, tmpl++, i++) {
827 		*cft = *tmpl;
828 		/* rebuild the name */
829 		snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name);
830 		/* rebuild the private */
831 		cft->private = MEMFILE_PRIVATE(idx, tmpl->private);
832 		/* rebuild the file_offset */
833 		if (tmpl->file_offset) {
834 			unsigned int offset = tmpl->file_offset;
835 
836 			cft->file_offset = MEMFILE_OFFSET0(offset) +
837 					   MEMFILE_FIELD_SIZE(offset) * idx;
838 		}
839 
840 		lockdep_register_key(&cft->lockdep_key);
841 	}
842 }
843 
844 static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h)
845 {
846 	int idx = hstate_index(h);
847 
848 	hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE,
849 				     hugetlb_dfl_tmpl, DFL_TMPL_SIZE);
850 }
851 
852 static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h)
853 {
854 	int idx = hstate_index(h);
855 
856 	hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE,
857 				     hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE);
858 }
859 
860 static void __init __hugetlb_cgroup_file_init(struct hstate *h)
861 {
862 	__hugetlb_cgroup_file_dfl_init(h);
863 	__hugetlb_cgroup_file_legacy_init(h);
864 }
865 
866 static void __init __hugetlb_cgroup_file_pre_init(void)
867 {
868 	int cft_count;
869 
870 	cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */
871 	dfl_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL);
872 	BUG_ON(!dfl_files);
873 	cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */
874 	legacy_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL);
875 	BUG_ON(!legacy_files);
876 }
877 
878 static void __init __hugetlb_cgroup_file_post_init(void)
879 {
880 	WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
881 				       dfl_files));
882 	WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
883 					  legacy_files));
884 }
885 
886 void __init hugetlb_cgroup_file_init(void)
887 {
888 	struct hstate *h;
889 
890 	__hugetlb_cgroup_file_pre_init();
891 	for_each_hstate(h)
892 		__hugetlb_cgroup_file_init(h);
893 	__hugetlb_cgroup_file_post_init();
894 }
895 
896 /*
897  * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
898  * when we migrate hugepages
899  */
900 void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
901 {
902 	struct hugetlb_cgroup *h_cg;
903 	struct hugetlb_cgroup *h_cg_rsvd;
904 	struct hstate *h = folio_hstate(old_folio);
905 
906 	if (hugetlb_cgroup_disabled())
907 		return;
908 
909 	spin_lock_irq(&hugetlb_lock);
910 	h_cg = hugetlb_cgroup_from_folio(old_folio);
911 	h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio);
912 	set_hugetlb_cgroup(old_folio, NULL);
913 	set_hugetlb_cgroup_rsvd(old_folio, NULL);
914 
915 	/* move the h_cg details to new cgroup */
916 	set_hugetlb_cgroup(new_folio, h_cg);
917 	set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
918 	list_move(&new_folio->lru, &h->hugepage_activelist);
919 	spin_unlock_irq(&hugetlb_lock);
920 	return;
921 }
922 
923 static struct cftype hugetlb_files[] = {
924 	{} /* terminate */
925 };
926 
927 struct cgroup_subsys hugetlb_cgrp_subsys = {
928 	.css_alloc	= hugetlb_cgroup_css_alloc,
929 	.css_offline	= hugetlb_cgroup_css_offline,
930 	.css_free	= hugetlb_cgroup_css_free,
931 	.dfl_cftypes	= hugetlb_files,
932 	.legacy_cftypes	= hugetlb_files,
933 };
934