xref: /linux/mm/hugetlb_cgroup.c (revision b0249c0d41b306ddd79de58ca7fea543ab5e7a2e)
1 // SPDX-License-Identifier: LGPL-2.1
2 /*
3  *
4  * Copyright IBM Corporation, 2012
5  * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
6  *
7  * Cgroup v2
8  * Copyright (C) 2019 Red Hat, Inc.
9  * Author: Giuseppe Scrivano <gscrivan@redhat.com>
10  *
11  */
12 
13 #include <linux/cgroup.h>
14 #include <linux/page_counter.h>
15 #include <linux/slab.h>
16 #include <linux/hugetlb.h>
17 #include <linux/hugetlb_cgroup.h>
18 
19 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
20 #define MEMFILE_IDX(val)	(((val) >> 16) & 0xffff)
21 #define MEMFILE_ATTR(val)	((val) & 0xffff)
22 
23 /* Use t->m[0] to encode the offset */
24 #define MEMFILE_OFFSET(t, m0)	(((offsetof(t, m0) << 16) | sizeof_field(t, m0)))
25 #define MEMFILE_OFFSET0(val)	(((val) >> 16) & 0xffff)
26 #define MEMFILE_FIELD_SIZE(val)	((val) & 0xffff)
27 
28 #define DFL_TMPL_SIZE		ARRAY_SIZE(hugetlb_dfl_tmpl)
29 #define LEGACY_TMPL_SIZE	ARRAY_SIZE(hugetlb_legacy_tmpl)
30 
31 static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
32 static struct cftype *dfl_files;
33 static struct cftype *legacy_files;
34 
35 static inline struct page_counter *
36 __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
37 				     bool rsvd)
38 {
39 	if (rsvd)
40 		return &h_cg->rsvd_hugepage[idx];
41 	return &h_cg->hugepage[idx];
42 }
43 
44 static inline struct page_counter *
45 hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
46 {
47 	return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
48 }
49 
50 static inline struct page_counter *
51 hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
52 {
53 	return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
54 }
55 
56 static inline
57 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
58 {
59 	return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
60 }
61 
62 static inline
63 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
64 {
65 	return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
66 }
67 
68 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
69 {
70 	return (h_cg == root_h_cgroup);
71 }
72 
73 static inline struct hugetlb_cgroup *
74 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
75 {
76 	return hugetlb_cgroup_from_css(h_cg->css.parent);
77 }
78 
79 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
80 {
81 	struct hstate *h;
82 
83 	for_each_hstate(h) {
84 		if (page_counter_read(
85 		    hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
86 			return true;
87 	}
88 	return false;
89 }
90 
91 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
92 				struct hugetlb_cgroup *parent_h_cgroup)
93 {
94 	int idx;
95 
96 	for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
97 		struct page_counter *fault, *fault_parent = NULL;
98 		struct page_counter *rsvd, *rsvd_parent = NULL;
99 		unsigned long limit;
100 
101 		if (parent_h_cgroup) {
102 			fault_parent = hugetlb_cgroup_counter_from_cgroup(
103 				parent_h_cgroup, idx);
104 			rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
105 				parent_h_cgroup, idx);
106 		}
107 		fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx);
108 		rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx);
109 
110 		page_counter_init(fault, fault_parent, false);
111 		page_counter_init(rsvd, rsvd_parent, false);
112 
113 		if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) {
114 			fault->track_failcnt = true;
115 			rsvd->track_failcnt = true;
116 		}
117 
118 		limit = round_down(PAGE_COUNTER_MAX,
119 				   pages_per_huge_page(&hstates[idx]));
120 
121 		VM_BUG_ON(page_counter_set_max(fault, limit));
122 		VM_BUG_ON(page_counter_set_max(rsvd, limit));
123 	}
124 }
125 
126 static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
127 {
128 	int node;
129 
130 	for_each_node(node)
131 		kfree(h_cgroup->nodeinfo[node]);
132 	kfree(h_cgroup);
133 }
134 
135 static struct cgroup_subsys_state *
136 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
137 {
138 	struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
139 	struct hugetlb_cgroup *h_cgroup;
140 	int node;
141 
142 	h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
143 			   GFP_KERNEL);
144 
145 	if (!h_cgroup)
146 		return ERR_PTR(-ENOMEM);
147 
148 	if (!parent_h_cgroup)
149 		root_h_cgroup = h_cgroup;
150 
151 	/*
152 	 * TODO: this routine can waste much memory for nodes which will
153 	 * never be onlined. It's better to use memory hotplug callback
154 	 * function.
155 	 */
156 	for_each_node(node) {
157 		/* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
158 		int node_to_alloc =
159 			node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
160 		h_cgroup->nodeinfo[node] =
161 			kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
162 				     GFP_KERNEL, node_to_alloc);
163 		if (!h_cgroup->nodeinfo[node])
164 			goto fail_alloc_nodeinfo;
165 	}
166 
167 	hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
168 	return &h_cgroup->css;
169 
170 fail_alloc_nodeinfo:
171 	hugetlb_cgroup_free(h_cgroup);
172 	return ERR_PTR(-ENOMEM);
173 }
174 
175 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
176 {
177 	hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
178 }
179 
180 /*
181  * Should be called with hugetlb_lock held.
182  * Since we are holding hugetlb_lock, pages cannot get moved from
183  * active list or uncharged from the cgroup, So no need to get
184  * page reference and test for page active here. This function
185  * cannot fail.
186  */
187 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
188 				       struct folio *folio)
189 {
190 	unsigned int nr_pages;
191 	struct page_counter *counter;
192 	struct hugetlb_cgroup *hcg;
193 	struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
194 
195 	hcg = hugetlb_cgroup_from_folio(folio);
196 	/*
197 	 * We can have pages in active list without any cgroup
198 	 * ie, hugepage with less than 3 pages. We can safely
199 	 * ignore those pages.
200 	 */
201 	if (!hcg || hcg != h_cg)
202 		goto out;
203 
204 	nr_pages = folio_nr_pages(folio);
205 	if (!parent) {
206 		parent = root_h_cgroup;
207 		/* root has no limit */
208 		page_counter_charge(&parent->hugepage[idx], nr_pages);
209 	}
210 	counter = &h_cg->hugepage[idx];
211 	/* Take the pages off the local counter */
212 	page_counter_cancel(counter, nr_pages);
213 
214 	set_hugetlb_cgroup(folio, parent);
215 out:
216 	return;
217 }
218 
219 /*
220  * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
221  * the parent cgroup.
222  */
223 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
224 {
225 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
226 	struct hstate *h;
227 	struct folio *folio;
228 
229 	do {
230 		for_each_hstate(h) {
231 			spin_lock_irq(&hugetlb_lock);
232 			list_for_each_entry(folio, &h->hugepage_activelist, lru)
233 				hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio);
234 
235 			spin_unlock_irq(&hugetlb_lock);
236 		}
237 		cond_resched();
238 	} while (hugetlb_cgroup_have_usage(h_cg));
239 }
240 
241 static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
242 				 enum hugetlb_memory_event event)
243 {
244 	atomic_long_inc(&hugetlb->events_local[idx][event]);
245 	cgroup_file_notify(&hugetlb->events_local_file[idx]);
246 
247 	do {
248 		atomic_long_inc(&hugetlb->events[idx][event]);
249 		cgroup_file_notify(&hugetlb->events_file[idx]);
250 	} while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
251 		 !hugetlb_cgroup_is_root(hugetlb));
252 }
253 
254 static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
255 					  struct hugetlb_cgroup **ptr,
256 					  bool rsvd)
257 {
258 	int ret = 0;
259 	struct page_counter *counter;
260 	struct hugetlb_cgroup *h_cg = NULL;
261 
262 	if (hugetlb_cgroup_disabled())
263 		goto done;
264 again:
265 	rcu_read_lock();
266 	h_cg = hugetlb_cgroup_from_task(current);
267 	if (!css_tryget(&h_cg->css)) {
268 		rcu_read_unlock();
269 		goto again;
270 	}
271 	rcu_read_unlock();
272 
273 	if (!page_counter_try_charge(
274 		    __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
275 		    nr_pages, &counter)) {
276 		ret = -ENOMEM;
277 		hugetlb_event(h_cg, idx, HUGETLB_MAX);
278 		css_put(&h_cg->css);
279 		goto done;
280 	}
281 	/* Reservations take a reference to the css because they do not get
282 	 * reparented.
283 	 */
284 	if (!rsvd)
285 		css_put(&h_cg->css);
286 done:
287 	*ptr = h_cg;
288 	return ret;
289 }
290 
291 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
292 				 struct hugetlb_cgroup **ptr)
293 {
294 	return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
295 }
296 
297 int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
298 				      struct hugetlb_cgroup **ptr)
299 {
300 	return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
301 }
302 
303 /* Should be called with hugetlb_lock held */
304 static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
305 					   struct hugetlb_cgroup *h_cg,
306 					   struct folio *folio, bool rsvd)
307 {
308 	if (hugetlb_cgroup_disabled() || !h_cg)
309 		return;
310 	lockdep_assert_held(&hugetlb_lock);
311 	__set_hugetlb_cgroup(folio, h_cg, rsvd);
312 	if (!rsvd) {
313 		unsigned long usage =
314 			h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
315 		/*
316 		 * This write is not atomic due to fetching usage and writing
317 		 * to it, but that's fine because we call this with
318 		 * hugetlb_lock held anyway.
319 		 */
320 		WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
321 			   usage + nr_pages);
322 	}
323 }
324 
325 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
326 				  struct hugetlb_cgroup *h_cg,
327 				  struct folio *folio)
328 {
329 	__hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
330 }
331 
332 void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
333 				       struct hugetlb_cgroup *h_cg,
334 				       struct folio *folio)
335 {
336 	__hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
337 }
338 
339 /*
340  * Should be called with hugetlb_lock held
341  */
342 static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
343 					   struct folio *folio, bool rsvd)
344 {
345 	struct hugetlb_cgroup *h_cg;
346 
347 	if (hugetlb_cgroup_disabled())
348 		return;
349 	lockdep_assert_held(&hugetlb_lock);
350 	h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
351 	if (unlikely(!h_cg))
352 		return;
353 	__set_hugetlb_cgroup(folio, NULL, rsvd);
354 
355 	page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
356 								   rsvd),
357 			      nr_pages);
358 
359 	if (rsvd)
360 		css_put(&h_cg->css);
361 	else {
362 		unsigned long usage =
363 			h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
364 		/*
365 		 * This write is not atomic due to fetching usage and writing
366 		 * to it, but that's fine because we call this with
367 		 * hugetlb_lock held anyway.
368 		 */
369 		WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
370 			   usage - nr_pages);
371 	}
372 }
373 
374 void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
375 				  struct folio *folio)
376 {
377 	__hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false);
378 }
379 
380 void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
381 				       struct folio *folio)
382 {
383 	__hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true);
384 }
385 
386 static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
387 					     struct hugetlb_cgroup *h_cg,
388 					     bool rsvd)
389 {
390 	if (hugetlb_cgroup_disabled() || !h_cg)
391 		return;
392 
393 	page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
394 								   rsvd),
395 			      nr_pages);
396 
397 	if (rsvd)
398 		css_put(&h_cg->css);
399 }
400 
401 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
402 				    struct hugetlb_cgroup *h_cg)
403 {
404 	__hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
405 }
406 
407 void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
408 					 struct hugetlb_cgroup *h_cg)
409 {
410 	__hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
411 }
412 
413 void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
414 				     unsigned long end)
415 {
416 	if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
417 	    !resv->css)
418 		return;
419 
420 	page_counter_uncharge(resv->reservation_counter,
421 			      (end - start) * resv->pages_per_hpage);
422 	css_put(resv->css);
423 }
424 
425 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
426 					 struct file_region *rg,
427 					 unsigned long nr_pages,
428 					 bool region_del)
429 {
430 	if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
431 		return;
432 
433 	if (rg->reservation_counter && resv->pages_per_hpage &&
434 	    !resv->reservation_counter) {
435 		page_counter_uncharge(rg->reservation_counter,
436 				      nr_pages * resv->pages_per_hpage);
437 		/*
438 		 * Only do css_put(rg->css) when we delete the entire region
439 		 * because one file_region must hold exactly one css reference.
440 		 */
441 		if (region_del)
442 			css_put(rg->css);
443 	}
444 }
445 
446 enum {
447 	RES_USAGE,
448 	RES_RSVD_USAGE,
449 	RES_LIMIT,
450 	RES_RSVD_LIMIT,
451 	RES_MAX_USAGE,
452 	RES_RSVD_MAX_USAGE,
453 	RES_FAILCNT,
454 	RES_RSVD_FAILCNT,
455 };
456 
457 static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
458 {
459 	int nid;
460 	struct cftype *cft = seq_cft(seq);
461 	int idx = MEMFILE_IDX(cft->private);
462 	bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys);
463 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
464 	struct cgroup_subsys_state *css;
465 	unsigned long usage;
466 
467 	if (legacy) {
468 		/* Add up usage across all nodes for the non-hierarchical total. */
469 		usage = 0;
470 		for_each_node_state(nid, N_MEMORY)
471 			usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
472 		seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
473 
474 		/* Simply print the per-node usage for the non-hierarchical total. */
475 		for_each_node_state(nid, N_MEMORY)
476 			seq_printf(seq, " N%d=%lu", nid,
477 				   READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
478 					   PAGE_SIZE);
479 		seq_putc(seq, '\n');
480 	}
481 
482 	/*
483 	 * The hierarchical total is pretty much the value recorded by the
484 	 * counter, so use that.
485 	 */
486 	seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
487 		   page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
488 
489 	/*
490 	 * For each node, transverse the css tree to obtain the hierarchical
491 	 * node usage.
492 	 */
493 	for_each_node_state(nid, N_MEMORY) {
494 		usage = 0;
495 		rcu_read_lock();
496 		css_for_each_descendant_pre(css, &h_cg->css) {
497 			usage += READ_ONCE(hugetlb_cgroup_from_css(css)
498 						   ->nodeinfo[nid]
499 						   ->usage[idx]);
500 		}
501 		rcu_read_unlock();
502 		seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
503 	}
504 
505 	seq_putc(seq, '\n');
506 
507 	return 0;
508 }
509 
510 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
511 				   struct cftype *cft)
512 {
513 	struct page_counter *counter;
514 	struct page_counter *rsvd_counter;
515 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
516 
517 	counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
518 	rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
519 
520 	switch (MEMFILE_ATTR(cft->private)) {
521 	case RES_USAGE:
522 		return (u64)page_counter_read(counter) * PAGE_SIZE;
523 	case RES_RSVD_USAGE:
524 		return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
525 	case RES_LIMIT:
526 		return (u64)counter->max * PAGE_SIZE;
527 	case RES_RSVD_LIMIT:
528 		return (u64)rsvd_counter->max * PAGE_SIZE;
529 	case RES_MAX_USAGE:
530 		return (u64)counter->watermark * PAGE_SIZE;
531 	case RES_RSVD_MAX_USAGE:
532 		return (u64)rsvd_counter->watermark * PAGE_SIZE;
533 	case RES_FAILCNT:
534 		return counter->failcnt;
535 	case RES_RSVD_FAILCNT:
536 		return rsvd_counter->failcnt;
537 	default:
538 		BUG();
539 	}
540 }
541 
542 static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
543 {
544 	int idx;
545 	u64 val;
546 	struct cftype *cft = seq_cft(seq);
547 	unsigned long limit;
548 	struct page_counter *counter;
549 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
550 
551 	idx = MEMFILE_IDX(cft->private);
552 	counter = &h_cg->hugepage[idx];
553 
554 	limit = round_down(PAGE_COUNTER_MAX,
555 			   pages_per_huge_page(&hstates[idx]));
556 
557 	switch (MEMFILE_ATTR(cft->private)) {
558 	case RES_RSVD_USAGE:
559 		counter = &h_cg->rsvd_hugepage[idx];
560 		fallthrough;
561 	case RES_USAGE:
562 		val = (u64)page_counter_read(counter);
563 		seq_printf(seq, "%llu\n", val * PAGE_SIZE);
564 		break;
565 	case RES_RSVD_LIMIT:
566 		counter = &h_cg->rsvd_hugepage[idx];
567 		fallthrough;
568 	case RES_LIMIT:
569 		val = (u64)counter->max;
570 		if (val == limit)
571 			seq_puts(seq, "max\n");
572 		else
573 			seq_printf(seq, "%llu\n", val * PAGE_SIZE);
574 		break;
575 	default:
576 		BUG();
577 	}
578 
579 	return 0;
580 }
581 
582 static DEFINE_MUTEX(hugetlb_limit_mutex);
583 
584 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
585 				    char *buf, size_t nbytes, loff_t off,
586 				    const char *max)
587 {
588 	int ret, idx;
589 	unsigned long nr_pages;
590 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
591 	bool rsvd = false;
592 
593 	if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
594 		return -EINVAL;
595 
596 	buf = strstrip(buf);
597 	ret = page_counter_memparse(buf, max, &nr_pages);
598 	if (ret)
599 		return ret;
600 
601 	idx = MEMFILE_IDX(of_cft(of)->private);
602 	nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
603 
604 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
605 	case RES_RSVD_LIMIT:
606 		rsvd = true;
607 		fallthrough;
608 	case RES_LIMIT:
609 		mutex_lock(&hugetlb_limit_mutex);
610 		ret = page_counter_set_max(
611 			__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
612 			nr_pages);
613 		mutex_unlock(&hugetlb_limit_mutex);
614 		break;
615 	default:
616 		ret = -EINVAL;
617 		break;
618 	}
619 	return ret ?: nbytes;
620 }
621 
622 static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
623 					   char *buf, size_t nbytes, loff_t off)
624 {
625 	return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
626 }
627 
628 static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
629 					char *buf, size_t nbytes, loff_t off)
630 {
631 	return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
632 }
633 
634 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
635 				    char *buf, size_t nbytes, loff_t off)
636 {
637 	int ret = 0;
638 	struct page_counter *counter, *rsvd_counter;
639 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
640 
641 	counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
642 	rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
643 
644 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
645 	case RES_MAX_USAGE:
646 		page_counter_reset_watermark(counter);
647 		break;
648 	case RES_RSVD_MAX_USAGE:
649 		page_counter_reset_watermark(rsvd_counter);
650 		break;
651 	case RES_FAILCNT:
652 		counter->failcnt = 0;
653 		break;
654 	case RES_RSVD_FAILCNT:
655 		rsvd_counter->failcnt = 0;
656 		break;
657 	default:
658 		ret = -EINVAL;
659 		break;
660 	}
661 	return ret ?: nbytes;
662 }
663 
664 static char *mem_fmt(char *buf, int size, unsigned long hsize)
665 {
666 	if (hsize >= SZ_1G)
667 		snprintf(buf, size, "%luGB", hsize / SZ_1G);
668 	else if (hsize >= SZ_1M)
669 		snprintf(buf, size, "%luMB", hsize / SZ_1M);
670 	else
671 		snprintf(buf, size, "%luKB", hsize / SZ_1K);
672 	return buf;
673 }
674 
675 static int __hugetlb_events_show(struct seq_file *seq, bool local)
676 {
677 	int idx;
678 	long max;
679 	struct cftype *cft = seq_cft(seq);
680 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
681 
682 	idx = MEMFILE_IDX(cft->private);
683 
684 	if (local)
685 		max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
686 	else
687 		max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
688 
689 	seq_printf(seq, "max %lu\n", max);
690 
691 	return 0;
692 }
693 
694 static int hugetlb_events_show(struct seq_file *seq, void *v)
695 {
696 	return __hugetlb_events_show(seq, false);
697 }
698 
699 static int hugetlb_events_local_show(struct seq_file *seq, void *v)
700 {
701 	return __hugetlb_events_show(seq, true);
702 }
703 
704 static struct cftype hugetlb_dfl_tmpl[] = {
705 	{
706 		.name = "max",
707 		.private = RES_LIMIT,
708 		.seq_show = hugetlb_cgroup_read_u64_max,
709 		.write = hugetlb_cgroup_write_dfl,
710 		.flags = CFTYPE_NOT_ON_ROOT,
711 	},
712 	{
713 		.name = "rsvd.max",
714 		.private = RES_RSVD_LIMIT,
715 		.seq_show = hugetlb_cgroup_read_u64_max,
716 		.write = hugetlb_cgroup_write_dfl,
717 		.flags = CFTYPE_NOT_ON_ROOT,
718 	},
719 	{
720 		.name = "current",
721 		.private = RES_USAGE,
722 		.seq_show = hugetlb_cgroup_read_u64_max,
723 		.flags = CFTYPE_NOT_ON_ROOT,
724 	},
725 	{
726 		.name = "rsvd.current",
727 		.private = RES_RSVD_USAGE,
728 		.seq_show = hugetlb_cgroup_read_u64_max,
729 		.flags = CFTYPE_NOT_ON_ROOT,
730 	},
731 	{
732 		.name = "events",
733 		.seq_show = hugetlb_events_show,
734 		.file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]),
735 		.flags = CFTYPE_NOT_ON_ROOT,
736 	},
737 	{
738 		.name = "events.local",
739 		.seq_show = hugetlb_events_local_show,
740 		.file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]),
741 		.flags = CFTYPE_NOT_ON_ROOT,
742 	},
743 	{
744 		.name = "numa_stat",
745 		.seq_show = hugetlb_cgroup_read_numa_stat,
746 		.flags = CFTYPE_NOT_ON_ROOT,
747 	},
748 	/* don't need terminator here */
749 };
750 
751 static struct cftype hugetlb_legacy_tmpl[] = {
752 	{
753 		.name = "limit_in_bytes",
754 		.private = RES_LIMIT,
755 		.read_u64 = hugetlb_cgroup_read_u64,
756 		.write = hugetlb_cgroup_write_legacy,
757 	},
758 	{
759 		.name = "rsvd.limit_in_bytes",
760 		.private = RES_RSVD_LIMIT,
761 		.read_u64 = hugetlb_cgroup_read_u64,
762 		.write = hugetlb_cgroup_write_legacy,
763 	},
764 	{
765 		.name = "usage_in_bytes",
766 		.private = RES_USAGE,
767 		.read_u64 = hugetlb_cgroup_read_u64,
768 	},
769 	{
770 		.name = "rsvd.usage_in_bytes",
771 		.private = RES_RSVD_USAGE,
772 		.read_u64 = hugetlb_cgroup_read_u64,
773 	},
774 	{
775 		.name = "max_usage_in_bytes",
776 		.private = RES_MAX_USAGE,
777 		.write = hugetlb_cgroup_reset,
778 		.read_u64 = hugetlb_cgroup_read_u64,
779 	},
780 	{
781 		.name = "rsvd.max_usage_in_bytes",
782 		.private = RES_RSVD_MAX_USAGE,
783 		.write = hugetlb_cgroup_reset,
784 		.read_u64 = hugetlb_cgroup_read_u64,
785 	},
786 	{
787 		.name = "failcnt",
788 		.private = RES_FAILCNT,
789 		.write = hugetlb_cgroup_reset,
790 		.read_u64 = hugetlb_cgroup_read_u64,
791 	},
792 	{
793 		.name = "rsvd.failcnt",
794 		.private = RES_RSVD_FAILCNT,
795 		.write = hugetlb_cgroup_reset,
796 		.read_u64 = hugetlb_cgroup_read_u64,
797 	},
798 	{
799 		.name = "numa_stat",
800 		.seq_show = hugetlb_cgroup_read_numa_stat,
801 	},
802 	/* don't need terminator here */
803 };
804 
805 static void __init
806 hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft,
807 			     struct cftype *tmpl, int tmpl_size)
808 {
809 	char buf[32];
810 	int i, idx = hstate_index(h);
811 
812 	/* format the size */
813 	mem_fmt(buf, sizeof(buf), huge_page_size(h));
814 
815 	for (i = 0; i < tmpl_size; cft++, tmpl++, i++) {
816 		*cft = *tmpl;
817 		/* rebuild the name */
818 		scnprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name);
819 		/* rebuild the private */
820 		cft->private = MEMFILE_PRIVATE(idx, tmpl->private);
821 		/* rebuild the file_offset */
822 		if (tmpl->file_offset) {
823 			unsigned int offset = tmpl->file_offset;
824 
825 			cft->file_offset = MEMFILE_OFFSET0(offset) +
826 					   MEMFILE_FIELD_SIZE(offset) * idx;
827 		}
828 
829 		lockdep_register_key(&cft->lockdep_key);
830 	}
831 }
832 
833 static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h)
834 {
835 	int idx = hstate_index(h);
836 
837 	hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE,
838 				     hugetlb_dfl_tmpl, DFL_TMPL_SIZE);
839 }
840 
841 static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h)
842 {
843 	int idx = hstate_index(h);
844 
845 	hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE,
846 				     hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE);
847 }
848 
849 static void __init __hugetlb_cgroup_file_init(struct hstate *h)
850 {
851 	__hugetlb_cgroup_file_dfl_init(h);
852 	__hugetlb_cgroup_file_legacy_init(h);
853 }
854 
855 static void __init __hugetlb_cgroup_file_pre_init(void)
856 {
857 	int cft_count;
858 
859 	cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */
860 	dfl_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL);
861 	BUG_ON(!dfl_files);
862 	cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */
863 	legacy_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL);
864 	BUG_ON(!legacy_files);
865 }
866 
867 static void __init __hugetlb_cgroup_file_post_init(void)
868 {
869 	WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
870 				       dfl_files));
871 	WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
872 					  legacy_files));
873 }
874 
875 void __init hugetlb_cgroup_file_init(void)
876 {
877 	struct hstate *h;
878 
879 	__hugetlb_cgroup_file_pre_init();
880 	for_each_hstate(h)
881 		__hugetlb_cgroup_file_init(h);
882 	__hugetlb_cgroup_file_post_init();
883 }
884 
885 /*
886  * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
887  * when we migrate hugepages
888  */
889 void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
890 {
891 	struct hugetlb_cgroup *h_cg;
892 	struct hugetlb_cgroup *h_cg_rsvd;
893 	struct hstate *h = folio_hstate(old_folio);
894 
895 	if (hugetlb_cgroup_disabled())
896 		return;
897 
898 	spin_lock_irq(&hugetlb_lock);
899 	h_cg = hugetlb_cgroup_from_folio(old_folio);
900 	h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio);
901 	set_hugetlb_cgroup(old_folio, NULL);
902 	set_hugetlb_cgroup_rsvd(old_folio, NULL);
903 
904 	/* move the h_cg details to new cgroup */
905 	set_hugetlb_cgroup(new_folio, h_cg);
906 	set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
907 	list_move(&new_folio->lru, &h->hugepage_activelist);
908 	spin_unlock_irq(&hugetlb_lock);
909 }
910 
911 static struct cftype hugetlb_files[] = {
912 	{} /* terminate */
913 };
914 
915 struct cgroup_subsys hugetlb_cgrp_subsys = {
916 	.css_alloc	= hugetlb_cgroup_css_alloc,
917 	.css_offline	= hugetlb_cgroup_css_offline,
918 	.css_free	= hugetlb_cgroup_css_free,
919 	.dfl_cftypes	= hugetlb_files,
920 	.legacy_cftypes	= hugetlb_files,
921 };
922