xref: /linux/mm/hugetlb_cgroup.c (revision 323bbfcf1ef8836d0d2ad9e2c1f1c684f0e3b5b3)
1 // SPDX-License-Identifier: LGPL-2.1
2 /*
3  *
4  * Copyright IBM Corporation, 2012
5  * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
6  *
7  * Cgroup v2
8  * Copyright (C) 2019 Red Hat, Inc.
9  * Author: Giuseppe Scrivano <gscrivan@redhat.com>
10  *
11  */
12 
13 #include <linux/cgroup.h>
14 #include <linux/page_counter.h>
15 #include <linux/slab.h>
16 #include <linux/hugetlb.h>
17 #include <linux/hugetlb_cgroup.h>
18 
19 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
20 #define MEMFILE_IDX(val)	(((val) >> 16) & 0xffff)
21 #define MEMFILE_ATTR(val)	((val) & 0xffff)
22 
23 /* Use t->m[0] to encode the offset */
24 #define MEMFILE_OFFSET(t, m0)	(((offsetof(t, m0) << 16) | sizeof_field(t, m0)))
25 #define MEMFILE_OFFSET0(val)	(((val) >> 16) & 0xffff)
26 #define MEMFILE_FIELD_SIZE(val)	((val) & 0xffff)
27 
28 #define DFL_TMPL_SIZE		ARRAY_SIZE(hugetlb_dfl_tmpl)
29 #define LEGACY_TMPL_SIZE	ARRAY_SIZE(hugetlb_legacy_tmpl)
30 
31 static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
32 static struct cftype *dfl_files;
33 static struct cftype *legacy_files;
34 
35 static inline struct page_counter *
__hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup * h_cg,int idx,bool rsvd)36 __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
37 				     bool rsvd)
38 {
39 	if (rsvd)
40 		return &h_cg->rsvd_hugepage[idx];
41 	return &h_cg->hugepage[idx];
42 }
43 
44 static inline struct page_counter *
hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup * h_cg,int idx)45 hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
46 {
47 	return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
48 }
49 
50 static inline struct page_counter *
hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup * h_cg,int idx)51 hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
52 {
53 	return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
54 }
55 
56 static inline
hugetlb_cgroup_from_css(struct cgroup_subsys_state * s)57 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
58 {
59 	return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
60 }
61 
62 static inline
hugetlb_cgroup_from_task(struct task_struct * task)63 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
64 {
65 	return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
66 }
67 
hugetlb_cgroup_is_root(struct hugetlb_cgroup * h_cg)68 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
69 {
70 	return (h_cg == root_h_cgroup);
71 }
72 
73 static inline struct hugetlb_cgroup *
parent_hugetlb_cgroup(struct hugetlb_cgroup * h_cg)74 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
75 {
76 	return hugetlb_cgroup_from_css(h_cg->css.parent);
77 }
78 
hugetlb_cgroup_have_usage(struct hugetlb_cgroup * h_cg)79 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
80 {
81 	struct hstate *h;
82 
83 	for_each_hstate(h) {
84 		if (page_counter_read(
85 		    hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
86 			return true;
87 	}
88 	return false;
89 }
90 
hugetlb_cgroup_init(struct hugetlb_cgroup * h_cgroup,struct hugetlb_cgroup * parent_h_cgroup)91 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
92 				struct hugetlb_cgroup *parent_h_cgroup)
93 {
94 	int idx;
95 
96 	for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
97 		struct page_counter *fault, *fault_parent = NULL;
98 		struct page_counter *rsvd, *rsvd_parent = NULL;
99 		unsigned long limit;
100 
101 		if (parent_h_cgroup) {
102 			fault_parent = hugetlb_cgroup_counter_from_cgroup(
103 				parent_h_cgroup, idx);
104 			rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
105 				parent_h_cgroup, idx);
106 		}
107 		fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx);
108 		rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx);
109 
110 		page_counter_init(fault, fault_parent, false);
111 		page_counter_init(rsvd, rsvd_parent, false);
112 
113 		if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) {
114 			fault->track_failcnt = true;
115 			rsvd->track_failcnt = true;
116 		}
117 
118 		limit = round_down(PAGE_COUNTER_MAX,
119 				   pages_per_huge_page(&hstates[idx]));
120 
121 		VM_BUG_ON(page_counter_set_max(fault, limit));
122 		VM_BUG_ON(page_counter_set_max(rsvd, limit));
123 	}
124 }
125 
hugetlb_cgroup_free(struct hugetlb_cgroup * h_cgroup)126 static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
127 {
128 	int node;
129 
130 	for_each_node(node)
131 		kfree(h_cgroup->nodeinfo[node]);
132 	kfree(h_cgroup);
133 }
134 
135 static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state * parent_css)136 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
137 {
138 	struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
139 	struct hugetlb_cgroup *h_cgroup;
140 	int node;
141 
142 	h_cgroup = kzalloc_flex(*h_cgroup, nodeinfo, nr_node_ids);
143 
144 	if (!h_cgroup)
145 		return ERR_PTR(-ENOMEM);
146 
147 	if (!parent_h_cgroup)
148 		root_h_cgroup = h_cgroup;
149 
150 	/*
151 	 * TODO: this routine can waste much memory for nodes which will
152 	 * never be onlined. It's better to use memory hotplug callback
153 	 * function.
154 	 */
155 	for_each_node(node) {
156 		/* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
157 		int node_to_alloc =
158 			node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
159 		h_cgroup->nodeinfo[node] =
160 			kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
161 				     GFP_KERNEL, node_to_alloc);
162 		if (!h_cgroup->nodeinfo[node])
163 			goto fail_alloc_nodeinfo;
164 	}
165 
166 	hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
167 	return &h_cgroup->css;
168 
169 fail_alloc_nodeinfo:
170 	hugetlb_cgroup_free(h_cgroup);
171 	return ERR_PTR(-ENOMEM);
172 }
173 
hugetlb_cgroup_css_free(struct cgroup_subsys_state * css)174 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
175 {
176 	hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
177 }
178 
179 /*
180  * Should be called with hugetlb_lock held.
181  * Since we are holding hugetlb_lock, pages cannot get moved from
182  * active list or uncharged from the cgroup, So no need to get
183  * page reference and test for page active here. This function
184  * cannot fail.
185  */
hugetlb_cgroup_move_parent(int idx,struct hugetlb_cgroup * h_cg,struct folio * folio)186 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
187 				       struct folio *folio)
188 {
189 	unsigned int nr_pages;
190 	struct page_counter *counter;
191 	struct hugetlb_cgroup *hcg;
192 	struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
193 
194 	hcg = hugetlb_cgroup_from_folio(folio);
195 	/*
196 	 * We can have pages in active list without any cgroup
197 	 * ie, hugepage with less than 3 pages. We can safely
198 	 * ignore those pages.
199 	 */
200 	if (!hcg || hcg != h_cg)
201 		goto out;
202 
203 	nr_pages = folio_nr_pages(folio);
204 	if (!parent) {
205 		parent = root_h_cgroup;
206 		/* root has no limit */
207 		page_counter_charge(&parent->hugepage[idx], nr_pages);
208 	}
209 	counter = &h_cg->hugepage[idx];
210 	/* Take the pages off the local counter */
211 	page_counter_cancel(counter, nr_pages);
212 
213 	set_hugetlb_cgroup(folio, parent);
214 out:
215 	return;
216 }
217 
218 /*
219  * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
220  * the parent cgroup.
221  */
hugetlb_cgroup_css_offline(struct cgroup_subsys_state * css)222 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
223 {
224 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
225 	struct hstate *h;
226 	struct folio *folio;
227 
228 	do {
229 		for_each_hstate(h) {
230 			spin_lock_irq(&hugetlb_lock);
231 			list_for_each_entry(folio, &h->hugepage_activelist, lru)
232 				hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio);
233 
234 			spin_unlock_irq(&hugetlb_lock);
235 		}
236 		cond_resched();
237 	} while (hugetlb_cgroup_have_usage(h_cg));
238 }
239 
hugetlb_event(struct hugetlb_cgroup * hugetlb,int idx,enum hugetlb_memory_event event)240 static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
241 				 enum hugetlb_memory_event event)
242 {
243 	atomic_long_inc(&hugetlb->events_local[idx][event]);
244 	cgroup_file_notify(&hugetlb->events_local_file[idx]);
245 
246 	do {
247 		atomic_long_inc(&hugetlb->events[idx][event]);
248 		cgroup_file_notify(&hugetlb->events_file[idx]);
249 	} while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
250 		 !hugetlb_cgroup_is_root(hugetlb));
251 }
252 
__hugetlb_cgroup_charge_cgroup(int idx,unsigned long nr_pages,struct hugetlb_cgroup ** ptr,bool rsvd)253 static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
254 					  struct hugetlb_cgroup **ptr,
255 					  bool rsvd)
256 {
257 	int ret = 0;
258 	struct page_counter *counter;
259 	struct hugetlb_cgroup *h_cg = NULL;
260 
261 	if (hugetlb_cgroup_disabled())
262 		goto done;
263 again:
264 	rcu_read_lock();
265 	h_cg = hugetlb_cgroup_from_task(current);
266 	if (!css_tryget(&h_cg->css)) {
267 		rcu_read_unlock();
268 		goto again;
269 	}
270 	rcu_read_unlock();
271 
272 	if (!page_counter_try_charge(
273 		    __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
274 		    nr_pages, &counter)) {
275 		ret = -ENOMEM;
276 		hugetlb_event(h_cg, idx, HUGETLB_MAX);
277 		css_put(&h_cg->css);
278 		goto done;
279 	}
280 	/* Reservations take a reference to the css because they do not get
281 	 * reparented.
282 	 */
283 	if (!rsvd)
284 		css_put(&h_cg->css);
285 done:
286 	*ptr = h_cg;
287 	return ret;
288 }
289 
hugetlb_cgroup_charge_cgroup(int idx,unsigned long nr_pages,struct hugetlb_cgroup ** ptr)290 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
291 				 struct hugetlb_cgroup **ptr)
292 {
293 	return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
294 }
295 
hugetlb_cgroup_charge_cgroup_rsvd(int idx,unsigned long nr_pages,struct hugetlb_cgroup ** ptr)296 int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
297 				      struct hugetlb_cgroup **ptr)
298 {
299 	return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
300 }
301 
302 /* Should be called with hugetlb_lock held */
__hugetlb_cgroup_commit_charge(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg,struct folio * folio,bool rsvd)303 static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
304 					   struct hugetlb_cgroup *h_cg,
305 					   struct folio *folio, bool rsvd)
306 {
307 	if (hugetlb_cgroup_disabled() || !h_cg)
308 		return;
309 	lockdep_assert_held(&hugetlb_lock);
310 	__set_hugetlb_cgroup(folio, h_cg, rsvd);
311 	if (!rsvd) {
312 		unsigned long usage =
313 			h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
314 		/*
315 		 * This write is not atomic due to fetching usage and writing
316 		 * to it, but that's fine because we call this with
317 		 * hugetlb_lock held anyway.
318 		 */
319 		WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
320 			   usage + nr_pages);
321 	}
322 }
323 
hugetlb_cgroup_commit_charge(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg,struct folio * folio)324 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
325 				  struct hugetlb_cgroup *h_cg,
326 				  struct folio *folio)
327 {
328 	__hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
329 }
330 
hugetlb_cgroup_commit_charge_rsvd(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg,struct folio * folio)331 void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
332 				       struct hugetlb_cgroup *h_cg,
333 				       struct folio *folio)
334 {
335 	__hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
336 }
337 
338 /*
339  * Should be called with hugetlb_lock held
340  */
__hugetlb_cgroup_uncharge_folio(int idx,unsigned long nr_pages,struct folio * folio,bool rsvd)341 static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
342 					   struct folio *folio, bool rsvd)
343 {
344 	struct hugetlb_cgroup *h_cg;
345 
346 	if (hugetlb_cgroup_disabled())
347 		return;
348 	lockdep_assert_held(&hugetlb_lock);
349 	h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
350 	if (unlikely(!h_cg))
351 		return;
352 	__set_hugetlb_cgroup(folio, NULL, rsvd);
353 
354 	page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
355 								   rsvd),
356 			      nr_pages);
357 
358 	if (rsvd)
359 		css_put(&h_cg->css);
360 	else {
361 		unsigned long usage =
362 			h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
363 		/*
364 		 * This write is not atomic due to fetching usage and writing
365 		 * to it, but that's fine because we call this with
366 		 * hugetlb_lock held anyway.
367 		 */
368 		WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
369 			   usage - nr_pages);
370 	}
371 }
372 
hugetlb_cgroup_uncharge_folio(int idx,unsigned long nr_pages,struct folio * folio)373 void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
374 				  struct folio *folio)
375 {
376 	__hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false);
377 }
378 
hugetlb_cgroup_uncharge_folio_rsvd(int idx,unsigned long nr_pages,struct folio * folio)379 void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
380 				       struct folio *folio)
381 {
382 	__hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true);
383 }
384 
__hugetlb_cgroup_uncharge_cgroup(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg,bool rsvd)385 static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
386 					     struct hugetlb_cgroup *h_cg,
387 					     bool rsvd)
388 {
389 	if (hugetlb_cgroup_disabled() || !h_cg)
390 		return;
391 
392 	page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
393 								   rsvd),
394 			      nr_pages);
395 
396 	if (rsvd)
397 		css_put(&h_cg->css);
398 }
399 
hugetlb_cgroup_uncharge_cgroup(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg)400 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
401 				    struct hugetlb_cgroup *h_cg)
402 {
403 	__hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
404 }
405 
hugetlb_cgroup_uncharge_cgroup_rsvd(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg)406 void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
407 					 struct hugetlb_cgroup *h_cg)
408 {
409 	__hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
410 }
411 
hugetlb_cgroup_uncharge_counter(struct resv_map * resv,unsigned long start,unsigned long end)412 void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
413 				     unsigned long end)
414 {
415 	if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
416 	    !resv->css)
417 		return;
418 
419 	page_counter_uncharge(resv->reservation_counter,
420 			      (end - start) * resv->pages_per_hpage);
421 	css_put(resv->css);
422 }
423 
hugetlb_cgroup_uncharge_file_region(struct resv_map * resv,struct file_region * rg,unsigned long nr_pages,bool region_del)424 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
425 					 struct file_region *rg,
426 					 unsigned long nr_pages,
427 					 bool region_del)
428 {
429 	if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
430 		return;
431 
432 	if (rg->reservation_counter && resv->pages_per_hpage &&
433 	    !resv->reservation_counter) {
434 		page_counter_uncharge(rg->reservation_counter,
435 				      nr_pages * resv->pages_per_hpage);
436 		/*
437 		 * Only do css_put(rg->css) when we delete the entire region
438 		 * because one file_region must hold exactly one css reference.
439 		 */
440 		if (region_del)
441 			css_put(rg->css);
442 	}
443 }
444 
445 enum {
446 	RES_USAGE,
447 	RES_RSVD_USAGE,
448 	RES_LIMIT,
449 	RES_RSVD_LIMIT,
450 	RES_MAX_USAGE,
451 	RES_RSVD_MAX_USAGE,
452 	RES_FAILCNT,
453 	RES_RSVD_FAILCNT,
454 };
455 
hugetlb_cgroup_read_numa_stat(struct seq_file * seq,void * dummy)456 static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
457 {
458 	int nid;
459 	struct cftype *cft = seq_cft(seq);
460 	int idx = MEMFILE_IDX(cft->private);
461 	bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys);
462 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
463 	struct cgroup_subsys_state *css;
464 	unsigned long usage;
465 
466 	if (legacy) {
467 		/* Add up usage across all nodes for the non-hierarchical total. */
468 		usage = 0;
469 		for_each_node_state(nid, N_MEMORY)
470 			usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
471 		seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
472 
473 		/* Simply print the per-node usage for the non-hierarchical total. */
474 		for_each_node_state(nid, N_MEMORY)
475 			seq_printf(seq, " N%d=%lu", nid,
476 				   READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
477 					   PAGE_SIZE);
478 		seq_putc(seq, '\n');
479 	}
480 
481 	/*
482 	 * The hierarchical total is pretty much the value recorded by the
483 	 * counter, so use that.
484 	 */
485 	seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
486 		   page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
487 
488 	/*
489 	 * For each node, transverse the css tree to obtain the hierarchical
490 	 * node usage.
491 	 */
492 	for_each_node_state(nid, N_MEMORY) {
493 		usage = 0;
494 		rcu_read_lock();
495 		css_for_each_descendant_pre(css, &h_cg->css) {
496 			usage += READ_ONCE(hugetlb_cgroup_from_css(css)
497 						   ->nodeinfo[nid]
498 						   ->usage[idx]);
499 		}
500 		rcu_read_unlock();
501 		seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
502 	}
503 
504 	seq_putc(seq, '\n');
505 
506 	return 0;
507 }
508 
hugetlb_cgroup_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)509 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
510 				   struct cftype *cft)
511 {
512 	struct page_counter *counter;
513 	struct page_counter *rsvd_counter;
514 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
515 
516 	counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
517 	rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
518 
519 	switch (MEMFILE_ATTR(cft->private)) {
520 	case RES_USAGE:
521 		return (u64)page_counter_read(counter) * PAGE_SIZE;
522 	case RES_RSVD_USAGE:
523 		return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
524 	case RES_LIMIT:
525 		return (u64)counter->max * PAGE_SIZE;
526 	case RES_RSVD_LIMIT:
527 		return (u64)rsvd_counter->max * PAGE_SIZE;
528 	case RES_MAX_USAGE:
529 		return (u64)counter->watermark * PAGE_SIZE;
530 	case RES_RSVD_MAX_USAGE:
531 		return (u64)rsvd_counter->watermark * PAGE_SIZE;
532 	case RES_FAILCNT:
533 		return counter->failcnt;
534 	case RES_RSVD_FAILCNT:
535 		return rsvd_counter->failcnt;
536 	default:
537 		BUG();
538 	}
539 }
540 
hugetlb_cgroup_read_u64_max(struct seq_file * seq,void * v)541 static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
542 {
543 	int idx;
544 	u64 val;
545 	struct cftype *cft = seq_cft(seq);
546 	unsigned long limit;
547 	struct page_counter *counter;
548 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
549 
550 	idx = MEMFILE_IDX(cft->private);
551 	counter = &h_cg->hugepage[idx];
552 
553 	limit = round_down(PAGE_COUNTER_MAX,
554 			   pages_per_huge_page(&hstates[idx]));
555 
556 	switch (MEMFILE_ATTR(cft->private)) {
557 	case RES_RSVD_USAGE:
558 		counter = &h_cg->rsvd_hugepage[idx];
559 		fallthrough;
560 	case RES_USAGE:
561 		val = (u64)page_counter_read(counter);
562 		seq_printf(seq, "%llu\n", val * PAGE_SIZE);
563 		break;
564 	case RES_RSVD_LIMIT:
565 		counter = &h_cg->rsvd_hugepage[idx];
566 		fallthrough;
567 	case RES_LIMIT:
568 		val = (u64)counter->max;
569 		if (val == limit)
570 			seq_puts(seq, "max\n");
571 		else
572 			seq_printf(seq, "%llu\n", val * PAGE_SIZE);
573 		break;
574 	default:
575 		BUG();
576 	}
577 
578 	return 0;
579 }
580 
581 static DEFINE_MUTEX(hugetlb_limit_mutex);
582 
hugetlb_cgroup_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off,const char * max)583 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
584 				    char *buf, size_t nbytes, loff_t off,
585 				    const char *max)
586 {
587 	int ret, idx;
588 	unsigned long nr_pages;
589 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
590 	bool rsvd = false;
591 
592 	if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
593 		return -EINVAL;
594 
595 	buf = strstrip(buf);
596 	ret = page_counter_memparse(buf, max, &nr_pages);
597 	if (ret)
598 		return ret;
599 
600 	idx = MEMFILE_IDX(of_cft(of)->private);
601 	nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
602 
603 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
604 	case RES_RSVD_LIMIT:
605 		rsvd = true;
606 		fallthrough;
607 	case RES_LIMIT:
608 		mutex_lock(&hugetlb_limit_mutex);
609 		ret = page_counter_set_max(
610 			__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
611 			nr_pages);
612 		mutex_unlock(&hugetlb_limit_mutex);
613 		break;
614 	default:
615 		ret = -EINVAL;
616 		break;
617 	}
618 	return ret ?: nbytes;
619 }
620 
hugetlb_cgroup_write_legacy(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)621 static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
622 					   char *buf, size_t nbytes, loff_t off)
623 {
624 	return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
625 }
626 
hugetlb_cgroup_write_dfl(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)627 static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
628 					char *buf, size_t nbytes, loff_t off)
629 {
630 	return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
631 }
632 
hugetlb_cgroup_reset(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)633 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
634 				    char *buf, size_t nbytes, loff_t off)
635 {
636 	int ret = 0;
637 	struct page_counter *counter, *rsvd_counter;
638 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
639 
640 	counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
641 	rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
642 
643 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
644 	case RES_MAX_USAGE:
645 		page_counter_reset_watermark(counter);
646 		break;
647 	case RES_RSVD_MAX_USAGE:
648 		page_counter_reset_watermark(rsvd_counter);
649 		break;
650 	case RES_FAILCNT:
651 		counter->failcnt = 0;
652 		break;
653 	case RES_RSVD_FAILCNT:
654 		rsvd_counter->failcnt = 0;
655 		break;
656 	default:
657 		ret = -EINVAL;
658 		break;
659 	}
660 	return ret ?: nbytes;
661 }
662 
mem_fmt(char * buf,int size,unsigned long hsize)663 static char *mem_fmt(char *buf, int size, unsigned long hsize)
664 {
665 	if (hsize >= SZ_1G)
666 		snprintf(buf, size, "%luGB", hsize / SZ_1G);
667 	else if (hsize >= SZ_1M)
668 		snprintf(buf, size, "%luMB", hsize / SZ_1M);
669 	else
670 		snprintf(buf, size, "%luKB", hsize / SZ_1K);
671 	return buf;
672 }
673 
__hugetlb_events_show(struct seq_file * seq,bool local)674 static int __hugetlb_events_show(struct seq_file *seq, bool local)
675 {
676 	int idx;
677 	long max;
678 	struct cftype *cft = seq_cft(seq);
679 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
680 
681 	idx = MEMFILE_IDX(cft->private);
682 
683 	if (local)
684 		max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
685 	else
686 		max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
687 
688 	seq_printf(seq, "max %lu\n", max);
689 
690 	return 0;
691 }
692 
hugetlb_events_show(struct seq_file * seq,void * v)693 static int hugetlb_events_show(struct seq_file *seq, void *v)
694 {
695 	return __hugetlb_events_show(seq, false);
696 }
697 
hugetlb_events_local_show(struct seq_file * seq,void * v)698 static int hugetlb_events_local_show(struct seq_file *seq, void *v)
699 {
700 	return __hugetlb_events_show(seq, true);
701 }
702 
703 static struct cftype hugetlb_dfl_tmpl[] = {
704 	{
705 		.name = "max",
706 		.private = RES_LIMIT,
707 		.seq_show = hugetlb_cgroup_read_u64_max,
708 		.write = hugetlb_cgroup_write_dfl,
709 		.flags = CFTYPE_NOT_ON_ROOT,
710 	},
711 	{
712 		.name = "rsvd.max",
713 		.private = RES_RSVD_LIMIT,
714 		.seq_show = hugetlb_cgroup_read_u64_max,
715 		.write = hugetlb_cgroup_write_dfl,
716 		.flags = CFTYPE_NOT_ON_ROOT,
717 	},
718 	{
719 		.name = "current",
720 		.private = RES_USAGE,
721 		.seq_show = hugetlb_cgroup_read_u64_max,
722 		.flags = CFTYPE_NOT_ON_ROOT,
723 	},
724 	{
725 		.name = "rsvd.current",
726 		.private = RES_RSVD_USAGE,
727 		.seq_show = hugetlb_cgroup_read_u64_max,
728 		.flags = CFTYPE_NOT_ON_ROOT,
729 	},
730 	{
731 		.name = "events",
732 		.seq_show = hugetlb_events_show,
733 		.file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]),
734 		.flags = CFTYPE_NOT_ON_ROOT,
735 	},
736 	{
737 		.name = "events.local",
738 		.seq_show = hugetlb_events_local_show,
739 		.file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]),
740 		.flags = CFTYPE_NOT_ON_ROOT,
741 	},
742 	{
743 		.name = "numa_stat",
744 		.seq_show = hugetlb_cgroup_read_numa_stat,
745 		.flags = CFTYPE_NOT_ON_ROOT,
746 	},
747 	/* don't need terminator here */
748 };
749 
750 static struct cftype hugetlb_legacy_tmpl[] = {
751 	{
752 		.name = "limit_in_bytes",
753 		.private = RES_LIMIT,
754 		.read_u64 = hugetlb_cgroup_read_u64,
755 		.write = hugetlb_cgroup_write_legacy,
756 	},
757 	{
758 		.name = "rsvd.limit_in_bytes",
759 		.private = RES_RSVD_LIMIT,
760 		.read_u64 = hugetlb_cgroup_read_u64,
761 		.write = hugetlb_cgroup_write_legacy,
762 	},
763 	{
764 		.name = "usage_in_bytes",
765 		.private = RES_USAGE,
766 		.read_u64 = hugetlb_cgroup_read_u64,
767 	},
768 	{
769 		.name = "rsvd.usage_in_bytes",
770 		.private = RES_RSVD_USAGE,
771 		.read_u64 = hugetlb_cgroup_read_u64,
772 	},
773 	{
774 		.name = "max_usage_in_bytes",
775 		.private = RES_MAX_USAGE,
776 		.write = hugetlb_cgroup_reset,
777 		.read_u64 = hugetlb_cgroup_read_u64,
778 	},
779 	{
780 		.name = "rsvd.max_usage_in_bytes",
781 		.private = RES_RSVD_MAX_USAGE,
782 		.write = hugetlb_cgroup_reset,
783 		.read_u64 = hugetlb_cgroup_read_u64,
784 	},
785 	{
786 		.name = "failcnt",
787 		.private = RES_FAILCNT,
788 		.write = hugetlb_cgroup_reset,
789 		.read_u64 = hugetlb_cgroup_read_u64,
790 	},
791 	{
792 		.name = "rsvd.failcnt",
793 		.private = RES_RSVD_FAILCNT,
794 		.write = hugetlb_cgroup_reset,
795 		.read_u64 = hugetlb_cgroup_read_u64,
796 	},
797 	{
798 		.name = "numa_stat",
799 		.seq_show = hugetlb_cgroup_read_numa_stat,
800 	},
801 	/* don't need terminator here */
802 };
803 
804 static void __init
hugetlb_cgroup_cfttypes_init(struct hstate * h,struct cftype * cft,struct cftype * tmpl,int tmpl_size)805 hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft,
806 			     struct cftype *tmpl, int tmpl_size)
807 {
808 	char buf[32];
809 	int i, idx = hstate_index(h);
810 
811 	/* format the size */
812 	mem_fmt(buf, sizeof(buf), huge_page_size(h));
813 
814 	for (i = 0; i < tmpl_size; cft++, tmpl++, i++) {
815 		*cft = *tmpl;
816 		/* rebuild the name */
817 		scnprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name);
818 		/* rebuild the private */
819 		cft->private = MEMFILE_PRIVATE(idx, tmpl->private);
820 		/* rebuild the file_offset */
821 		if (tmpl->file_offset) {
822 			unsigned int offset = tmpl->file_offset;
823 
824 			cft->file_offset = MEMFILE_OFFSET0(offset) +
825 					   MEMFILE_FIELD_SIZE(offset) * idx;
826 		}
827 
828 		lockdep_register_key(&cft->lockdep_key);
829 	}
830 }
831 
__hugetlb_cgroup_file_dfl_init(struct hstate * h)832 static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h)
833 {
834 	int idx = hstate_index(h);
835 
836 	hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE,
837 				     hugetlb_dfl_tmpl, DFL_TMPL_SIZE);
838 }
839 
__hugetlb_cgroup_file_legacy_init(struct hstate * h)840 static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h)
841 {
842 	int idx = hstate_index(h);
843 
844 	hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE,
845 				     hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE);
846 }
847 
__hugetlb_cgroup_file_init(struct hstate * h)848 static void __init __hugetlb_cgroup_file_init(struct hstate *h)
849 {
850 	__hugetlb_cgroup_file_dfl_init(h);
851 	__hugetlb_cgroup_file_legacy_init(h);
852 }
853 
__hugetlb_cgroup_file_pre_init(void)854 static void __init __hugetlb_cgroup_file_pre_init(void)
855 {
856 	int cft_count;
857 
858 	cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */
859 	dfl_files = kzalloc_objs(struct cftype, cft_count);
860 	BUG_ON(!dfl_files);
861 	cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */
862 	legacy_files = kzalloc_objs(struct cftype, cft_count);
863 	BUG_ON(!legacy_files);
864 }
865 
__hugetlb_cgroup_file_post_init(void)866 static void __init __hugetlb_cgroup_file_post_init(void)
867 {
868 	WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
869 				       dfl_files));
870 	WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
871 					  legacy_files));
872 }
873 
hugetlb_cgroup_file_init(void)874 void __init hugetlb_cgroup_file_init(void)
875 {
876 	struct hstate *h;
877 
878 	__hugetlb_cgroup_file_pre_init();
879 	for_each_hstate(h)
880 		__hugetlb_cgroup_file_init(h);
881 	__hugetlb_cgroup_file_post_init();
882 }
883 
884 /*
885  * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
886  * when we migrate hugepages
887  */
hugetlb_cgroup_migrate(struct folio * old_folio,struct folio * new_folio)888 void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
889 {
890 	struct hugetlb_cgroup *h_cg;
891 	struct hugetlb_cgroup *h_cg_rsvd;
892 	struct hstate *h = folio_hstate(old_folio);
893 
894 	if (hugetlb_cgroup_disabled())
895 		return;
896 
897 	spin_lock_irq(&hugetlb_lock);
898 	h_cg = hugetlb_cgroup_from_folio(old_folio);
899 	h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio);
900 	set_hugetlb_cgroup(old_folio, NULL);
901 	set_hugetlb_cgroup_rsvd(old_folio, NULL);
902 
903 	/* move the h_cg details to new cgroup */
904 	set_hugetlb_cgroup(new_folio, h_cg);
905 	set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
906 	list_move(&new_folio->lru, &h->hugepage_activelist);
907 	spin_unlock_irq(&hugetlb_lock);
908 }
909 
910 static struct cftype hugetlb_files[] = {
911 	{} /* terminate */
912 };
913 
914 struct cgroup_subsys hugetlb_cgrp_subsys = {
915 	.css_alloc	= hugetlb_cgroup_css_alloc,
916 	.css_offline	= hugetlb_cgroup_css_offline,
917 	.css_free	= hugetlb_cgroup_css_free,
918 	.dfl_cftypes	= hugetlb_files,
919 	.legacy_cftypes	= hugetlb_files,
920 };
921