1 // SPDX-License-Identifier: LGPL-2.1
2 /*
3 *
4 * Copyright IBM Corporation, 2012
5 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
6 *
7 * Cgroup v2
8 * Copyright (C) 2019 Red Hat, Inc.
9 * Author: Giuseppe Scrivano <gscrivan@redhat.com>
10 *
11 */
12
13 #include <linux/cgroup.h>
14 #include <linux/page_counter.h>
15 #include <linux/slab.h>
16 #include <linux/hugetlb.h>
17 #include <linux/hugetlb_cgroup.h>
18
19 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
20 #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
21 #define MEMFILE_ATTR(val) ((val) & 0xffff)
22
23 /* Use t->m[0] to encode the offset */
24 #define MEMFILE_OFFSET(t, m0) (((offsetof(t, m0) << 16) | sizeof_field(t, m0)))
25 #define MEMFILE_OFFSET0(val) (((val) >> 16) & 0xffff)
26 #define MEMFILE_FIELD_SIZE(val) ((val) & 0xffff)
27
28 #define DFL_TMPL_SIZE ARRAY_SIZE(hugetlb_dfl_tmpl)
29 #define LEGACY_TMPL_SIZE ARRAY_SIZE(hugetlb_legacy_tmpl)
30
31 static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
32 static struct cftype *dfl_files;
33 static struct cftype *legacy_files;
34
35 static inline struct page_counter *
__hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup * h_cg,int idx,bool rsvd)36 __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
37 bool rsvd)
38 {
39 if (rsvd)
40 return &h_cg->rsvd_hugepage[idx];
41 return &h_cg->hugepage[idx];
42 }
43
44 static inline struct page_counter *
hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup * h_cg,int idx)45 hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
46 {
47 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
48 }
49
50 static inline struct page_counter *
hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup * h_cg,int idx)51 hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
52 {
53 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
54 }
55
56 static inline
hugetlb_cgroup_from_css(struct cgroup_subsys_state * s)57 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
58 {
59 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
60 }
61
62 static inline
hugetlb_cgroup_from_task(struct task_struct * task)63 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
64 {
65 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
66 }
67
hugetlb_cgroup_is_root(struct hugetlb_cgroup * h_cg)68 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
69 {
70 return (h_cg == root_h_cgroup);
71 }
72
73 static inline struct hugetlb_cgroup *
parent_hugetlb_cgroup(struct hugetlb_cgroup * h_cg)74 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
75 {
76 return hugetlb_cgroup_from_css(h_cg->css.parent);
77 }
78
hugetlb_cgroup_have_usage(struct hugetlb_cgroup * h_cg)79 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
80 {
81 struct hstate *h;
82
83 for_each_hstate(h) {
84 if (page_counter_read(
85 hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
86 return true;
87 }
88 return false;
89 }
90
hugetlb_cgroup_init(struct hugetlb_cgroup * h_cgroup,struct hugetlb_cgroup * parent_h_cgroup)91 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
92 struct hugetlb_cgroup *parent_h_cgroup)
93 {
94 int idx;
95
96 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
97 struct page_counter *fault, *fault_parent = NULL;
98 struct page_counter *rsvd, *rsvd_parent = NULL;
99 unsigned long limit;
100
101 if (parent_h_cgroup) {
102 fault_parent = hugetlb_cgroup_counter_from_cgroup(
103 parent_h_cgroup, idx);
104 rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
105 parent_h_cgroup, idx);
106 }
107 fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx);
108 rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx);
109
110 page_counter_init(fault, fault_parent, false);
111 page_counter_init(rsvd, rsvd_parent, false);
112
113 if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) {
114 fault->track_failcnt = true;
115 rsvd->track_failcnt = true;
116 }
117
118 limit = round_down(PAGE_COUNTER_MAX,
119 pages_per_huge_page(&hstates[idx]));
120
121 VM_BUG_ON(page_counter_set_max(fault, limit));
122 VM_BUG_ON(page_counter_set_max(rsvd, limit));
123 }
124 }
125
hugetlb_cgroup_free(struct hugetlb_cgroup * h_cgroup)126 static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
127 {
128 int node;
129
130 for_each_node(node)
131 kfree(h_cgroup->nodeinfo[node]);
132 kfree(h_cgroup);
133 }
134
135 static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state * parent_css)136 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
137 {
138 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
139 struct hugetlb_cgroup *h_cgroup;
140 int node;
141
142 h_cgroup = kzalloc_flex(*h_cgroup, nodeinfo, nr_node_ids);
143
144 if (!h_cgroup)
145 return ERR_PTR(-ENOMEM);
146
147 if (!parent_h_cgroup)
148 root_h_cgroup = h_cgroup;
149
150 /*
151 * TODO: this routine can waste much memory for nodes which will
152 * never be onlined. It's better to use memory hotplug callback
153 * function.
154 */
155 for_each_node(node) {
156 /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
157 int node_to_alloc =
158 node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
159 h_cgroup->nodeinfo[node] =
160 kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
161 GFP_KERNEL, node_to_alloc);
162 if (!h_cgroup->nodeinfo[node])
163 goto fail_alloc_nodeinfo;
164 }
165
166 hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
167 return &h_cgroup->css;
168
169 fail_alloc_nodeinfo:
170 hugetlb_cgroup_free(h_cgroup);
171 return ERR_PTR(-ENOMEM);
172 }
173
hugetlb_cgroup_css_free(struct cgroup_subsys_state * css)174 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
175 {
176 hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
177 }
178
179 /*
180 * Should be called with hugetlb_lock held.
181 * Since we are holding hugetlb_lock, pages cannot get moved from
182 * active list or uncharged from the cgroup, So no need to get
183 * page reference and test for page active here. This function
184 * cannot fail.
185 */
hugetlb_cgroup_move_parent(int idx,struct hugetlb_cgroup * h_cg,struct folio * folio)186 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
187 struct folio *folio)
188 {
189 unsigned int nr_pages;
190 struct page_counter *counter;
191 struct hugetlb_cgroup *hcg;
192 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
193
194 hcg = hugetlb_cgroup_from_folio(folio);
195 /*
196 * We can have pages in active list without any cgroup
197 * ie, hugepage with less than 3 pages. We can safely
198 * ignore those pages.
199 */
200 if (!hcg || hcg != h_cg)
201 goto out;
202
203 nr_pages = folio_nr_pages(folio);
204 if (!parent) {
205 parent = root_h_cgroup;
206 /* root has no limit */
207 page_counter_charge(&parent->hugepage[idx], nr_pages);
208 }
209 counter = &h_cg->hugepage[idx];
210 /* Take the pages off the local counter */
211 page_counter_cancel(counter, nr_pages);
212
213 set_hugetlb_cgroup(folio, parent);
214 out:
215 return;
216 }
217
218 /*
219 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
220 * the parent cgroup.
221 */
hugetlb_cgroup_css_offline(struct cgroup_subsys_state * css)222 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
223 {
224 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
225 struct hstate *h;
226 struct folio *folio;
227
228 do {
229 for_each_hstate(h) {
230 spin_lock_irq(&hugetlb_lock);
231 list_for_each_entry(folio, &h->hugepage_activelist, lru)
232 hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio);
233
234 spin_unlock_irq(&hugetlb_lock);
235 }
236 cond_resched();
237 } while (hugetlb_cgroup_have_usage(h_cg));
238 }
239
hugetlb_event(struct hugetlb_cgroup * hugetlb,int idx,enum hugetlb_memory_event event)240 static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
241 enum hugetlb_memory_event event)
242 {
243 atomic_long_inc(&hugetlb->events_local[idx][event]);
244 cgroup_file_notify(&hugetlb->events_local_file[idx]);
245
246 do {
247 atomic_long_inc(&hugetlb->events[idx][event]);
248 cgroup_file_notify(&hugetlb->events_file[idx]);
249 } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
250 !hugetlb_cgroup_is_root(hugetlb));
251 }
252
__hugetlb_cgroup_charge_cgroup(int idx,unsigned long nr_pages,struct hugetlb_cgroup ** ptr,bool rsvd)253 static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
254 struct hugetlb_cgroup **ptr,
255 bool rsvd)
256 {
257 int ret = 0;
258 struct page_counter *counter;
259 struct hugetlb_cgroup *h_cg = NULL;
260
261 if (hugetlb_cgroup_disabled())
262 goto done;
263 again:
264 rcu_read_lock();
265 h_cg = hugetlb_cgroup_from_task(current);
266 if (!css_tryget(&h_cg->css)) {
267 rcu_read_unlock();
268 goto again;
269 }
270 rcu_read_unlock();
271
272 if (!page_counter_try_charge(
273 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
274 nr_pages, &counter)) {
275 ret = -ENOMEM;
276 hugetlb_event(h_cg, idx, HUGETLB_MAX);
277 css_put(&h_cg->css);
278 goto done;
279 }
280 /* Reservations take a reference to the css because they do not get
281 * reparented.
282 */
283 if (!rsvd)
284 css_put(&h_cg->css);
285 done:
286 *ptr = h_cg;
287 return ret;
288 }
289
hugetlb_cgroup_charge_cgroup(int idx,unsigned long nr_pages,struct hugetlb_cgroup ** ptr)290 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
291 struct hugetlb_cgroup **ptr)
292 {
293 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
294 }
295
hugetlb_cgroup_charge_cgroup_rsvd(int idx,unsigned long nr_pages,struct hugetlb_cgroup ** ptr)296 int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
297 struct hugetlb_cgroup **ptr)
298 {
299 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
300 }
301
302 /* Should be called with hugetlb_lock held */
__hugetlb_cgroup_commit_charge(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg,struct folio * folio,bool rsvd)303 static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
304 struct hugetlb_cgroup *h_cg,
305 struct folio *folio, bool rsvd)
306 {
307 if (hugetlb_cgroup_disabled() || !h_cg)
308 return;
309 lockdep_assert_held(&hugetlb_lock);
310 __set_hugetlb_cgroup(folio, h_cg, rsvd);
311 if (!rsvd) {
312 unsigned long usage =
313 h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
314 /*
315 * This write is not atomic due to fetching usage and writing
316 * to it, but that's fine because we call this with
317 * hugetlb_lock held anyway.
318 */
319 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
320 usage + nr_pages);
321 }
322 }
323
hugetlb_cgroup_commit_charge(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg,struct folio * folio)324 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
325 struct hugetlb_cgroup *h_cg,
326 struct folio *folio)
327 {
328 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
329 }
330
hugetlb_cgroup_commit_charge_rsvd(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg,struct folio * folio)331 void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
332 struct hugetlb_cgroup *h_cg,
333 struct folio *folio)
334 {
335 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
336 }
337
338 /*
339 * Should be called with hugetlb_lock held
340 */
__hugetlb_cgroup_uncharge_folio(int idx,unsigned long nr_pages,struct folio * folio,bool rsvd)341 static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
342 struct folio *folio, bool rsvd)
343 {
344 struct hugetlb_cgroup *h_cg;
345
346 if (hugetlb_cgroup_disabled())
347 return;
348 lockdep_assert_held(&hugetlb_lock);
349 h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
350 if (unlikely(!h_cg))
351 return;
352 __set_hugetlb_cgroup(folio, NULL, rsvd);
353
354 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
355 rsvd),
356 nr_pages);
357
358 if (rsvd)
359 css_put(&h_cg->css);
360 else {
361 unsigned long usage =
362 h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
363 /*
364 * This write is not atomic due to fetching usage and writing
365 * to it, but that's fine because we call this with
366 * hugetlb_lock held anyway.
367 */
368 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
369 usage - nr_pages);
370 }
371 }
372
hugetlb_cgroup_uncharge_folio(int idx,unsigned long nr_pages,struct folio * folio)373 void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
374 struct folio *folio)
375 {
376 __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false);
377 }
378
hugetlb_cgroup_uncharge_folio_rsvd(int idx,unsigned long nr_pages,struct folio * folio)379 void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
380 struct folio *folio)
381 {
382 __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true);
383 }
384
__hugetlb_cgroup_uncharge_cgroup(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg,bool rsvd)385 static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
386 struct hugetlb_cgroup *h_cg,
387 bool rsvd)
388 {
389 if (hugetlb_cgroup_disabled() || !h_cg)
390 return;
391
392 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
393 rsvd),
394 nr_pages);
395
396 if (rsvd)
397 css_put(&h_cg->css);
398 }
399
hugetlb_cgroup_uncharge_cgroup(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg)400 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
401 struct hugetlb_cgroup *h_cg)
402 {
403 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
404 }
405
hugetlb_cgroup_uncharge_cgroup_rsvd(int idx,unsigned long nr_pages,struct hugetlb_cgroup * h_cg)406 void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
407 struct hugetlb_cgroup *h_cg)
408 {
409 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
410 }
411
hugetlb_cgroup_uncharge_counter(struct resv_map * resv,unsigned long start,unsigned long end)412 void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
413 unsigned long end)
414 {
415 if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
416 !resv->css)
417 return;
418
419 page_counter_uncharge(resv->reservation_counter,
420 (end - start) * resv->pages_per_hpage);
421 css_put(resv->css);
422 }
423
hugetlb_cgroup_uncharge_file_region(struct resv_map * resv,struct file_region * rg,unsigned long nr_pages,bool region_del)424 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
425 struct file_region *rg,
426 unsigned long nr_pages,
427 bool region_del)
428 {
429 if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
430 return;
431
432 if (rg->reservation_counter && resv->pages_per_hpage &&
433 !resv->reservation_counter) {
434 page_counter_uncharge(rg->reservation_counter,
435 nr_pages * resv->pages_per_hpage);
436 /*
437 * Only do css_put(rg->css) when we delete the entire region
438 * because one file_region must hold exactly one css reference.
439 */
440 if (region_del)
441 css_put(rg->css);
442 }
443 }
444
445 enum {
446 RES_USAGE,
447 RES_RSVD_USAGE,
448 RES_LIMIT,
449 RES_RSVD_LIMIT,
450 RES_MAX_USAGE,
451 RES_RSVD_MAX_USAGE,
452 RES_FAILCNT,
453 RES_RSVD_FAILCNT,
454 };
455
hugetlb_cgroup_read_numa_stat(struct seq_file * seq,void * dummy)456 static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
457 {
458 int nid;
459 struct cftype *cft = seq_cft(seq);
460 int idx = MEMFILE_IDX(cft->private);
461 bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys);
462 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
463 struct cgroup_subsys_state *css;
464 unsigned long usage;
465
466 if (legacy) {
467 /* Add up usage across all nodes for the non-hierarchical total. */
468 usage = 0;
469 for_each_node_state(nid, N_MEMORY)
470 usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
471 seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
472
473 /* Simply print the per-node usage for the non-hierarchical total. */
474 for_each_node_state(nid, N_MEMORY)
475 seq_printf(seq, " N%d=%lu", nid,
476 READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
477 PAGE_SIZE);
478 seq_putc(seq, '\n');
479 }
480
481 /*
482 * The hierarchical total is pretty much the value recorded by the
483 * counter, so use that.
484 */
485 seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
486 page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
487
488 /*
489 * For each node, transverse the css tree to obtain the hierarchical
490 * node usage.
491 */
492 for_each_node_state(nid, N_MEMORY) {
493 usage = 0;
494 rcu_read_lock();
495 css_for_each_descendant_pre(css, &h_cg->css) {
496 usage += READ_ONCE(hugetlb_cgroup_from_css(css)
497 ->nodeinfo[nid]
498 ->usage[idx]);
499 }
500 rcu_read_unlock();
501 seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
502 }
503
504 seq_putc(seq, '\n');
505
506 return 0;
507 }
508
hugetlb_cgroup_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)509 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
510 struct cftype *cft)
511 {
512 struct page_counter *counter;
513 struct page_counter *rsvd_counter;
514 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
515
516 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
517 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
518
519 switch (MEMFILE_ATTR(cft->private)) {
520 case RES_USAGE:
521 return (u64)page_counter_read(counter) * PAGE_SIZE;
522 case RES_RSVD_USAGE:
523 return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
524 case RES_LIMIT:
525 return (u64)counter->max * PAGE_SIZE;
526 case RES_RSVD_LIMIT:
527 return (u64)rsvd_counter->max * PAGE_SIZE;
528 case RES_MAX_USAGE:
529 return (u64)counter->watermark * PAGE_SIZE;
530 case RES_RSVD_MAX_USAGE:
531 return (u64)rsvd_counter->watermark * PAGE_SIZE;
532 case RES_FAILCNT:
533 return counter->failcnt;
534 case RES_RSVD_FAILCNT:
535 return rsvd_counter->failcnt;
536 default:
537 BUG();
538 }
539 }
540
hugetlb_cgroup_read_u64_max(struct seq_file * seq,void * v)541 static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
542 {
543 int idx;
544 u64 val;
545 struct cftype *cft = seq_cft(seq);
546 unsigned long limit;
547 struct page_counter *counter;
548 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
549
550 idx = MEMFILE_IDX(cft->private);
551 counter = &h_cg->hugepage[idx];
552
553 limit = round_down(PAGE_COUNTER_MAX,
554 pages_per_huge_page(&hstates[idx]));
555
556 switch (MEMFILE_ATTR(cft->private)) {
557 case RES_RSVD_USAGE:
558 counter = &h_cg->rsvd_hugepage[idx];
559 fallthrough;
560 case RES_USAGE:
561 val = (u64)page_counter_read(counter);
562 seq_printf(seq, "%llu\n", val * PAGE_SIZE);
563 break;
564 case RES_RSVD_LIMIT:
565 counter = &h_cg->rsvd_hugepage[idx];
566 fallthrough;
567 case RES_LIMIT:
568 val = (u64)counter->max;
569 if (val == limit)
570 seq_puts(seq, "max\n");
571 else
572 seq_printf(seq, "%llu\n", val * PAGE_SIZE);
573 break;
574 default:
575 BUG();
576 }
577
578 return 0;
579 }
580
581 static DEFINE_MUTEX(hugetlb_limit_mutex);
582
hugetlb_cgroup_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off,const char * max)583 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
584 char *buf, size_t nbytes, loff_t off,
585 const char *max)
586 {
587 int ret, idx;
588 unsigned long nr_pages;
589 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
590 bool rsvd = false;
591
592 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
593 return -EINVAL;
594
595 buf = strstrip(buf);
596 ret = page_counter_memparse(buf, max, &nr_pages);
597 if (ret)
598 return ret;
599
600 idx = MEMFILE_IDX(of_cft(of)->private);
601 nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
602
603 switch (MEMFILE_ATTR(of_cft(of)->private)) {
604 case RES_RSVD_LIMIT:
605 rsvd = true;
606 fallthrough;
607 case RES_LIMIT:
608 mutex_lock(&hugetlb_limit_mutex);
609 ret = page_counter_set_max(
610 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
611 nr_pages);
612 mutex_unlock(&hugetlb_limit_mutex);
613 break;
614 default:
615 ret = -EINVAL;
616 break;
617 }
618 return ret ?: nbytes;
619 }
620
hugetlb_cgroup_write_legacy(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)621 static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
622 char *buf, size_t nbytes, loff_t off)
623 {
624 return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
625 }
626
hugetlb_cgroup_write_dfl(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)627 static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
628 char *buf, size_t nbytes, loff_t off)
629 {
630 return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
631 }
632
hugetlb_cgroup_reset(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)633 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
634 char *buf, size_t nbytes, loff_t off)
635 {
636 int ret = 0;
637 struct page_counter *counter, *rsvd_counter;
638 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
639
640 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
641 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
642
643 switch (MEMFILE_ATTR(of_cft(of)->private)) {
644 case RES_MAX_USAGE:
645 page_counter_reset_watermark(counter);
646 break;
647 case RES_RSVD_MAX_USAGE:
648 page_counter_reset_watermark(rsvd_counter);
649 break;
650 case RES_FAILCNT:
651 counter->failcnt = 0;
652 break;
653 case RES_RSVD_FAILCNT:
654 rsvd_counter->failcnt = 0;
655 break;
656 default:
657 ret = -EINVAL;
658 break;
659 }
660 return ret ?: nbytes;
661 }
662
mem_fmt(char * buf,int size,unsigned long hsize)663 static char *mem_fmt(char *buf, int size, unsigned long hsize)
664 {
665 if (hsize >= SZ_1G)
666 snprintf(buf, size, "%luGB", hsize / SZ_1G);
667 else if (hsize >= SZ_1M)
668 snprintf(buf, size, "%luMB", hsize / SZ_1M);
669 else
670 snprintf(buf, size, "%luKB", hsize / SZ_1K);
671 return buf;
672 }
673
__hugetlb_events_show(struct seq_file * seq,bool local)674 static int __hugetlb_events_show(struct seq_file *seq, bool local)
675 {
676 int idx;
677 long max;
678 struct cftype *cft = seq_cft(seq);
679 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
680
681 idx = MEMFILE_IDX(cft->private);
682
683 if (local)
684 max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
685 else
686 max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
687
688 seq_printf(seq, "max %lu\n", max);
689
690 return 0;
691 }
692
hugetlb_events_show(struct seq_file * seq,void * v)693 static int hugetlb_events_show(struct seq_file *seq, void *v)
694 {
695 return __hugetlb_events_show(seq, false);
696 }
697
hugetlb_events_local_show(struct seq_file * seq,void * v)698 static int hugetlb_events_local_show(struct seq_file *seq, void *v)
699 {
700 return __hugetlb_events_show(seq, true);
701 }
702
703 static struct cftype hugetlb_dfl_tmpl[] = {
704 {
705 .name = "max",
706 .private = RES_LIMIT,
707 .seq_show = hugetlb_cgroup_read_u64_max,
708 .write = hugetlb_cgroup_write_dfl,
709 .flags = CFTYPE_NOT_ON_ROOT,
710 },
711 {
712 .name = "rsvd.max",
713 .private = RES_RSVD_LIMIT,
714 .seq_show = hugetlb_cgroup_read_u64_max,
715 .write = hugetlb_cgroup_write_dfl,
716 .flags = CFTYPE_NOT_ON_ROOT,
717 },
718 {
719 .name = "current",
720 .private = RES_USAGE,
721 .seq_show = hugetlb_cgroup_read_u64_max,
722 .flags = CFTYPE_NOT_ON_ROOT,
723 },
724 {
725 .name = "rsvd.current",
726 .private = RES_RSVD_USAGE,
727 .seq_show = hugetlb_cgroup_read_u64_max,
728 .flags = CFTYPE_NOT_ON_ROOT,
729 },
730 {
731 .name = "events",
732 .seq_show = hugetlb_events_show,
733 .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]),
734 .flags = CFTYPE_NOT_ON_ROOT,
735 },
736 {
737 .name = "events.local",
738 .seq_show = hugetlb_events_local_show,
739 .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]),
740 .flags = CFTYPE_NOT_ON_ROOT,
741 },
742 {
743 .name = "numa_stat",
744 .seq_show = hugetlb_cgroup_read_numa_stat,
745 .flags = CFTYPE_NOT_ON_ROOT,
746 },
747 /* don't need terminator here */
748 };
749
750 static struct cftype hugetlb_legacy_tmpl[] = {
751 {
752 .name = "limit_in_bytes",
753 .private = RES_LIMIT,
754 .read_u64 = hugetlb_cgroup_read_u64,
755 .write = hugetlb_cgroup_write_legacy,
756 },
757 {
758 .name = "rsvd.limit_in_bytes",
759 .private = RES_RSVD_LIMIT,
760 .read_u64 = hugetlb_cgroup_read_u64,
761 .write = hugetlb_cgroup_write_legacy,
762 },
763 {
764 .name = "usage_in_bytes",
765 .private = RES_USAGE,
766 .read_u64 = hugetlb_cgroup_read_u64,
767 },
768 {
769 .name = "rsvd.usage_in_bytes",
770 .private = RES_RSVD_USAGE,
771 .read_u64 = hugetlb_cgroup_read_u64,
772 },
773 {
774 .name = "max_usage_in_bytes",
775 .private = RES_MAX_USAGE,
776 .write = hugetlb_cgroup_reset,
777 .read_u64 = hugetlb_cgroup_read_u64,
778 },
779 {
780 .name = "rsvd.max_usage_in_bytes",
781 .private = RES_RSVD_MAX_USAGE,
782 .write = hugetlb_cgroup_reset,
783 .read_u64 = hugetlb_cgroup_read_u64,
784 },
785 {
786 .name = "failcnt",
787 .private = RES_FAILCNT,
788 .write = hugetlb_cgroup_reset,
789 .read_u64 = hugetlb_cgroup_read_u64,
790 },
791 {
792 .name = "rsvd.failcnt",
793 .private = RES_RSVD_FAILCNT,
794 .write = hugetlb_cgroup_reset,
795 .read_u64 = hugetlb_cgroup_read_u64,
796 },
797 {
798 .name = "numa_stat",
799 .seq_show = hugetlb_cgroup_read_numa_stat,
800 },
801 /* don't need terminator here */
802 };
803
804 static void __init
hugetlb_cgroup_cfttypes_init(struct hstate * h,struct cftype * cft,struct cftype * tmpl,int tmpl_size)805 hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft,
806 struct cftype *tmpl, int tmpl_size)
807 {
808 char buf[32];
809 int i, idx = hstate_index(h);
810
811 /* format the size */
812 mem_fmt(buf, sizeof(buf), huge_page_size(h));
813
814 for (i = 0; i < tmpl_size; cft++, tmpl++, i++) {
815 *cft = *tmpl;
816 /* rebuild the name */
817 scnprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name);
818 /* rebuild the private */
819 cft->private = MEMFILE_PRIVATE(idx, tmpl->private);
820 /* rebuild the file_offset */
821 if (tmpl->file_offset) {
822 unsigned int offset = tmpl->file_offset;
823
824 cft->file_offset = MEMFILE_OFFSET0(offset) +
825 MEMFILE_FIELD_SIZE(offset) * idx;
826 }
827
828 lockdep_register_key(&cft->lockdep_key);
829 }
830 }
831
__hugetlb_cgroup_file_dfl_init(struct hstate * h)832 static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h)
833 {
834 int idx = hstate_index(h);
835
836 hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE,
837 hugetlb_dfl_tmpl, DFL_TMPL_SIZE);
838 }
839
__hugetlb_cgroup_file_legacy_init(struct hstate * h)840 static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h)
841 {
842 int idx = hstate_index(h);
843
844 hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE,
845 hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE);
846 }
847
__hugetlb_cgroup_file_init(struct hstate * h)848 static void __init __hugetlb_cgroup_file_init(struct hstate *h)
849 {
850 __hugetlb_cgroup_file_dfl_init(h);
851 __hugetlb_cgroup_file_legacy_init(h);
852 }
853
__hugetlb_cgroup_file_pre_init(void)854 static void __init __hugetlb_cgroup_file_pre_init(void)
855 {
856 int cft_count;
857
858 cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */
859 dfl_files = kzalloc_objs(struct cftype, cft_count);
860 BUG_ON(!dfl_files);
861 cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */
862 legacy_files = kzalloc_objs(struct cftype, cft_count);
863 BUG_ON(!legacy_files);
864 }
865
__hugetlb_cgroup_file_post_init(void)866 static void __init __hugetlb_cgroup_file_post_init(void)
867 {
868 WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
869 dfl_files));
870 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
871 legacy_files));
872 }
873
hugetlb_cgroup_file_init(void)874 void __init hugetlb_cgroup_file_init(void)
875 {
876 struct hstate *h;
877
878 __hugetlb_cgroup_file_pre_init();
879 for_each_hstate(h)
880 __hugetlb_cgroup_file_init(h);
881 __hugetlb_cgroup_file_post_init();
882 }
883
884 /*
885 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
886 * when we migrate hugepages
887 */
hugetlb_cgroup_migrate(struct folio * old_folio,struct folio * new_folio)888 void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
889 {
890 struct hugetlb_cgroup *h_cg;
891 struct hugetlb_cgroup *h_cg_rsvd;
892 struct hstate *h = folio_hstate(old_folio);
893
894 if (hugetlb_cgroup_disabled())
895 return;
896
897 spin_lock_irq(&hugetlb_lock);
898 h_cg = hugetlb_cgroup_from_folio(old_folio);
899 h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio);
900 set_hugetlb_cgroup(old_folio, NULL);
901 set_hugetlb_cgroup_rsvd(old_folio, NULL);
902
903 /* move the h_cg details to new cgroup */
904 set_hugetlb_cgroup(new_folio, h_cg);
905 set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
906 list_move(&new_folio->lru, &h->hugepage_activelist);
907 spin_unlock_irq(&hugetlb_lock);
908 }
909
910 static struct cftype hugetlb_files[] = {
911 {} /* terminate */
912 };
913
914 struct cgroup_subsys hugetlb_cgrp_subsys = {
915 .css_alloc = hugetlb_cgroup_css_alloc,
916 .css_offline = hugetlb_cgroup_css_offline,
917 .css_free = hugetlb_cgroup_css_free,
918 .dfl_cftypes = hugetlb_files,
919 .legacy_cftypes = hugetlb_files,
920 };
921