1 /* 2 * 3 * Copyright IBM Corporation, 2012 4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> 5 * 6 * Cgroup v2 7 * Copyright (C) 2019 Red Hat, Inc. 8 * Author: Giuseppe Scrivano <gscrivan@redhat.com> 9 * 10 * This program is free software; you can redistribute it and/or modify it 11 * under the terms of version 2.1 of the GNU Lesser General Public License 12 * as published by the Free Software Foundation. 13 * 14 * This program is distributed in the hope that it would be useful, but 15 * WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 17 * 18 */ 19 20 #include <linux/cgroup.h> 21 #include <linux/page_counter.h> 22 #include <linux/slab.h> 23 #include <linux/hugetlb.h> 24 #include <linux/hugetlb_cgroup.h> 25 26 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 27 #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) 28 #define MEMFILE_ATTR(val) ((val) & 0xffff) 29 30 /* Use t->m[0] to encode the offset */ 31 #define MEMFILE_OFFSET(t, m0) (((offsetof(t, m0) << 16) | sizeof_field(t, m0))) 32 #define MEMFILE_OFFSET0(val) (((val) >> 16) & 0xffff) 33 #define MEMFILE_FIELD_SIZE(val) ((val) & 0xffff) 34 35 #define DFL_TMPL_SIZE ARRAY_SIZE(hugetlb_dfl_tmpl) 36 #define LEGACY_TMPL_SIZE ARRAY_SIZE(hugetlb_legacy_tmpl) 37 38 static struct hugetlb_cgroup *root_h_cgroup __read_mostly; 39 static struct cftype *dfl_files; 40 static struct cftype *legacy_files; 41 42 static inline struct page_counter * 43 __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, 44 bool rsvd) 45 { 46 if (rsvd) 47 return &h_cg->rsvd_hugepage[idx]; 48 return &h_cg->hugepage[idx]; 49 } 50 51 static inline struct page_counter * 52 hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) 53 { 54 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); 55 } 56 57 static inline struct page_counter * 58 hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) 59 { 60 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); 61 } 62 63 static inline 64 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) 65 { 66 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; 67 } 68 69 static inline 70 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 71 { 72 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); 73 } 74 75 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 76 { 77 return (h_cg == root_h_cgroup); 78 } 79 80 static inline struct hugetlb_cgroup * 81 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) 82 { 83 return hugetlb_cgroup_from_css(h_cg->css.parent); 84 } 85 86 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) 87 { 88 struct hstate *h; 89 90 for_each_hstate(h) { 91 if (page_counter_read( 92 hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) 93 return true; 94 } 95 return false; 96 } 97 98 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, 99 struct hugetlb_cgroup *parent_h_cgroup) 100 { 101 int idx; 102 103 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { 104 struct page_counter *fault, *fault_parent = NULL; 105 struct page_counter *rsvd, *rsvd_parent = NULL; 106 unsigned long limit; 107 108 if (parent_h_cgroup) { 109 fault_parent = hugetlb_cgroup_counter_from_cgroup( 110 parent_h_cgroup, idx); 111 rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( 112 parent_h_cgroup, idx); 113 } 114 fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx); 115 rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx); 116 117 page_counter_init(fault, fault_parent, false); 118 page_counter_init(rsvd, rsvd_parent, false); 119 120 if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) { 121 fault->track_failcnt = true; 122 rsvd->track_failcnt = true; 123 } 124 125 limit = round_down(PAGE_COUNTER_MAX, 126 pages_per_huge_page(&hstates[idx])); 127 128 VM_BUG_ON(page_counter_set_max(fault, limit)); 129 VM_BUG_ON(page_counter_set_max(rsvd, limit)); 130 } 131 } 132 133 static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) 134 { 135 int node; 136 137 for_each_node(node) 138 kfree(h_cgroup->nodeinfo[node]); 139 kfree(h_cgroup); 140 } 141 142 static struct cgroup_subsys_state * 143 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 144 { 145 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); 146 struct hugetlb_cgroup *h_cgroup; 147 int node; 148 149 h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), 150 GFP_KERNEL); 151 152 if (!h_cgroup) 153 return ERR_PTR(-ENOMEM); 154 155 if (!parent_h_cgroup) 156 root_h_cgroup = h_cgroup; 157 158 /* 159 * TODO: this routine can waste much memory for nodes which will 160 * never be onlined. It's better to use memory hotplug callback 161 * function. 162 */ 163 for_each_node(node) { 164 /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ 165 int node_to_alloc = 166 node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; 167 h_cgroup->nodeinfo[node] = 168 kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), 169 GFP_KERNEL, node_to_alloc); 170 if (!h_cgroup->nodeinfo[node]) 171 goto fail_alloc_nodeinfo; 172 } 173 174 hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); 175 return &h_cgroup->css; 176 177 fail_alloc_nodeinfo: 178 hugetlb_cgroup_free(h_cgroup); 179 return ERR_PTR(-ENOMEM); 180 } 181 182 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) 183 { 184 hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); 185 } 186 187 /* 188 * Should be called with hugetlb_lock held. 189 * Since we are holding hugetlb_lock, pages cannot get moved from 190 * active list or uncharged from the cgroup, So no need to get 191 * page reference and test for page active here. This function 192 * cannot fail. 193 */ 194 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, 195 struct folio *folio) 196 { 197 unsigned int nr_pages; 198 struct page_counter *counter; 199 struct hugetlb_cgroup *hcg; 200 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); 201 202 hcg = hugetlb_cgroup_from_folio(folio); 203 /* 204 * We can have pages in active list without any cgroup 205 * ie, hugepage with less than 3 pages. We can safely 206 * ignore those pages. 207 */ 208 if (!hcg || hcg != h_cg) 209 goto out; 210 211 nr_pages = folio_nr_pages(folio); 212 if (!parent) { 213 parent = root_h_cgroup; 214 /* root has no limit */ 215 page_counter_charge(&parent->hugepage[idx], nr_pages); 216 } 217 counter = &h_cg->hugepage[idx]; 218 /* Take the pages off the local counter */ 219 page_counter_cancel(counter, nr_pages); 220 221 set_hugetlb_cgroup(folio, parent); 222 out: 223 return; 224 } 225 226 /* 227 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 228 * the parent cgroup. 229 */ 230 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) 231 { 232 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 233 struct hstate *h; 234 struct folio *folio; 235 236 do { 237 for_each_hstate(h) { 238 spin_lock_irq(&hugetlb_lock); 239 list_for_each_entry(folio, &h->hugepage_activelist, lru) 240 hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio); 241 242 spin_unlock_irq(&hugetlb_lock); 243 } 244 cond_resched(); 245 } while (hugetlb_cgroup_have_usage(h_cg)); 246 } 247 248 static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, 249 enum hugetlb_memory_event event) 250 { 251 atomic_long_inc(&hugetlb->events_local[idx][event]); 252 cgroup_file_notify(&hugetlb->events_local_file[idx]); 253 254 do { 255 atomic_long_inc(&hugetlb->events[idx][event]); 256 cgroup_file_notify(&hugetlb->events_file[idx]); 257 } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && 258 !hugetlb_cgroup_is_root(hugetlb)); 259 } 260 261 static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 262 struct hugetlb_cgroup **ptr, 263 bool rsvd) 264 { 265 int ret = 0; 266 struct page_counter *counter; 267 struct hugetlb_cgroup *h_cg = NULL; 268 269 if (hugetlb_cgroup_disabled()) 270 goto done; 271 again: 272 rcu_read_lock(); 273 h_cg = hugetlb_cgroup_from_task(current); 274 if (!css_tryget(&h_cg->css)) { 275 rcu_read_unlock(); 276 goto again; 277 } 278 rcu_read_unlock(); 279 280 if (!page_counter_try_charge( 281 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), 282 nr_pages, &counter)) { 283 ret = -ENOMEM; 284 hugetlb_event(h_cg, idx, HUGETLB_MAX); 285 css_put(&h_cg->css); 286 goto done; 287 } 288 /* Reservations take a reference to the css because they do not get 289 * reparented. 290 */ 291 if (!rsvd) 292 css_put(&h_cg->css); 293 done: 294 *ptr = h_cg; 295 return ret; 296 } 297 298 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 299 struct hugetlb_cgroup **ptr) 300 { 301 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); 302 } 303 304 int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, 305 struct hugetlb_cgroup **ptr) 306 { 307 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); 308 } 309 310 /* Should be called with hugetlb_lock held */ 311 static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 312 struct hugetlb_cgroup *h_cg, 313 struct folio *folio, bool rsvd) 314 { 315 if (hugetlb_cgroup_disabled() || !h_cg) 316 return; 317 lockdep_assert_held(&hugetlb_lock); 318 __set_hugetlb_cgroup(folio, h_cg, rsvd); 319 if (!rsvd) { 320 unsigned long usage = 321 h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; 322 /* 323 * This write is not atomic due to fetching usage and writing 324 * to it, but that's fine because we call this with 325 * hugetlb_lock held anyway. 326 */ 327 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], 328 usage + nr_pages); 329 } 330 } 331 332 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 333 struct hugetlb_cgroup *h_cg, 334 struct folio *folio) 335 { 336 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); 337 } 338 339 void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, 340 struct hugetlb_cgroup *h_cg, 341 struct folio *folio) 342 { 343 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); 344 } 345 346 /* 347 * Should be called with hugetlb_lock held 348 */ 349 static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, 350 struct folio *folio, bool rsvd) 351 { 352 struct hugetlb_cgroup *h_cg; 353 354 if (hugetlb_cgroup_disabled()) 355 return; 356 lockdep_assert_held(&hugetlb_lock); 357 h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); 358 if (unlikely(!h_cg)) 359 return; 360 __set_hugetlb_cgroup(folio, NULL, rsvd); 361 362 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, 363 rsvd), 364 nr_pages); 365 366 if (rsvd) 367 css_put(&h_cg->css); 368 else { 369 unsigned long usage = 370 h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; 371 /* 372 * This write is not atomic due to fetching usage and writing 373 * to it, but that's fine because we call this with 374 * hugetlb_lock held anyway. 375 */ 376 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], 377 usage - nr_pages); 378 } 379 } 380 381 void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, 382 struct folio *folio) 383 { 384 __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); 385 } 386 387 void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, 388 struct folio *folio) 389 { 390 __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); 391 } 392 393 static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 394 struct hugetlb_cgroup *h_cg, 395 bool rsvd) 396 { 397 if (hugetlb_cgroup_disabled() || !h_cg) 398 return; 399 400 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, 401 rsvd), 402 nr_pages); 403 404 if (rsvd) 405 css_put(&h_cg->css); 406 } 407 408 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 409 struct hugetlb_cgroup *h_cg) 410 { 411 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); 412 } 413 414 void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, 415 struct hugetlb_cgroup *h_cg) 416 { 417 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); 418 } 419 420 void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, 421 unsigned long end) 422 { 423 if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || 424 !resv->css) 425 return; 426 427 page_counter_uncharge(resv->reservation_counter, 428 (end - start) * resv->pages_per_hpage); 429 css_put(resv->css); 430 } 431 432 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, 433 struct file_region *rg, 434 unsigned long nr_pages, 435 bool region_del) 436 { 437 if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) 438 return; 439 440 if (rg->reservation_counter && resv->pages_per_hpage && 441 !resv->reservation_counter) { 442 page_counter_uncharge(rg->reservation_counter, 443 nr_pages * resv->pages_per_hpage); 444 /* 445 * Only do css_put(rg->css) when we delete the entire region 446 * because one file_region must hold exactly one css reference. 447 */ 448 if (region_del) 449 css_put(rg->css); 450 } 451 } 452 453 enum { 454 RES_USAGE, 455 RES_RSVD_USAGE, 456 RES_LIMIT, 457 RES_RSVD_LIMIT, 458 RES_MAX_USAGE, 459 RES_RSVD_MAX_USAGE, 460 RES_FAILCNT, 461 RES_RSVD_FAILCNT, 462 }; 463 464 static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) 465 { 466 int nid; 467 struct cftype *cft = seq_cft(seq); 468 int idx = MEMFILE_IDX(cft->private); 469 bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys); 470 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 471 struct cgroup_subsys_state *css; 472 unsigned long usage; 473 474 if (legacy) { 475 /* Add up usage across all nodes for the non-hierarchical total. */ 476 usage = 0; 477 for_each_node_state(nid, N_MEMORY) 478 usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); 479 seq_printf(seq, "total=%lu", usage * PAGE_SIZE); 480 481 /* Simply print the per-node usage for the non-hierarchical total. */ 482 for_each_node_state(nid, N_MEMORY) 483 seq_printf(seq, " N%d=%lu", nid, 484 READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * 485 PAGE_SIZE); 486 seq_putc(seq, '\n'); 487 } 488 489 /* 490 * The hierarchical total is pretty much the value recorded by the 491 * counter, so use that. 492 */ 493 seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", 494 page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); 495 496 /* 497 * For each node, transverse the css tree to obtain the hierarchical 498 * node usage. 499 */ 500 for_each_node_state(nid, N_MEMORY) { 501 usage = 0; 502 rcu_read_lock(); 503 css_for_each_descendant_pre(css, &h_cg->css) { 504 usage += READ_ONCE(hugetlb_cgroup_from_css(css) 505 ->nodeinfo[nid] 506 ->usage[idx]); 507 } 508 rcu_read_unlock(); 509 seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); 510 } 511 512 seq_putc(seq, '\n'); 513 514 return 0; 515 } 516 517 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 518 struct cftype *cft) 519 { 520 struct page_counter *counter; 521 struct page_counter *rsvd_counter; 522 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 523 524 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; 525 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; 526 527 switch (MEMFILE_ATTR(cft->private)) { 528 case RES_USAGE: 529 return (u64)page_counter_read(counter) * PAGE_SIZE; 530 case RES_RSVD_USAGE: 531 return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; 532 case RES_LIMIT: 533 return (u64)counter->max * PAGE_SIZE; 534 case RES_RSVD_LIMIT: 535 return (u64)rsvd_counter->max * PAGE_SIZE; 536 case RES_MAX_USAGE: 537 return (u64)counter->watermark * PAGE_SIZE; 538 case RES_RSVD_MAX_USAGE: 539 return (u64)rsvd_counter->watermark * PAGE_SIZE; 540 case RES_FAILCNT: 541 return counter->failcnt; 542 case RES_RSVD_FAILCNT: 543 return rsvd_counter->failcnt; 544 default: 545 BUG(); 546 } 547 } 548 549 static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) 550 { 551 int idx; 552 u64 val; 553 struct cftype *cft = seq_cft(seq); 554 unsigned long limit; 555 struct page_counter *counter; 556 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 557 558 idx = MEMFILE_IDX(cft->private); 559 counter = &h_cg->hugepage[idx]; 560 561 limit = round_down(PAGE_COUNTER_MAX, 562 pages_per_huge_page(&hstates[idx])); 563 564 switch (MEMFILE_ATTR(cft->private)) { 565 case RES_RSVD_USAGE: 566 counter = &h_cg->rsvd_hugepage[idx]; 567 fallthrough; 568 case RES_USAGE: 569 val = (u64)page_counter_read(counter); 570 seq_printf(seq, "%llu\n", val * PAGE_SIZE); 571 break; 572 case RES_RSVD_LIMIT: 573 counter = &h_cg->rsvd_hugepage[idx]; 574 fallthrough; 575 case RES_LIMIT: 576 val = (u64)counter->max; 577 if (val == limit) 578 seq_puts(seq, "max\n"); 579 else 580 seq_printf(seq, "%llu\n", val * PAGE_SIZE); 581 break; 582 default: 583 BUG(); 584 } 585 586 return 0; 587 } 588 589 static DEFINE_MUTEX(hugetlb_limit_mutex); 590 591 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, 592 char *buf, size_t nbytes, loff_t off, 593 const char *max) 594 { 595 int ret, idx; 596 unsigned long nr_pages; 597 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 598 bool rsvd = false; 599 600 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ 601 return -EINVAL; 602 603 buf = strstrip(buf); 604 ret = page_counter_memparse(buf, max, &nr_pages); 605 if (ret) 606 return ret; 607 608 idx = MEMFILE_IDX(of_cft(of)->private); 609 nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); 610 611 switch (MEMFILE_ATTR(of_cft(of)->private)) { 612 case RES_RSVD_LIMIT: 613 rsvd = true; 614 fallthrough; 615 case RES_LIMIT: 616 mutex_lock(&hugetlb_limit_mutex); 617 ret = page_counter_set_max( 618 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), 619 nr_pages); 620 mutex_unlock(&hugetlb_limit_mutex); 621 break; 622 default: 623 ret = -EINVAL; 624 break; 625 } 626 return ret ?: nbytes; 627 } 628 629 static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, 630 char *buf, size_t nbytes, loff_t off) 631 { 632 return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); 633 } 634 635 static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, 636 char *buf, size_t nbytes, loff_t off) 637 { 638 return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); 639 } 640 641 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, 642 char *buf, size_t nbytes, loff_t off) 643 { 644 int ret = 0; 645 struct page_counter *counter, *rsvd_counter; 646 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 647 648 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; 649 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; 650 651 switch (MEMFILE_ATTR(of_cft(of)->private)) { 652 case RES_MAX_USAGE: 653 page_counter_reset_watermark(counter); 654 break; 655 case RES_RSVD_MAX_USAGE: 656 page_counter_reset_watermark(rsvd_counter); 657 break; 658 case RES_FAILCNT: 659 counter->failcnt = 0; 660 break; 661 case RES_RSVD_FAILCNT: 662 rsvd_counter->failcnt = 0; 663 break; 664 default: 665 ret = -EINVAL; 666 break; 667 } 668 return ret ?: nbytes; 669 } 670 671 static char *mem_fmt(char *buf, int size, unsigned long hsize) 672 { 673 if (hsize >= SZ_1G) 674 snprintf(buf, size, "%luGB", hsize / SZ_1G); 675 else if (hsize >= SZ_1M) 676 snprintf(buf, size, "%luMB", hsize / SZ_1M); 677 else 678 snprintf(buf, size, "%luKB", hsize / SZ_1K); 679 return buf; 680 } 681 682 static int __hugetlb_events_show(struct seq_file *seq, bool local) 683 { 684 int idx; 685 long max; 686 struct cftype *cft = seq_cft(seq); 687 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 688 689 idx = MEMFILE_IDX(cft->private); 690 691 if (local) 692 max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); 693 else 694 max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); 695 696 seq_printf(seq, "max %lu\n", max); 697 698 return 0; 699 } 700 701 static int hugetlb_events_show(struct seq_file *seq, void *v) 702 { 703 return __hugetlb_events_show(seq, false); 704 } 705 706 static int hugetlb_events_local_show(struct seq_file *seq, void *v) 707 { 708 return __hugetlb_events_show(seq, true); 709 } 710 711 static struct cftype hugetlb_dfl_tmpl[] = { 712 { 713 .name = "max", 714 .private = RES_LIMIT, 715 .seq_show = hugetlb_cgroup_read_u64_max, 716 .write = hugetlb_cgroup_write_dfl, 717 .flags = CFTYPE_NOT_ON_ROOT, 718 }, 719 { 720 .name = "rsvd.max", 721 .private = RES_RSVD_LIMIT, 722 .seq_show = hugetlb_cgroup_read_u64_max, 723 .write = hugetlb_cgroup_write_dfl, 724 .flags = CFTYPE_NOT_ON_ROOT, 725 }, 726 { 727 .name = "current", 728 .private = RES_USAGE, 729 .seq_show = hugetlb_cgroup_read_u64_max, 730 .flags = CFTYPE_NOT_ON_ROOT, 731 }, 732 { 733 .name = "rsvd.current", 734 .private = RES_RSVD_USAGE, 735 .seq_show = hugetlb_cgroup_read_u64_max, 736 .flags = CFTYPE_NOT_ON_ROOT, 737 }, 738 { 739 .name = "events", 740 .seq_show = hugetlb_events_show, 741 .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]), 742 .flags = CFTYPE_NOT_ON_ROOT, 743 }, 744 { 745 .name = "events.local", 746 .seq_show = hugetlb_events_local_show, 747 .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]), 748 .flags = CFTYPE_NOT_ON_ROOT, 749 }, 750 { 751 .name = "numa_stat", 752 .seq_show = hugetlb_cgroup_read_numa_stat, 753 .flags = CFTYPE_NOT_ON_ROOT, 754 }, 755 /* don't need terminator here */ 756 }; 757 758 static struct cftype hugetlb_legacy_tmpl[] = { 759 { 760 .name = "limit_in_bytes", 761 .private = RES_LIMIT, 762 .read_u64 = hugetlb_cgroup_read_u64, 763 .write = hugetlb_cgroup_write_legacy, 764 }, 765 { 766 .name = "rsvd.limit_in_bytes", 767 .private = RES_RSVD_LIMIT, 768 .read_u64 = hugetlb_cgroup_read_u64, 769 .write = hugetlb_cgroup_write_legacy, 770 }, 771 { 772 .name = "usage_in_bytes", 773 .private = RES_USAGE, 774 .read_u64 = hugetlb_cgroup_read_u64, 775 }, 776 { 777 .name = "rsvd.usage_in_bytes", 778 .private = RES_RSVD_USAGE, 779 .read_u64 = hugetlb_cgroup_read_u64, 780 }, 781 { 782 .name = "max_usage_in_bytes", 783 .private = RES_MAX_USAGE, 784 .write = hugetlb_cgroup_reset, 785 .read_u64 = hugetlb_cgroup_read_u64, 786 }, 787 { 788 .name = "rsvd.max_usage_in_bytes", 789 .private = RES_RSVD_MAX_USAGE, 790 .write = hugetlb_cgroup_reset, 791 .read_u64 = hugetlb_cgroup_read_u64, 792 }, 793 { 794 .name = "failcnt", 795 .private = RES_FAILCNT, 796 .write = hugetlb_cgroup_reset, 797 .read_u64 = hugetlb_cgroup_read_u64, 798 }, 799 { 800 .name = "rsvd.failcnt", 801 .private = RES_RSVD_FAILCNT, 802 .write = hugetlb_cgroup_reset, 803 .read_u64 = hugetlb_cgroup_read_u64, 804 }, 805 { 806 .name = "numa_stat", 807 .seq_show = hugetlb_cgroup_read_numa_stat, 808 }, 809 /* don't need terminator here */ 810 }; 811 812 static void __init 813 hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft, 814 struct cftype *tmpl, int tmpl_size) 815 { 816 char buf[32]; 817 int i, idx = hstate_index(h); 818 819 /* format the size */ 820 mem_fmt(buf, sizeof(buf), huge_page_size(h)); 821 822 for (i = 0; i < tmpl_size; cft++, tmpl++, i++) { 823 *cft = *tmpl; 824 /* rebuild the name */ 825 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name); 826 /* rebuild the private */ 827 cft->private = MEMFILE_PRIVATE(idx, tmpl->private); 828 /* rebuild the file_offset */ 829 if (tmpl->file_offset) { 830 unsigned int offset = tmpl->file_offset; 831 832 cft->file_offset = MEMFILE_OFFSET0(offset) + 833 MEMFILE_FIELD_SIZE(offset) * idx; 834 } 835 836 lockdep_register_key(&cft->lockdep_key); 837 } 838 } 839 840 static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h) 841 { 842 int idx = hstate_index(h); 843 844 hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE, 845 hugetlb_dfl_tmpl, DFL_TMPL_SIZE); 846 } 847 848 static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h) 849 { 850 int idx = hstate_index(h); 851 852 hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE, 853 hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE); 854 } 855 856 static void __init __hugetlb_cgroup_file_init(struct hstate *h) 857 { 858 __hugetlb_cgroup_file_dfl_init(h); 859 __hugetlb_cgroup_file_legacy_init(h); 860 } 861 862 static void __init __hugetlb_cgroup_file_pre_init(void) 863 { 864 int cft_count; 865 866 cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */ 867 dfl_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); 868 BUG_ON(!dfl_files); 869 cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */ 870 legacy_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); 871 BUG_ON(!legacy_files); 872 } 873 874 static void __init __hugetlb_cgroup_file_post_init(void) 875 { 876 WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, 877 dfl_files)); 878 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, 879 legacy_files)); 880 } 881 882 void __init hugetlb_cgroup_file_init(void) 883 { 884 struct hstate *h; 885 886 __hugetlb_cgroup_file_pre_init(); 887 for_each_hstate(h) 888 __hugetlb_cgroup_file_init(h); 889 __hugetlb_cgroup_file_post_init(); 890 } 891 892 /* 893 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen 894 * when we migrate hugepages 895 */ 896 void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) 897 { 898 struct hugetlb_cgroup *h_cg; 899 struct hugetlb_cgroup *h_cg_rsvd; 900 struct hstate *h = folio_hstate(old_folio); 901 902 if (hugetlb_cgroup_disabled()) 903 return; 904 905 spin_lock_irq(&hugetlb_lock); 906 h_cg = hugetlb_cgroup_from_folio(old_folio); 907 h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); 908 set_hugetlb_cgroup(old_folio, NULL); 909 set_hugetlb_cgroup_rsvd(old_folio, NULL); 910 911 /* move the h_cg details to new cgroup */ 912 set_hugetlb_cgroup(new_folio, h_cg); 913 set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); 914 list_move(&new_folio->lru, &h->hugepage_activelist); 915 spin_unlock_irq(&hugetlb_lock); 916 } 917 918 static struct cftype hugetlb_files[] = { 919 {} /* terminate */ 920 }; 921 922 struct cgroup_subsys hugetlb_cgrp_subsys = { 923 .css_alloc = hugetlb_cgroup_css_alloc, 924 .css_offline = hugetlb_cgroup_css_offline, 925 .css_free = hugetlb_cgroup_css_free, 926 .dfl_cftypes = hugetlb_files, 927 .legacy_cftypes = hugetlb_files, 928 }; 929