1 /* 2 * 3 * Copyright IBM Corporation, 2012 4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> 5 * 6 * Cgroup v2 7 * Copyright (C) 2019 Red Hat, Inc. 8 * Author: Giuseppe Scrivano <gscrivan@redhat.com> 9 * 10 * This program is free software; you can redistribute it and/or modify it 11 * under the terms of version 2.1 of the GNU Lesser General Public License 12 * as published by the Free Software Foundation. 13 * 14 * This program is distributed in the hope that it would be useful, but 15 * WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 17 * 18 */ 19 20 #include <linux/cgroup.h> 21 #include <linux/page_counter.h> 22 #include <linux/slab.h> 23 #include <linux/hugetlb.h> 24 #include <linux/hugetlb_cgroup.h> 25 26 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 27 #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) 28 #define MEMFILE_ATTR(val) ((val) & 0xffff) 29 30 static struct hugetlb_cgroup *root_h_cgroup __read_mostly; 31 32 static inline struct page_counter * 33 __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, 34 bool rsvd) 35 { 36 if (rsvd) 37 return &h_cg->rsvd_hugepage[idx]; 38 return &h_cg->hugepage[idx]; 39 } 40 41 static inline struct page_counter * 42 hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) 43 { 44 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); 45 } 46 47 static inline struct page_counter * 48 hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) 49 { 50 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); 51 } 52 53 static inline 54 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) 55 { 56 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; 57 } 58 59 static inline 60 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 61 { 62 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); 63 } 64 65 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 66 { 67 return (h_cg == root_h_cgroup); 68 } 69 70 static inline struct hugetlb_cgroup * 71 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) 72 { 73 return hugetlb_cgroup_from_css(h_cg->css.parent); 74 } 75 76 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) 77 { 78 struct hstate *h; 79 80 for_each_hstate(h) { 81 if (page_counter_read( 82 hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) 83 return true; 84 } 85 return false; 86 } 87 88 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, 89 struct hugetlb_cgroup *parent_h_cgroup) 90 { 91 int idx; 92 93 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { 94 struct page_counter *fault_parent = NULL; 95 struct page_counter *rsvd_parent = NULL; 96 unsigned long limit; 97 int ret; 98 99 if (parent_h_cgroup) { 100 fault_parent = hugetlb_cgroup_counter_from_cgroup( 101 parent_h_cgroup, idx); 102 rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( 103 parent_h_cgroup, idx); 104 } 105 page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, 106 idx), 107 fault_parent); 108 page_counter_init( 109 hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), 110 rsvd_parent); 111 112 limit = round_down(PAGE_COUNTER_MAX, 113 pages_per_huge_page(&hstates[idx])); 114 115 ret = page_counter_set_max( 116 hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), 117 limit); 118 VM_BUG_ON(ret); 119 ret = page_counter_set_max( 120 hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), 121 limit); 122 VM_BUG_ON(ret); 123 } 124 } 125 126 static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) 127 { 128 int node; 129 130 for_each_node(node) 131 kfree(h_cgroup->nodeinfo[node]); 132 kfree(h_cgroup); 133 } 134 135 static struct cgroup_subsys_state * 136 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 137 { 138 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); 139 struct hugetlb_cgroup *h_cgroup; 140 int node; 141 142 h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), 143 GFP_KERNEL); 144 145 if (!h_cgroup) 146 return ERR_PTR(-ENOMEM); 147 148 if (!parent_h_cgroup) 149 root_h_cgroup = h_cgroup; 150 151 /* 152 * TODO: this routine can waste much memory for nodes which will 153 * never be onlined. It's better to use memory hotplug callback 154 * function. 155 */ 156 for_each_node(node) { 157 /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ 158 int node_to_alloc = 159 node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; 160 h_cgroup->nodeinfo[node] = 161 kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), 162 GFP_KERNEL, node_to_alloc); 163 if (!h_cgroup->nodeinfo[node]) 164 goto fail_alloc_nodeinfo; 165 } 166 167 hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); 168 return &h_cgroup->css; 169 170 fail_alloc_nodeinfo: 171 hugetlb_cgroup_free(h_cgroup); 172 return ERR_PTR(-ENOMEM); 173 } 174 175 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) 176 { 177 hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); 178 } 179 180 /* 181 * Should be called with hugetlb_lock held. 182 * Since we are holding hugetlb_lock, pages cannot get moved from 183 * active list or uncharged from the cgroup, So no need to get 184 * page reference and test for page active here. This function 185 * cannot fail. 186 */ 187 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, 188 struct page *page) 189 { 190 unsigned int nr_pages; 191 struct page_counter *counter; 192 struct hugetlb_cgroup *page_hcg; 193 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); 194 struct folio *folio = page_folio(page); 195 196 page_hcg = hugetlb_cgroup_from_folio(folio); 197 /* 198 * We can have pages in active list without any cgroup 199 * ie, hugepage with less than 3 pages. We can safely 200 * ignore those pages. 201 */ 202 if (!page_hcg || page_hcg != h_cg) 203 goto out; 204 205 nr_pages = compound_nr(page); 206 if (!parent) { 207 parent = root_h_cgroup; 208 /* root has no limit */ 209 page_counter_charge(&parent->hugepage[idx], nr_pages); 210 } 211 counter = &h_cg->hugepage[idx]; 212 /* Take the pages off the local counter */ 213 page_counter_cancel(counter, nr_pages); 214 215 set_hugetlb_cgroup(folio, parent); 216 out: 217 return; 218 } 219 220 /* 221 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 222 * the parent cgroup. 223 */ 224 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) 225 { 226 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 227 struct hstate *h; 228 struct page *page; 229 230 do { 231 for_each_hstate(h) { 232 spin_lock_irq(&hugetlb_lock); 233 list_for_each_entry(page, &h->hugepage_activelist, lru) 234 hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); 235 236 spin_unlock_irq(&hugetlb_lock); 237 } 238 cond_resched(); 239 } while (hugetlb_cgroup_have_usage(h_cg)); 240 } 241 242 static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, 243 enum hugetlb_memory_event event) 244 { 245 atomic_long_inc(&hugetlb->events_local[idx][event]); 246 cgroup_file_notify(&hugetlb->events_local_file[idx]); 247 248 do { 249 atomic_long_inc(&hugetlb->events[idx][event]); 250 cgroup_file_notify(&hugetlb->events_file[idx]); 251 } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && 252 !hugetlb_cgroup_is_root(hugetlb)); 253 } 254 255 static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 256 struct hugetlb_cgroup **ptr, 257 bool rsvd) 258 { 259 int ret = 0; 260 struct page_counter *counter; 261 struct hugetlb_cgroup *h_cg = NULL; 262 263 if (hugetlb_cgroup_disabled()) 264 goto done; 265 /* 266 * We don't charge any cgroup if the compound page have less 267 * than 3 pages. 268 */ 269 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 270 goto done; 271 again: 272 rcu_read_lock(); 273 h_cg = hugetlb_cgroup_from_task(current); 274 if (!css_tryget(&h_cg->css)) { 275 rcu_read_unlock(); 276 goto again; 277 } 278 rcu_read_unlock(); 279 280 if (!page_counter_try_charge( 281 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), 282 nr_pages, &counter)) { 283 ret = -ENOMEM; 284 hugetlb_event(h_cg, idx, HUGETLB_MAX); 285 css_put(&h_cg->css); 286 goto done; 287 } 288 /* Reservations take a reference to the css because they do not get 289 * reparented. 290 */ 291 if (!rsvd) 292 css_put(&h_cg->css); 293 done: 294 *ptr = h_cg; 295 return ret; 296 } 297 298 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 299 struct hugetlb_cgroup **ptr) 300 { 301 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); 302 } 303 304 int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, 305 struct hugetlb_cgroup **ptr) 306 { 307 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); 308 } 309 310 /* Should be called with hugetlb_lock held */ 311 static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 312 struct hugetlb_cgroup *h_cg, 313 struct folio *folio, bool rsvd) 314 { 315 if (hugetlb_cgroup_disabled() || !h_cg) 316 return; 317 318 __set_hugetlb_cgroup(folio, h_cg, rsvd); 319 if (!rsvd) { 320 unsigned long usage = 321 h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; 322 /* 323 * This write is not atomic due to fetching usage and writing 324 * to it, but that's fine because we call this with 325 * hugetlb_lock held anyway. 326 */ 327 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], 328 usage + nr_pages); 329 } 330 } 331 332 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 333 struct hugetlb_cgroup *h_cg, 334 struct folio *folio) 335 { 336 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); 337 } 338 339 void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, 340 struct hugetlb_cgroup *h_cg, 341 struct folio *folio) 342 { 343 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); 344 } 345 346 /* 347 * Should be called with hugetlb_lock held 348 */ 349 static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, 350 struct folio *folio, bool rsvd) 351 { 352 struct hugetlb_cgroup *h_cg; 353 354 if (hugetlb_cgroup_disabled()) 355 return; 356 lockdep_assert_held(&hugetlb_lock); 357 h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); 358 if (unlikely(!h_cg)) 359 return; 360 __set_hugetlb_cgroup(folio, NULL, rsvd); 361 362 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, 363 rsvd), 364 nr_pages); 365 366 if (rsvd) 367 css_put(&h_cg->css); 368 else { 369 unsigned long usage = 370 h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; 371 /* 372 * This write is not atomic due to fetching usage and writing 373 * to it, but that's fine because we call this with 374 * hugetlb_lock held anyway. 375 */ 376 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], 377 usage - nr_pages); 378 } 379 } 380 381 void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, 382 struct folio *folio) 383 { 384 __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); 385 } 386 387 void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, 388 struct folio *folio) 389 { 390 __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); 391 } 392 393 static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 394 struct hugetlb_cgroup *h_cg, 395 bool rsvd) 396 { 397 if (hugetlb_cgroup_disabled() || !h_cg) 398 return; 399 400 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 401 return; 402 403 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, 404 rsvd), 405 nr_pages); 406 407 if (rsvd) 408 css_put(&h_cg->css); 409 } 410 411 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 412 struct hugetlb_cgroup *h_cg) 413 { 414 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); 415 } 416 417 void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, 418 struct hugetlb_cgroup *h_cg) 419 { 420 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); 421 } 422 423 void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, 424 unsigned long end) 425 { 426 if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || 427 !resv->css) 428 return; 429 430 page_counter_uncharge(resv->reservation_counter, 431 (end - start) * resv->pages_per_hpage); 432 css_put(resv->css); 433 } 434 435 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, 436 struct file_region *rg, 437 unsigned long nr_pages, 438 bool region_del) 439 { 440 if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) 441 return; 442 443 if (rg->reservation_counter && resv->pages_per_hpage && 444 !resv->reservation_counter) { 445 page_counter_uncharge(rg->reservation_counter, 446 nr_pages * resv->pages_per_hpage); 447 /* 448 * Only do css_put(rg->css) when we delete the entire region 449 * because one file_region must hold exactly one css reference. 450 */ 451 if (region_del) 452 css_put(rg->css); 453 } 454 } 455 456 enum { 457 RES_USAGE, 458 RES_RSVD_USAGE, 459 RES_LIMIT, 460 RES_RSVD_LIMIT, 461 RES_MAX_USAGE, 462 RES_RSVD_MAX_USAGE, 463 RES_FAILCNT, 464 RES_RSVD_FAILCNT, 465 }; 466 467 static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) 468 { 469 int nid; 470 struct cftype *cft = seq_cft(seq); 471 int idx = MEMFILE_IDX(cft->private); 472 bool legacy = MEMFILE_ATTR(cft->private); 473 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 474 struct cgroup_subsys_state *css; 475 unsigned long usage; 476 477 if (legacy) { 478 /* Add up usage across all nodes for the non-hierarchical total. */ 479 usage = 0; 480 for_each_node_state(nid, N_MEMORY) 481 usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); 482 seq_printf(seq, "total=%lu", usage * PAGE_SIZE); 483 484 /* Simply print the per-node usage for the non-hierarchical total. */ 485 for_each_node_state(nid, N_MEMORY) 486 seq_printf(seq, " N%d=%lu", nid, 487 READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * 488 PAGE_SIZE); 489 seq_putc(seq, '\n'); 490 } 491 492 /* 493 * The hierarchical total is pretty much the value recorded by the 494 * counter, so use that. 495 */ 496 seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", 497 page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); 498 499 /* 500 * For each node, transverse the css tree to obtain the hierarchical 501 * node usage. 502 */ 503 for_each_node_state(nid, N_MEMORY) { 504 usage = 0; 505 rcu_read_lock(); 506 css_for_each_descendant_pre(css, &h_cg->css) { 507 usage += READ_ONCE(hugetlb_cgroup_from_css(css) 508 ->nodeinfo[nid] 509 ->usage[idx]); 510 } 511 rcu_read_unlock(); 512 seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); 513 } 514 515 seq_putc(seq, '\n'); 516 517 return 0; 518 } 519 520 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 521 struct cftype *cft) 522 { 523 struct page_counter *counter; 524 struct page_counter *rsvd_counter; 525 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 526 527 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; 528 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; 529 530 switch (MEMFILE_ATTR(cft->private)) { 531 case RES_USAGE: 532 return (u64)page_counter_read(counter) * PAGE_SIZE; 533 case RES_RSVD_USAGE: 534 return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; 535 case RES_LIMIT: 536 return (u64)counter->max * PAGE_SIZE; 537 case RES_RSVD_LIMIT: 538 return (u64)rsvd_counter->max * PAGE_SIZE; 539 case RES_MAX_USAGE: 540 return (u64)counter->watermark * PAGE_SIZE; 541 case RES_RSVD_MAX_USAGE: 542 return (u64)rsvd_counter->watermark * PAGE_SIZE; 543 case RES_FAILCNT: 544 return counter->failcnt; 545 case RES_RSVD_FAILCNT: 546 return rsvd_counter->failcnt; 547 default: 548 BUG(); 549 } 550 } 551 552 static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) 553 { 554 int idx; 555 u64 val; 556 struct cftype *cft = seq_cft(seq); 557 unsigned long limit; 558 struct page_counter *counter; 559 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 560 561 idx = MEMFILE_IDX(cft->private); 562 counter = &h_cg->hugepage[idx]; 563 564 limit = round_down(PAGE_COUNTER_MAX, 565 pages_per_huge_page(&hstates[idx])); 566 567 switch (MEMFILE_ATTR(cft->private)) { 568 case RES_RSVD_USAGE: 569 counter = &h_cg->rsvd_hugepage[idx]; 570 fallthrough; 571 case RES_USAGE: 572 val = (u64)page_counter_read(counter); 573 seq_printf(seq, "%llu\n", val * PAGE_SIZE); 574 break; 575 case RES_RSVD_LIMIT: 576 counter = &h_cg->rsvd_hugepage[idx]; 577 fallthrough; 578 case RES_LIMIT: 579 val = (u64)counter->max; 580 if (val == limit) 581 seq_puts(seq, "max\n"); 582 else 583 seq_printf(seq, "%llu\n", val * PAGE_SIZE); 584 break; 585 default: 586 BUG(); 587 } 588 589 return 0; 590 } 591 592 static DEFINE_MUTEX(hugetlb_limit_mutex); 593 594 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, 595 char *buf, size_t nbytes, loff_t off, 596 const char *max) 597 { 598 int ret, idx; 599 unsigned long nr_pages; 600 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 601 bool rsvd = false; 602 603 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ 604 return -EINVAL; 605 606 buf = strstrip(buf); 607 ret = page_counter_memparse(buf, max, &nr_pages); 608 if (ret) 609 return ret; 610 611 idx = MEMFILE_IDX(of_cft(of)->private); 612 nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); 613 614 switch (MEMFILE_ATTR(of_cft(of)->private)) { 615 case RES_RSVD_LIMIT: 616 rsvd = true; 617 fallthrough; 618 case RES_LIMIT: 619 mutex_lock(&hugetlb_limit_mutex); 620 ret = page_counter_set_max( 621 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), 622 nr_pages); 623 mutex_unlock(&hugetlb_limit_mutex); 624 break; 625 default: 626 ret = -EINVAL; 627 break; 628 } 629 return ret ?: nbytes; 630 } 631 632 static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, 633 char *buf, size_t nbytes, loff_t off) 634 { 635 return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); 636 } 637 638 static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, 639 char *buf, size_t nbytes, loff_t off) 640 { 641 return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); 642 } 643 644 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, 645 char *buf, size_t nbytes, loff_t off) 646 { 647 int ret = 0; 648 struct page_counter *counter, *rsvd_counter; 649 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 650 651 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; 652 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; 653 654 switch (MEMFILE_ATTR(of_cft(of)->private)) { 655 case RES_MAX_USAGE: 656 page_counter_reset_watermark(counter); 657 break; 658 case RES_RSVD_MAX_USAGE: 659 page_counter_reset_watermark(rsvd_counter); 660 break; 661 case RES_FAILCNT: 662 counter->failcnt = 0; 663 break; 664 case RES_RSVD_FAILCNT: 665 rsvd_counter->failcnt = 0; 666 break; 667 default: 668 ret = -EINVAL; 669 break; 670 } 671 return ret ?: nbytes; 672 } 673 674 static char *mem_fmt(char *buf, int size, unsigned long hsize) 675 { 676 if (hsize >= SZ_1G) 677 snprintf(buf, size, "%luGB", hsize / SZ_1G); 678 else if (hsize >= SZ_1M) 679 snprintf(buf, size, "%luMB", hsize / SZ_1M); 680 else 681 snprintf(buf, size, "%luKB", hsize / SZ_1K); 682 return buf; 683 } 684 685 static int __hugetlb_events_show(struct seq_file *seq, bool local) 686 { 687 int idx; 688 long max; 689 struct cftype *cft = seq_cft(seq); 690 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 691 692 idx = MEMFILE_IDX(cft->private); 693 694 if (local) 695 max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); 696 else 697 max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); 698 699 seq_printf(seq, "max %lu\n", max); 700 701 return 0; 702 } 703 704 static int hugetlb_events_show(struct seq_file *seq, void *v) 705 { 706 return __hugetlb_events_show(seq, false); 707 } 708 709 static int hugetlb_events_local_show(struct seq_file *seq, void *v) 710 { 711 return __hugetlb_events_show(seq, true); 712 } 713 714 static void __init __hugetlb_cgroup_file_dfl_init(int idx) 715 { 716 char buf[32]; 717 struct cftype *cft; 718 struct hstate *h = &hstates[idx]; 719 720 /* format the size */ 721 mem_fmt(buf, sizeof(buf), huge_page_size(h)); 722 723 /* Add the limit file */ 724 cft = &h->cgroup_files_dfl[0]; 725 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf); 726 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 727 cft->seq_show = hugetlb_cgroup_read_u64_max; 728 cft->write = hugetlb_cgroup_write_dfl; 729 cft->flags = CFTYPE_NOT_ON_ROOT; 730 731 /* Add the reservation limit file */ 732 cft = &h->cgroup_files_dfl[1]; 733 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max", buf); 734 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); 735 cft->seq_show = hugetlb_cgroup_read_u64_max; 736 cft->write = hugetlb_cgroup_write_dfl; 737 cft->flags = CFTYPE_NOT_ON_ROOT; 738 739 /* Add the current usage file */ 740 cft = &h->cgroup_files_dfl[2]; 741 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf); 742 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 743 cft->seq_show = hugetlb_cgroup_read_u64_max; 744 cft->flags = CFTYPE_NOT_ON_ROOT; 745 746 /* Add the current reservation usage file */ 747 cft = &h->cgroup_files_dfl[3]; 748 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.current", buf); 749 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); 750 cft->seq_show = hugetlb_cgroup_read_u64_max; 751 cft->flags = CFTYPE_NOT_ON_ROOT; 752 753 /* Add the events file */ 754 cft = &h->cgroup_files_dfl[4]; 755 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf); 756 cft->private = MEMFILE_PRIVATE(idx, 0); 757 cft->seq_show = hugetlb_events_show; 758 cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]); 759 cft->flags = CFTYPE_NOT_ON_ROOT; 760 761 /* Add the events.local file */ 762 cft = &h->cgroup_files_dfl[5]; 763 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf); 764 cft->private = MEMFILE_PRIVATE(idx, 0); 765 cft->seq_show = hugetlb_events_local_show; 766 cft->file_offset = offsetof(struct hugetlb_cgroup, 767 events_local_file[idx]); 768 cft->flags = CFTYPE_NOT_ON_ROOT; 769 770 /* Add the numa stat file */ 771 cft = &h->cgroup_files_dfl[6]; 772 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); 773 cft->private = MEMFILE_PRIVATE(idx, 0); 774 cft->seq_show = hugetlb_cgroup_read_numa_stat; 775 cft->flags = CFTYPE_NOT_ON_ROOT; 776 777 /* NULL terminate the last cft */ 778 cft = &h->cgroup_files_dfl[7]; 779 memset(cft, 0, sizeof(*cft)); 780 781 WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, 782 h->cgroup_files_dfl)); 783 } 784 785 static void __init __hugetlb_cgroup_file_legacy_init(int idx) 786 { 787 char buf[32]; 788 struct cftype *cft; 789 struct hstate *h = &hstates[idx]; 790 791 /* format the size */ 792 mem_fmt(buf, sizeof(buf), huge_page_size(h)); 793 794 /* Add the limit file */ 795 cft = &h->cgroup_files_legacy[0]; 796 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); 797 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 798 cft->read_u64 = hugetlb_cgroup_read_u64; 799 cft->write = hugetlb_cgroup_write_legacy; 800 801 /* Add the reservation limit file */ 802 cft = &h->cgroup_files_legacy[1]; 803 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.limit_in_bytes", buf); 804 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); 805 cft->read_u64 = hugetlb_cgroup_read_u64; 806 cft->write = hugetlb_cgroup_write_legacy; 807 808 /* Add the usage file */ 809 cft = &h->cgroup_files_legacy[2]; 810 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); 811 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 812 cft->read_u64 = hugetlb_cgroup_read_u64; 813 814 /* Add the reservation usage file */ 815 cft = &h->cgroup_files_legacy[3]; 816 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.usage_in_bytes", buf); 817 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); 818 cft->read_u64 = hugetlb_cgroup_read_u64; 819 820 /* Add the MAX usage file */ 821 cft = &h->cgroup_files_legacy[4]; 822 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); 823 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); 824 cft->write = hugetlb_cgroup_reset; 825 cft->read_u64 = hugetlb_cgroup_read_u64; 826 827 /* Add the MAX reservation usage file */ 828 cft = &h->cgroup_files_legacy[5]; 829 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max_usage_in_bytes", buf); 830 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE); 831 cft->write = hugetlb_cgroup_reset; 832 cft->read_u64 = hugetlb_cgroup_read_u64; 833 834 /* Add the failcntfile */ 835 cft = &h->cgroup_files_legacy[6]; 836 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); 837 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); 838 cft->write = hugetlb_cgroup_reset; 839 cft->read_u64 = hugetlb_cgroup_read_u64; 840 841 /* Add the reservation failcntfile */ 842 cft = &h->cgroup_files_legacy[7]; 843 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.failcnt", buf); 844 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT); 845 cft->write = hugetlb_cgroup_reset; 846 cft->read_u64 = hugetlb_cgroup_read_u64; 847 848 /* Add the numa stat file */ 849 cft = &h->cgroup_files_legacy[8]; 850 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); 851 cft->private = MEMFILE_PRIVATE(idx, 1); 852 cft->seq_show = hugetlb_cgroup_read_numa_stat; 853 854 /* NULL terminate the last cft */ 855 cft = &h->cgroup_files_legacy[9]; 856 memset(cft, 0, sizeof(*cft)); 857 858 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, 859 h->cgroup_files_legacy)); 860 } 861 862 static void __init __hugetlb_cgroup_file_init(int idx) 863 { 864 __hugetlb_cgroup_file_dfl_init(idx); 865 __hugetlb_cgroup_file_legacy_init(idx); 866 } 867 868 void __init hugetlb_cgroup_file_init(void) 869 { 870 struct hstate *h; 871 872 for_each_hstate(h) { 873 /* 874 * Add cgroup control files only if the huge page consists 875 * of more than two normal pages. This is because we use 876 * page[2].private for storing cgroup details. 877 */ 878 if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) 879 __hugetlb_cgroup_file_init(hstate_index(h)); 880 } 881 } 882 883 /* 884 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen 885 * when we migrate hugepages 886 */ 887 void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) 888 { 889 struct hugetlb_cgroup *h_cg; 890 struct hugetlb_cgroup *h_cg_rsvd; 891 struct hstate *h = folio_hstate(old_folio); 892 893 if (hugetlb_cgroup_disabled()) 894 return; 895 896 spin_lock_irq(&hugetlb_lock); 897 h_cg = hugetlb_cgroup_from_folio(old_folio); 898 h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); 899 set_hugetlb_cgroup(old_folio, NULL); 900 set_hugetlb_cgroup_rsvd(old_folio, NULL); 901 902 /* move the h_cg details to new cgroup */ 903 set_hugetlb_cgroup(new_folio, h_cg); 904 set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); 905 list_move(&new_folio->lru, &h->hugepage_activelist); 906 spin_unlock_irq(&hugetlb_lock); 907 return; 908 } 909 910 static struct cftype hugetlb_files[] = { 911 {} /* terminate */ 912 }; 913 914 struct cgroup_subsys hugetlb_cgrp_subsys = { 915 .css_alloc = hugetlb_cgroup_css_alloc, 916 .css_offline = hugetlb_cgroup_css_offline, 917 .css_free = hugetlb_cgroup_css_free, 918 .dfl_cftypes = hugetlb_files, 919 .legacy_cftypes = hugetlb_files, 920 }; 921