1 /* 2 * 3 * Copyright IBM Corporation, 2012 4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> 5 * 6 * Cgroup v2 7 * Copyright (C) 2019 Red Hat, Inc. 8 * Author: Giuseppe Scrivano <gscrivan@redhat.com> 9 * 10 * This program is free software; you can redistribute it and/or modify it 11 * under the terms of version 2.1 of the GNU Lesser General Public License 12 * as published by the Free Software Foundation. 13 * 14 * This program is distributed in the hope that it would be useful, but 15 * WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 17 * 18 */ 19 20 #include <linux/cgroup.h> 21 #include <linux/page_counter.h> 22 #include <linux/slab.h> 23 #include <linux/hugetlb.h> 24 #include <linux/hugetlb_cgroup.h> 25 26 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 27 #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) 28 #define MEMFILE_ATTR(val) ((val) & 0xffff) 29 30 static struct hugetlb_cgroup *root_h_cgroup __read_mostly; 31 32 static inline struct page_counter * 33 __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, 34 bool rsvd) 35 { 36 if (rsvd) 37 return &h_cg->rsvd_hugepage[idx]; 38 return &h_cg->hugepage[idx]; 39 } 40 41 static inline struct page_counter * 42 hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) 43 { 44 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); 45 } 46 47 static inline struct page_counter * 48 hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) 49 { 50 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); 51 } 52 53 static inline 54 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) 55 { 56 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; 57 } 58 59 static inline 60 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 61 { 62 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); 63 } 64 65 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 66 { 67 return (h_cg == root_h_cgroup); 68 } 69 70 static inline struct hugetlb_cgroup * 71 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) 72 { 73 return hugetlb_cgroup_from_css(h_cg->css.parent); 74 } 75 76 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) 77 { 78 struct hstate *h; 79 80 for_each_hstate(h) { 81 if (page_counter_read( 82 hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) 83 return true; 84 } 85 return false; 86 } 87 88 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, 89 struct hugetlb_cgroup *parent_h_cgroup) 90 { 91 int idx; 92 93 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { 94 struct page_counter *fault_parent = NULL; 95 struct page_counter *rsvd_parent = NULL; 96 unsigned long limit; 97 int ret; 98 99 if (parent_h_cgroup) { 100 fault_parent = hugetlb_cgroup_counter_from_cgroup( 101 parent_h_cgroup, idx); 102 rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( 103 parent_h_cgroup, idx); 104 } 105 page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, 106 idx), 107 fault_parent); 108 page_counter_init( 109 hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), 110 rsvd_parent); 111 112 limit = round_down(PAGE_COUNTER_MAX, 113 pages_per_huge_page(&hstates[idx])); 114 115 ret = page_counter_set_max( 116 hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), 117 limit); 118 VM_BUG_ON(ret); 119 ret = page_counter_set_max( 120 hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), 121 limit); 122 VM_BUG_ON(ret); 123 } 124 } 125 126 static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) 127 { 128 int node; 129 130 for_each_node(node) 131 kfree(h_cgroup->nodeinfo[node]); 132 kfree(h_cgroup); 133 } 134 135 static struct cgroup_subsys_state * 136 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 137 { 138 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); 139 struct hugetlb_cgroup *h_cgroup; 140 int node; 141 142 h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), 143 GFP_KERNEL); 144 145 if (!h_cgroup) 146 return ERR_PTR(-ENOMEM); 147 148 if (!parent_h_cgroup) 149 root_h_cgroup = h_cgroup; 150 151 /* 152 * TODO: this routine can waste much memory for nodes which will 153 * never be onlined. It's better to use memory hotplug callback 154 * function. 155 */ 156 for_each_node(node) { 157 /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ 158 int node_to_alloc = 159 node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; 160 h_cgroup->nodeinfo[node] = 161 kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), 162 GFP_KERNEL, node_to_alloc); 163 if (!h_cgroup->nodeinfo[node]) 164 goto fail_alloc_nodeinfo; 165 } 166 167 hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); 168 return &h_cgroup->css; 169 170 fail_alloc_nodeinfo: 171 hugetlb_cgroup_free(h_cgroup); 172 return ERR_PTR(-ENOMEM); 173 } 174 175 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) 176 { 177 hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); 178 } 179 180 /* 181 * Should be called with hugetlb_lock held. 182 * Since we are holding hugetlb_lock, pages cannot get moved from 183 * active list or uncharged from the cgroup, So no need to get 184 * page reference and test for page active here. This function 185 * cannot fail. 186 */ 187 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, 188 struct page *page) 189 { 190 unsigned int nr_pages; 191 struct page_counter *counter; 192 struct hugetlb_cgroup *page_hcg; 193 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); 194 struct folio *folio = page_folio(page); 195 196 page_hcg = hugetlb_cgroup_from_folio(folio); 197 /* 198 * We can have pages in active list without any cgroup 199 * ie, hugepage with less than 3 pages. We can safely 200 * ignore those pages. 201 */ 202 if (!page_hcg || page_hcg != h_cg) 203 goto out; 204 205 nr_pages = compound_nr(page); 206 if (!parent) { 207 parent = root_h_cgroup; 208 /* root has no limit */ 209 page_counter_charge(&parent->hugepage[idx], nr_pages); 210 } 211 counter = &h_cg->hugepage[idx]; 212 /* Take the pages off the local counter */ 213 page_counter_cancel(counter, nr_pages); 214 215 set_hugetlb_cgroup(folio, parent); 216 out: 217 return; 218 } 219 220 /* 221 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 222 * the parent cgroup. 223 */ 224 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) 225 { 226 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 227 struct hstate *h; 228 struct page *page; 229 230 do { 231 for_each_hstate(h) { 232 spin_lock_irq(&hugetlb_lock); 233 list_for_each_entry(page, &h->hugepage_activelist, lru) 234 hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); 235 236 spin_unlock_irq(&hugetlb_lock); 237 } 238 cond_resched(); 239 } while (hugetlb_cgroup_have_usage(h_cg)); 240 } 241 242 static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, 243 enum hugetlb_memory_event event) 244 { 245 atomic_long_inc(&hugetlb->events_local[idx][event]); 246 cgroup_file_notify(&hugetlb->events_local_file[idx]); 247 248 do { 249 atomic_long_inc(&hugetlb->events[idx][event]); 250 cgroup_file_notify(&hugetlb->events_file[idx]); 251 } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && 252 !hugetlb_cgroup_is_root(hugetlb)); 253 } 254 255 static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 256 struct hugetlb_cgroup **ptr, 257 bool rsvd) 258 { 259 int ret = 0; 260 struct page_counter *counter; 261 struct hugetlb_cgroup *h_cg = NULL; 262 263 if (hugetlb_cgroup_disabled()) 264 goto done; 265 /* 266 * We don't charge any cgroup if the compound page have less 267 * than 3 pages. 268 */ 269 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 270 goto done; 271 again: 272 rcu_read_lock(); 273 h_cg = hugetlb_cgroup_from_task(current); 274 if (!css_tryget(&h_cg->css)) { 275 rcu_read_unlock(); 276 goto again; 277 } 278 rcu_read_unlock(); 279 280 if (!page_counter_try_charge( 281 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), 282 nr_pages, &counter)) { 283 ret = -ENOMEM; 284 hugetlb_event(h_cg, idx, HUGETLB_MAX); 285 css_put(&h_cg->css); 286 goto done; 287 } 288 /* Reservations take a reference to the css because they do not get 289 * reparented. 290 */ 291 if (!rsvd) 292 css_put(&h_cg->css); 293 done: 294 *ptr = h_cg; 295 return ret; 296 } 297 298 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 299 struct hugetlb_cgroup **ptr) 300 { 301 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); 302 } 303 304 int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, 305 struct hugetlb_cgroup **ptr) 306 { 307 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); 308 } 309 310 /* Should be called with hugetlb_lock held */ 311 static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 312 struct hugetlb_cgroup *h_cg, 313 struct folio *folio, bool rsvd) 314 { 315 if (hugetlb_cgroup_disabled() || !h_cg) 316 return; 317 318 __set_hugetlb_cgroup(folio, h_cg, rsvd); 319 if (!rsvd) { 320 unsigned long usage = 321 h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; 322 /* 323 * This write is not atomic due to fetching usage and writing 324 * to it, but that's fine because we call this with 325 * hugetlb_lock held anyway. 326 */ 327 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], 328 usage + nr_pages); 329 } 330 } 331 332 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 333 struct hugetlb_cgroup *h_cg, 334 struct page *page) 335 { 336 struct folio *folio = page_folio(page); 337 338 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); 339 } 340 341 void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, 342 struct hugetlb_cgroup *h_cg, 343 struct page *page) 344 { 345 struct folio *folio = page_folio(page); 346 347 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); 348 } 349 350 /* 351 * Should be called with hugetlb_lock held 352 */ 353 static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, 354 struct folio *folio, bool rsvd) 355 { 356 struct hugetlb_cgroup *h_cg; 357 358 if (hugetlb_cgroup_disabled()) 359 return; 360 lockdep_assert_held(&hugetlb_lock); 361 h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); 362 if (unlikely(!h_cg)) 363 return; 364 __set_hugetlb_cgroup(folio, NULL, rsvd); 365 366 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, 367 rsvd), 368 nr_pages); 369 370 if (rsvd) 371 css_put(&h_cg->css); 372 else { 373 unsigned long usage = 374 h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; 375 /* 376 * This write is not atomic due to fetching usage and writing 377 * to it, but that's fine because we call this with 378 * hugetlb_lock held anyway. 379 */ 380 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], 381 usage - nr_pages); 382 } 383 } 384 385 void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, 386 struct folio *folio) 387 { 388 __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); 389 } 390 391 void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, 392 struct folio *folio) 393 { 394 __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); 395 } 396 397 static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 398 struct hugetlb_cgroup *h_cg, 399 bool rsvd) 400 { 401 if (hugetlb_cgroup_disabled() || !h_cg) 402 return; 403 404 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 405 return; 406 407 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, 408 rsvd), 409 nr_pages); 410 411 if (rsvd) 412 css_put(&h_cg->css); 413 } 414 415 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 416 struct hugetlb_cgroup *h_cg) 417 { 418 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); 419 } 420 421 void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, 422 struct hugetlb_cgroup *h_cg) 423 { 424 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); 425 } 426 427 void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, 428 unsigned long end) 429 { 430 if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || 431 !resv->css) 432 return; 433 434 page_counter_uncharge(resv->reservation_counter, 435 (end - start) * resv->pages_per_hpage); 436 css_put(resv->css); 437 } 438 439 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, 440 struct file_region *rg, 441 unsigned long nr_pages, 442 bool region_del) 443 { 444 if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) 445 return; 446 447 if (rg->reservation_counter && resv->pages_per_hpage && 448 !resv->reservation_counter) { 449 page_counter_uncharge(rg->reservation_counter, 450 nr_pages * resv->pages_per_hpage); 451 /* 452 * Only do css_put(rg->css) when we delete the entire region 453 * because one file_region must hold exactly one css reference. 454 */ 455 if (region_del) 456 css_put(rg->css); 457 } 458 } 459 460 enum { 461 RES_USAGE, 462 RES_RSVD_USAGE, 463 RES_LIMIT, 464 RES_RSVD_LIMIT, 465 RES_MAX_USAGE, 466 RES_RSVD_MAX_USAGE, 467 RES_FAILCNT, 468 RES_RSVD_FAILCNT, 469 }; 470 471 static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) 472 { 473 int nid; 474 struct cftype *cft = seq_cft(seq); 475 int idx = MEMFILE_IDX(cft->private); 476 bool legacy = MEMFILE_ATTR(cft->private); 477 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 478 struct cgroup_subsys_state *css; 479 unsigned long usage; 480 481 if (legacy) { 482 /* Add up usage across all nodes for the non-hierarchical total. */ 483 usage = 0; 484 for_each_node_state(nid, N_MEMORY) 485 usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); 486 seq_printf(seq, "total=%lu", usage * PAGE_SIZE); 487 488 /* Simply print the per-node usage for the non-hierarchical total. */ 489 for_each_node_state(nid, N_MEMORY) 490 seq_printf(seq, " N%d=%lu", nid, 491 READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * 492 PAGE_SIZE); 493 seq_putc(seq, '\n'); 494 } 495 496 /* 497 * The hierarchical total is pretty much the value recorded by the 498 * counter, so use that. 499 */ 500 seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", 501 page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); 502 503 /* 504 * For each node, transverse the css tree to obtain the hierarchical 505 * node usage. 506 */ 507 for_each_node_state(nid, N_MEMORY) { 508 usage = 0; 509 rcu_read_lock(); 510 css_for_each_descendant_pre(css, &h_cg->css) { 511 usage += READ_ONCE(hugetlb_cgroup_from_css(css) 512 ->nodeinfo[nid] 513 ->usage[idx]); 514 } 515 rcu_read_unlock(); 516 seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); 517 } 518 519 seq_putc(seq, '\n'); 520 521 return 0; 522 } 523 524 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 525 struct cftype *cft) 526 { 527 struct page_counter *counter; 528 struct page_counter *rsvd_counter; 529 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 530 531 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; 532 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; 533 534 switch (MEMFILE_ATTR(cft->private)) { 535 case RES_USAGE: 536 return (u64)page_counter_read(counter) * PAGE_SIZE; 537 case RES_RSVD_USAGE: 538 return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; 539 case RES_LIMIT: 540 return (u64)counter->max * PAGE_SIZE; 541 case RES_RSVD_LIMIT: 542 return (u64)rsvd_counter->max * PAGE_SIZE; 543 case RES_MAX_USAGE: 544 return (u64)counter->watermark * PAGE_SIZE; 545 case RES_RSVD_MAX_USAGE: 546 return (u64)rsvd_counter->watermark * PAGE_SIZE; 547 case RES_FAILCNT: 548 return counter->failcnt; 549 case RES_RSVD_FAILCNT: 550 return rsvd_counter->failcnt; 551 default: 552 BUG(); 553 } 554 } 555 556 static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) 557 { 558 int idx; 559 u64 val; 560 struct cftype *cft = seq_cft(seq); 561 unsigned long limit; 562 struct page_counter *counter; 563 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 564 565 idx = MEMFILE_IDX(cft->private); 566 counter = &h_cg->hugepage[idx]; 567 568 limit = round_down(PAGE_COUNTER_MAX, 569 pages_per_huge_page(&hstates[idx])); 570 571 switch (MEMFILE_ATTR(cft->private)) { 572 case RES_RSVD_USAGE: 573 counter = &h_cg->rsvd_hugepage[idx]; 574 fallthrough; 575 case RES_USAGE: 576 val = (u64)page_counter_read(counter); 577 seq_printf(seq, "%llu\n", val * PAGE_SIZE); 578 break; 579 case RES_RSVD_LIMIT: 580 counter = &h_cg->rsvd_hugepage[idx]; 581 fallthrough; 582 case RES_LIMIT: 583 val = (u64)counter->max; 584 if (val == limit) 585 seq_puts(seq, "max\n"); 586 else 587 seq_printf(seq, "%llu\n", val * PAGE_SIZE); 588 break; 589 default: 590 BUG(); 591 } 592 593 return 0; 594 } 595 596 static DEFINE_MUTEX(hugetlb_limit_mutex); 597 598 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, 599 char *buf, size_t nbytes, loff_t off, 600 const char *max) 601 { 602 int ret, idx; 603 unsigned long nr_pages; 604 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 605 bool rsvd = false; 606 607 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ 608 return -EINVAL; 609 610 buf = strstrip(buf); 611 ret = page_counter_memparse(buf, max, &nr_pages); 612 if (ret) 613 return ret; 614 615 idx = MEMFILE_IDX(of_cft(of)->private); 616 nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); 617 618 switch (MEMFILE_ATTR(of_cft(of)->private)) { 619 case RES_RSVD_LIMIT: 620 rsvd = true; 621 fallthrough; 622 case RES_LIMIT: 623 mutex_lock(&hugetlb_limit_mutex); 624 ret = page_counter_set_max( 625 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), 626 nr_pages); 627 mutex_unlock(&hugetlb_limit_mutex); 628 break; 629 default: 630 ret = -EINVAL; 631 break; 632 } 633 return ret ?: nbytes; 634 } 635 636 static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, 637 char *buf, size_t nbytes, loff_t off) 638 { 639 return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); 640 } 641 642 static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, 643 char *buf, size_t nbytes, loff_t off) 644 { 645 return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); 646 } 647 648 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, 649 char *buf, size_t nbytes, loff_t off) 650 { 651 int ret = 0; 652 struct page_counter *counter, *rsvd_counter; 653 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 654 655 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; 656 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; 657 658 switch (MEMFILE_ATTR(of_cft(of)->private)) { 659 case RES_MAX_USAGE: 660 page_counter_reset_watermark(counter); 661 break; 662 case RES_RSVD_MAX_USAGE: 663 page_counter_reset_watermark(rsvd_counter); 664 break; 665 case RES_FAILCNT: 666 counter->failcnt = 0; 667 break; 668 case RES_RSVD_FAILCNT: 669 rsvd_counter->failcnt = 0; 670 break; 671 default: 672 ret = -EINVAL; 673 break; 674 } 675 return ret ?: nbytes; 676 } 677 678 static char *mem_fmt(char *buf, int size, unsigned long hsize) 679 { 680 if (hsize >= SZ_1G) 681 snprintf(buf, size, "%luGB", hsize / SZ_1G); 682 else if (hsize >= SZ_1M) 683 snprintf(buf, size, "%luMB", hsize / SZ_1M); 684 else 685 snprintf(buf, size, "%luKB", hsize / SZ_1K); 686 return buf; 687 } 688 689 static int __hugetlb_events_show(struct seq_file *seq, bool local) 690 { 691 int idx; 692 long max; 693 struct cftype *cft = seq_cft(seq); 694 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 695 696 idx = MEMFILE_IDX(cft->private); 697 698 if (local) 699 max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); 700 else 701 max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); 702 703 seq_printf(seq, "max %lu\n", max); 704 705 return 0; 706 } 707 708 static int hugetlb_events_show(struct seq_file *seq, void *v) 709 { 710 return __hugetlb_events_show(seq, false); 711 } 712 713 static int hugetlb_events_local_show(struct seq_file *seq, void *v) 714 { 715 return __hugetlb_events_show(seq, true); 716 } 717 718 static void __init __hugetlb_cgroup_file_dfl_init(int idx) 719 { 720 char buf[32]; 721 struct cftype *cft; 722 struct hstate *h = &hstates[idx]; 723 724 /* format the size */ 725 mem_fmt(buf, sizeof(buf), huge_page_size(h)); 726 727 /* Add the limit file */ 728 cft = &h->cgroup_files_dfl[0]; 729 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf); 730 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 731 cft->seq_show = hugetlb_cgroup_read_u64_max; 732 cft->write = hugetlb_cgroup_write_dfl; 733 cft->flags = CFTYPE_NOT_ON_ROOT; 734 735 /* Add the reservation limit file */ 736 cft = &h->cgroup_files_dfl[1]; 737 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max", buf); 738 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); 739 cft->seq_show = hugetlb_cgroup_read_u64_max; 740 cft->write = hugetlb_cgroup_write_dfl; 741 cft->flags = CFTYPE_NOT_ON_ROOT; 742 743 /* Add the current usage file */ 744 cft = &h->cgroup_files_dfl[2]; 745 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf); 746 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 747 cft->seq_show = hugetlb_cgroup_read_u64_max; 748 cft->flags = CFTYPE_NOT_ON_ROOT; 749 750 /* Add the current reservation usage file */ 751 cft = &h->cgroup_files_dfl[3]; 752 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.current", buf); 753 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); 754 cft->seq_show = hugetlb_cgroup_read_u64_max; 755 cft->flags = CFTYPE_NOT_ON_ROOT; 756 757 /* Add the events file */ 758 cft = &h->cgroup_files_dfl[4]; 759 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf); 760 cft->private = MEMFILE_PRIVATE(idx, 0); 761 cft->seq_show = hugetlb_events_show; 762 cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]); 763 cft->flags = CFTYPE_NOT_ON_ROOT; 764 765 /* Add the events.local file */ 766 cft = &h->cgroup_files_dfl[5]; 767 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf); 768 cft->private = MEMFILE_PRIVATE(idx, 0); 769 cft->seq_show = hugetlb_events_local_show; 770 cft->file_offset = offsetof(struct hugetlb_cgroup, 771 events_local_file[idx]); 772 cft->flags = CFTYPE_NOT_ON_ROOT; 773 774 /* Add the numa stat file */ 775 cft = &h->cgroup_files_dfl[6]; 776 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); 777 cft->private = MEMFILE_PRIVATE(idx, 0); 778 cft->seq_show = hugetlb_cgroup_read_numa_stat; 779 cft->flags = CFTYPE_NOT_ON_ROOT; 780 781 /* NULL terminate the last cft */ 782 cft = &h->cgroup_files_dfl[7]; 783 memset(cft, 0, sizeof(*cft)); 784 785 WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, 786 h->cgroup_files_dfl)); 787 } 788 789 static void __init __hugetlb_cgroup_file_legacy_init(int idx) 790 { 791 char buf[32]; 792 struct cftype *cft; 793 struct hstate *h = &hstates[idx]; 794 795 /* format the size */ 796 mem_fmt(buf, sizeof(buf), huge_page_size(h)); 797 798 /* Add the limit file */ 799 cft = &h->cgroup_files_legacy[0]; 800 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); 801 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 802 cft->read_u64 = hugetlb_cgroup_read_u64; 803 cft->write = hugetlb_cgroup_write_legacy; 804 805 /* Add the reservation limit file */ 806 cft = &h->cgroup_files_legacy[1]; 807 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.limit_in_bytes", buf); 808 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); 809 cft->read_u64 = hugetlb_cgroup_read_u64; 810 cft->write = hugetlb_cgroup_write_legacy; 811 812 /* Add the usage file */ 813 cft = &h->cgroup_files_legacy[2]; 814 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); 815 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 816 cft->read_u64 = hugetlb_cgroup_read_u64; 817 818 /* Add the reservation usage file */ 819 cft = &h->cgroup_files_legacy[3]; 820 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.usage_in_bytes", buf); 821 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); 822 cft->read_u64 = hugetlb_cgroup_read_u64; 823 824 /* Add the MAX usage file */ 825 cft = &h->cgroup_files_legacy[4]; 826 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); 827 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); 828 cft->write = hugetlb_cgroup_reset; 829 cft->read_u64 = hugetlb_cgroup_read_u64; 830 831 /* Add the MAX reservation usage file */ 832 cft = &h->cgroup_files_legacy[5]; 833 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max_usage_in_bytes", buf); 834 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE); 835 cft->write = hugetlb_cgroup_reset; 836 cft->read_u64 = hugetlb_cgroup_read_u64; 837 838 /* Add the failcntfile */ 839 cft = &h->cgroup_files_legacy[6]; 840 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); 841 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); 842 cft->write = hugetlb_cgroup_reset; 843 cft->read_u64 = hugetlb_cgroup_read_u64; 844 845 /* Add the reservation failcntfile */ 846 cft = &h->cgroup_files_legacy[7]; 847 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.failcnt", buf); 848 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT); 849 cft->write = hugetlb_cgroup_reset; 850 cft->read_u64 = hugetlb_cgroup_read_u64; 851 852 /* Add the numa stat file */ 853 cft = &h->cgroup_files_legacy[8]; 854 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); 855 cft->private = MEMFILE_PRIVATE(idx, 1); 856 cft->seq_show = hugetlb_cgroup_read_numa_stat; 857 858 /* NULL terminate the last cft */ 859 cft = &h->cgroup_files_legacy[9]; 860 memset(cft, 0, sizeof(*cft)); 861 862 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, 863 h->cgroup_files_legacy)); 864 } 865 866 static void __init __hugetlb_cgroup_file_init(int idx) 867 { 868 __hugetlb_cgroup_file_dfl_init(idx); 869 __hugetlb_cgroup_file_legacy_init(idx); 870 } 871 872 void __init hugetlb_cgroup_file_init(void) 873 { 874 struct hstate *h; 875 876 for_each_hstate(h) { 877 /* 878 * Add cgroup control files only if the huge page consists 879 * of more than two normal pages. This is because we use 880 * page[2].private for storing cgroup details. 881 */ 882 if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) 883 __hugetlb_cgroup_file_init(hstate_index(h)); 884 } 885 } 886 887 /* 888 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen 889 * when we migrate hugepages 890 */ 891 void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) 892 { 893 struct hugetlb_cgroup *h_cg; 894 struct hugetlb_cgroup *h_cg_rsvd; 895 struct hstate *h = folio_hstate(old_folio); 896 897 if (hugetlb_cgroup_disabled()) 898 return; 899 900 spin_lock_irq(&hugetlb_lock); 901 h_cg = hugetlb_cgroup_from_folio(old_folio); 902 h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); 903 set_hugetlb_cgroup(old_folio, NULL); 904 set_hugetlb_cgroup_rsvd(old_folio, NULL); 905 906 /* move the h_cg details to new cgroup */ 907 set_hugetlb_cgroup(new_folio, h_cg); 908 set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); 909 list_move(&new_folio->lru, &h->hugepage_activelist); 910 spin_unlock_irq(&hugetlb_lock); 911 return; 912 } 913 914 static struct cftype hugetlb_files[] = { 915 {} /* terminate */ 916 }; 917 918 struct cgroup_subsys hugetlb_cgrp_subsys = { 919 .css_alloc = hugetlb_cgroup_css_alloc, 920 .css_offline = hugetlb_cgroup_css_offline, 921 .css_free = hugetlb_cgroup_css_free, 922 .dfl_cftypes = hugetlb_files, 923 .legacy_cftypes = hugetlb_files, 924 }; 925