1 /* 2 * 3 * Copyright IBM Corporation, 2012 4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> 5 * 6 * Cgroup v2 7 * Copyright (C) 2019 Red Hat, Inc. 8 * Author: Giuseppe Scrivano <gscrivan@redhat.com> 9 * 10 * This program is free software; you can redistribute it and/or modify it 11 * under the terms of version 2.1 of the GNU Lesser General Public License 12 * as published by the Free Software Foundation. 13 * 14 * This program is distributed in the hope that it would be useful, but 15 * WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 17 * 18 */ 19 20 #include <linux/cgroup.h> 21 #include <linux/page_counter.h> 22 #include <linux/slab.h> 23 #include <linux/hugetlb.h> 24 #include <linux/hugetlb_cgroup.h> 25 26 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 27 #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) 28 #define MEMFILE_ATTR(val) ((val) & 0xffff) 29 30 /* Use t->m[0] to encode the offset */ 31 #define MEMFILE_OFFSET(t, m0) (((offsetof(t, m0) << 16) | sizeof_field(t, m0))) 32 #define MEMFILE_OFFSET0(val) (((val) >> 16) & 0xffff) 33 #define MEMFILE_FIELD_SIZE(val) ((val) & 0xffff) 34 35 #define DFL_TMPL_SIZE ARRAY_SIZE(hugetlb_dfl_tmpl) 36 #define LEGACY_TMPL_SIZE ARRAY_SIZE(hugetlb_legacy_tmpl) 37 38 static struct hugetlb_cgroup *root_h_cgroup __read_mostly; 39 static struct cftype *dfl_files; 40 static struct cftype *legacy_files; 41 42 static inline struct page_counter * 43 __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, 44 bool rsvd) 45 { 46 if (rsvd) 47 return &h_cg->rsvd_hugepage[idx]; 48 return &h_cg->hugepage[idx]; 49 } 50 51 static inline struct page_counter * 52 hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) 53 { 54 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); 55 } 56 57 static inline struct page_counter * 58 hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) 59 { 60 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); 61 } 62 63 static inline 64 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) 65 { 66 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; 67 } 68 69 static inline 70 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 71 { 72 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); 73 } 74 75 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 76 { 77 return (h_cg == root_h_cgroup); 78 } 79 80 static inline struct hugetlb_cgroup * 81 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) 82 { 83 return hugetlb_cgroup_from_css(h_cg->css.parent); 84 } 85 86 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) 87 { 88 struct hstate *h; 89 90 for_each_hstate(h) { 91 if (page_counter_read( 92 hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) 93 return true; 94 } 95 return false; 96 } 97 98 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, 99 struct hugetlb_cgroup *parent_h_cgroup) 100 { 101 int idx; 102 103 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { 104 struct page_counter *fault_parent = NULL; 105 struct page_counter *rsvd_parent = NULL; 106 unsigned long limit; 107 int ret; 108 109 if (parent_h_cgroup) { 110 fault_parent = hugetlb_cgroup_counter_from_cgroup( 111 parent_h_cgroup, idx); 112 rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( 113 parent_h_cgroup, idx); 114 } 115 page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, 116 idx), 117 fault_parent, false); 118 page_counter_init( 119 hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), 120 rsvd_parent, false); 121 122 limit = round_down(PAGE_COUNTER_MAX, 123 pages_per_huge_page(&hstates[idx])); 124 125 ret = page_counter_set_max( 126 hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), 127 limit); 128 VM_BUG_ON(ret); 129 ret = page_counter_set_max( 130 hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), 131 limit); 132 VM_BUG_ON(ret); 133 } 134 } 135 136 static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) 137 { 138 int node; 139 140 for_each_node(node) 141 kfree(h_cgroup->nodeinfo[node]); 142 kfree(h_cgroup); 143 } 144 145 static struct cgroup_subsys_state * 146 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 147 { 148 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); 149 struct hugetlb_cgroup *h_cgroup; 150 int node; 151 152 h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), 153 GFP_KERNEL); 154 155 if (!h_cgroup) 156 return ERR_PTR(-ENOMEM); 157 158 if (!parent_h_cgroup) 159 root_h_cgroup = h_cgroup; 160 161 /* 162 * TODO: this routine can waste much memory for nodes which will 163 * never be onlined. It's better to use memory hotplug callback 164 * function. 165 */ 166 for_each_node(node) { 167 /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ 168 int node_to_alloc = 169 node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; 170 h_cgroup->nodeinfo[node] = 171 kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), 172 GFP_KERNEL, node_to_alloc); 173 if (!h_cgroup->nodeinfo[node]) 174 goto fail_alloc_nodeinfo; 175 } 176 177 hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); 178 return &h_cgroup->css; 179 180 fail_alloc_nodeinfo: 181 hugetlb_cgroup_free(h_cgroup); 182 return ERR_PTR(-ENOMEM); 183 } 184 185 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) 186 { 187 hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); 188 } 189 190 /* 191 * Should be called with hugetlb_lock held. 192 * Since we are holding hugetlb_lock, pages cannot get moved from 193 * active list or uncharged from the cgroup, So no need to get 194 * page reference and test for page active here. This function 195 * cannot fail. 196 */ 197 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, 198 struct folio *folio) 199 { 200 unsigned int nr_pages; 201 struct page_counter *counter; 202 struct hugetlb_cgroup *hcg; 203 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); 204 205 hcg = hugetlb_cgroup_from_folio(folio); 206 /* 207 * We can have pages in active list without any cgroup 208 * ie, hugepage with less than 3 pages. We can safely 209 * ignore those pages. 210 */ 211 if (!hcg || hcg != h_cg) 212 goto out; 213 214 nr_pages = folio_nr_pages(folio); 215 if (!parent) { 216 parent = root_h_cgroup; 217 /* root has no limit */ 218 page_counter_charge(&parent->hugepage[idx], nr_pages); 219 } 220 counter = &h_cg->hugepage[idx]; 221 /* Take the pages off the local counter */ 222 page_counter_cancel(counter, nr_pages); 223 224 set_hugetlb_cgroup(folio, parent); 225 out: 226 return; 227 } 228 229 /* 230 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 231 * the parent cgroup. 232 */ 233 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) 234 { 235 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 236 struct hstate *h; 237 struct folio *folio; 238 239 do { 240 for_each_hstate(h) { 241 spin_lock_irq(&hugetlb_lock); 242 list_for_each_entry(folio, &h->hugepage_activelist, lru) 243 hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio); 244 245 spin_unlock_irq(&hugetlb_lock); 246 } 247 cond_resched(); 248 } while (hugetlb_cgroup_have_usage(h_cg)); 249 } 250 251 static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, 252 enum hugetlb_memory_event event) 253 { 254 atomic_long_inc(&hugetlb->events_local[idx][event]); 255 cgroup_file_notify(&hugetlb->events_local_file[idx]); 256 257 do { 258 atomic_long_inc(&hugetlb->events[idx][event]); 259 cgroup_file_notify(&hugetlb->events_file[idx]); 260 } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && 261 !hugetlb_cgroup_is_root(hugetlb)); 262 } 263 264 static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 265 struct hugetlb_cgroup **ptr, 266 bool rsvd) 267 { 268 int ret = 0; 269 struct page_counter *counter; 270 struct hugetlb_cgroup *h_cg = NULL; 271 272 if (hugetlb_cgroup_disabled()) 273 goto done; 274 again: 275 rcu_read_lock(); 276 h_cg = hugetlb_cgroup_from_task(current); 277 if (!css_tryget(&h_cg->css)) { 278 rcu_read_unlock(); 279 goto again; 280 } 281 rcu_read_unlock(); 282 283 if (!page_counter_try_charge( 284 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), 285 nr_pages, &counter)) { 286 ret = -ENOMEM; 287 hugetlb_event(h_cg, idx, HUGETLB_MAX); 288 css_put(&h_cg->css); 289 goto done; 290 } 291 /* Reservations take a reference to the css because they do not get 292 * reparented. 293 */ 294 if (!rsvd) 295 css_put(&h_cg->css); 296 done: 297 *ptr = h_cg; 298 return ret; 299 } 300 301 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 302 struct hugetlb_cgroup **ptr) 303 { 304 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); 305 } 306 307 int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, 308 struct hugetlb_cgroup **ptr) 309 { 310 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); 311 } 312 313 /* Should be called with hugetlb_lock held */ 314 static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 315 struct hugetlb_cgroup *h_cg, 316 struct folio *folio, bool rsvd) 317 { 318 if (hugetlb_cgroup_disabled() || !h_cg) 319 return; 320 lockdep_assert_held(&hugetlb_lock); 321 __set_hugetlb_cgroup(folio, h_cg, rsvd); 322 if (!rsvd) { 323 unsigned long usage = 324 h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; 325 /* 326 * This write is not atomic due to fetching usage and writing 327 * to it, but that's fine because we call this with 328 * hugetlb_lock held anyway. 329 */ 330 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], 331 usage + nr_pages); 332 } 333 } 334 335 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 336 struct hugetlb_cgroup *h_cg, 337 struct folio *folio) 338 { 339 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); 340 } 341 342 void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, 343 struct hugetlb_cgroup *h_cg, 344 struct folio *folio) 345 { 346 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); 347 } 348 349 /* 350 * Should be called with hugetlb_lock held 351 */ 352 static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, 353 struct folio *folio, bool rsvd) 354 { 355 struct hugetlb_cgroup *h_cg; 356 357 if (hugetlb_cgroup_disabled()) 358 return; 359 lockdep_assert_held(&hugetlb_lock); 360 h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); 361 if (unlikely(!h_cg)) 362 return; 363 __set_hugetlb_cgroup(folio, NULL, rsvd); 364 365 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, 366 rsvd), 367 nr_pages); 368 369 if (rsvd) 370 css_put(&h_cg->css); 371 else { 372 unsigned long usage = 373 h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; 374 /* 375 * This write is not atomic due to fetching usage and writing 376 * to it, but that's fine because we call this with 377 * hugetlb_lock held anyway. 378 */ 379 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], 380 usage - nr_pages); 381 } 382 } 383 384 void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, 385 struct folio *folio) 386 { 387 __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); 388 } 389 390 void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, 391 struct folio *folio) 392 { 393 __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); 394 } 395 396 static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 397 struct hugetlb_cgroup *h_cg, 398 bool rsvd) 399 { 400 if (hugetlb_cgroup_disabled() || !h_cg) 401 return; 402 403 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, 404 rsvd), 405 nr_pages); 406 407 if (rsvd) 408 css_put(&h_cg->css); 409 } 410 411 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 412 struct hugetlb_cgroup *h_cg) 413 { 414 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); 415 } 416 417 void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, 418 struct hugetlb_cgroup *h_cg) 419 { 420 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); 421 } 422 423 void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, 424 unsigned long end) 425 { 426 if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || 427 !resv->css) 428 return; 429 430 page_counter_uncharge(resv->reservation_counter, 431 (end - start) * resv->pages_per_hpage); 432 css_put(resv->css); 433 } 434 435 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, 436 struct file_region *rg, 437 unsigned long nr_pages, 438 bool region_del) 439 { 440 if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) 441 return; 442 443 if (rg->reservation_counter && resv->pages_per_hpage && 444 !resv->reservation_counter) { 445 page_counter_uncharge(rg->reservation_counter, 446 nr_pages * resv->pages_per_hpage); 447 /* 448 * Only do css_put(rg->css) when we delete the entire region 449 * because one file_region must hold exactly one css reference. 450 */ 451 if (region_del) 452 css_put(rg->css); 453 } 454 } 455 456 enum { 457 RES_USAGE, 458 RES_RSVD_USAGE, 459 RES_LIMIT, 460 RES_RSVD_LIMIT, 461 RES_MAX_USAGE, 462 RES_RSVD_MAX_USAGE, 463 RES_FAILCNT, 464 RES_RSVD_FAILCNT, 465 }; 466 467 static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) 468 { 469 int nid; 470 struct cftype *cft = seq_cft(seq); 471 int idx = MEMFILE_IDX(cft->private); 472 bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys); 473 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 474 struct cgroup_subsys_state *css; 475 unsigned long usage; 476 477 if (legacy) { 478 /* Add up usage across all nodes for the non-hierarchical total. */ 479 usage = 0; 480 for_each_node_state(nid, N_MEMORY) 481 usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); 482 seq_printf(seq, "total=%lu", usage * PAGE_SIZE); 483 484 /* Simply print the per-node usage for the non-hierarchical total. */ 485 for_each_node_state(nid, N_MEMORY) 486 seq_printf(seq, " N%d=%lu", nid, 487 READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * 488 PAGE_SIZE); 489 seq_putc(seq, '\n'); 490 } 491 492 /* 493 * The hierarchical total is pretty much the value recorded by the 494 * counter, so use that. 495 */ 496 seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", 497 page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); 498 499 /* 500 * For each node, transverse the css tree to obtain the hierarchical 501 * node usage. 502 */ 503 for_each_node_state(nid, N_MEMORY) { 504 usage = 0; 505 rcu_read_lock(); 506 css_for_each_descendant_pre(css, &h_cg->css) { 507 usage += READ_ONCE(hugetlb_cgroup_from_css(css) 508 ->nodeinfo[nid] 509 ->usage[idx]); 510 } 511 rcu_read_unlock(); 512 seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); 513 } 514 515 seq_putc(seq, '\n'); 516 517 return 0; 518 } 519 520 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 521 struct cftype *cft) 522 { 523 struct page_counter *counter; 524 struct page_counter *rsvd_counter; 525 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 526 527 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; 528 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; 529 530 switch (MEMFILE_ATTR(cft->private)) { 531 case RES_USAGE: 532 return (u64)page_counter_read(counter) * PAGE_SIZE; 533 case RES_RSVD_USAGE: 534 return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; 535 case RES_LIMIT: 536 return (u64)counter->max * PAGE_SIZE; 537 case RES_RSVD_LIMIT: 538 return (u64)rsvd_counter->max * PAGE_SIZE; 539 case RES_MAX_USAGE: 540 return (u64)counter->watermark * PAGE_SIZE; 541 case RES_RSVD_MAX_USAGE: 542 return (u64)rsvd_counter->watermark * PAGE_SIZE; 543 case RES_FAILCNT: 544 return counter->failcnt; 545 case RES_RSVD_FAILCNT: 546 return rsvd_counter->failcnt; 547 default: 548 BUG(); 549 } 550 } 551 552 static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) 553 { 554 int idx; 555 u64 val; 556 struct cftype *cft = seq_cft(seq); 557 unsigned long limit; 558 struct page_counter *counter; 559 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 560 561 idx = MEMFILE_IDX(cft->private); 562 counter = &h_cg->hugepage[idx]; 563 564 limit = round_down(PAGE_COUNTER_MAX, 565 pages_per_huge_page(&hstates[idx])); 566 567 switch (MEMFILE_ATTR(cft->private)) { 568 case RES_RSVD_USAGE: 569 counter = &h_cg->rsvd_hugepage[idx]; 570 fallthrough; 571 case RES_USAGE: 572 val = (u64)page_counter_read(counter); 573 seq_printf(seq, "%llu\n", val * PAGE_SIZE); 574 break; 575 case RES_RSVD_LIMIT: 576 counter = &h_cg->rsvd_hugepage[idx]; 577 fallthrough; 578 case RES_LIMIT: 579 val = (u64)counter->max; 580 if (val == limit) 581 seq_puts(seq, "max\n"); 582 else 583 seq_printf(seq, "%llu\n", val * PAGE_SIZE); 584 break; 585 default: 586 BUG(); 587 } 588 589 return 0; 590 } 591 592 static DEFINE_MUTEX(hugetlb_limit_mutex); 593 594 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, 595 char *buf, size_t nbytes, loff_t off, 596 const char *max) 597 { 598 int ret, idx; 599 unsigned long nr_pages; 600 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 601 bool rsvd = false; 602 603 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ 604 return -EINVAL; 605 606 buf = strstrip(buf); 607 ret = page_counter_memparse(buf, max, &nr_pages); 608 if (ret) 609 return ret; 610 611 idx = MEMFILE_IDX(of_cft(of)->private); 612 nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); 613 614 switch (MEMFILE_ATTR(of_cft(of)->private)) { 615 case RES_RSVD_LIMIT: 616 rsvd = true; 617 fallthrough; 618 case RES_LIMIT: 619 mutex_lock(&hugetlb_limit_mutex); 620 ret = page_counter_set_max( 621 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), 622 nr_pages); 623 mutex_unlock(&hugetlb_limit_mutex); 624 break; 625 default: 626 ret = -EINVAL; 627 break; 628 } 629 return ret ?: nbytes; 630 } 631 632 static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, 633 char *buf, size_t nbytes, loff_t off) 634 { 635 return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); 636 } 637 638 static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, 639 char *buf, size_t nbytes, loff_t off) 640 { 641 return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); 642 } 643 644 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, 645 char *buf, size_t nbytes, loff_t off) 646 { 647 int ret = 0; 648 struct page_counter *counter, *rsvd_counter; 649 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 650 651 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; 652 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; 653 654 switch (MEMFILE_ATTR(of_cft(of)->private)) { 655 case RES_MAX_USAGE: 656 page_counter_reset_watermark(counter); 657 break; 658 case RES_RSVD_MAX_USAGE: 659 page_counter_reset_watermark(rsvd_counter); 660 break; 661 case RES_FAILCNT: 662 counter->failcnt = 0; 663 break; 664 case RES_RSVD_FAILCNT: 665 rsvd_counter->failcnt = 0; 666 break; 667 default: 668 ret = -EINVAL; 669 break; 670 } 671 return ret ?: nbytes; 672 } 673 674 static char *mem_fmt(char *buf, int size, unsigned long hsize) 675 { 676 if (hsize >= SZ_1G) 677 snprintf(buf, size, "%luGB", hsize / SZ_1G); 678 else if (hsize >= SZ_1M) 679 snprintf(buf, size, "%luMB", hsize / SZ_1M); 680 else 681 snprintf(buf, size, "%luKB", hsize / SZ_1K); 682 return buf; 683 } 684 685 static int __hugetlb_events_show(struct seq_file *seq, bool local) 686 { 687 int idx; 688 long max; 689 struct cftype *cft = seq_cft(seq); 690 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 691 692 idx = MEMFILE_IDX(cft->private); 693 694 if (local) 695 max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); 696 else 697 max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); 698 699 seq_printf(seq, "max %lu\n", max); 700 701 return 0; 702 } 703 704 static int hugetlb_events_show(struct seq_file *seq, void *v) 705 { 706 return __hugetlb_events_show(seq, false); 707 } 708 709 static int hugetlb_events_local_show(struct seq_file *seq, void *v) 710 { 711 return __hugetlb_events_show(seq, true); 712 } 713 714 static struct cftype hugetlb_dfl_tmpl[] = { 715 { 716 .name = "max", 717 .private = RES_LIMIT, 718 .seq_show = hugetlb_cgroup_read_u64_max, 719 .write = hugetlb_cgroup_write_dfl, 720 .flags = CFTYPE_NOT_ON_ROOT, 721 }, 722 { 723 .name = "rsvd.max", 724 .private = RES_RSVD_LIMIT, 725 .seq_show = hugetlb_cgroup_read_u64_max, 726 .write = hugetlb_cgroup_write_dfl, 727 .flags = CFTYPE_NOT_ON_ROOT, 728 }, 729 { 730 .name = "current", 731 .private = RES_USAGE, 732 .seq_show = hugetlb_cgroup_read_u64_max, 733 .flags = CFTYPE_NOT_ON_ROOT, 734 }, 735 { 736 .name = "rsvd.current", 737 .private = RES_RSVD_USAGE, 738 .seq_show = hugetlb_cgroup_read_u64_max, 739 .flags = CFTYPE_NOT_ON_ROOT, 740 }, 741 { 742 .name = "events", 743 .seq_show = hugetlb_events_show, 744 .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]), 745 .flags = CFTYPE_NOT_ON_ROOT, 746 }, 747 { 748 .name = "events.local", 749 .seq_show = hugetlb_events_local_show, 750 .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]), 751 .flags = CFTYPE_NOT_ON_ROOT, 752 }, 753 { 754 .name = "numa_stat", 755 .seq_show = hugetlb_cgroup_read_numa_stat, 756 .flags = CFTYPE_NOT_ON_ROOT, 757 }, 758 /* don't need terminator here */ 759 }; 760 761 static struct cftype hugetlb_legacy_tmpl[] = { 762 { 763 .name = "limit_in_bytes", 764 .private = RES_LIMIT, 765 .read_u64 = hugetlb_cgroup_read_u64, 766 .write = hugetlb_cgroup_write_legacy, 767 }, 768 { 769 .name = "rsvd.limit_in_bytes", 770 .private = RES_RSVD_LIMIT, 771 .read_u64 = hugetlb_cgroup_read_u64, 772 .write = hugetlb_cgroup_write_legacy, 773 }, 774 { 775 .name = "usage_in_bytes", 776 .private = RES_USAGE, 777 .read_u64 = hugetlb_cgroup_read_u64, 778 }, 779 { 780 .name = "rsvd.usage_in_bytes", 781 .private = RES_RSVD_USAGE, 782 .read_u64 = hugetlb_cgroup_read_u64, 783 }, 784 { 785 .name = "max_usage_in_bytes", 786 .private = RES_MAX_USAGE, 787 .write = hugetlb_cgroup_reset, 788 .read_u64 = hugetlb_cgroup_read_u64, 789 }, 790 { 791 .name = "rsvd.max_usage_in_bytes", 792 .private = RES_RSVD_MAX_USAGE, 793 .write = hugetlb_cgroup_reset, 794 .read_u64 = hugetlb_cgroup_read_u64, 795 }, 796 { 797 .name = "failcnt", 798 .private = RES_FAILCNT, 799 .write = hugetlb_cgroup_reset, 800 .read_u64 = hugetlb_cgroup_read_u64, 801 }, 802 { 803 .name = "rsvd.failcnt", 804 .private = RES_RSVD_FAILCNT, 805 .write = hugetlb_cgroup_reset, 806 .read_u64 = hugetlb_cgroup_read_u64, 807 }, 808 { 809 .name = "numa_stat", 810 .seq_show = hugetlb_cgroup_read_numa_stat, 811 }, 812 /* don't need terminator here */ 813 }; 814 815 static void __init 816 hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft, 817 struct cftype *tmpl, int tmpl_size) 818 { 819 char buf[32]; 820 int i, idx = hstate_index(h); 821 822 /* format the size */ 823 mem_fmt(buf, sizeof(buf), huge_page_size(h)); 824 825 for (i = 0; i < tmpl_size; cft++, tmpl++, i++) { 826 *cft = *tmpl; 827 /* rebuild the name */ 828 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name); 829 /* rebuild the private */ 830 cft->private = MEMFILE_PRIVATE(idx, tmpl->private); 831 /* rebuild the file_offset */ 832 if (tmpl->file_offset) { 833 unsigned int offset = tmpl->file_offset; 834 835 cft->file_offset = MEMFILE_OFFSET0(offset) + 836 MEMFILE_FIELD_SIZE(offset) * idx; 837 } 838 839 lockdep_register_key(&cft->lockdep_key); 840 } 841 } 842 843 static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h) 844 { 845 int idx = hstate_index(h); 846 847 hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE, 848 hugetlb_dfl_tmpl, DFL_TMPL_SIZE); 849 } 850 851 static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h) 852 { 853 int idx = hstate_index(h); 854 855 hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE, 856 hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE); 857 } 858 859 static void __init __hugetlb_cgroup_file_init(struct hstate *h) 860 { 861 __hugetlb_cgroup_file_dfl_init(h); 862 __hugetlb_cgroup_file_legacy_init(h); 863 } 864 865 static void __init __hugetlb_cgroup_file_pre_init(void) 866 { 867 int cft_count; 868 869 cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */ 870 dfl_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); 871 BUG_ON(!dfl_files); 872 cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */ 873 legacy_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); 874 BUG_ON(!legacy_files); 875 } 876 877 static void __init __hugetlb_cgroup_file_post_init(void) 878 { 879 WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, 880 dfl_files)); 881 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, 882 legacy_files)); 883 } 884 885 void __init hugetlb_cgroup_file_init(void) 886 { 887 struct hstate *h; 888 889 __hugetlb_cgroup_file_pre_init(); 890 for_each_hstate(h) 891 __hugetlb_cgroup_file_init(h); 892 __hugetlb_cgroup_file_post_init(); 893 } 894 895 /* 896 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen 897 * when we migrate hugepages 898 */ 899 void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) 900 { 901 struct hugetlb_cgroup *h_cg; 902 struct hugetlb_cgroup *h_cg_rsvd; 903 struct hstate *h = folio_hstate(old_folio); 904 905 if (hugetlb_cgroup_disabled()) 906 return; 907 908 spin_lock_irq(&hugetlb_lock); 909 h_cg = hugetlb_cgroup_from_folio(old_folio); 910 h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); 911 set_hugetlb_cgroup(old_folio, NULL); 912 set_hugetlb_cgroup_rsvd(old_folio, NULL); 913 914 /* move the h_cg details to new cgroup */ 915 set_hugetlb_cgroup(new_folio, h_cg); 916 set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); 917 list_move(&new_folio->lru, &h->hugepage_activelist); 918 spin_unlock_irq(&hugetlb_lock); 919 } 920 921 static struct cftype hugetlb_files[] = { 922 {} /* terminate */ 923 }; 924 925 struct cgroup_subsys hugetlb_cgrp_subsys = { 926 .css_alloc = hugetlb_cgroup_css_alloc, 927 .css_offline = hugetlb_cgroup_css_offline, 928 .css_free = hugetlb_cgroup_css_free, 929 .dfl_cftypes = hugetlb_files, 930 .legacy_cftypes = hugetlb_files, 931 }; 932