1 /* 2 * 3 * Copyright IBM Corporation, 2012 4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of version 2.1 of the GNU Lesser General Public License 8 * as published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it would be useful, but 11 * WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 * 14 */ 15 16 #include <linux/cgroup.h> 17 #include <linux/page_counter.h> 18 #include <linux/slab.h> 19 #include <linux/hugetlb.h> 20 #include <linux/hugetlb_cgroup.h> 21 22 struct hugetlb_cgroup { 23 struct cgroup_subsys_state css; 24 /* 25 * the counter to account for hugepages from hugetlb. 26 */ 27 struct page_counter hugepage[HUGE_MAX_HSTATE]; 28 }; 29 30 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 31 #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) 32 #define MEMFILE_ATTR(val) ((val) & 0xffff) 33 34 static struct hugetlb_cgroup *root_h_cgroup __read_mostly; 35 36 static inline 37 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) 38 { 39 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; 40 } 41 42 static inline 43 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 44 { 45 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); 46 } 47 48 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 49 { 50 return (h_cg == root_h_cgroup); 51 } 52 53 static inline struct hugetlb_cgroup * 54 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) 55 { 56 return hugetlb_cgroup_from_css(h_cg->css.parent); 57 } 58 59 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) 60 { 61 int idx; 62 63 for (idx = 0; idx < hugetlb_max_hstate; idx++) { 64 if (page_counter_read(&h_cg->hugepage[idx])) 65 return true; 66 } 67 return false; 68 } 69 70 static struct cgroup_subsys_state * 71 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 72 { 73 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); 74 struct hugetlb_cgroup *h_cgroup; 75 int idx; 76 77 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); 78 if (!h_cgroup) 79 return ERR_PTR(-ENOMEM); 80 81 if (parent_h_cgroup) { 82 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) 83 page_counter_init(&h_cgroup->hugepage[idx], 84 &parent_h_cgroup->hugepage[idx]); 85 } else { 86 root_h_cgroup = h_cgroup; 87 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) 88 page_counter_init(&h_cgroup->hugepage[idx], NULL); 89 } 90 return &h_cgroup->css; 91 } 92 93 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) 94 { 95 struct hugetlb_cgroup *h_cgroup; 96 97 h_cgroup = hugetlb_cgroup_from_css(css); 98 kfree(h_cgroup); 99 } 100 101 102 /* 103 * Should be called with hugetlb_lock held. 104 * Since we are holding hugetlb_lock, pages cannot get moved from 105 * active list or uncharged from the cgroup, So no need to get 106 * page reference and test for page active here. This function 107 * cannot fail. 108 */ 109 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, 110 struct page *page) 111 { 112 unsigned int nr_pages; 113 struct page_counter *counter; 114 struct hugetlb_cgroup *page_hcg; 115 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); 116 117 page_hcg = hugetlb_cgroup_from_page(page); 118 /* 119 * We can have pages in active list without any cgroup 120 * ie, hugepage with less than 3 pages. We can safely 121 * ignore those pages. 122 */ 123 if (!page_hcg || page_hcg != h_cg) 124 goto out; 125 126 nr_pages = 1 << compound_order(page); 127 if (!parent) { 128 parent = root_h_cgroup; 129 /* root has no limit */ 130 page_counter_charge(&parent->hugepage[idx], nr_pages); 131 } 132 counter = &h_cg->hugepage[idx]; 133 /* Take the pages off the local counter */ 134 page_counter_cancel(counter, nr_pages); 135 136 set_hugetlb_cgroup(page, parent); 137 out: 138 return; 139 } 140 141 /* 142 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 143 * the parent cgroup. 144 */ 145 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) 146 { 147 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 148 struct hstate *h; 149 struct page *page; 150 int idx = 0; 151 152 do { 153 for_each_hstate(h) { 154 spin_lock(&hugetlb_lock); 155 list_for_each_entry(page, &h->hugepage_activelist, lru) 156 hugetlb_cgroup_move_parent(idx, h_cg, page); 157 158 spin_unlock(&hugetlb_lock); 159 idx++; 160 } 161 cond_resched(); 162 } while (hugetlb_cgroup_have_usage(h_cg)); 163 } 164 165 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 166 struct hugetlb_cgroup **ptr) 167 { 168 int ret = 0; 169 struct page_counter *counter; 170 struct hugetlb_cgroup *h_cg = NULL; 171 172 if (hugetlb_cgroup_disabled()) 173 goto done; 174 /* 175 * We don't charge any cgroup if the compound page have less 176 * than 3 pages. 177 */ 178 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 179 goto done; 180 again: 181 rcu_read_lock(); 182 h_cg = hugetlb_cgroup_from_task(current); 183 if (!css_tryget_online(&h_cg->css)) { 184 rcu_read_unlock(); 185 goto again; 186 } 187 rcu_read_unlock(); 188 189 ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); 190 css_put(&h_cg->css); 191 done: 192 *ptr = h_cg; 193 return ret; 194 } 195 196 /* Should be called with hugetlb_lock held */ 197 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 198 struct hugetlb_cgroup *h_cg, 199 struct page *page) 200 { 201 if (hugetlb_cgroup_disabled() || !h_cg) 202 return; 203 204 set_hugetlb_cgroup(page, h_cg); 205 return; 206 } 207 208 /* 209 * Should be called with hugetlb_lock held 210 */ 211 void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, 212 struct page *page) 213 { 214 struct hugetlb_cgroup *h_cg; 215 216 if (hugetlb_cgroup_disabled()) 217 return; 218 lockdep_assert_held(&hugetlb_lock); 219 h_cg = hugetlb_cgroup_from_page(page); 220 if (unlikely(!h_cg)) 221 return; 222 set_hugetlb_cgroup(page, NULL); 223 page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); 224 return; 225 } 226 227 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 228 struct hugetlb_cgroup *h_cg) 229 { 230 if (hugetlb_cgroup_disabled() || !h_cg) 231 return; 232 233 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 234 return; 235 236 page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); 237 return; 238 } 239 240 enum { 241 RES_USAGE, 242 RES_LIMIT, 243 RES_MAX_USAGE, 244 RES_FAILCNT, 245 }; 246 247 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 248 struct cftype *cft) 249 { 250 struct page_counter *counter; 251 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 252 253 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; 254 255 switch (MEMFILE_ATTR(cft->private)) { 256 case RES_USAGE: 257 return (u64)page_counter_read(counter) * PAGE_SIZE; 258 case RES_LIMIT: 259 return (u64)counter->limit * PAGE_SIZE; 260 case RES_MAX_USAGE: 261 return (u64)counter->watermark * PAGE_SIZE; 262 case RES_FAILCNT: 263 return counter->failcnt; 264 default: 265 BUG(); 266 } 267 } 268 269 static DEFINE_MUTEX(hugetlb_limit_mutex); 270 271 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, 272 char *buf, size_t nbytes, loff_t off) 273 { 274 int ret, idx; 275 unsigned long nr_pages; 276 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 277 278 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ 279 return -EINVAL; 280 281 buf = strstrip(buf); 282 ret = page_counter_memparse(buf, &nr_pages); 283 if (ret) 284 return ret; 285 286 idx = MEMFILE_IDX(of_cft(of)->private); 287 288 switch (MEMFILE_ATTR(of_cft(of)->private)) { 289 case RES_LIMIT: 290 mutex_lock(&hugetlb_limit_mutex); 291 ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); 292 mutex_unlock(&hugetlb_limit_mutex); 293 break; 294 default: 295 ret = -EINVAL; 296 break; 297 } 298 return ret ?: nbytes; 299 } 300 301 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, 302 char *buf, size_t nbytes, loff_t off) 303 { 304 int ret = 0; 305 struct page_counter *counter; 306 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 307 308 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; 309 310 switch (MEMFILE_ATTR(of_cft(of)->private)) { 311 case RES_MAX_USAGE: 312 page_counter_reset_watermark(counter); 313 break; 314 case RES_FAILCNT: 315 counter->failcnt = 0; 316 break; 317 default: 318 ret = -EINVAL; 319 break; 320 } 321 return ret ?: nbytes; 322 } 323 324 static char *mem_fmt(char *buf, int size, unsigned long hsize) 325 { 326 if (hsize >= (1UL << 30)) 327 snprintf(buf, size, "%luGB", hsize >> 30); 328 else if (hsize >= (1UL << 20)) 329 snprintf(buf, size, "%luMB", hsize >> 20); 330 else 331 snprintf(buf, size, "%luKB", hsize >> 10); 332 return buf; 333 } 334 335 static void __init __hugetlb_cgroup_file_init(int idx) 336 { 337 char buf[32]; 338 struct cftype *cft; 339 struct hstate *h = &hstates[idx]; 340 341 /* format the size */ 342 mem_fmt(buf, 32, huge_page_size(h)); 343 344 /* Add the limit file */ 345 cft = &h->cgroup_files[0]; 346 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); 347 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 348 cft->read_u64 = hugetlb_cgroup_read_u64; 349 cft->write = hugetlb_cgroup_write; 350 351 /* Add the usage file */ 352 cft = &h->cgroup_files[1]; 353 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); 354 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 355 cft->read_u64 = hugetlb_cgroup_read_u64; 356 357 /* Add the MAX usage file */ 358 cft = &h->cgroup_files[2]; 359 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); 360 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); 361 cft->write = hugetlb_cgroup_reset; 362 cft->read_u64 = hugetlb_cgroup_read_u64; 363 364 /* Add the failcntfile */ 365 cft = &h->cgroup_files[3]; 366 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); 367 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); 368 cft->write = hugetlb_cgroup_reset; 369 cft->read_u64 = hugetlb_cgroup_read_u64; 370 371 /* NULL terminate the last cft */ 372 cft = &h->cgroup_files[4]; 373 memset(cft, 0, sizeof(*cft)); 374 375 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, 376 h->cgroup_files)); 377 } 378 379 void __init hugetlb_cgroup_file_init(void) 380 { 381 struct hstate *h; 382 383 for_each_hstate(h) { 384 /* 385 * Add cgroup control files only if the huge page consists 386 * of more than two normal pages. This is because we use 387 * page[2].lru.next for storing cgroup details. 388 */ 389 if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) 390 __hugetlb_cgroup_file_init(hstate_index(h)); 391 } 392 } 393 394 /* 395 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen 396 * when we migrate hugepages 397 */ 398 void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) 399 { 400 struct hugetlb_cgroup *h_cg; 401 struct hstate *h = page_hstate(oldhpage); 402 403 if (hugetlb_cgroup_disabled()) 404 return; 405 406 VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); 407 spin_lock(&hugetlb_lock); 408 h_cg = hugetlb_cgroup_from_page(oldhpage); 409 set_hugetlb_cgroup(oldhpage, NULL); 410 411 /* move the h_cg details to new cgroup */ 412 set_hugetlb_cgroup(newhpage, h_cg); 413 list_move(&newhpage->lru, &h->hugepage_activelist); 414 spin_unlock(&hugetlb_lock); 415 return; 416 } 417 418 struct cgroup_subsys hugetlb_cgrp_subsys = { 419 .css_alloc = hugetlb_cgroup_css_alloc, 420 .css_offline = hugetlb_cgroup_css_offline, 421 .css_free = hugetlb_cgroup_css_free, 422 }; 423