1 #include <linux/debugfs.h> 2 #include <linux/mm.h> 3 #include <linux/slab.h> 4 #include <linux/uaccess.h> 5 #include <linux/bootmem.h> 6 #include <linux/stacktrace.h> 7 #include <linux/page_owner.h> 8 #include <linux/jump_label.h> 9 #include <linux/migrate.h> 10 #include <linux/stackdepot.h> 11 12 #include "internal.h" 13 14 /* 15 * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack) 16 * to use off stack temporal storage 17 */ 18 #define PAGE_OWNER_STACK_DEPTH (16) 19 20 static bool page_owner_disabled = true; 21 DEFINE_STATIC_KEY_FALSE(page_owner_inited); 22 23 static depot_stack_handle_t dummy_handle; 24 static depot_stack_handle_t failure_handle; 25 26 static void init_early_allocated_pages(void); 27 28 static int early_page_owner_param(char *buf) 29 { 30 if (!buf) 31 return -EINVAL; 32 33 if (strcmp(buf, "on") == 0) 34 page_owner_disabled = false; 35 36 return 0; 37 } 38 early_param("page_owner", early_page_owner_param); 39 40 static bool need_page_owner(void) 41 { 42 if (page_owner_disabled) 43 return false; 44 45 return true; 46 } 47 48 static noinline void register_dummy_stack(void) 49 { 50 unsigned long entries[4]; 51 struct stack_trace dummy; 52 53 dummy.nr_entries = 0; 54 dummy.max_entries = ARRAY_SIZE(entries); 55 dummy.entries = &entries[0]; 56 dummy.skip = 0; 57 58 save_stack_trace(&dummy); 59 dummy_handle = depot_save_stack(&dummy, GFP_KERNEL); 60 } 61 62 static noinline void register_failure_stack(void) 63 { 64 unsigned long entries[4]; 65 struct stack_trace failure; 66 67 failure.nr_entries = 0; 68 failure.max_entries = ARRAY_SIZE(entries); 69 failure.entries = &entries[0]; 70 failure.skip = 0; 71 72 save_stack_trace(&failure); 73 failure_handle = depot_save_stack(&failure, GFP_KERNEL); 74 } 75 76 static void init_page_owner(void) 77 { 78 if (page_owner_disabled) 79 return; 80 81 register_dummy_stack(); 82 register_failure_stack(); 83 static_branch_enable(&page_owner_inited); 84 init_early_allocated_pages(); 85 } 86 87 struct page_ext_operations page_owner_ops = { 88 .need = need_page_owner, 89 .init = init_page_owner, 90 }; 91 92 void __reset_page_owner(struct page *page, unsigned int order) 93 { 94 int i; 95 struct page_ext *page_ext; 96 97 for (i = 0; i < (1 << order); i++) { 98 page_ext = lookup_page_ext(page + i); 99 if (unlikely(!page_ext)) 100 continue; 101 __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); 102 } 103 } 104 105 static inline bool check_recursive_alloc(struct stack_trace *trace, 106 unsigned long ip) 107 { 108 int i, count; 109 110 if (!trace->nr_entries) 111 return false; 112 113 for (i = 0, count = 0; i < trace->nr_entries; i++) { 114 if (trace->entries[i] == ip && ++count == 2) 115 return true; 116 } 117 118 return false; 119 } 120 121 static noinline depot_stack_handle_t save_stack(gfp_t flags) 122 { 123 unsigned long entries[PAGE_OWNER_STACK_DEPTH]; 124 struct stack_trace trace = { 125 .nr_entries = 0, 126 .entries = entries, 127 .max_entries = PAGE_OWNER_STACK_DEPTH, 128 .skip = 0 129 }; 130 depot_stack_handle_t handle; 131 132 save_stack_trace(&trace); 133 if (trace.nr_entries != 0 && 134 trace.entries[trace.nr_entries-1] == ULONG_MAX) 135 trace.nr_entries--; 136 137 /* 138 * We need to check recursion here because our request to stackdepot 139 * could trigger memory allocation to save new entry. New memory 140 * allocation would reach here and call depot_save_stack() again 141 * if we don't catch it. There is still not enough memory in stackdepot 142 * so it would try to allocate memory again and loop forever. 143 */ 144 if (check_recursive_alloc(&trace, _RET_IP_)) 145 return dummy_handle; 146 147 handle = depot_save_stack(&trace, flags); 148 if (!handle) 149 handle = failure_handle; 150 151 return handle; 152 } 153 154 noinline void __set_page_owner(struct page *page, unsigned int order, 155 gfp_t gfp_mask) 156 { 157 struct page_ext *page_ext = lookup_page_ext(page); 158 159 if (unlikely(!page_ext)) 160 return; 161 162 page_ext->handle = save_stack(gfp_mask); 163 page_ext->order = order; 164 page_ext->gfp_mask = gfp_mask; 165 page_ext->last_migrate_reason = -1; 166 167 __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 168 } 169 170 void __set_page_owner_migrate_reason(struct page *page, int reason) 171 { 172 struct page_ext *page_ext = lookup_page_ext(page); 173 if (unlikely(!page_ext)) 174 return; 175 176 page_ext->last_migrate_reason = reason; 177 } 178 179 void __split_page_owner(struct page *page, unsigned int order) 180 { 181 int i; 182 struct page_ext *page_ext = lookup_page_ext(page); 183 184 if (unlikely(!page_ext)) 185 return; 186 187 page_ext->order = 0; 188 for (i = 1; i < (1 << order); i++) 189 __copy_page_owner(page, page + i); 190 } 191 192 void __copy_page_owner(struct page *oldpage, struct page *newpage) 193 { 194 struct page_ext *old_ext = lookup_page_ext(oldpage); 195 struct page_ext *new_ext = lookup_page_ext(newpage); 196 197 if (unlikely(!old_ext || !new_ext)) 198 return; 199 200 new_ext->order = old_ext->order; 201 new_ext->gfp_mask = old_ext->gfp_mask; 202 new_ext->last_migrate_reason = old_ext->last_migrate_reason; 203 new_ext->handle = old_ext->handle; 204 205 /* 206 * We don't clear the bit on the oldpage as it's going to be freed 207 * after migration. Until then, the info can be useful in case of 208 * a bug, and the overal stats will be off a bit only temporarily. 209 * Also, migrate_misplaced_transhuge_page() can still fail the 210 * migration and then we want the oldpage to retain the info. But 211 * in that case we also don't need to explicitly clear the info from 212 * the new page, which will be freed. 213 */ 214 __set_bit(PAGE_EXT_OWNER, &new_ext->flags); 215 } 216 217 static ssize_t 218 print_page_owner(char __user *buf, size_t count, unsigned long pfn, 219 struct page *page, struct page_ext *page_ext, 220 depot_stack_handle_t handle) 221 { 222 int ret; 223 int pageblock_mt, page_mt; 224 char *kbuf; 225 unsigned long entries[PAGE_OWNER_STACK_DEPTH]; 226 struct stack_trace trace = { 227 .nr_entries = 0, 228 .entries = entries, 229 .max_entries = PAGE_OWNER_STACK_DEPTH, 230 .skip = 0 231 }; 232 233 kbuf = kmalloc(count, GFP_KERNEL); 234 if (!kbuf) 235 return -ENOMEM; 236 237 ret = snprintf(kbuf, count, 238 "Page allocated via order %u, mask %#x(%pGg)\n", 239 page_ext->order, page_ext->gfp_mask, 240 &page_ext->gfp_mask); 241 242 if (ret >= count) 243 goto err; 244 245 /* Print information relevant to grouping pages by mobility */ 246 pageblock_mt = get_pageblock_migratetype(page); 247 page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); 248 ret += snprintf(kbuf + ret, count - ret, 249 "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", 250 pfn, 251 migratetype_names[page_mt], 252 pfn >> pageblock_order, 253 migratetype_names[pageblock_mt], 254 page->flags, &page->flags); 255 256 if (ret >= count) 257 goto err; 258 259 depot_fetch_stack(handle, &trace); 260 ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); 261 if (ret >= count) 262 goto err; 263 264 if (page_ext->last_migrate_reason != -1) { 265 ret += snprintf(kbuf + ret, count - ret, 266 "Page has been migrated, last migrate reason: %s\n", 267 migrate_reason_names[page_ext->last_migrate_reason]); 268 if (ret >= count) 269 goto err; 270 } 271 272 ret += snprintf(kbuf + ret, count - ret, "\n"); 273 if (ret >= count) 274 goto err; 275 276 if (copy_to_user(buf, kbuf, ret)) 277 ret = -EFAULT; 278 279 kfree(kbuf); 280 return ret; 281 282 err: 283 kfree(kbuf); 284 return -ENOMEM; 285 } 286 287 void __dump_page_owner(struct page *page) 288 { 289 struct page_ext *page_ext = lookup_page_ext(page); 290 unsigned long entries[PAGE_OWNER_STACK_DEPTH]; 291 struct stack_trace trace = { 292 .nr_entries = 0, 293 .entries = entries, 294 .max_entries = PAGE_OWNER_STACK_DEPTH, 295 .skip = 0 296 }; 297 depot_stack_handle_t handle; 298 gfp_t gfp_mask; 299 int mt; 300 301 if (unlikely(!page_ext)) { 302 pr_alert("There is not page extension available.\n"); 303 return; 304 } 305 gfp_mask = page_ext->gfp_mask; 306 mt = gfpflags_to_migratetype(gfp_mask); 307 308 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { 309 pr_alert("page_owner info is not active (free page?)\n"); 310 return; 311 } 312 313 handle = READ_ONCE(page_ext->handle); 314 if (!handle) { 315 pr_alert("page_owner info is not active (free page?)\n"); 316 return; 317 } 318 319 depot_fetch_stack(handle, &trace); 320 pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", 321 page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); 322 print_stack_trace(&trace, 0); 323 324 if (page_ext->last_migrate_reason != -1) 325 pr_alert("page has been migrated, last migrate reason: %s\n", 326 migrate_reason_names[page_ext->last_migrate_reason]); 327 } 328 329 static ssize_t 330 read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) 331 { 332 unsigned long pfn; 333 struct page *page; 334 struct page_ext *page_ext; 335 depot_stack_handle_t handle; 336 337 if (!static_branch_unlikely(&page_owner_inited)) 338 return -EINVAL; 339 340 page = NULL; 341 pfn = min_low_pfn + *ppos; 342 343 /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ 344 while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) 345 pfn++; 346 347 drain_all_pages(NULL); 348 349 /* Find an allocated page */ 350 for (; pfn < max_pfn; pfn++) { 351 /* 352 * If the new page is in a new MAX_ORDER_NR_PAGES area, 353 * validate the area as existing, skip it if not 354 */ 355 if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { 356 pfn += MAX_ORDER_NR_PAGES - 1; 357 continue; 358 } 359 360 /* Check for holes within a MAX_ORDER area */ 361 if (!pfn_valid_within(pfn)) 362 continue; 363 364 page = pfn_to_page(pfn); 365 if (PageBuddy(page)) { 366 unsigned long freepage_order = page_order_unsafe(page); 367 368 if (freepage_order < MAX_ORDER) 369 pfn += (1UL << freepage_order) - 1; 370 continue; 371 } 372 373 page_ext = lookup_page_ext(page); 374 if (unlikely(!page_ext)) 375 continue; 376 377 /* 378 * Some pages could be missed by concurrent allocation or free, 379 * because we don't hold the zone lock. 380 */ 381 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 382 continue; 383 384 /* 385 * Access to page_ext->handle isn't synchronous so we should 386 * be careful to access it. 387 */ 388 handle = READ_ONCE(page_ext->handle); 389 if (!handle) 390 continue; 391 392 /* Record the next PFN to read in the file offset */ 393 *ppos = (pfn - min_low_pfn) + 1; 394 395 return print_page_owner(buf, count, pfn, page, 396 page_ext, handle); 397 } 398 399 return 0; 400 } 401 402 static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) 403 { 404 struct page *page; 405 struct page_ext *page_ext; 406 unsigned long pfn = zone->zone_start_pfn, block_end_pfn; 407 unsigned long end_pfn = pfn + zone->spanned_pages; 408 unsigned long count = 0; 409 410 /* Scan block by block. First and last block may be incomplete */ 411 pfn = zone->zone_start_pfn; 412 413 /* 414 * Walk the zone in pageblock_nr_pages steps. If a page block spans 415 * a zone boundary, it will be double counted between zones. This does 416 * not matter as the mixed block count will still be correct 417 */ 418 for (; pfn < end_pfn; ) { 419 if (!pfn_valid(pfn)) { 420 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); 421 continue; 422 } 423 424 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 425 block_end_pfn = min(block_end_pfn, end_pfn); 426 427 page = pfn_to_page(pfn); 428 429 for (; pfn < block_end_pfn; pfn++) { 430 if (!pfn_valid_within(pfn)) 431 continue; 432 433 page = pfn_to_page(pfn); 434 435 if (page_zone(page) != zone) 436 continue; 437 438 /* 439 * We are safe to check buddy flag and order, because 440 * this is init stage and only single thread runs. 441 */ 442 if (PageBuddy(page)) { 443 pfn += (1UL << page_order(page)) - 1; 444 continue; 445 } 446 447 if (PageReserved(page)) 448 continue; 449 450 page_ext = lookup_page_ext(page); 451 if (unlikely(!page_ext)) 452 continue; 453 454 /* Maybe overraping zone */ 455 if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 456 continue; 457 458 /* Found early allocated page */ 459 set_page_owner(page, 0, 0); 460 count++; 461 } 462 } 463 464 pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", 465 pgdat->node_id, zone->name, count); 466 } 467 468 static void init_zones_in_node(pg_data_t *pgdat) 469 { 470 struct zone *zone; 471 struct zone *node_zones = pgdat->node_zones; 472 unsigned long flags; 473 474 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 475 if (!populated_zone(zone)) 476 continue; 477 478 spin_lock_irqsave(&zone->lock, flags); 479 init_pages_in_zone(pgdat, zone); 480 spin_unlock_irqrestore(&zone->lock, flags); 481 } 482 } 483 484 static void init_early_allocated_pages(void) 485 { 486 pg_data_t *pgdat; 487 488 drain_all_pages(NULL); 489 for_each_online_pgdat(pgdat) 490 init_zones_in_node(pgdat); 491 } 492 493 static const struct file_operations proc_page_owner_operations = { 494 .read = read_page_owner, 495 }; 496 497 static int __init pageowner_init(void) 498 { 499 struct dentry *dentry; 500 501 if (!static_branch_unlikely(&page_owner_inited)) { 502 pr_info("page_owner is disabled\n"); 503 return 0; 504 } 505 506 dentry = debugfs_create_file("page_owner", S_IRUSR, NULL, 507 NULL, &proc_page_owner_operations); 508 if (IS_ERR(dentry)) 509 return PTR_ERR(dentry); 510 511 return 0; 512 } 513 late_initcall(pageowner_init) 514