1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/debugfs.h>
3 #include <linux/mm.h>
4 #include <linux/slab.h>
5 #include <linux/uaccess.h>
6 #include <linux/memblock.h>
7 #include <linux/stacktrace.h>
8 #include <linux/page_owner.h>
9 #include <linux/jump_label.h>
10 #include <linux/migrate.h>
11 #include <linux/stackdepot.h>
12 #include <linux/seq_file.h>
13 #include <linux/memcontrol.h>
14 #include <linux/sched/clock.h>
15
16 #include "internal.h"
17
18 /*
19 * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
20 * to use off stack temporal storage
21 */
22 #define PAGE_OWNER_STACK_DEPTH (16)
23
24 struct page_owner {
25 unsigned short order;
26 short last_migrate_reason;
27 gfp_t gfp_mask;
28 depot_stack_handle_t handle;
29 depot_stack_handle_t free_handle;
30 u64 ts_nsec;
31 u64 free_ts_nsec;
32 char comm[TASK_COMM_LEN];
33 pid_t pid;
34 pid_t tgid;
35 pid_t free_pid;
36 pid_t free_tgid;
37 };
38
39 struct stack {
40 struct stack_record *stack_record;
41 struct stack *next;
42 };
43 static struct stack dummy_stack;
44 static struct stack failure_stack;
45 static struct stack *stack_list;
46 static DEFINE_SPINLOCK(stack_list_lock);
47
48 #define STACK_PRINT_FLAG_STACK 0x1
49 #define STACK_PRINT_FLAG_PAGES 0x2
50 #define STACK_PRINT_FLAG_HANDLE 0x4
51
52 struct stack_print_ctx {
53 struct stack *stack;
54 u8 flags;
55 };
56
57 static bool page_owner_enabled __initdata;
58 DEFINE_STATIC_KEY_FALSE(page_owner_inited);
59
60 static depot_stack_handle_t dummy_handle;
61 static depot_stack_handle_t failure_handle;
62 static depot_stack_handle_t early_handle;
63
64 static void init_early_allocated_pages(void);
65
set_current_in_page_owner(void)66 static inline void set_current_in_page_owner(void)
67 {
68 /*
69 * Avoid recursion.
70 *
71 * We might need to allocate more memory from page_owner code, so make
72 * sure to signal it in order to avoid recursion.
73 */
74 current->in_page_owner = 1;
75 }
76
unset_current_in_page_owner(void)77 static inline void unset_current_in_page_owner(void)
78 {
79 current->in_page_owner = 0;
80 }
81
early_page_owner_param(char * buf)82 static int __init early_page_owner_param(char *buf)
83 {
84 int ret = kstrtobool(buf, &page_owner_enabled);
85
86 if (page_owner_enabled)
87 stack_depot_request_early_init();
88
89 return ret;
90 }
91 early_param("page_owner", early_page_owner_param);
92
need_page_owner(void)93 static __init bool need_page_owner(void)
94 {
95 return page_owner_enabled;
96 }
97
create_dummy_stack(void)98 static __always_inline depot_stack_handle_t create_dummy_stack(void)
99 {
100 unsigned long entries[4];
101 unsigned int nr_entries;
102
103 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
104 return stack_depot_save(entries, nr_entries, GFP_KERNEL);
105 }
106
register_dummy_stack(void)107 static noinline void register_dummy_stack(void)
108 {
109 dummy_handle = create_dummy_stack();
110 }
111
register_failure_stack(void)112 static noinline void register_failure_stack(void)
113 {
114 failure_handle = create_dummy_stack();
115 }
116
register_early_stack(void)117 static noinline void register_early_stack(void)
118 {
119 early_handle = create_dummy_stack();
120 }
121
init_page_owner(void)122 static __init void init_page_owner(void)
123 {
124 if (!page_owner_enabled)
125 return;
126
127 register_dummy_stack();
128 register_failure_stack();
129 register_early_stack();
130 init_early_allocated_pages();
131 /* Initialize dummy and failure stacks and link them to stack_list */
132 dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle);
133 failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle);
134 if (dummy_stack.stack_record)
135 refcount_set(&dummy_stack.stack_record->count, 1);
136 if (failure_stack.stack_record)
137 refcount_set(&failure_stack.stack_record->count, 1);
138 dummy_stack.next = &failure_stack;
139 stack_list = &dummy_stack;
140 static_branch_enable(&page_owner_inited);
141 }
142
143 struct page_ext_operations page_owner_ops = {
144 .size = sizeof(struct page_owner),
145 .need = need_page_owner,
146 .init = init_page_owner,
147 .need_shared_flags = true,
148 };
149
get_page_owner(struct page_ext * page_ext)150 static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
151 {
152 return page_ext_data(page_ext, &page_owner_ops);
153 }
154
save_stack(gfp_t flags)155 static noinline depot_stack_handle_t save_stack(gfp_t flags)
156 {
157 unsigned long entries[PAGE_OWNER_STACK_DEPTH];
158 depot_stack_handle_t handle;
159 unsigned int nr_entries;
160
161 if (current->in_page_owner)
162 return dummy_handle;
163
164 set_current_in_page_owner();
165 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
166 handle = stack_depot_save(entries, nr_entries, flags);
167 if (!handle)
168 handle = failure_handle;
169 unset_current_in_page_owner();
170
171 return handle;
172 }
173
add_stack_record_to_list(struct stack_record * stack_record,gfp_t gfp_mask)174 static void add_stack_record_to_list(struct stack_record *stack_record,
175 gfp_t gfp_mask)
176 {
177 unsigned long flags;
178 struct stack *stack;
179
180 if (!gfpflags_allow_spinning(gfp_mask))
181 return;
182
183 set_current_in_page_owner();
184 stack = kmalloc(sizeof(*stack), gfp_nested_mask(gfp_mask));
185 if (!stack) {
186 unset_current_in_page_owner();
187 return;
188 }
189 unset_current_in_page_owner();
190
191 stack->stack_record = stack_record;
192 stack->next = NULL;
193
194 spin_lock_irqsave(&stack_list_lock, flags);
195 stack->next = stack_list;
196 /*
197 * This pairs with smp_load_acquire() from function
198 * stack_start(). This guarantees that stack_start()
199 * will see an updated stack_list before starting to
200 * traverse the list.
201 */
202 smp_store_release(&stack_list, stack);
203 spin_unlock_irqrestore(&stack_list_lock, flags);
204 }
205
inc_stack_record_count(depot_stack_handle_t handle,gfp_t gfp_mask,int nr_base_pages)206 static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
207 int nr_base_pages)
208 {
209 struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
210
211 if (!stack_record)
212 return;
213
214 /*
215 * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
216 * with REFCOUNT_SATURATED to catch spurious increments of their
217 * refcount.
218 * Since we do not use STACK_DEPOT_FLAG_GET API, let us
219 * set a refcount of 1 ourselves.
220 */
221 if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
222 int old = REFCOUNT_SATURATED;
223
224 if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1))
225 /* Add the new stack_record to our list */
226 add_stack_record_to_list(stack_record, gfp_mask);
227 }
228 refcount_add(nr_base_pages, &stack_record->count);
229 }
230
dec_stack_record_count(depot_stack_handle_t handle,int nr_base_pages)231 static void dec_stack_record_count(depot_stack_handle_t handle,
232 int nr_base_pages)
233 {
234 struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
235
236 if (!stack_record)
237 return;
238
239 if (refcount_sub_and_test(nr_base_pages, &stack_record->count))
240 pr_warn("%s: refcount went to 0 for %u handle\n", __func__,
241 handle);
242 }
243
__update_page_owner_handle(struct page * page,depot_stack_handle_t handle,unsigned short order,gfp_t gfp_mask,short last_migrate_reason,u64 ts_nsec,pid_t pid,pid_t tgid,char * comm)244 static inline void __update_page_owner_handle(struct page *page,
245 depot_stack_handle_t handle,
246 unsigned short order,
247 gfp_t gfp_mask,
248 short last_migrate_reason, u64 ts_nsec,
249 pid_t pid, pid_t tgid, char *comm)
250 {
251 struct page_ext_iter iter;
252 struct page_ext *page_ext;
253 struct page_owner *page_owner;
254
255 rcu_read_lock();
256 for_each_page_ext(page, 1 << order, page_ext, iter) {
257 page_owner = get_page_owner(page_ext);
258 page_owner->handle = handle;
259 page_owner->order = order;
260 page_owner->gfp_mask = gfp_mask;
261 page_owner->last_migrate_reason = last_migrate_reason;
262 page_owner->pid = pid;
263 page_owner->tgid = tgid;
264 page_owner->ts_nsec = ts_nsec;
265 strscpy(page_owner->comm, comm,
266 sizeof(page_owner->comm));
267 __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
268 __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
269 }
270 rcu_read_unlock();
271 }
272
__update_page_owner_free_handle(struct page * page,depot_stack_handle_t handle,unsigned short order,pid_t pid,pid_t tgid,u64 free_ts_nsec)273 static inline void __update_page_owner_free_handle(struct page *page,
274 depot_stack_handle_t handle,
275 unsigned short order,
276 pid_t pid, pid_t tgid,
277 u64 free_ts_nsec)
278 {
279 struct page_ext_iter iter;
280 struct page_ext *page_ext;
281 struct page_owner *page_owner;
282
283 rcu_read_lock();
284 for_each_page_ext(page, 1 << order, page_ext, iter) {
285 page_owner = get_page_owner(page_ext);
286 /* Only __reset_page_owner() wants to clear the bit */
287 if (handle) {
288 __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
289 page_owner->free_handle = handle;
290 }
291 page_owner->free_ts_nsec = free_ts_nsec;
292 page_owner->free_pid = current->pid;
293 page_owner->free_tgid = current->tgid;
294 }
295 rcu_read_unlock();
296 }
297
__reset_page_owner(struct page * page,unsigned short order)298 void __reset_page_owner(struct page *page, unsigned short order)
299 {
300 struct page_ext *page_ext;
301 depot_stack_handle_t handle;
302 depot_stack_handle_t alloc_handle;
303 struct page_owner *page_owner;
304 u64 free_ts_nsec = local_clock();
305
306 page_ext = page_ext_get(page);
307 if (unlikely(!page_ext))
308 return;
309
310 page_owner = get_page_owner(page_ext);
311 alloc_handle = page_owner->handle;
312 page_ext_put(page_ext);
313
314 /*
315 * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
316 * to prevent issues in stack_depot_save().
317 * This is similar to alloc_pages_nolock() gfp flags, but only used
318 * to signal stack_depot to avoid spin_locks.
319 */
320 handle = save_stack(__GFP_NOWARN);
321 __update_page_owner_free_handle(page, handle, order, current->pid,
322 current->tgid, free_ts_nsec);
323
324 if (alloc_handle != early_handle)
325 /*
326 * early_handle is being set as a handle for all those
327 * early allocated pages. See init_pages_in_zone().
328 * Since their refcount is not being incremented because
329 * the machinery is not ready yet, we cannot decrement
330 * their refcount either.
331 */
332 dec_stack_record_count(alloc_handle, 1 << order);
333 }
334
__set_page_owner(struct page * page,unsigned short order,gfp_t gfp_mask)335 noinline void __set_page_owner(struct page *page, unsigned short order,
336 gfp_t gfp_mask)
337 {
338 u64 ts_nsec = local_clock();
339 depot_stack_handle_t handle;
340
341 handle = save_stack(gfp_mask);
342 __update_page_owner_handle(page, handle, order, gfp_mask, -1,
343 ts_nsec, current->pid, current->tgid,
344 current->comm);
345 inc_stack_record_count(handle, gfp_mask, 1 << order);
346 }
347
__folio_set_owner_migrate_reason(struct folio * folio,int reason)348 void __folio_set_owner_migrate_reason(struct folio *folio, int reason)
349 {
350 struct page_ext *page_ext = page_ext_get(&folio->page);
351 struct page_owner *page_owner;
352
353 if (unlikely(!page_ext))
354 return;
355
356 page_owner = get_page_owner(page_ext);
357 page_owner->last_migrate_reason = reason;
358 page_ext_put(page_ext);
359 }
360
__split_page_owner(struct page * page,int old_order,int new_order)361 void __split_page_owner(struct page *page, int old_order, int new_order)
362 {
363 struct page_ext_iter iter;
364 struct page_ext *page_ext;
365 struct page_owner *page_owner;
366
367 rcu_read_lock();
368 for_each_page_ext(page, 1 << old_order, page_ext, iter) {
369 page_owner = get_page_owner(page_ext);
370 page_owner->order = new_order;
371 }
372 rcu_read_unlock();
373 }
374
__folio_copy_owner(struct folio * newfolio,struct folio * old)375 void __folio_copy_owner(struct folio *newfolio, struct folio *old)
376 {
377 struct page_ext *page_ext;
378 struct page_ext_iter iter;
379 struct page_owner *old_page_owner;
380 struct page_owner *new_page_owner;
381 depot_stack_handle_t migrate_handle;
382
383 page_ext = page_ext_get(&old->page);
384 if (unlikely(!page_ext))
385 return;
386
387 old_page_owner = get_page_owner(page_ext);
388 page_ext_put(page_ext);
389
390 page_ext = page_ext_get(&newfolio->page);
391 if (unlikely(!page_ext))
392 return;
393
394 new_page_owner = get_page_owner(page_ext);
395 page_ext_put(page_ext);
396
397 migrate_handle = new_page_owner->handle;
398 __update_page_owner_handle(&newfolio->page, old_page_owner->handle,
399 old_page_owner->order, old_page_owner->gfp_mask,
400 old_page_owner->last_migrate_reason,
401 old_page_owner->ts_nsec, old_page_owner->pid,
402 old_page_owner->tgid, old_page_owner->comm);
403 /*
404 * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio
405 * will be freed after migration. Keep them until then as they may be
406 * useful.
407 */
408 __update_page_owner_free_handle(&newfolio->page, 0, old_page_owner->order,
409 old_page_owner->free_pid,
410 old_page_owner->free_tgid,
411 old_page_owner->free_ts_nsec);
412 /*
413 * We linked the original stack to the new folio, we need to do the same
414 * for the new one and the old folio otherwise there will be an imbalance
415 * when subtracting those pages from the stack.
416 */
417 rcu_read_lock();
418 for_each_page_ext(&old->page, 1 << new_page_owner->order, page_ext, iter) {
419 old_page_owner = get_page_owner(page_ext);
420 old_page_owner->handle = migrate_handle;
421 }
422 rcu_read_unlock();
423 }
424
pagetypeinfo_showmixedcount_print(struct seq_file * m,pg_data_t * pgdat,struct zone * zone)425 void pagetypeinfo_showmixedcount_print(struct seq_file *m,
426 pg_data_t *pgdat, struct zone *zone)
427 {
428 struct page *page;
429 struct page_ext *page_ext;
430 struct page_owner *page_owner;
431 unsigned long pfn, block_end_pfn;
432 unsigned long end_pfn = zone_end_pfn(zone);
433 unsigned long count[MIGRATE_TYPES] = { 0, };
434 int pageblock_mt, page_mt;
435 int i;
436
437 /* Scan block by block. First and last block may be incomplete */
438 pfn = zone->zone_start_pfn;
439
440 /*
441 * Walk the zone in pageblock_nr_pages steps. If a page block spans
442 * a zone boundary, it will be double counted between zones. This does
443 * not matter as the mixed block count will still be correct
444 */
445 for (; pfn < end_pfn; ) {
446 page = pfn_to_online_page(pfn);
447 if (!page) {
448 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
449 continue;
450 }
451
452 block_end_pfn = pageblock_end_pfn(pfn);
453 block_end_pfn = min(block_end_pfn, end_pfn);
454
455 pageblock_mt = get_pageblock_migratetype(page);
456
457 for (; pfn < block_end_pfn; pfn++) {
458 /* The pageblock is online, no need to recheck. */
459 page = pfn_to_page(pfn);
460
461 if (page_zone(page) != zone)
462 continue;
463
464 if (PageBuddy(page)) {
465 unsigned long freepage_order;
466
467 freepage_order = buddy_order_unsafe(page);
468 if (freepage_order <= MAX_PAGE_ORDER)
469 pfn += (1UL << freepage_order) - 1;
470 continue;
471 }
472
473 if (PageReserved(page))
474 continue;
475
476 page_ext = page_ext_get(page);
477 if (unlikely(!page_ext))
478 continue;
479
480 if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
481 goto ext_put_continue;
482
483 page_owner = get_page_owner(page_ext);
484 page_mt = gfp_migratetype(page_owner->gfp_mask);
485 if (pageblock_mt != page_mt) {
486 if (is_migrate_cma(pageblock_mt))
487 count[MIGRATE_MOVABLE]++;
488 else
489 count[pageblock_mt]++;
490
491 pfn = block_end_pfn;
492 page_ext_put(page_ext);
493 break;
494 }
495 pfn += (1UL << page_owner->order) - 1;
496 ext_put_continue:
497 page_ext_put(page_ext);
498 }
499 }
500
501 /* Print counts */
502 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
503 for (i = 0; i < MIGRATE_TYPES; i++)
504 seq_printf(m, "%12lu ", count[i]);
505 seq_putc(m, '\n');
506 }
507
508 /*
509 * Looking for memcg information and print it out
510 */
print_page_owner_memcg(char * kbuf,size_t count,int ret,struct page * page)511 static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
512 struct page *page)
513 {
514 #ifdef CONFIG_MEMCG
515 unsigned long memcg_data;
516 struct mem_cgroup *memcg;
517 bool online;
518 char name[80];
519
520 rcu_read_lock();
521 memcg_data = READ_ONCE(page->memcg_data);
522 if (!memcg_data || PageTail(page))
523 goto out_unlock;
524
525 if (memcg_data & MEMCG_DATA_OBJEXTS)
526 ret += scnprintf(kbuf + ret, count - ret,
527 "Slab cache page\n");
528
529 memcg = page_memcg_check(page);
530 if (!memcg)
531 goto out_unlock;
532
533 online = (memcg->css.flags & CSS_ONLINE);
534 cgroup_name(memcg->css.cgroup, name, sizeof(name));
535 ret += scnprintf(kbuf + ret, count - ret,
536 "Charged %sto %smemcg %s\n",
537 PageMemcgKmem(page) ? "(via objcg) " : "",
538 online ? "" : "offline ",
539 name);
540 out_unlock:
541 rcu_read_unlock();
542 #endif /* CONFIG_MEMCG */
543
544 return ret;
545 }
546
547 static ssize_t
print_page_owner(char __user * buf,size_t count,unsigned long pfn,struct page * page,struct page_owner * page_owner,depot_stack_handle_t handle)548 print_page_owner(char __user *buf, size_t count, unsigned long pfn,
549 struct page *page, struct page_owner *page_owner,
550 depot_stack_handle_t handle)
551 {
552 int ret, pageblock_mt, page_mt;
553 char *kbuf;
554
555 count = min_t(size_t, count, PAGE_SIZE);
556 kbuf = kmalloc(count, GFP_KERNEL);
557 if (!kbuf)
558 return -ENOMEM;
559
560 ret = scnprintf(kbuf, count,
561 "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns\n",
562 page_owner->order, page_owner->gfp_mask,
563 &page_owner->gfp_mask, page_owner->pid,
564 page_owner->tgid, page_owner->comm,
565 page_owner->ts_nsec);
566
567 /* Print information relevant to grouping pages by mobility */
568 pageblock_mt = get_pageblock_migratetype(page);
569 page_mt = gfp_migratetype(page_owner->gfp_mask);
570 ret += scnprintf(kbuf + ret, count - ret,
571 "PFN 0x%lx type %s Block %lu type %s Flags %pGp\n",
572 pfn,
573 migratetype_names[page_mt],
574 pfn >> pageblock_order,
575 migratetype_names[pageblock_mt],
576 &page->flags);
577
578 ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
579 if (ret >= count)
580 goto err;
581
582 if (page_owner->last_migrate_reason != -1) {
583 ret += scnprintf(kbuf + ret, count - ret,
584 "Page has been migrated, last migrate reason: %s\n",
585 migrate_reason_names[page_owner->last_migrate_reason]);
586 }
587
588 ret = print_page_owner_memcg(kbuf, count, ret, page);
589
590 ret += snprintf(kbuf + ret, count - ret, "\n");
591 if (ret >= count)
592 goto err;
593
594 if (copy_to_user(buf, kbuf, ret))
595 ret = -EFAULT;
596
597 kfree(kbuf);
598 return ret;
599
600 err:
601 kfree(kbuf);
602 return -ENOMEM;
603 }
604
__dump_page_owner(const struct page * page)605 void __dump_page_owner(const struct page *page)
606 {
607 struct page_ext *page_ext = page_ext_get((void *)page);
608 struct page_owner *page_owner;
609 depot_stack_handle_t handle;
610 gfp_t gfp_mask;
611 int mt;
612
613 if (unlikely(!page_ext)) {
614 pr_alert("There is not page extension available.\n");
615 return;
616 }
617
618 page_owner = get_page_owner(page_ext);
619 gfp_mask = page_owner->gfp_mask;
620 mt = gfp_migratetype(gfp_mask);
621
622 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
623 pr_alert("page_owner info is not present (never set?)\n");
624 page_ext_put(page_ext);
625 return;
626 }
627
628 if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
629 pr_alert("page_owner tracks the page as allocated\n");
630 else
631 pr_alert("page_owner tracks the page as freed\n");
632
633 pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n",
634 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
635 page_owner->pid, page_owner->tgid, page_owner->comm,
636 page_owner->ts_nsec, page_owner->free_ts_nsec);
637
638 handle = READ_ONCE(page_owner->handle);
639 if (!handle)
640 pr_alert("page_owner allocation stack trace missing\n");
641 else
642 stack_depot_print(handle);
643
644 handle = READ_ONCE(page_owner->free_handle);
645 if (!handle) {
646 pr_alert("page_owner free stack trace missing\n");
647 } else {
648 pr_alert("page last free pid %d tgid %d stack trace:\n",
649 page_owner->free_pid, page_owner->free_tgid);
650 stack_depot_print(handle);
651 }
652
653 if (page_owner->last_migrate_reason != -1)
654 pr_alert("page has been migrated, last migrate reason: %s\n",
655 migrate_reason_names[page_owner->last_migrate_reason]);
656 page_ext_put(page_ext);
657 }
658
659 static ssize_t
read_page_owner(struct file * file,char __user * buf,size_t count,loff_t * ppos)660 read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
661 {
662 unsigned long pfn;
663 struct page *page;
664 struct page_ext *page_ext;
665 struct page_owner *page_owner;
666 depot_stack_handle_t handle;
667
668 if (!static_branch_unlikely(&page_owner_inited))
669 return -EINVAL;
670
671 page = NULL;
672 if (*ppos == 0)
673 pfn = min_low_pfn;
674 else
675 pfn = *ppos;
676 /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
677 while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
678 pfn++;
679
680 /* Find an allocated page */
681 for (; pfn < max_pfn; pfn++) {
682 /*
683 * This temporary page_owner is required so
684 * that we can avoid the context switches while holding
685 * the rcu lock and copying the page owner information to
686 * user through copy_to_user() or GFP_KERNEL allocations.
687 */
688 struct page_owner page_owner_tmp;
689
690 /*
691 * If the new page is in a new MAX_ORDER_NR_PAGES area,
692 * validate the area as existing, skip it if not
693 */
694 if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
695 pfn += MAX_ORDER_NR_PAGES - 1;
696 continue;
697 }
698
699 page = pfn_to_page(pfn);
700 if (PageBuddy(page)) {
701 unsigned long freepage_order = buddy_order_unsafe(page);
702
703 if (freepage_order <= MAX_PAGE_ORDER)
704 pfn += (1UL << freepage_order) - 1;
705 continue;
706 }
707
708 page_ext = page_ext_get(page);
709 if (unlikely(!page_ext))
710 continue;
711
712 /*
713 * Some pages could be missed by concurrent allocation or free,
714 * because we don't hold the zone lock.
715 */
716 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
717 goto ext_put_continue;
718
719 /*
720 * Although we do have the info about past allocation of free
721 * pages, it's not relevant for current memory usage.
722 */
723 if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
724 goto ext_put_continue;
725
726 page_owner = get_page_owner(page_ext);
727
728 /*
729 * Don't print "tail" pages of high-order allocations as that
730 * would inflate the stats.
731 */
732 if (!IS_ALIGNED(pfn, 1 << page_owner->order))
733 goto ext_put_continue;
734
735 /*
736 * Access to page_ext->handle isn't synchronous so we should
737 * be careful to access it.
738 */
739 handle = READ_ONCE(page_owner->handle);
740 if (!handle)
741 goto ext_put_continue;
742
743 /* Record the next PFN to read in the file offset */
744 *ppos = pfn + 1;
745
746 page_owner_tmp = *page_owner;
747 page_ext_put(page_ext);
748 return print_page_owner(buf, count, pfn, page,
749 &page_owner_tmp, handle);
750 ext_put_continue:
751 page_ext_put(page_ext);
752 }
753
754 return 0;
755 }
756
lseek_page_owner(struct file * file,loff_t offset,int orig)757 static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig)
758 {
759 switch (orig) {
760 case SEEK_SET:
761 file->f_pos = offset;
762 break;
763 case SEEK_CUR:
764 file->f_pos += offset;
765 break;
766 default:
767 return -EINVAL;
768 }
769 return file->f_pos;
770 }
771
init_pages_in_zone(struct zone * zone)772 static void init_pages_in_zone(struct zone *zone)
773 {
774 unsigned long pfn = zone->zone_start_pfn;
775 unsigned long end_pfn = zone_end_pfn(zone);
776 unsigned long count = 0;
777
778 /*
779 * Walk the zone in pageblock_nr_pages steps. If a page block spans
780 * a zone boundary, it will be double counted between zones. This does
781 * not matter as the mixed block count will still be correct
782 */
783 for (; pfn < end_pfn; ) {
784 unsigned long block_end_pfn;
785
786 if (!pfn_valid(pfn)) {
787 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
788 continue;
789 }
790
791 block_end_pfn = pageblock_end_pfn(pfn);
792 block_end_pfn = min(block_end_pfn, end_pfn);
793
794 for (; pfn < block_end_pfn; pfn++) {
795 struct page *page = pfn_to_page(pfn);
796 struct page_ext *page_ext;
797
798 if (page_zone(page) != zone)
799 continue;
800
801 /*
802 * To avoid having to grab zone->lock, be a little
803 * careful when reading buddy page order. The only
804 * danger is that we skip too much and potentially miss
805 * some early allocated pages, which is better than
806 * heavy lock contention.
807 */
808 if (PageBuddy(page)) {
809 unsigned long order = buddy_order_unsafe(page);
810
811 if (order > 0 && order <= MAX_PAGE_ORDER)
812 pfn += (1UL << order) - 1;
813 continue;
814 }
815
816 if (PageReserved(page))
817 continue;
818
819 page_ext = page_ext_get(page);
820 if (unlikely(!page_ext))
821 continue;
822
823 /* Maybe overlapping zone */
824 if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
825 goto ext_put_continue;
826
827 /* Found early allocated page */
828 __update_page_owner_handle(page, early_handle, 0, 0,
829 -1, local_clock(), current->pid,
830 current->tgid, current->comm);
831 count++;
832 ext_put_continue:
833 page_ext_put(page_ext);
834 }
835 cond_resched();
836 }
837
838 pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
839 zone->zone_pgdat->node_id, zone->name, count);
840 }
841
init_early_allocated_pages(void)842 static void init_early_allocated_pages(void)
843 {
844 struct zone *zone;
845
846 for_each_populated_zone(zone)
847 init_pages_in_zone(zone);
848 }
849
850 static const struct file_operations page_owner_fops = {
851 .read = read_page_owner,
852 .llseek = lseek_page_owner,
853 };
854
stack_start(struct seq_file * m,loff_t * ppos)855 static void *stack_start(struct seq_file *m, loff_t *ppos)
856 {
857 struct stack *stack;
858 struct stack_print_ctx *ctx = m->private;
859
860 if (*ppos == -1UL)
861 return NULL;
862
863 if (!*ppos) {
864 /*
865 * This pairs with smp_store_release() from function
866 * add_stack_record_to_list(), so we get a consistent
867 * value of stack_list.
868 */
869 stack = smp_load_acquire(&stack_list);
870 ctx->stack = stack;
871 } else {
872 stack = ctx->stack;
873 }
874
875 return stack;
876 }
877
stack_next(struct seq_file * m,void * v,loff_t * ppos)878 static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
879 {
880 struct stack *stack = v;
881 struct stack_print_ctx *ctx = m->private;
882
883 stack = stack->next;
884 *ppos = stack ? *ppos + 1 : -1UL;
885 ctx->stack = stack;
886
887 return stack;
888 }
889
890 static unsigned long page_owner_pages_threshold;
891
stack_print(struct seq_file * m,void * v)892 static int stack_print(struct seq_file *m, void *v)
893 {
894 int i, nr_base_pages;
895 struct stack *stack = v;
896 unsigned long *entries;
897 unsigned long nr_entries;
898 struct stack_record *stack_record = stack->stack_record;
899 struct stack_print_ctx *ctx = m->private;
900
901 if (!stack->stack_record)
902 return 0;
903
904 nr_base_pages = refcount_read(&stack_record->count) - 1;
905
906 if (ctx->flags & STACK_PRINT_FLAG_PAGES &&
907 (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold))
908 return 0;
909
910 if (ctx->flags & STACK_PRINT_FLAG_STACK) {
911 nr_entries = stack_record->size;
912 entries = stack_record->entries;
913 for (i = 0; i < nr_entries; i++)
914 seq_printf(m, " %pS\n", (void *)entries[i]);
915 }
916 if (ctx->flags & STACK_PRINT_FLAG_HANDLE)
917 seq_printf(m, "handle: %d\n", stack_record->handle.handle);
918 if (ctx->flags & STACK_PRINT_FLAG_PAGES)
919 seq_printf(m, "nr_base_pages: %d\n", nr_base_pages);
920 seq_putc(m, '\n');
921
922 return 0;
923 }
924
stack_stop(struct seq_file * m,void * v)925 static void stack_stop(struct seq_file *m, void *v)
926 {
927 }
928
929 static const struct seq_operations page_owner_stack_op = {
930 .start = stack_start,
931 .next = stack_next,
932 .stop = stack_stop,
933 .show = stack_print
934 };
935
page_owner_stack_open(struct inode * inode,struct file * file)936 static int page_owner_stack_open(struct inode *inode, struct file *file)
937 {
938 int ret = seq_open_private(file, &page_owner_stack_op,
939 sizeof(struct stack_print_ctx));
940
941 if (!ret) {
942 struct seq_file *m = file->private_data;
943 struct stack_print_ctx *ctx = m->private;
944
945 ctx->flags = (uintptr_t) inode->i_private;
946 }
947
948 return ret;
949 }
950
951 static const struct file_operations page_owner_stack_fops = {
952 .open = page_owner_stack_open,
953 .read = seq_read,
954 .llseek = seq_lseek,
955 .release = seq_release,
956 };
957
page_owner_threshold_get(void * data,u64 * val)958 static int page_owner_threshold_get(void *data, u64 *val)
959 {
960 *val = READ_ONCE(page_owner_pages_threshold);
961 return 0;
962 }
963
page_owner_threshold_set(void * data,u64 val)964 static int page_owner_threshold_set(void *data, u64 val)
965 {
966 WRITE_ONCE(page_owner_pages_threshold, val);
967 return 0;
968 }
969
970 DEFINE_SIMPLE_ATTRIBUTE(page_owner_threshold_fops, &page_owner_threshold_get,
971 &page_owner_threshold_set, "%llu");
972
973
pageowner_init(void)974 static int __init pageowner_init(void)
975 {
976 struct dentry *dir;
977
978 if (!static_branch_unlikely(&page_owner_inited)) {
979 pr_info("page_owner is disabled\n");
980 return 0;
981 }
982
983 debugfs_create_file("page_owner", 0400, NULL, NULL, &page_owner_fops);
984 dir = debugfs_create_dir("page_owner_stacks", NULL);
985 debugfs_create_file("show_stacks", 0400, dir,
986 (void *)(STACK_PRINT_FLAG_STACK |
987 STACK_PRINT_FLAG_PAGES),
988 &page_owner_stack_fops);
989 debugfs_create_file("show_handles", 0400, dir,
990 (void *)(STACK_PRINT_FLAG_HANDLE |
991 STACK_PRINT_FLAG_PAGES),
992 &page_owner_stack_fops);
993 debugfs_create_file("show_stacks_handles", 0400, dir,
994 (void *)(STACK_PRINT_FLAG_STACK |
995 STACK_PRINT_FLAG_HANDLE),
996 &page_owner_stack_fops);
997 debugfs_create_file("count_threshold", 0600, dir, NULL,
998 &page_owner_threshold_fops);
999 return 0;
1000 }
1001 late_initcall(pageowner_init)
1002