xref: /linux/mm/page_owner.c (revision 7203ca412fc8e8a0588e9adc0f777d3163f8dff3)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/debugfs.h>
3 #include <linux/mm.h>
4 #include <linux/slab.h>
5 #include <linux/uaccess.h>
6 #include <linux/memblock.h>
7 #include <linux/stacktrace.h>
8 #include <linux/page_owner.h>
9 #include <linux/jump_label.h>
10 #include <linux/migrate.h>
11 #include <linux/stackdepot.h>
12 #include <linux/seq_file.h>
13 #include <linux/memcontrol.h>
14 #include <linux/sched/clock.h>
15 
16 #include "internal.h"
17 
18 /*
19  * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
20  * to use off stack temporal storage
21  */
22 #define PAGE_OWNER_STACK_DEPTH (16)
23 
24 struct page_owner {
25 	unsigned short order;
26 	short last_migrate_reason;
27 	gfp_t gfp_mask;
28 	depot_stack_handle_t handle;
29 	depot_stack_handle_t free_handle;
30 	u64 ts_nsec;
31 	u64 free_ts_nsec;
32 	char comm[TASK_COMM_LEN];
33 	pid_t pid;
34 	pid_t tgid;
35 	pid_t free_pid;
36 	pid_t free_tgid;
37 };
38 
39 struct stack {
40 	struct stack_record *stack_record;
41 	struct stack *next;
42 };
43 static struct stack dummy_stack;
44 static struct stack failure_stack;
45 static struct stack *stack_list;
46 static DEFINE_SPINLOCK(stack_list_lock);
47 
48 #define STACK_PRINT_FLAG_STACK		0x1
49 #define STACK_PRINT_FLAG_PAGES		0x2
50 #define STACK_PRINT_FLAG_HANDLE		0x4
51 
52 struct stack_print_ctx {
53 	struct stack *stack;
54 	u8 flags;
55 };
56 
57 static bool page_owner_enabled __initdata;
58 DEFINE_STATIC_KEY_FALSE(page_owner_inited);
59 
60 static depot_stack_handle_t dummy_handle;
61 static depot_stack_handle_t failure_handle;
62 static depot_stack_handle_t early_handle;
63 
64 static void init_early_allocated_pages(void);
65 
set_current_in_page_owner(void)66 static inline void set_current_in_page_owner(void)
67 {
68 	/*
69 	 * Avoid recursion.
70 	 *
71 	 * We might need to allocate more memory from page_owner code, so make
72 	 * sure to signal it in order to avoid recursion.
73 	 */
74 	current->in_page_owner = 1;
75 }
76 
unset_current_in_page_owner(void)77 static inline void unset_current_in_page_owner(void)
78 {
79 	current->in_page_owner = 0;
80 }
81 
early_page_owner_param(char * buf)82 static int __init early_page_owner_param(char *buf)
83 {
84 	int ret = kstrtobool(buf, &page_owner_enabled);
85 
86 	if (page_owner_enabled)
87 		stack_depot_request_early_init();
88 
89 	return ret;
90 }
91 early_param("page_owner", early_page_owner_param);
92 
need_page_owner(void)93 static __init bool need_page_owner(void)
94 {
95 	return page_owner_enabled;
96 }
97 
create_dummy_stack(void)98 static __always_inline depot_stack_handle_t create_dummy_stack(void)
99 {
100 	unsigned long entries[4];
101 	unsigned int nr_entries;
102 
103 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
104 	return stack_depot_save(entries, nr_entries, GFP_KERNEL);
105 }
106 
register_dummy_stack(void)107 static noinline void register_dummy_stack(void)
108 {
109 	dummy_handle = create_dummy_stack();
110 }
111 
register_failure_stack(void)112 static noinline void register_failure_stack(void)
113 {
114 	failure_handle = create_dummy_stack();
115 }
116 
register_early_stack(void)117 static noinline void register_early_stack(void)
118 {
119 	early_handle = create_dummy_stack();
120 }
121 
init_page_owner(void)122 static __init void init_page_owner(void)
123 {
124 	if (!page_owner_enabled)
125 		return;
126 
127 	register_dummy_stack();
128 	register_failure_stack();
129 	register_early_stack();
130 	init_early_allocated_pages();
131 	/* Initialize dummy and failure stacks and link them to stack_list */
132 	dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle);
133 	failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle);
134 	if (dummy_stack.stack_record)
135 		refcount_set(&dummy_stack.stack_record->count, 1);
136 	if (failure_stack.stack_record)
137 		refcount_set(&failure_stack.stack_record->count, 1);
138 	dummy_stack.next = &failure_stack;
139 	stack_list = &dummy_stack;
140 	static_branch_enable(&page_owner_inited);
141 }
142 
143 struct page_ext_operations page_owner_ops = {
144 	.size = sizeof(struct page_owner),
145 	.need = need_page_owner,
146 	.init = init_page_owner,
147 	.need_shared_flags = true,
148 };
149 
get_page_owner(struct page_ext * page_ext)150 static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
151 {
152 	return page_ext_data(page_ext, &page_owner_ops);
153 }
154 
save_stack(gfp_t flags)155 static noinline depot_stack_handle_t save_stack(gfp_t flags)
156 {
157 	unsigned long entries[PAGE_OWNER_STACK_DEPTH];
158 	depot_stack_handle_t handle;
159 	unsigned int nr_entries;
160 
161 	if (current->in_page_owner)
162 		return dummy_handle;
163 
164 	set_current_in_page_owner();
165 	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
166 	handle = stack_depot_save(entries, nr_entries, flags);
167 	if (!handle)
168 		handle = failure_handle;
169 	unset_current_in_page_owner();
170 
171 	return handle;
172 }
173 
add_stack_record_to_list(struct stack_record * stack_record,gfp_t gfp_mask)174 static void add_stack_record_to_list(struct stack_record *stack_record,
175 				     gfp_t gfp_mask)
176 {
177 	unsigned long flags;
178 	struct stack *stack;
179 
180 	if (!gfpflags_allow_spinning(gfp_mask))
181 		return;
182 
183 	set_current_in_page_owner();
184 	stack = kmalloc(sizeof(*stack), gfp_nested_mask(gfp_mask));
185 	if (!stack) {
186 		unset_current_in_page_owner();
187 		return;
188 	}
189 	unset_current_in_page_owner();
190 
191 	stack->stack_record = stack_record;
192 	stack->next = NULL;
193 
194 	spin_lock_irqsave(&stack_list_lock, flags);
195 	stack->next = stack_list;
196 	/*
197 	 * This pairs with smp_load_acquire() from function
198 	 * stack_start(). This guarantees that stack_start()
199 	 * will see an updated stack_list before starting to
200 	 * traverse the list.
201 	 */
202 	smp_store_release(&stack_list, stack);
203 	spin_unlock_irqrestore(&stack_list_lock, flags);
204 }
205 
inc_stack_record_count(depot_stack_handle_t handle,gfp_t gfp_mask,int nr_base_pages)206 static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
207 				   int nr_base_pages)
208 {
209 	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
210 
211 	if (!stack_record)
212 		return;
213 
214 	/*
215 	 * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
216 	 * with REFCOUNT_SATURATED to catch spurious increments of their
217 	 * refcount.
218 	 * Since we do not use STACK_DEPOT_FLAG_GET API, let us
219 	 * set a refcount of 1 ourselves.
220 	 */
221 	if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
222 		int old = REFCOUNT_SATURATED;
223 
224 		if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1))
225 			/* Add the new stack_record to our list */
226 			add_stack_record_to_list(stack_record, gfp_mask);
227 	}
228 	refcount_add(nr_base_pages, &stack_record->count);
229 }
230 
dec_stack_record_count(depot_stack_handle_t handle,int nr_base_pages)231 static void dec_stack_record_count(depot_stack_handle_t handle,
232 				   int nr_base_pages)
233 {
234 	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
235 
236 	if (!stack_record)
237 		return;
238 
239 	if (refcount_sub_and_test(nr_base_pages, &stack_record->count))
240 		pr_warn("%s: refcount went to 0 for %u handle\n", __func__,
241 			handle);
242 }
243 
__update_page_owner_handle(struct page * page,depot_stack_handle_t handle,unsigned short order,gfp_t gfp_mask,short last_migrate_reason,u64 ts_nsec,pid_t pid,pid_t tgid,char * comm)244 static inline void __update_page_owner_handle(struct page *page,
245 					      depot_stack_handle_t handle,
246 					      unsigned short order,
247 					      gfp_t gfp_mask,
248 					      short last_migrate_reason, u64 ts_nsec,
249 					      pid_t pid, pid_t tgid, char *comm)
250 {
251 	struct page_ext_iter iter;
252 	struct page_ext *page_ext;
253 	struct page_owner *page_owner;
254 
255 	rcu_read_lock();
256 	for_each_page_ext(page, 1 << order, page_ext, iter) {
257 		page_owner = get_page_owner(page_ext);
258 		page_owner->handle = handle;
259 		page_owner->order = order;
260 		page_owner->gfp_mask = gfp_mask;
261 		page_owner->last_migrate_reason = last_migrate_reason;
262 		page_owner->pid = pid;
263 		page_owner->tgid = tgid;
264 		page_owner->ts_nsec = ts_nsec;
265 		strscpy(page_owner->comm, comm,
266 			sizeof(page_owner->comm));
267 		__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
268 		__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
269 	}
270 	rcu_read_unlock();
271 }
272 
__update_page_owner_free_handle(struct page * page,depot_stack_handle_t handle,unsigned short order,pid_t pid,pid_t tgid,u64 free_ts_nsec)273 static inline void __update_page_owner_free_handle(struct page *page,
274 						   depot_stack_handle_t handle,
275 						   unsigned short order,
276 						   pid_t pid, pid_t tgid,
277 						   u64 free_ts_nsec)
278 {
279 	struct page_ext_iter iter;
280 	struct page_ext *page_ext;
281 	struct page_owner *page_owner;
282 
283 	rcu_read_lock();
284 	for_each_page_ext(page, 1 << order, page_ext, iter) {
285 		page_owner = get_page_owner(page_ext);
286 		/* Only __reset_page_owner() wants to clear the bit */
287 		if (handle) {
288 			__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
289 			page_owner->free_handle = handle;
290 		}
291 		page_owner->free_ts_nsec = free_ts_nsec;
292 		page_owner->free_pid = current->pid;
293 		page_owner->free_tgid = current->tgid;
294 	}
295 	rcu_read_unlock();
296 }
297 
__reset_page_owner(struct page * page,unsigned short order)298 void __reset_page_owner(struct page *page, unsigned short order)
299 {
300 	struct page_ext *page_ext;
301 	depot_stack_handle_t handle;
302 	depot_stack_handle_t alloc_handle;
303 	struct page_owner *page_owner;
304 	u64 free_ts_nsec = local_clock();
305 
306 	page_ext = page_ext_get(page);
307 	if (unlikely(!page_ext))
308 		return;
309 
310 	page_owner = get_page_owner(page_ext);
311 	alloc_handle = page_owner->handle;
312 	page_ext_put(page_ext);
313 
314 	/*
315 	 * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
316 	 * to prevent issues in stack_depot_save().
317 	 * This is similar to alloc_pages_nolock() gfp flags, but only used
318 	 * to signal stack_depot to avoid spin_locks.
319 	 */
320 	handle = save_stack(__GFP_NOWARN);
321 	__update_page_owner_free_handle(page, handle, order, current->pid,
322 					current->tgid, free_ts_nsec);
323 
324 	if (alloc_handle != early_handle)
325 		/*
326 		 * early_handle is being set as a handle for all those
327 		 * early allocated pages. See init_pages_in_zone().
328 		 * Since their refcount is not being incremented because
329 		 * the machinery is not ready yet, we cannot decrement
330 		 * their refcount either.
331 		 */
332 		dec_stack_record_count(alloc_handle, 1 << order);
333 }
334 
__set_page_owner(struct page * page,unsigned short order,gfp_t gfp_mask)335 noinline void __set_page_owner(struct page *page, unsigned short order,
336 					gfp_t gfp_mask)
337 {
338 	u64 ts_nsec = local_clock();
339 	depot_stack_handle_t handle;
340 
341 	handle = save_stack(gfp_mask);
342 	__update_page_owner_handle(page, handle, order, gfp_mask, -1,
343 				   ts_nsec, current->pid, current->tgid,
344 				   current->comm);
345 	inc_stack_record_count(handle, gfp_mask, 1 << order);
346 }
347 
__folio_set_owner_migrate_reason(struct folio * folio,int reason)348 void __folio_set_owner_migrate_reason(struct folio *folio, int reason)
349 {
350 	struct page_ext *page_ext = page_ext_get(&folio->page);
351 	struct page_owner *page_owner;
352 
353 	if (unlikely(!page_ext))
354 		return;
355 
356 	page_owner = get_page_owner(page_ext);
357 	page_owner->last_migrate_reason = reason;
358 	page_ext_put(page_ext);
359 }
360 
__split_page_owner(struct page * page,int old_order,int new_order)361 void __split_page_owner(struct page *page, int old_order, int new_order)
362 {
363 	struct page_ext_iter iter;
364 	struct page_ext *page_ext;
365 	struct page_owner *page_owner;
366 
367 	rcu_read_lock();
368 	for_each_page_ext(page, 1 << old_order, page_ext, iter) {
369 		page_owner = get_page_owner(page_ext);
370 		page_owner->order = new_order;
371 	}
372 	rcu_read_unlock();
373 }
374 
__folio_copy_owner(struct folio * newfolio,struct folio * old)375 void __folio_copy_owner(struct folio *newfolio, struct folio *old)
376 {
377 	struct page_ext *page_ext;
378 	struct page_ext_iter iter;
379 	struct page_owner *old_page_owner;
380 	struct page_owner *new_page_owner;
381 	depot_stack_handle_t migrate_handle;
382 
383 	page_ext = page_ext_get(&old->page);
384 	if (unlikely(!page_ext))
385 		return;
386 
387 	old_page_owner = get_page_owner(page_ext);
388 	page_ext_put(page_ext);
389 
390 	page_ext = page_ext_get(&newfolio->page);
391 	if (unlikely(!page_ext))
392 		return;
393 
394 	new_page_owner = get_page_owner(page_ext);
395 	page_ext_put(page_ext);
396 
397 	migrate_handle = new_page_owner->handle;
398 	__update_page_owner_handle(&newfolio->page, old_page_owner->handle,
399 				   old_page_owner->order, old_page_owner->gfp_mask,
400 				   old_page_owner->last_migrate_reason,
401 				   old_page_owner->ts_nsec, old_page_owner->pid,
402 				   old_page_owner->tgid, old_page_owner->comm);
403 	/*
404 	 * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio
405 	 * will be freed after migration. Keep them until then as they may be
406 	 * useful.
407 	 */
408 	__update_page_owner_free_handle(&newfolio->page, 0, old_page_owner->order,
409 					old_page_owner->free_pid,
410 					old_page_owner->free_tgid,
411 					old_page_owner->free_ts_nsec);
412 	/*
413 	 * We linked the original stack to the new folio, we need to do the same
414 	 * for the new one and the old folio otherwise there will be an imbalance
415 	 * when subtracting those pages from the stack.
416 	 */
417 	rcu_read_lock();
418 	for_each_page_ext(&old->page, 1 << new_page_owner->order, page_ext, iter) {
419 		old_page_owner = get_page_owner(page_ext);
420 		old_page_owner->handle = migrate_handle;
421 	}
422 	rcu_read_unlock();
423 }
424 
pagetypeinfo_showmixedcount_print(struct seq_file * m,pg_data_t * pgdat,struct zone * zone)425 void pagetypeinfo_showmixedcount_print(struct seq_file *m,
426 				       pg_data_t *pgdat, struct zone *zone)
427 {
428 	struct page *page;
429 	struct page_ext *page_ext;
430 	struct page_owner *page_owner;
431 	unsigned long pfn, block_end_pfn;
432 	unsigned long end_pfn = zone_end_pfn(zone);
433 	unsigned long count[MIGRATE_TYPES] = { 0, };
434 	int pageblock_mt, page_mt;
435 	int i;
436 
437 	/* Scan block by block. First and last block may be incomplete */
438 	pfn = zone->zone_start_pfn;
439 
440 	/*
441 	 * Walk the zone in pageblock_nr_pages steps. If a page block spans
442 	 * a zone boundary, it will be double counted between zones. This does
443 	 * not matter as the mixed block count will still be correct
444 	 */
445 	for (; pfn < end_pfn; ) {
446 		page = pfn_to_online_page(pfn);
447 		if (!page) {
448 			pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
449 			continue;
450 		}
451 
452 		block_end_pfn = pageblock_end_pfn(pfn);
453 		block_end_pfn = min(block_end_pfn, end_pfn);
454 
455 		pageblock_mt = get_pageblock_migratetype(page);
456 
457 		for (; pfn < block_end_pfn; pfn++) {
458 			/* The pageblock is online, no need to recheck. */
459 			page = pfn_to_page(pfn);
460 
461 			if (page_zone(page) != zone)
462 				continue;
463 
464 			if (PageBuddy(page)) {
465 				unsigned long freepage_order;
466 
467 				freepage_order = buddy_order_unsafe(page);
468 				if (freepage_order <= MAX_PAGE_ORDER)
469 					pfn += (1UL << freepage_order) - 1;
470 				continue;
471 			}
472 
473 			if (PageReserved(page))
474 				continue;
475 
476 			page_ext = page_ext_get(page);
477 			if (unlikely(!page_ext))
478 				continue;
479 
480 			if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
481 				goto ext_put_continue;
482 
483 			page_owner = get_page_owner(page_ext);
484 			page_mt = gfp_migratetype(page_owner->gfp_mask);
485 			if (pageblock_mt != page_mt) {
486 				if (is_migrate_cma(pageblock_mt))
487 					count[MIGRATE_MOVABLE]++;
488 				else
489 					count[pageblock_mt]++;
490 
491 				pfn = block_end_pfn;
492 				page_ext_put(page_ext);
493 				break;
494 			}
495 			pfn += (1UL << page_owner->order) - 1;
496 ext_put_continue:
497 			page_ext_put(page_ext);
498 		}
499 	}
500 
501 	/* Print counts */
502 	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
503 	for (i = 0; i < MIGRATE_TYPES; i++)
504 		seq_printf(m, "%12lu ", count[i]);
505 	seq_putc(m, '\n');
506 }
507 
508 /*
509  * Looking for memcg information and print it out
510  */
print_page_owner_memcg(char * kbuf,size_t count,int ret,struct page * page)511 static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
512 					 struct page *page)
513 {
514 #ifdef CONFIG_MEMCG
515 	unsigned long memcg_data;
516 	struct mem_cgroup *memcg;
517 	bool online;
518 	char name[80];
519 
520 	rcu_read_lock();
521 	memcg_data = READ_ONCE(page->memcg_data);
522 	if (!memcg_data || PageTail(page))
523 		goto out_unlock;
524 
525 	if (memcg_data & MEMCG_DATA_OBJEXTS)
526 		ret += scnprintf(kbuf + ret, count - ret,
527 				"Slab cache page\n");
528 
529 	memcg = page_memcg_check(page);
530 	if (!memcg)
531 		goto out_unlock;
532 
533 	online = (memcg->css.flags & CSS_ONLINE);
534 	cgroup_name(memcg->css.cgroup, name, sizeof(name));
535 	ret += scnprintf(kbuf + ret, count - ret,
536 			"Charged %sto %smemcg %s\n",
537 			PageMemcgKmem(page) ? "(via objcg) " : "",
538 			online ? "" : "offline ",
539 			name);
540 out_unlock:
541 	rcu_read_unlock();
542 #endif /* CONFIG_MEMCG */
543 
544 	return ret;
545 }
546 
547 static ssize_t
print_page_owner(char __user * buf,size_t count,unsigned long pfn,struct page * page,struct page_owner * page_owner,depot_stack_handle_t handle)548 print_page_owner(char __user *buf, size_t count, unsigned long pfn,
549 		struct page *page, struct page_owner *page_owner,
550 		depot_stack_handle_t handle)
551 {
552 	int ret, pageblock_mt, page_mt;
553 	char *kbuf;
554 
555 	count = min_t(size_t, count, PAGE_SIZE);
556 	kbuf = kmalloc(count, GFP_KERNEL);
557 	if (!kbuf)
558 		return -ENOMEM;
559 
560 	ret = scnprintf(kbuf, count,
561 			"Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns\n",
562 			page_owner->order, page_owner->gfp_mask,
563 			&page_owner->gfp_mask, page_owner->pid,
564 			page_owner->tgid, page_owner->comm,
565 			page_owner->ts_nsec);
566 
567 	/* Print information relevant to grouping pages by mobility */
568 	pageblock_mt = get_pageblock_migratetype(page);
569 	page_mt  = gfp_migratetype(page_owner->gfp_mask);
570 	ret += scnprintf(kbuf + ret, count - ret,
571 			"PFN 0x%lx type %s Block %lu type %s Flags %pGp\n",
572 			pfn,
573 			migratetype_names[page_mt],
574 			pfn >> pageblock_order,
575 			migratetype_names[pageblock_mt],
576 			&page->flags);
577 
578 	ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
579 	if (ret >= count)
580 		goto err;
581 
582 	if (page_owner->last_migrate_reason != -1) {
583 		ret += scnprintf(kbuf + ret, count - ret,
584 			"Page has been migrated, last migrate reason: %s\n",
585 			migrate_reason_names[page_owner->last_migrate_reason]);
586 	}
587 
588 	ret = print_page_owner_memcg(kbuf, count, ret, page);
589 
590 	ret += snprintf(kbuf + ret, count - ret, "\n");
591 	if (ret >= count)
592 		goto err;
593 
594 	if (copy_to_user(buf, kbuf, ret))
595 		ret = -EFAULT;
596 
597 	kfree(kbuf);
598 	return ret;
599 
600 err:
601 	kfree(kbuf);
602 	return -ENOMEM;
603 }
604 
__dump_page_owner(const struct page * page)605 void __dump_page_owner(const struct page *page)
606 {
607 	struct page_ext *page_ext = page_ext_get((void *)page);
608 	struct page_owner *page_owner;
609 	depot_stack_handle_t handle;
610 	gfp_t gfp_mask;
611 	int mt;
612 
613 	if (unlikely(!page_ext)) {
614 		pr_alert("There is not page extension available.\n");
615 		return;
616 	}
617 
618 	page_owner = get_page_owner(page_ext);
619 	gfp_mask = page_owner->gfp_mask;
620 	mt = gfp_migratetype(gfp_mask);
621 
622 	if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
623 		pr_alert("page_owner info is not present (never set?)\n");
624 		page_ext_put(page_ext);
625 		return;
626 	}
627 
628 	if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
629 		pr_alert("page_owner tracks the page as allocated\n");
630 	else
631 		pr_alert("page_owner tracks the page as freed\n");
632 
633 	pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n",
634 		 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
635 		 page_owner->pid, page_owner->tgid, page_owner->comm,
636 		 page_owner->ts_nsec, page_owner->free_ts_nsec);
637 
638 	handle = READ_ONCE(page_owner->handle);
639 	if (!handle)
640 		pr_alert("page_owner allocation stack trace missing\n");
641 	else
642 		stack_depot_print(handle);
643 
644 	handle = READ_ONCE(page_owner->free_handle);
645 	if (!handle) {
646 		pr_alert("page_owner free stack trace missing\n");
647 	} else {
648 		pr_alert("page last free pid %d tgid %d stack trace:\n",
649 			  page_owner->free_pid, page_owner->free_tgid);
650 		stack_depot_print(handle);
651 	}
652 
653 	if (page_owner->last_migrate_reason != -1)
654 		pr_alert("page has been migrated, last migrate reason: %s\n",
655 			migrate_reason_names[page_owner->last_migrate_reason]);
656 	page_ext_put(page_ext);
657 }
658 
659 static ssize_t
read_page_owner(struct file * file,char __user * buf,size_t count,loff_t * ppos)660 read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
661 {
662 	unsigned long pfn;
663 	struct page *page;
664 	struct page_ext *page_ext;
665 	struct page_owner *page_owner;
666 	depot_stack_handle_t handle;
667 
668 	if (!static_branch_unlikely(&page_owner_inited))
669 		return -EINVAL;
670 
671 	page = NULL;
672 	if (*ppos == 0)
673 		pfn = min_low_pfn;
674 	else
675 		pfn = *ppos;
676 	/* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
677 	while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
678 		pfn++;
679 
680 	/* Find an allocated page */
681 	for (; pfn < max_pfn; pfn++) {
682 		/*
683 		 * This temporary page_owner is required so
684 		 * that we can avoid the context switches while holding
685 		 * the rcu lock and copying the page owner information to
686 		 * user through copy_to_user() or GFP_KERNEL allocations.
687 		 */
688 		struct page_owner page_owner_tmp;
689 
690 		/*
691 		 * If the new page is in a new MAX_ORDER_NR_PAGES area,
692 		 * validate the area as existing, skip it if not
693 		 */
694 		if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
695 			pfn += MAX_ORDER_NR_PAGES - 1;
696 			continue;
697 		}
698 
699 		page = pfn_to_page(pfn);
700 		if (PageBuddy(page)) {
701 			unsigned long freepage_order = buddy_order_unsafe(page);
702 
703 			if (freepage_order <= MAX_PAGE_ORDER)
704 				pfn += (1UL << freepage_order) - 1;
705 			continue;
706 		}
707 
708 		page_ext = page_ext_get(page);
709 		if (unlikely(!page_ext))
710 			continue;
711 
712 		/*
713 		 * Some pages could be missed by concurrent allocation or free,
714 		 * because we don't hold the zone lock.
715 		 */
716 		if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
717 			goto ext_put_continue;
718 
719 		/*
720 		 * Although we do have the info about past allocation of free
721 		 * pages, it's not relevant for current memory usage.
722 		 */
723 		if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
724 			goto ext_put_continue;
725 
726 		page_owner = get_page_owner(page_ext);
727 
728 		/*
729 		 * Don't print "tail" pages of high-order allocations as that
730 		 * would inflate the stats.
731 		 */
732 		if (!IS_ALIGNED(pfn, 1 << page_owner->order))
733 			goto ext_put_continue;
734 
735 		/*
736 		 * Access to page_ext->handle isn't synchronous so we should
737 		 * be careful to access it.
738 		 */
739 		handle = READ_ONCE(page_owner->handle);
740 		if (!handle)
741 			goto ext_put_continue;
742 
743 		/* Record the next PFN to read in the file offset */
744 		*ppos = pfn + 1;
745 
746 		page_owner_tmp = *page_owner;
747 		page_ext_put(page_ext);
748 		return print_page_owner(buf, count, pfn, page,
749 				&page_owner_tmp, handle);
750 ext_put_continue:
751 		page_ext_put(page_ext);
752 	}
753 
754 	return 0;
755 }
756 
lseek_page_owner(struct file * file,loff_t offset,int orig)757 static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig)
758 {
759 	switch (orig) {
760 	case SEEK_SET:
761 		file->f_pos = offset;
762 		break;
763 	case SEEK_CUR:
764 		file->f_pos += offset;
765 		break;
766 	default:
767 		return -EINVAL;
768 	}
769 	return file->f_pos;
770 }
771 
init_pages_in_zone(struct zone * zone)772 static void init_pages_in_zone(struct zone *zone)
773 {
774 	unsigned long pfn = zone->zone_start_pfn;
775 	unsigned long end_pfn = zone_end_pfn(zone);
776 	unsigned long count = 0;
777 
778 	/*
779 	 * Walk the zone in pageblock_nr_pages steps. If a page block spans
780 	 * a zone boundary, it will be double counted between zones. This does
781 	 * not matter as the mixed block count will still be correct
782 	 */
783 	for (; pfn < end_pfn; ) {
784 		unsigned long block_end_pfn;
785 
786 		if (!pfn_valid(pfn)) {
787 			pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
788 			continue;
789 		}
790 
791 		block_end_pfn = pageblock_end_pfn(pfn);
792 		block_end_pfn = min(block_end_pfn, end_pfn);
793 
794 		for (; pfn < block_end_pfn; pfn++) {
795 			struct page *page = pfn_to_page(pfn);
796 			struct page_ext *page_ext;
797 
798 			if (page_zone(page) != zone)
799 				continue;
800 
801 			/*
802 			 * To avoid having to grab zone->lock, be a little
803 			 * careful when reading buddy page order. The only
804 			 * danger is that we skip too much and potentially miss
805 			 * some early allocated pages, which is better than
806 			 * heavy lock contention.
807 			 */
808 			if (PageBuddy(page)) {
809 				unsigned long order = buddy_order_unsafe(page);
810 
811 				if (order > 0 && order <= MAX_PAGE_ORDER)
812 					pfn += (1UL << order) - 1;
813 				continue;
814 			}
815 
816 			if (PageReserved(page))
817 				continue;
818 
819 			page_ext = page_ext_get(page);
820 			if (unlikely(!page_ext))
821 				continue;
822 
823 			/* Maybe overlapping zone */
824 			if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
825 				goto ext_put_continue;
826 
827 			/* Found early allocated page */
828 			__update_page_owner_handle(page, early_handle, 0, 0,
829 						   -1, local_clock(), current->pid,
830 						   current->tgid, current->comm);
831 			count++;
832 ext_put_continue:
833 			page_ext_put(page_ext);
834 		}
835 		cond_resched();
836 	}
837 
838 	pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
839 		zone->zone_pgdat->node_id, zone->name, count);
840 }
841 
init_early_allocated_pages(void)842 static void init_early_allocated_pages(void)
843 {
844 	struct zone *zone;
845 
846 	for_each_populated_zone(zone)
847 		init_pages_in_zone(zone);
848 }
849 
850 static const struct file_operations page_owner_fops = {
851 	.read		= read_page_owner,
852 	.llseek		= lseek_page_owner,
853 };
854 
stack_start(struct seq_file * m,loff_t * ppos)855 static void *stack_start(struct seq_file *m, loff_t *ppos)
856 {
857 	struct stack *stack;
858 	struct stack_print_ctx *ctx = m->private;
859 
860 	if (*ppos == -1UL)
861 		return NULL;
862 
863 	if (!*ppos) {
864 		/*
865 		 * This pairs with smp_store_release() from function
866 		 * add_stack_record_to_list(), so we get a consistent
867 		 * value of stack_list.
868 		 */
869 		stack = smp_load_acquire(&stack_list);
870 		ctx->stack = stack;
871 	} else {
872 		stack = ctx->stack;
873 	}
874 
875 	return stack;
876 }
877 
stack_next(struct seq_file * m,void * v,loff_t * ppos)878 static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
879 {
880 	struct stack *stack = v;
881 	struct stack_print_ctx *ctx = m->private;
882 
883 	stack = stack->next;
884 	*ppos = stack ? *ppos + 1 : -1UL;
885 	ctx->stack = stack;
886 
887 	return stack;
888 }
889 
890 static unsigned long page_owner_pages_threshold;
891 
stack_print(struct seq_file * m,void * v)892 static int stack_print(struct seq_file *m, void *v)
893 {
894 	int i, nr_base_pages;
895 	struct stack *stack = v;
896 	unsigned long *entries;
897 	unsigned long nr_entries;
898 	struct stack_record *stack_record = stack->stack_record;
899 	struct stack_print_ctx *ctx = m->private;
900 
901 	if (!stack->stack_record)
902 		return 0;
903 
904 	nr_base_pages = refcount_read(&stack_record->count) - 1;
905 
906 	if (ctx->flags & STACK_PRINT_FLAG_PAGES &&
907 	    (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold))
908 		return 0;
909 
910 	if (ctx->flags & STACK_PRINT_FLAG_STACK) {
911 		nr_entries = stack_record->size;
912 		entries = stack_record->entries;
913 		for (i = 0; i < nr_entries; i++)
914 			seq_printf(m, " %pS\n", (void *)entries[i]);
915 	}
916 	if (ctx->flags & STACK_PRINT_FLAG_HANDLE)
917 		seq_printf(m, "handle: %d\n", stack_record->handle.handle);
918 	if (ctx->flags & STACK_PRINT_FLAG_PAGES)
919 		seq_printf(m, "nr_base_pages: %d\n", nr_base_pages);
920 	seq_putc(m, '\n');
921 
922 	return 0;
923 }
924 
stack_stop(struct seq_file * m,void * v)925 static void stack_stop(struct seq_file *m, void *v)
926 {
927 }
928 
929 static const struct seq_operations page_owner_stack_op = {
930 	.start	= stack_start,
931 	.next	= stack_next,
932 	.stop	= stack_stop,
933 	.show	= stack_print
934 };
935 
page_owner_stack_open(struct inode * inode,struct file * file)936 static int page_owner_stack_open(struct inode *inode, struct file *file)
937 {
938 	int ret = seq_open_private(file, &page_owner_stack_op,
939 				   sizeof(struct stack_print_ctx));
940 
941 	if (!ret) {
942 		struct seq_file *m = file->private_data;
943 		struct stack_print_ctx *ctx = m->private;
944 
945 		ctx->flags = (uintptr_t) inode->i_private;
946 	}
947 
948 	return ret;
949 }
950 
951 static const struct file_operations page_owner_stack_fops = {
952 	.open		= page_owner_stack_open,
953 	.read		= seq_read,
954 	.llseek		= seq_lseek,
955 	.release	= seq_release,
956 };
957 
page_owner_threshold_get(void * data,u64 * val)958 static int page_owner_threshold_get(void *data, u64 *val)
959 {
960 	*val = READ_ONCE(page_owner_pages_threshold);
961 	return 0;
962 }
963 
page_owner_threshold_set(void * data,u64 val)964 static int page_owner_threshold_set(void *data, u64 val)
965 {
966 	WRITE_ONCE(page_owner_pages_threshold, val);
967 	return 0;
968 }
969 
970 DEFINE_SIMPLE_ATTRIBUTE(page_owner_threshold_fops, &page_owner_threshold_get,
971 			&page_owner_threshold_set, "%llu");
972 
973 
pageowner_init(void)974 static int __init pageowner_init(void)
975 {
976 	struct dentry *dir;
977 
978 	if (!static_branch_unlikely(&page_owner_inited)) {
979 		pr_info("page_owner is disabled\n");
980 		return 0;
981 	}
982 
983 	debugfs_create_file("page_owner", 0400, NULL, NULL, &page_owner_fops);
984 	dir = debugfs_create_dir("page_owner_stacks", NULL);
985 	debugfs_create_file("show_stacks", 0400, dir,
986 			    (void *)(STACK_PRINT_FLAG_STACK |
987 				     STACK_PRINT_FLAG_PAGES),
988 			     &page_owner_stack_fops);
989 	debugfs_create_file("show_handles", 0400, dir,
990 			    (void *)(STACK_PRINT_FLAG_HANDLE |
991 				     STACK_PRINT_FLAG_PAGES),
992 			    &page_owner_stack_fops);
993 	debugfs_create_file("show_stacks_handles", 0400, dir,
994 			    (void *)(STACK_PRINT_FLAG_STACK |
995 				     STACK_PRINT_FLAG_HANDLE),
996 			    &page_owner_stack_fops);
997 	debugfs_create_file("count_threshold", 0600, dir, NULL,
998 			    &page_owner_threshold_fops);
999 	return 0;
1000 }
1001 late_initcall(pageowner_init)
1002