xref: /linux/lib/alloc_tag.c (revision e2683c8868d03382da7e1ce8453b543a043066d1)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/alloc_tag.h>
3 #include <linux/execmem.h>
4 #include <linux/fs.h>
5 #include <linux/gfp.h>
6 #include <linux/kallsyms.h>
7 #include <linux/module.h>
8 #include <linux/page_ext.h>
9 #include <linux/pgalloc_tag.h>
10 #include <linux/proc_fs.h>
11 #include <linux/rcupdate.h>
12 #include <linux/seq_buf.h>
13 #include <linux/seq_file.h>
14 #include <linux/string_choices.h>
15 #include <linux/vmalloc.h>
16 #include <linux/kmemleak.h>
17 
18 #define ALLOCINFO_FILE_NAME		"allocinfo"
19 #define MODULE_ALLOC_TAG_VMAP_SIZE	(100000UL * sizeof(struct alloc_tag))
20 #define SECTION_START(NAME)		(CODETAG_SECTION_START_PREFIX NAME)
21 #define SECTION_STOP(NAME)		(CODETAG_SECTION_STOP_PREFIX NAME)
22 
23 #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
24 static bool mem_profiling_support = true;
25 #else
26 static bool mem_profiling_support;
27 #endif
28 
29 static struct codetag_type *alloc_tag_cttype;
30 
31 #ifdef CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU
32 DEFINE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
33 EXPORT_SYMBOL(_shared_alloc_tag);
34 #endif
35 
36 DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
37 			mem_alloc_profiling_key);
38 EXPORT_SYMBOL(mem_alloc_profiling_key);
39 
40 DEFINE_STATIC_KEY_FALSE(mem_profiling_compressed);
41 
42 struct alloc_tag_kernel_section kernel_tags = { NULL, 0 };
43 unsigned long alloc_tag_ref_mask;
44 int alloc_tag_ref_offs;
45 
46 struct allocinfo_private {
47 	struct codetag_iterator iter;
48 	bool print_header;
49 };
50 
51 static void *allocinfo_start(struct seq_file *m, loff_t *pos)
52 {
53 	struct allocinfo_private *priv;
54 	loff_t node = *pos;
55 
56 	priv = (struct allocinfo_private *)m->private;
57 	codetag_lock_module_list(alloc_tag_cttype, true);
58 	if (node == 0) {
59 		priv->print_header = true;
60 		priv->iter = codetag_get_ct_iter(alloc_tag_cttype);
61 		codetag_next_ct(&priv->iter);
62 	}
63 	return priv->iter.ct ? priv : NULL;
64 }
65 
66 static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
67 {
68 	struct allocinfo_private *priv = (struct allocinfo_private *)arg;
69 	struct codetag *ct = codetag_next_ct(&priv->iter);
70 
71 	(*pos)++;
72 	if (!ct)
73 		return NULL;
74 
75 	return priv;
76 }
77 
78 static void allocinfo_stop(struct seq_file *m, void *arg)
79 {
80 	codetag_lock_module_list(alloc_tag_cttype, false);
81 }
82 
83 static void print_allocinfo_header(struct seq_buf *buf)
84 {
85 	/* Output format version, so we can change it. */
86 	seq_buf_printf(buf, "allocinfo - version: 2.0\n");
87 	seq_buf_printf(buf, "#     <size>  <calls> <tag info>\n");
88 }
89 
90 static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
91 {
92 	struct alloc_tag *tag = ct_to_alloc_tag(ct);
93 	struct alloc_tag_counters counter = alloc_tag_read(tag);
94 	s64 bytes = counter.bytes;
95 
96 	seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls);
97 	codetag_to_text(out, ct);
98 	if (unlikely(alloc_tag_is_inaccurate(tag)))
99 		seq_buf_printf(out, " accurate:no");
100 	seq_buf_putc(out, ' ');
101 	seq_buf_putc(out, '\n');
102 }
103 
104 static int allocinfo_show(struct seq_file *m, void *arg)
105 {
106 	struct allocinfo_private *priv = (struct allocinfo_private *)arg;
107 	char *bufp;
108 	size_t n = seq_get_buf(m, &bufp);
109 	struct seq_buf buf;
110 
111 	seq_buf_init(&buf, bufp, n);
112 	if (priv->print_header) {
113 		print_allocinfo_header(&buf);
114 		priv->print_header = false;
115 	}
116 	alloc_tag_to_text(&buf, priv->iter.ct);
117 	seq_commit(m, seq_buf_used(&buf));
118 	return 0;
119 }
120 
121 static const struct seq_operations allocinfo_seq_op = {
122 	.start	= allocinfo_start,
123 	.next	= allocinfo_next,
124 	.stop	= allocinfo_stop,
125 	.show	= allocinfo_show,
126 };
127 
128 size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sleep)
129 {
130 	struct codetag_iterator iter;
131 	struct codetag *ct;
132 	struct codetag_bytes n;
133 	unsigned int i, nr = 0;
134 
135 	if (IS_ERR_OR_NULL(alloc_tag_cttype))
136 		return 0;
137 
138 	if (can_sleep)
139 		codetag_lock_module_list(alloc_tag_cttype, true);
140 	else if (!codetag_trylock_module_list(alloc_tag_cttype))
141 		return 0;
142 
143 	iter = codetag_get_ct_iter(alloc_tag_cttype);
144 	while ((ct = codetag_next_ct(&iter))) {
145 		struct alloc_tag_counters counter = alloc_tag_read(ct_to_alloc_tag(ct));
146 
147 		n.ct	= ct;
148 		n.bytes = counter.bytes;
149 
150 		for (i = 0; i < nr; i++)
151 			if (n.bytes > tags[i].bytes)
152 				break;
153 
154 		if (i < count) {
155 			nr -= nr == count;
156 			memmove(&tags[i + 1],
157 				&tags[i],
158 				sizeof(tags[0]) * (nr - i));
159 			nr++;
160 			tags[i] = n;
161 		}
162 	}
163 
164 	codetag_lock_module_list(alloc_tag_cttype, false);
165 
166 	return nr;
167 }
168 
169 void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
170 {
171 	int i;
172 	struct alloc_tag *tag;
173 	unsigned int nr_pages = 1 << new_order;
174 
175 	if (!mem_alloc_profiling_enabled())
176 		return;
177 
178 	tag = __pgalloc_tag_get(&folio->page);
179 	if (!tag)
180 		return;
181 
182 	for (i = nr_pages; i < (1 << old_order); i += nr_pages) {
183 		union pgtag_ref_handle handle;
184 		union codetag_ref ref;
185 
186 		if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) {
187 			/* Set new reference to point to the original tag */
188 			alloc_tag_ref_set(&ref, tag);
189 			update_page_tag_ref(handle, &ref);
190 			put_page_tag_ref(handle);
191 		}
192 	}
193 }
194 
195 void pgalloc_tag_swap(struct folio *new, struct folio *old)
196 {
197 	union pgtag_ref_handle handle_old, handle_new;
198 	union codetag_ref ref_old, ref_new;
199 	struct alloc_tag *tag_old, *tag_new;
200 
201 	if (!mem_alloc_profiling_enabled())
202 		return;
203 
204 	tag_old = __pgalloc_tag_get(&old->page);
205 	if (!tag_old)
206 		return;
207 	tag_new = __pgalloc_tag_get(&new->page);
208 	if (!tag_new)
209 		return;
210 
211 	if (!get_page_tag_ref(&old->page, &ref_old, &handle_old))
212 		return;
213 	if (!get_page_tag_ref(&new->page, &ref_new, &handle_new)) {
214 		put_page_tag_ref(handle_old);
215 		return;
216 	}
217 
218 	/*
219 	 * Clear tag references to avoid debug warning when using
220 	 * __alloc_tag_ref_set() with non-empty reference.
221 	 */
222 	set_codetag_empty(&ref_old);
223 	set_codetag_empty(&ref_new);
224 
225 	/* swap tags */
226 	__alloc_tag_ref_set(&ref_old, tag_new);
227 	update_page_tag_ref(handle_old, &ref_old);
228 	__alloc_tag_ref_set(&ref_new, tag_old);
229 	update_page_tag_ref(handle_new, &ref_new);
230 
231 	put_page_tag_ref(handle_old);
232 	put_page_tag_ref(handle_new);
233 }
234 
235 static void shutdown_mem_profiling(bool remove_file)
236 {
237 	if (mem_alloc_profiling_enabled())
238 		static_branch_disable(&mem_alloc_profiling_key);
239 
240 	if (!mem_profiling_support)
241 		return;
242 
243 	if (remove_file)
244 		remove_proc_entry(ALLOCINFO_FILE_NAME, NULL);
245 	mem_profiling_support = false;
246 }
247 
248 void __init alloc_tag_sec_init(void)
249 {
250 	struct alloc_tag *last_codetag;
251 
252 	if (!mem_profiling_support)
253 		return;
254 
255 	if (!static_key_enabled(&mem_profiling_compressed))
256 		return;
257 
258 	kernel_tags.first_tag = (struct alloc_tag *)kallsyms_lookup_name(
259 					SECTION_START(ALLOC_TAG_SECTION_NAME));
260 	last_codetag = (struct alloc_tag *)kallsyms_lookup_name(
261 					SECTION_STOP(ALLOC_TAG_SECTION_NAME));
262 	kernel_tags.count = last_codetag - kernel_tags.first_tag;
263 
264 	/* Check if kernel tags fit into page flags */
265 	if (kernel_tags.count > (1UL << NR_UNUSED_PAGEFLAG_BITS)) {
266 		shutdown_mem_profiling(false); /* allocinfo file does not exist yet */
267 		pr_err("%lu allocation tags cannot be references using %d available page flag bits. Memory allocation profiling is disabled!\n",
268 			kernel_tags.count, NR_UNUSED_PAGEFLAG_BITS);
269 		return;
270 	}
271 
272 	alloc_tag_ref_offs = (LRU_REFS_PGOFF - NR_UNUSED_PAGEFLAG_BITS);
273 	alloc_tag_ref_mask = ((1UL << NR_UNUSED_PAGEFLAG_BITS) - 1);
274 	pr_debug("Memory allocation profiling compression is using %d page flag bits!\n",
275 		 NR_UNUSED_PAGEFLAG_BITS);
276 }
277 
278 #ifdef CONFIG_MODULES
279 
280 static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE);
281 static struct vm_struct *vm_module_tags;
282 /* A dummy object used to indicate an unloaded module */
283 static struct module unloaded_mod;
284 /* A dummy object used to indicate a module prepended area */
285 static struct module prepend_mod;
286 
287 struct alloc_tag_module_section module_tags;
288 
289 static inline unsigned long alloc_tag_align(unsigned long val)
290 {
291 	if (!static_key_enabled(&mem_profiling_compressed)) {
292 		/* No alignment requirements when we are not indexing the tags */
293 		return val;
294 	}
295 
296 	if (val % sizeof(struct alloc_tag) == 0)
297 		return val;
298 	return ((val / sizeof(struct alloc_tag)) + 1) * sizeof(struct alloc_tag);
299 }
300 
301 static bool ensure_alignment(unsigned long align, unsigned int *prepend)
302 {
303 	if (!static_key_enabled(&mem_profiling_compressed)) {
304 		/* No alignment requirements when we are not indexing the tags */
305 		return true;
306 	}
307 
308 	/*
309 	 * If alloc_tag size is not a multiple of required alignment, tag
310 	 * indexing does not work.
311 	 */
312 	if (!IS_ALIGNED(sizeof(struct alloc_tag), align))
313 		return false;
314 
315 	/* Ensure prepend consumes multiple of alloc_tag-sized blocks */
316 	if (*prepend)
317 		*prepend = alloc_tag_align(*prepend);
318 
319 	return true;
320 }
321 
322 static inline bool tags_addressable(void)
323 {
324 	unsigned long tag_idx_count;
325 
326 	if (!static_key_enabled(&mem_profiling_compressed))
327 		return true; /* with page_ext tags are always addressable */
328 
329 	tag_idx_count = CODETAG_ID_FIRST + kernel_tags.count +
330 			module_tags.size / sizeof(struct alloc_tag);
331 
332 	return tag_idx_count < (1UL << NR_UNUSED_PAGEFLAG_BITS);
333 }
334 
335 static bool needs_section_mem(struct module *mod, unsigned long size)
336 {
337 	if (!mem_profiling_support)
338 		return false;
339 
340 	return size >= sizeof(struct alloc_tag);
341 }
342 
343 static bool clean_unused_counters(struct alloc_tag *start_tag,
344 				  struct alloc_tag *end_tag)
345 {
346 	struct alloc_tag *tag;
347 	bool ret = true;
348 
349 	for (tag = start_tag; tag <= end_tag; tag++) {
350 		struct alloc_tag_counters counter;
351 
352 		if (!tag->counters)
353 			continue;
354 
355 		counter = alloc_tag_read(tag);
356 		if (!counter.bytes) {
357 			free_percpu(tag->counters);
358 			tag->counters = NULL;
359 		} else {
360 			ret = false;
361 		}
362 	}
363 
364 	return ret;
365 }
366 
367 /* Called with mod_area_mt locked */
368 static void clean_unused_module_areas_locked(void)
369 {
370 	MA_STATE(mas, &mod_area_mt, 0, module_tags.size);
371 	struct module *val;
372 
373 	mas_for_each(&mas, val, module_tags.size) {
374 		struct alloc_tag *start_tag;
375 		struct alloc_tag *end_tag;
376 
377 		if (val != &unloaded_mod)
378 			continue;
379 
380 		/* Release area if all tags are unused */
381 		start_tag = (struct alloc_tag *)(module_tags.start_addr + mas.index);
382 		end_tag = (struct alloc_tag *)(module_tags.start_addr + mas.last);
383 		if (clean_unused_counters(start_tag, end_tag))
384 			mas_erase(&mas);
385 	}
386 }
387 
388 /* Called with mod_area_mt locked */
389 static bool find_aligned_area(struct ma_state *mas, unsigned long section_size,
390 			      unsigned long size, unsigned int prepend, unsigned long align)
391 {
392 	bool cleanup_done = false;
393 
394 repeat:
395 	/* Try finding exact size and hope the start is aligned */
396 	if (!mas_empty_area(mas, 0, section_size - 1, prepend + size)) {
397 		if (IS_ALIGNED(mas->index + prepend, align))
398 			return true;
399 
400 		/* Try finding larger area to align later */
401 		mas_reset(mas);
402 		if (!mas_empty_area(mas, 0, section_size - 1,
403 				    size + prepend + align - 1))
404 			return true;
405 	}
406 
407 	/* No free area, try cleanup stale data and repeat the search once */
408 	if (!cleanup_done) {
409 		clean_unused_module_areas_locked();
410 		cleanup_done = true;
411 		mas_reset(mas);
412 		goto repeat;
413 	}
414 
415 	return false;
416 }
417 
418 static int vm_module_tags_populate(void)
419 {
420 	unsigned long phys_end = ALIGN_DOWN(module_tags.start_addr, PAGE_SIZE) +
421 				 (vm_module_tags->nr_pages << PAGE_SHIFT);
422 	unsigned long new_end = module_tags.start_addr + module_tags.size;
423 
424 	if (phys_end < new_end) {
425 		struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages;
426 		unsigned long old_shadow_end = ALIGN(phys_end, MODULE_ALIGN);
427 		unsigned long new_shadow_end = ALIGN(new_end, MODULE_ALIGN);
428 		unsigned long more_pages;
429 		unsigned long nr = 0;
430 
431 		more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT;
432 		while (nr < more_pages) {
433 			unsigned long allocated;
434 
435 			allocated = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN,
436 				NUMA_NO_NODE, more_pages - nr, next_page + nr);
437 
438 			if (!allocated)
439 				break;
440 			nr += allocated;
441 		}
442 
443 		if (nr < more_pages ||
444 		    vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL,
445 				     next_page, PAGE_SHIFT) < 0) {
446 			release_pages_arg arg = { .pages = next_page };
447 
448 			/* Clean up and error out */
449 			release_pages(arg, nr);
450 			return -ENOMEM;
451 		}
452 
453 		vm_module_tags->nr_pages += nr;
454 
455 		/*
456 		 * Kasan allocates 1 byte of shadow for every 8 bytes of data.
457 		 * When kasan_alloc_module_shadow allocates shadow memory,
458 		 * its unit of allocation is a page.
459 		 * Therefore, here we need to align to MODULE_ALIGN.
460 		 */
461 		if (old_shadow_end < new_shadow_end)
462 			kasan_alloc_module_shadow((void *)old_shadow_end,
463 						  new_shadow_end - old_shadow_end,
464 						  GFP_KERNEL);
465 	}
466 
467 	/*
468 	 * Mark the pages as accessible, now that they are mapped.
469 	 * With hardware tag-based KASAN, marking is skipped for
470 	 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
471 	 */
472 	kasan_unpoison_vmalloc((void *)module_tags.start_addr,
473 				new_end - module_tags.start_addr,
474 				KASAN_VMALLOC_PROT_NORMAL);
475 
476 	return 0;
477 }
478 
479 static void *reserve_module_tags(struct module *mod, unsigned long size,
480 				 unsigned int prepend, unsigned long align)
481 {
482 	unsigned long section_size = module_tags.end_addr - module_tags.start_addr;
483 	MA_STATE(mas, &mod_area_mt, 0, section_size - 1);
484 	unsigned long offset;
485 	void *ret = NULL;
486 
487 	/* If no tags return error */
488 	if (size < sizeof(struct alloc_tag))
489 		return ERR_PTR(-EINVAL);
490 
491 	/*
492 	 * align is always power of 2, so we can use IS_ALIGNED and ALIGN.
493 	 * align 0 or 1 means no alignment, to simplify set to 1.
494 	 */
495 	if (!align)
496 		align = 1;
497 
498 	if (!ensure_alignment(align, &prepend)) {
499 		shutdown_mem_profiling(true);
500 		pr_err("%s: alignment %lu is incompatible with allocation tag indexing. Memory allocation profiling is disabled!\n",
501 			mod->name, align);
502 		return ERR_PTR(-EINVAL);
503 	}
504 
505 	mas_lock(&mas);
506 	if (!find_aligned_area(&mas, section_size, size, prepend, align)) {
507 		ret = ERR_PTR(-ENOMEM);
508 		goto unlock;
509 	}
510 
511 	/* Mark found area as reserved */
512 	offset = mas.index;
513 	offset += prepend;
514 	offset = ALIGN(offset, align);
515 	if (offset != mas.index) {
516 		unsigned long pad_start = mas.index;
517 
518 		mas.last = offset - 1;
519 		mas_store(&mas, &prepend_mod);
520 		if (mas_is_err(&mas)) {
521 			ret = ERR_PTR(xa_err(mas.node));
522 			goto unlock;
523 		}
524 		mas.index = offset;
525 		mas.last = offset + size - 1;
526 		mas_store(&mas, mod);
527 		if (mas_is_err(&mas)) {
528 			mas.index = pad_start;
529 			mas_erase(&mas);
530 			ret = ERR_PTR(xa_err(mas.node));
531 		}
532 	} else {
533 		mas.last = offset + size - 1;
534 		mas_store(&mas, mod);
535 		if (mas_is_err(&mas))
536 			ret = ERR_PTR(xa_err(mas.node));
537 	}
538 unlock:
539 	mas_unlock(&mas);
540 
541 	if (IS_ERR(ret))
542 		return ret;
543 
544 	if (module_tags.size < offset + size) {
545 		int grow_res;
546 
547 		module_tags.size = offset + size;
548 		if (mem_alloc_profiling_enabled() && !tags_addressable()) {
549 			shutdown_mem_profiling(true);
550 			pr_warn("With module %s there are too many tags to fit in %d page flag bits. Memory allocation profiling is disabled!\n",
551 				mod->name, NR_UNUSED_PAGEFLAG_BITS);
552 		}
553 
554 		grow_res = vm_module_tags_populate();
555 		if (grow_res) {
556 			shutdown_mem_profiling(true);
557 			pr_err("Failed to allocate memory for allocation tags in the module %s. Memory allocation profiling is disabled!\n",
558 			       mod->name);
559 			return ERR_PTR(grow_res);
560 		}
561 	}
562 
563 	return (struct alloc_tag *)(module_tags.start_addr + offset);
564 }
565 
566 static void release_module_tags(struct module *mod, bool used)
567 {
568 	MA_STATE(mas, &mod_area_mt, module_tags.size, module_tags.size);
569 	struct alloc_tag *start_tag;
570 	struct alloc_tag *end_tag;
571 	struct module *val;
572 
573 	mas_lock(&mas);
574 	mas_for_each_rev(&mas, val, 0)
575 		if (val == mod)
576 			break;
577 
578 	if (!val) /* module not found */
579 		goto out;
580 
581 	if (!used)
582 		goto release_area;
583 
584 	start_tag = (struct alloc_tag *)(module_tags.start_addr + mas.index);
585 	end_tag = (struct alloc_tag *)(module_tags.start_addr + mas.last);
586 	if (!clean_unused_counters(start_tag, end_tag)) {
587 		struct alloc_tag *tag;
588 
589 		for (tag = start_tag; tag <= end_tag; tag++) {
590 			struct alloc_tag_counters counter;
591 
592 			if (!tag->counters)
593 				continue;
594 
595 			counter = alloc_tag_read(tag);
596 			pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n",
597 				tag->ct.filename, tag->ct.lineno, tag->ct.modname,
598 				tag->ct.function, counter.bytes);
599 		}
600 	} else {
601 		used = false;
602 	}
603 release_area:
604 	mas_store(&mas, used ? &unloaded_mod : NULL);
605 	val = mas_prev_range(&mas, 0);
606 	if (val == &prepend_mod)
607 		mas_store(&mas, NULL);
608 out:
609 	mas_unlock(&mas);
610 }
611 
612 static int load_module(struct module *mod, struct codetag *start, struct codetag *stop)
613 {
614 	/* Allocate module alloc_tag percpu counters */
615 	struct alloc_tag *start_tag;
616 	struct alloc_tag *stop_tag;
617 	struct alloc_tag *tag;
618 
619 	/* percpu counters for core allocations are already statically allocated */
620 	if (!mod)
621 		return 0;
622 
623 	start_tag = ct_to_alloc_tag(start);
624 	stop_tag = ct_to_alloc_tag(stop);
625 	for (tag = start_tag; tag < stop_tag; tag++) {
626 		WARN_ON(tag->counters);
627 		tag->counters = alloc_percpu(struct alloc_tag_counters);
628 		if (!tag->counters) {
629 			while (--tag >= start_tag) {
630 				free_percpu(tag->counters);
631 				tag->counters = NULL;
632 			}
633 			pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s\n",
634 			       mod->name);
635 			return -ENOMEM;
636 		}
637 
638 		/*
639 		 * Avoid a kmemleak false positive. The pointer to the counters is stored
640 		 * in the alloc_tag section of the module and cannot be directly accessed.
641 		 */
642 		kmemleak_ignore_percpu(tag->counters);
643 	}
644 	return 0;
645 }
646 
647 static void replace_module(struct module *mod, struct module *new_mod)
648 {
649 	MA_STATE(mas, &mod_area_mt, 0, module_tags.size);
650 	struct module *val;
651 
652 	mas_lock(&mas);
653 	mas_for_each(&mas, val, module_tags.size) {
654 		if (val != mod)
655 			continue;
656 
657 		mas_store_gfp(&mas, new_mod, GFP_KERNEL);
658 		break;
659 	}
660 	mas_unlock(&mas);
661 }
662 
663 static int __init alloc_mod_tags_mem(void)
664 {
665 	/* Map space to copy allocation tags */
666 	vm_module_tags = execmem_vmap(MODULE_ALLOC_TAG_VMAP_SIZE);
667 	if (!vm_module_tags) {
668 		pr_err("Failed to map %lu bytes for module allocation tags\n",
669 			MODULE_ALLOC_TAG_VMAP_SIZE);
670 		module_tags.start_addr = 0;
671 		return -ENOMEM;
672 	}
673 
674 	vm_module_tags->pages = kmalloc_objs(struct page *,
675 					     get_vm_area_size(vm_module_tags) >> PAGE_SHIFT,
676 					     GFP_KERNEL | __GFP_ZERO);
677 	if (!vm_module_tags->pages) {
678 		free_vm_area(vm_module_tags);
679 		return -ENOMEM;
680 	}
681 
682 	module_tags.start_addr = (unsigned long)vm_module_tags->addr;
683 	module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE;
684 	/* Ensure the base is alloc_tag aligned when required for indexing */
685 	module_tags.start_addr = alloc_tag_align(module_tags.start_addr);
686 
687 	return 0;
688 }
689 
690 static void __init free_mod_tags_mem(void)
691 {
692 	release_pages_arg arg = { .pages = vm_module_tags->pages };
693 
694 	module_tags.start_addr = 0;
695 	release_pages(arg, vm_module_tags->nr_pages);
696 	kfree(vm_module_tags->pages);
697 	free_vm_area(vm_module_tags);
698 }
699 
700 #else /* CONFIG_MODULES */
701 
702 static inline int alloc_mod_tags_mem(void) { return 0; }
703 static inline void free_mod_tags_mem(void) {}
704 
705 #endif /* CONFIG_MODULES */
706 
707 /* See: Documentation/mm/allocation-profiling.rst */
708 static int __init setup_early_mem_profiling(char *str)
709 {
710 	bool compressed = false;
711 	bool enable;
712 
713 	if (!str || !str[0])
714 		return -EINVAL;
715 
716 	if (!strncmp(str, "never", 5)) {
717 		enable = false;
718 		mem_profiling_support = false;
719 		pr_info("Memory allocation profiling is disabled!\n");
720 	} else {
721 		char *token = strsep(&str, ",");
722 
723 		if (kstrtobool(token, &enable))
724 			return -EINVAL;
725 
726 		if (str) {
727 
728 			if (strcmp(str, "compressed"))
729 				return -EINVAL;
730 
731 			compressed = true;
732 		}
733 		mem_profiling_support = true;
734 		pr_info("Memory allocation profiling is enabled %s compression and is turned %s!\n",
735 			compressed ? "with" : "without", str_on_off(enable));
736 	}
737 
738 	if (enable != mem_alloc_profiling_enabled()) {
739 		if (enable)
740 			static_branch_enable(&mem_alloc_profiling_key);
741 		else
742 			static_branch_disable(&mem_alloc_profiling_key);
743 	}
744 	if (compressed != static_key_enabled(&mem_profiling_compressed)) {
745 		if (compressed)
746 			static_branch_enable(&mem_profiling_compressed);
747 		else
748 			static_branch_disable(&mem_profiling_compressed);
749 	}
750 
751 	return 0;
752 }
753 early_param("sysctl.vm.mem_profiling", setup_early_mem_profiling);
754 
755 static __init bool need_page_alloc_tagging(void)
756 {
757 	if (static_key_enabled(&mem_profiling_compressed))
758 		return false;
759 
760 	return mem_profiling_support;
761 }
762 
763 #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
764 /*
765  * Track page allocations before page_ext is initialized.
766  * Some pages are allocated before page_ext becomes available, leaving
767  * their codetag uninitialized. Track these early PFNs so we can clear
768  * their codetag refs later to avoid warnings when they are freed.
769  *
770  * Early allocations include:
771  *   - Base allocations independent of CPU count
772  *   - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
773  *     such as trace ring buffers, scheduler per-cpu data)
774  *
775  * For simplicity, we fix the size to 8192.
776  * If insufficient, a warning will be triggered to alert the user.
777  *
778  * TODO: Replace fixed-size array with dynamic allocation using
779  * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
780  */
781 #define EARLY_ALLOC_PFN_MAX		8192
782 
783 static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
784 static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
785 
786 static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
787 {
788 	int old_idx, new_idx;
789 
790 	do {
791 		old_idx = atomic_read(&early_pfn_count);
792 		if (old_idx >= EARLY_ALLOC_PFN_MAX) {
793 			pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
794 				      EARLY_ALLOC_PFN_MAX);
795 			return;
796 		}
797 		new_idx = old_idx + 1;
798 	} while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
799 
800 	early_pfns[old_idx] = pfn;
801 }
802 
803 typedef void alloc_tag_add_func(unsigned long pfn);
804 static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
805 	RCU_INITIALIZER(__alloc_tag_add_early_pfn);
806 
807 void alloc_tag_add_early_pfn(unsigned long pfn)
808 {
809 	alloc_tag_add_func *alloc_tag_add;
810 
811 	if (static_key_enabled(&mem_profiling_compressed))
812 		return;
813 
814 	rcu_read_lock();
815 	alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
816 	if (alloc_tag_add)
817 		alloc_tag_add(pfn);
818 	rcu_read_unlock();
819 }
820 
821 static void __init clear_early_alloc_pfn_tag_refs(void)
822 {
823 	unsigned int i;
824 
825 	if (static_key_enabled(&mem_profiling_compressed))
826 		return;
827 
828 	rcu_assign_pointer(alloc_tag_add_early_pfn_ptr, NULL);
829 	/* Make sure we are not racing with __alloc_tag_add_early_pfn() */
830 	synchronize_rcu();
831 
832 	for (i = 0; i < atomic_read(&early_pfn_count); i++) {
833 		unsigned long pfn = early_pfns[i];
834 
835 		if (pfn_valid(pfn)) {
836 			struct page *page = pfn_to_page(pfn);
837 			union pgtag_ref_handle handle;
838 			union codetag_ref ref;
839 
840 			if (get_page_tag_ref(page, &ref, &handle)) {
841 				/*
842 				 * An early-allocated page could be freed and reallocated
843 				 * after its page_ext is initialized but before we clear it.
844 				 * In that case, it already has a valid tag set.
845 				 * We should not overwrite that valid tag with CODETAG_EMPTY.
846 				 *
847 				 * Note: there is still a small race window between checking
848 				 * ref.ct and calling set_codetag_empty(). We accept this
849 				 * race as it's unlikely and the extra complexity of atomic
850 				 * cmpxchg is not worth it for this debug-only code path.
851 				 */
852 				if (ref.ct) {
853 					put_page_tag_ref(handle);
854 					continue;
855 				}
856 
857 				set_codetag_empty(&ref);
858 				update_page_tag_ref(handle, &ref);
859 				put_page_tag_ref(handle);
860 			}
861 		}
862 
863 	}
864 }
865 #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
866 static inline void __init clear_early_alloc_pfn_tag_refs(void) {}
867 #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
868 
869 static __init void init_page_alloc_tagging(void)
870 {
871 	clear_early_alloc_pfn_tag_refs();
872 }
873 
874 struct page_ext_operations page_alloc_tagging_ops = {
875 	.size = sizeof(union codetag_ref),
876 	.need = need_page_alloc_tagging,
877 	.init = init_page_alloc_tagging,
878 };
879 EXPORT_SYMBOL(page_alloc_tagging_ops);
880 
881 #ifdef CONFIG_SYSCTL
882 /*
883  * Not using proc_do_static_key() directly to prevent enabling profiling
884  * after it was shut down.
885  */
886 static int proc_mem_profiling_handler(const struct ctl_table *table, int write,
887 				      void *buffer, size_t *lenp, loff_t *ppos)
888 {
889 	if (write) {
890 		/*
891 		 * Call from do_sysctl_args() which is a no-op since the same
892 		 * value was already set by setup_early_mem_profiling.
893 		 * Return success to avoid warnings from do_sysctl_args().
894 		 */
895 		if (!current->mm)
896 			return 0;
897 
898 #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
899 		/* User can't toggle profiling while debugging */
900 		return -EACCES;
901 #endif
902 		if (!mem_profiling_support)
903 			return -EINVAL;
904 	}
905 
906 	return proc_do_static_key(table, write, buffer, lenp, ppos);
907 }
908 
909 
910 static const struct ctl_table memory_allocation_profiling_sysctls[] = {
911 	{
912 		.procname	= "mem_profiling",
913 		.data		= &mem_alloc_profiling_key,
914 		.mode		= 0644,
915 		.proc_handler	= proc_mem_profiling_handler,
916 	},
917 };
918 
919 static void __init sysctl_init(void)
920 {
921 	register_sysctl_init("vm", memory_allocation_profiling_sysctls);
922 }
923 #else /* CONFIG_SYSCTL */
924 static inline void sysctl_init(void) {}
925 #endif /* CONFIG_SYSCTL */
926 
927 static int __init alloc_tag_init(void)
928 {
929 	const struct codetag_type_desc desc = {
930 		.section		= ALLOC_TAG_SECTION_NAME,
931 		.tag_size		= sizeof(struct alloc_tag),
932 #ifdef CONFIG_MODULES
933 		.needs_section_mem	= needs_section_mem,
934 		.alloc_section_mem	= reserve_module_tags,
935 		.free_section_mem	= release_module_tags,
936 		.module_load		= load_module,
937 		.module_replaced	= replace_module,
938 #endif
939 	};
940 	int res;
941 
942 	sysctl_init();
943 
944 	if (!mem_profiling_support) {
945 		pr_info("Memory allocation profiling is not supported!\n");
946 		return 0;
947 	}
948 
949 	if (!proc_create_seq_private(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op,
950 				     sizeof(struct allocinfo_private), NULL)) {
951 		pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME);
952 		shutdown_mem_profiling(false);
953 		return -ENOMEM;
954 	}
955 
956 	res = alloc_mod_tags_mem();
957 	if (res) {
958 		pr_err("Failed to reserve address space for module tags, errno = %d\n", res);
959 		shutdown_mem_profiling(true);
960 		return res;
961 	}
962 
963 	alloc_tag_cttype = codetag_register_type(&desc);
964 	if (IS_ERR(alloc_tag_cttype)) {
965 		pr_err("Allocation tags registration failed, errno = %pe\n", alloc_tag_cttype);
966 		free_mod_tags_mem();
967 		shutdown_mem_profiling(true);
968 		return PTR_ERR(alloc_tag_cttype);
969 	}
970 
971 	return 0;
972 }
973 module_init(alloc_tag_init);
974