xref: /linux/kernel/liveupdate/kexec_handover.c (revision d9e4142e7635f6f7173854667c0695ce5b836bbc)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * kexec_handover.c - kexec handover metadata processing
4  * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
5  * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
6  * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
7  * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com>
8  * Copyright (C) 2026 Google LLC, Jason Miu <jasonmiu@google.com>
9  */
10 
11 #define pr_fmt(fmt) "KHO: " fmt
12 
13 #include <linux/cleanup.h>
14 #include <linux/cma.h>
15 #include <linux/kmemleak.h>
16 #include <linux/count_zeros.h>
17 #include <linux/kasan.h>
18 #include <linux/kexec.h>
19 #include <linux/kexec_handover.h>
20 #include <linux/kho_radix_tree.h>
21 #include <linux/kho/abi/kexec_handover.h>
22 #include <linux/libfdt.h>
23 #include <linux/list.h>
24 #include <linux/memblock.h>
25 #include <linux/page-isolation.h>
26 #include <linux/unaligned.h>
27 #include <linux/vmalloc.h>
28 
29 #include <asm/early_ioremap.h>
30 
31 /*
32  * KHO is tightly coupled with mm init and needs access to some of mm
33  * internal APIs.
34  */
35 #include "../../mm/internal.h"
36 #include "../kexec_internal.h"
37 #include "kexec_handover_internal.h"
38 
39 /* The magic token for preserved pages */
40 #define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */
41 
42 /*
43  * KHO uses page->private, which is an unsigned long, to store page metadata.
44  * Use it to store both the magic and the order.
45  */
46 union kho_page_info {
47 	unsigned long page_private;
48 	struct {
49 		unsigned int order;
50 		unsigned int magic;
51 	};
52 };
53 
54 static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private));
55 
56 static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT);
57 
58 bool kho_is_enabled(void)
59 {
60 	return kho_enable;
61 }
62 EXPORT_SYMBOL_GPL(kho_is_enabled);
63 
64 static int __init kho_parse_enable(char *p)
65 {
66 	return kstrtobool(p, &kho_enable);
67 }
68 early_param("kho", kho_parse_enable);
69 
70 struct kho_out {
71 	void *fdt;
72 	struct mutex lock; /* protects KHO FDT */
73 
74 	struct kho_radix_tree radix_tree;
75 	struct kho_debugfs dbg;
76 };
77 
78 static struct kho_out kho_out = {
79 	.lock = __MUTEX_INITIALIZER(kho_out.lock),
80 	.radix_tree = {
81 		.lock = __MUTEX_INITIALIZER(kho_out.radix_tree.lock),
82 	},
83 };
84 
85 /**
86  * kho_radix_encode_key - Encodes a physical address and order into a radix key.
87  * @phys: The physical address of the page.
88  * @order: The order of the page.
89  *
90  * This function combines a page's physical address and its order into a
91  * single unsigned long, which is used as a key for all radix tree
92  * operations.
93  *
94  * Return: The encoded unsigned long radix key.
95  */
96 static unsigned long kho_radix_encode_key(phys_addr_t phys, unsigned int order)
97 {
98 	/* Order bits part */
99 	unsigned long h = 1UL << (KHO_ORDER_0_LOG2 - order);
100 	/* Shifted physical address part */
101 	unsigned long l = phys >> (PAGE_SHIFT + order);
102 
103 	return h | l;
104 }
105 
106 /**
107  * kho_radix_decode_key - Decodes a radix key back into a physical address and order.
108  * @key: The unsigned long key to decode.
109  * @order: An output parameter, a pointer to an unsigned int where the decoded
110  *         page order will be stored.
111  *
112  * This function reverses the encoding performed by kho_radix_encode_key(),
113  * extracting the original physical address and page order from a given key.
114  *
115  * Return: The decoded physical address.
116  */
117 static phys_addr_t kho_radix_decode_key(unsigned long key, unsigned int *order)
118 {
119 	unsigned int order_bit = fls64(key);
120 	phys_addr_t phys;
121 
122 	/* order_bit is numbered starting at 1 from fls64 */
123 	*order = KHO_ORDER_0_LOG2 - order_bit + 1;
124 	/* The order is discarded by the shift */
125 	phys = key << (PAGE_SHIFT + *order);
126 
127 	return phys;
128 }
129 
130 static unsigned long kho_radix_get_bitmap_index(unsigned long key)
131 {
132 	return key % (1 << KHO_BITMAP_SIZE_LOG2);
133 }
134 
135 static unsigned long kho_radix_get_table_index(unsigned long key,
136 					       unsigned int level)
137 {
138 	int s;
139 
140 	s = ((level - 1) * KHO_TABLE_SIZE_LOG2) + KHO_BITMAP_SIZE_LOG2;
141 	return (key >> s) % (1 << KHO_TABLE_SIZE_LOG2);
142 }
143 
144 /**
145  * kho_radix_add_page - Marks a page as preserved in the radix tree.
146  * @tree: The KHO radix tree.
147  * @pfn: The page frame number of the page to preserve.
148  * @order: The order of the page.
149  *
150  * This function traverses the radix tree based on the key derived from @pfn
151  * and @order. It sets the corresponding bit in the leaf bitmap to mark the
152  * page for preservation. If intermediate nodes do not exist along the path,
153  * they are allocated and added to the tree.
154  *
155  * Return: 0 on success, or a negative error code on failure.
156  */
157 int kho_radix_add_page(struct kho_radix_tree *tree,
158 		       unsigned long pfn, unsigned int order)
159 {
160 	/* Newly allocated nodes for error cleanup */
161 	struct kho_radix_node *intermediate_nodes[KHO_TREE_MAX_DEPTH] = { 0 };
162 	unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order);
163 	struct kho_radix_node *anchor_node = NULL;
164 	struct kho_radix_node *node = tree->root;
165 	struct kho_radix_node *new_node;
166 	unsigned int i, idx, anchor_idx;
167 	struct kho_radix_leaf *leaf;
168 	int err = 0;
169 
170 	if (WARN_ON_ONCE(!tree->root))
171 		return -EINVAL;
172 
173 	might_sleep();
174 
175 	guard(mutex)(&tree->lock);
176 
177 	/* Go from high levels to low levels */
178 	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
179 		idx = kho_radix_get_table_index(key, i);
180 
181 		if (node->table[idx]) {
182 			node = phys_to_virt(node->table[idx]);
183 			continue;
184 		}
185 
186 		/* Next node is empty, create a new node for it */
187 		new_node = (struct kho_radix_node *)get_zeroed_page(GFP_KERNEL);
188 		if (!new_node) {
189 			err = -ENOMEM;
190 			goto err_free_nodes;
191 		}
192 
193 		node->table[idx] = virt_to_phys(new_node);
194 
195 		/*
196 		 * Capture the node where the new branch starts for cleanup
197 		 * if allocation fails.
198 		 */
199 		if (!anchor_node) {
200 			anchor_node = node;
201 			anchor_idx = idx;
202 		}
203 		intermediate_nodes[i] = new_node;
204 
205 		node = new_node;
206 	}
207 
208 	/* Handle the leaf level bitmap (level 0) */
209 	idx = kho_radix_get_bitmap_index(key);
210 	leaf = (struct kho_radix_leaf *)node;
211 	__set_bit(idx, leaf->bitmap);
212 
213 	return 0;
214 
215 err_free_nodes:
216 	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
217 		if (intermediate_nodes[i])
218 			free_page((unsigned long)intermediate_nodes[i]);
219 	}
220 	if (anchor_node)
221 		anchor_node->table[anchor_idx] = 0;
222 
223 	return err;
224 }
225 EXPORT_SYMBOL_GPL(kho_radix_add_page);
226 
227 /**
228  * kho_radix_del_page - Removes a page's preservation status from the radix tree.
229  * @tree: The KHO radix tree.
230  * @pfn: The page frame number of the page to unpreserve.
231  * @order: The order of the page.
232  *
233  * This function traverses the radix tree and clears the bit corresponding to
234  * the page, effectively removing its "preserved" status. It does not free
235  * the tree's intermediate nodes, even if they become empty.
236  */
237 void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn,
238 			unsigned int order)
239 {
240 	unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order);
241 	struct kho_radix_node *node = tree->root;
242 	struct kho_radix_leaf *leaf;
243 	unsigned int i, idx;
244 
245 	if (WARN_ON_ONCE(!tree->root))
246 		return;
247 
248 	might_sleep();
249 
250 	guard(mutex)(&tree->lock);
251 
252 	/* Go from high levels to low levels */
253 	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
254 		idx = kho_radix_get_table_index(key, i);
255 
256 		/*
257 		 * Attempting to delete a page that has not been preserved,
258 		 * return with a warning.
259 		 */
260 		if (WARN_ON(!node->table[idx]))
261 			return;
262 
263 		node = phys_to_virt(node->table[idx]);
264 	}
265 
266 	/* Handle the leaf level bitmap (level 0) */
267 	leaf = (struct kho_radix_leaf *)node;
268 	idx = kho_radix_get_bitmap_index(key);
269 	__clear_bit(idx, leaf->bitmap);
270 }
271 EXPORT_SYMBOL_GPL(kho_radix_del_page);
272 
273 static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf,
274 			       unsigned long key,
275 			       kho_radix_tree_walk_callback_t cb)
276 {
277 	unsigned long *bitmap = (unsigned long *)leaf;
278 	unsigned int order;
279 	phys_addr_t phys;
280 	unsigned int i;
281 	int err;
282 
283 	for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) {
284 		phys = kho_radix_decode_key(key | i, &order);
285 		err = cb(phys, order);
286 		if (err)
287 			return err;
288 	}
289 
290 	return 0;
291 }
292 
293 static int __kho_radix_walk_tree(struct kho_radix_node *root,
294 				 unsigned int level, unsigned long start,
295 				 kho_radix_tree_walk_callback_t cb)
296 {
297 	struct kho_radix_node *node;
298 	struct kho_radix_leaf *leaf;
299 	unsigned long key, i;
300 	unsigned int shift;
301 	int err;
302 
303 	for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) {
304 		if (!root->table[i])
305 			continue;
306 
307 		shift = ((level - 1) * KHO_TABLE_SIZE_LOG2) +
308 			KHO_BITMAP_SIZE_LOG2;
309 		key = start | (i << shift);
310 
311 		node = phys_to_virt(root->table[i]);
312 
313 		if (level == 1) {
314 			/*
315 			 * we are at level 1,
316 			 * node is pointing to the level 0 bitmap.
317 			 */
318 			leaf = (struct kho_radix_leaf *)node;
319 			err = kho_radix_walk_leaf(leaf, key, cb);
320 		} else {
321 			err  = __kho_radix_walk_tree(node, level - 1,
322 						     key, cb);
323 		}
324 
325 		if (err)
326 			return err;
327 	}
328 
329 	return 0;
330 }
331 
332 /**
333  * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each preserved page.
334  * @tree: A pointer to the KHO radix tree to walk.
335  * @cb: A callback function of type kho_radix_tree_walk_callback_t that will be
336  *      invoked for each preserved page found in the tree. The callback receives
337  *      the physical address and order of the preserved page.
338  *
339  * This function walks the radix tree, searching from the specified top level
340  * down to the lowest level (level 0). For each preserved page found, it invokes
341  * the provided callback, passing the page's physical address and order.
342  *
343  * Return: 0 if the walk completed the specified tree, or the non-zero return
344  *         value from the callback that stopped the walk.
345  */
346 int kho_radix_walk_tree(struct kho_radix_tree *tree,
347 			kho_radix_tree_walk_callback_t cb)
348 {
349 	if (WARN_ON_ONCE(!tree->root))
350 		return -EINVAL;
351 
352 	guard(mutex)(&tree->lock);
353 
354 	return __kho_radix_walk_tree(tree->root, KHO_TREE_MAX_DEPTH - 1, 0, cb);
355 }
356 EXPORT_SYMBOL_GPL(kho_radix_walk_tree);
357 
358 static void __kho_unpreserve(struct kho_radix_tree *tree,
359 			     unsigned long pfn, unsigned long end_pfn)
360 {
361 	unsigned int order;
362 
363 	while (pfn < end_pfn) {
364 		order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
365 
366 		kho_radix_del_page(tree, pfn, order);
367 
368 		pfn += 1 << order;
369 	}
370 }
371 
372 /* For physically contiguous 0-order pages. */
373 static void kho_init_pages(struct page *page, unsigned long nr_pages)
374 {
375 	for (unsigned long i = 0; i < nr_pages; i++) {
376 		set_page_count(page + i, 1);
377 		/* Clear each page's codetag to avoid accounting mismatch. */
378 		clear_page_tag_ref(page + i);
379 	}
380 }
381 
382 static void kho_init_folio(struct page *page, unsigned int order)
383 {
384 	unsigned long nr_pages = (1 << order);
385 
386 	/* Head page gets refcount of 1. */
387 	set_page_count(page, 1);
388 	/* Clear head page's codetag to avoid accounting mismatch. */
389 	clear_page_tag_ref(page);
390 
391 	/* For higher order folios, tail pages get a page count of zero. */
392 	for (unsigned long i = 1; i < nr_pages; i++)
393 		set_page_count(page + i, 0);
394 
395 	if (order > 0)
396 		prep_compound_page(page, order);
397 }
398 
399 static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
400 {
401 	struct page *page = pfn_to_online_page(PHYS_PFN(phys));
402 	unsigned long nr_pages;
403 	union kho_page_info info;
404 
405 	if (!page)
406 		return NULL;
407 
408 	info.page_private = page->private;
409 	/*
410 	 * deserialize_bitmap() only sets the magic on the head page. This magic
411 	 * check also implicitly makes sure phys is order-aligned since for
412 	 * non-order-aligned phys addresses, magic will never be set.
413 	 */
414 	if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC))
415 		return NULL;
416 	nr_pages = (1 << info.order);
417 
418 	/* Clear private to make sure later restores on this page error out. */
419 	page->private = 0;
420 
421 	if (is_folio)
422 		kho_init_folio(page, info.order);
423 	else
424 		kho_init_pages(page, nr_pages);
425 
426 	adjust_managed_page_count(page, nr_pages);
427 	return page;
428 }
429 
430 /**
431  * kho_restore_folio - recreates the folio from the preserved memory.
432  * @phys: physical address of the folio.
433  *
434  * Return: pointer to the struct folio on success, NULL on failure.
435  */
436 struct folio *kho_restore_folio(phys_addr_t phys)
437 {
438 	struct page *page = kho_restore_page(phys, true);
439 
440 	return page ? page_folio(page) : NULL;
441 }
442 EXPORT_SYMBOL_GPL(kho_restore_folio);
443 
444 /**
445  * kho_restore_pages - restore list of contiguous order 0 pages.
446  * @phys: physical address of the first page.
447  * @nr_pages: number of pages.
448  *
449  * Restore a contiguous list of order 0 pages that was preserved with
450  * kho_preserve_pages().
451  *
452  * Return: the first page on success, NULL on failure.
453  */
454 struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages)
455 {
456 	const unsigned long start_pfn = PHYS_PFN(phys);
457 	const unsigned long end_pfn = start_pfn + nr_pages;
458 	unsigned long pfn = start_pfn;
459 
460 	while (pfn < end_pfn) {
461 		const unsigned int order =
462 			min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
463 		struct page *page = kho_restore_page(PFN_PHYS(pfn), false);
464 
465 		if (!page)
466 			return NULL;
467 		pfn += 1 << order;
468 	}
469 
470 	return pfn_to_page(start_pfn);
471 }
472 EXPORT_SYMBOL_GPL(kho_restore_pages);
473 
474 static int __init kho_preserved_memory_reserve(phys_addr_t phys,
475 					       unsigned int order)
476 {
477 	union kho_page_info info;
478 	struct page *page;
479 	u64 sz;
480 
481 	sz = 1 << (order + PAGE_SHIFT);
482 	page = phys_to_page(phys);
483 
484 	/* Reserve the memory preserved in KHO in memblock */
485 	memblock_reserve(phys, sz);
486 	memblock_reserved_mark_noinit(phys, sz);
487 	info.magic = KHO_PAGE_MAGIC;
488 	info.order = order;
489 	page->private = info.page_private;
490 
491 	return 0;
492 }
493 
494 /* Returns physical address of the preserved memory map from FDT */
495 static phys_addr_t __init kho_get_mem_map_phys(const void *fdt)
496 {
497 	const void *mem_ptr;
498 	int len;
499 
500 	mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len);
501 	if (!mem_ptr || len != sizeof(u64)) {
502 		pr_err("failed to get preserved memory map\n");
503 		return 0;
504 	}
505 
506 	return get_unaligned((const u64 *)mem_ptr);
507 }
508 
509 /*
510  * With KHO enabled, memory can become fragmented because KHO regions may
511  * be anywhere in physical address space. The scratch regions give us a
512  * safe zones that we will never see KHO allocations from. This is where we
513  * can later safely load our new kexec images into and then use the scratch
514  * area for early allocations that happen before page allocator is
515  * initialized.
516  */
517 struct kho_scratch *kho_scratch;
518 unsigned int kho_scratch_cnt;
519 
520 /*
521  * The scratch areas are scaled by default as percent of memory allocated from
522  * memblock. A user can override the scale with command line parameter:
523  *
524  * kho_scratch=N%
525  *
526  * It is also possible to explicitly define size for a lowmem, a global and
527  * per-node scratch areas:
528  *
529  * kho_scratch=l[KMG],n[KMG],m[KMG]
530  *
531  * The explicit size definition takes precedence over scale definition.
532  */
533 static unsigned int scratch_scale __initdata = 200;
534 static phys_addr_t scratch_size_global __initdata;
535 static phys_addr_t scratch_size_pernode __initdata;
536 static phys_addr_t scratch_size_lowmem __initdata;
537 
538 static int __init kho_parse_scratch_size(char *p)
539 {
540 	size_t len;
541 	unsigned long sizes[3];
542 	size_t total_size = 0;
543 	int i;
544 
545 	if (!p)
546 		return -EINVAL;
547 
548 	len = strlen(p);
549 	if (!len)
550 		return -EINVAL;
551 
552 	/* parse nn% */
553 	if (p[len - 1] == '%') {
554 		/* unsigned int max is 4,294,967,295, 10 chars */
555 		char s_scale[11] = {};
556 		int ret = 0;
557 
558 		if (len > ARRAY_SIZE(s_scale))
559 			return -EINVAL;
560 
561 		memcpy(s_scale, p, len - 1);
562 		ret = kstrtouint(s_scale, 10, &scratch_scale);
563 		if (!ret)
564 			pr_notice("scratch scale is %d%%\n", scratch_scale);
565 		return ret;
566 	}
567 
568 	/* parse ll[KMG],mm[KMG],nn[KMG] */
569 	for (i = 0; i < ARRAY_SIZE(sizes); i++) {
570 		char *endp = p;
571 
572 		if (i > 0) {
573 			if (*p != ',')
574 				return -EINVAL;
575 			p += 1;
576 		}
577 
578 		sizes[i] = memparse(p, &endp);
579 		if (endp == p)
580 			return -EINVAL;
581 		p = endp;
582 		total_size += sizes[i];
583 	}
584 
585 	if (!total_size)
586 		return -EINVAL;
587 
588 	/* The string should be fully consumed by now. */
589 	if (*p)
590 		return -EINVAL;
591 
592 	scratch_size_lowmem = sizes[0];
593 	scratch_size_global = sizes[1];
594 	scratch_size_pernode = sizes[2];
595 	scratch_scale = 0;
596 
597 	pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n",
598 		  (u64)(scratch_size_lowmem >> 20),
599 		  (u64)(scratch_size_global >> 20),
600 		  (u64)(scratch_size_pernode >> 20));
601 
602 	return 0;
603 }
604 early_param("kho_scratch", kho_parse_scratch_size);
605 
606 static void __init scratch_size_update(void)
607 {
608 	phys_addr_t size;
609 
610 	if (!scratch_scale)
611 		return;
612 
613 	size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT,
614 					   NUMA_NO_NODE);
615 	size = size * scratch_scale / 100;
616 	scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
617 
618 	size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
619 					   NUMA_NO_NODE);
620 	size = size * scratch_scale / 100 - scratch_size_lowmem;
621 	scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
622 }
623 
624 static phys_addr_t __init scratch_size_node(int nid)
625 {
626 	phys_addr_t size;
627 
628 	if (scratch_scale) {
629 		size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
630 						   nid);
631 		size = size * scratch_scale / 100;
632 	} else {
633 		size = scratch_size_pernode;
634 	}
635 
636 	return round_up(size, CMA_MIN_ALIGNMENT_BYTES);
637 }
638 
639 /**
640  * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec
641  *
642  * With KHO we can preserve arbitrary pages in the system. To ensure we still
643  * have a large contiguous region of memory when we search the physical address
644  * space for target memory, let's make sure we always have a large CMA region
645  * active. This CMA region will only be used for movable pages which are not a
646  * problem for us during KHO because we can just move them somewhere else.
647  */
648 static void __init kho_reserve_scratch(void)
649 {
650 	phys_addr_t addr, size;
651 	int nid, i = 0;
652 
653 	if (!kho_enable)
654 		return;
655 
656 	scratch_size_update();
657 
658 	/* FIXME: deal with node hot-plug/remove */
659 	kho_scratch_cnt = nodes_weight(node_states[N_MEMORY]) + 2;
660 	size = kho_scratch_cnt * sizeof(*kho_scratch);
661 	kho_scratch = memblock_alloc(size, PAGE_SIZE);
662 	if (!kho_scratch) {
663 		pr_err("Failed to reserve scratch array\n");
664 		goto err_disable_kho;
665 	}
666 
667 	/*
668 	 * reserve scratch area in low memory for lowmem allocations in the
669 	 * next kernel
670 	 */
671 	size = scratch_size_lowmem;
672 	addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0,
673 					 ARCH_LOW_ADDRESS_LIMIT);
674 	if (!addr) {
675 		pr_err("Failed to reserve lowmem scratch buffer\n");
676 		goto err_free_scratch_desc;
677 	}
678 
679 	kho_scratch[i].addr = addr;
680 	kho_scratch[i].size = size;
681 	i++;
682 
683 	/* reserve large contiguous area for allocations without nid */
684 	size = scratch_size_global;
685 	addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES);
686 	if (!addr) {
687 		pr_err("Failed to reserve global scratch buffer\n");
688 		goto err_free_scratch_areas;
689 	}
690 
691 	kho_scratch[i].addr = addr;
692 	kho_scratch[i].size = size;
693 	i++;
694 
695 	/*
696 	 * Loop over nodes that have both memory and are online. Skip
697 	 * memoryless nodes, as we can not allocate scratch areas there.
698 	 */
699 	for_each_node_state(nid, N_MEMORY) {
700 		size = scratch_size_node(nid);
701 		addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES,
702 						0, MEMBLOCK_ALLOC_ACCESSIBLE,
703 						nid, true);
704 		if (!addr) {
705 			pr_err("Failed to reserve nid %d scratch buffer\n", nid);
706 			goto err_free_scratch_areas;
707 		}
708 
709 		kho_scratch[i].addr = addr;
710 		kho_scratch[i].size = size;
711 		i++;
712 	}
713 
714 	return;
715 
716 err_free_scratch_areas:
717 	for (i--; i >= 0; i--)
718 		memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size);
719 err_free_scratch_desc:
720 	memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch));
721 err_disable_kho:
722 	pr_warn("Failed to reserve scratch area, disabling kexec handover\n");
723 	kho_enable = false;
724 }
725 
726 /**
727  * kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
728  * @name: name of the sub tree.
729  * @fdt: the sub tree blob.
730  * @size: size of the blob in bytes.
731  *
732  * Creates a new child node named @name in KHO root FDT and records
733  * the physical address of @fdt. The pages of @fdt must also be preserved
734  * by KHO for the new kernel to retrieve it after kexec.
735  *
736  * A debugfs blob entry is also created at
737  * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with
738  * CONFIG_KEXEC_HANDOVER_DEBUGFS
739  *
740  * Return: 0 on success, error code on failure
741  */
742 int kho_add_subtree(const char *name, void *fdt, size_t size)
743 {
744 	phys_addr_t phys = virt_to_phys(fdt);
745 	void *root_fdt = kho_out.fdt;
746 	int err = -ENOMEM;
747 	int off, fdt_err;
748 
749 	guard(mutex)(&kho_out.lock);
750 
751 	fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE);
752 	if (fdt_err < 0)
753 		return err;
754 
755 	off = fdt_add_subnode(root_fdt, 0, name);
756 	if (off < 0) {
757 		if (off == -FDT_ERR_EXISTS)
758 			err = -EEXIST;
759 		goto out_pack;
760 	}
761 
762 	err = fdt_setprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME,
763 			  &phys, sizeof(phys));
764 	if (err < 0)
765 		goto out_pack;
766 
767 	WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, size, false));
768 
769 out_pack:
770 	fdt_pack(root_fdt);
771 
772 	return err;
773 }
774 EXPORT_SYMBOL_GPL(kho_add_subtree);
775 
776 void kho_remove_subtree(void *fdt)
777 {
778 	phys_addr_t target_phys = virt_to_phys(fdt);
779 	void *root_fdt = kho_out.fdt;
780 	int off;
781 	int err;
782 
783 	guard(mutex)(&kho_out.lock);
784 
785 	err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE);
786 	if (err < 0)
787 		return;
788 
789 	for (off = fdt_first_subnode(root_fdt, 0); off >= 0;
790 	     off = fdt_next_subnode(root_fdt, off)) {
791 		const u64 *val;
792 		int len;
793 
794 		val = fdt_getprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, &len);
795 		if (!val || len != sizeof(phys_addr_t))
796 			continue;
797 
798 		if ((phys_addr_t)*val == target_phys) {
799 			fdt_del_node(root_fdt, off);
800 			kho_debugfs_fdt_remove(&kho_out.dbg, fdt);
801 			break;
802 		}
803 	}
804 
805 	fdt_pack(root_fdt);
806 }
807 EXPORT_SYMBOL_GPL(kho_remove_subtree);
808 
809 /**
810  * kho_preserve_folio - preserve a folio across kexec.
811  * @folio: folio to preserve.
812  *
813  * Instructs KHO to preserve the whole folio across kexec. The order
814  * will be preserved as well.
815  *
816  * Return: 0 on success, error code on failure
817  */
818 int kho_preserve_folio(struct folio *folio)
819 {
820 	struct kho_radix_tree *tree = &kho_out.radix_tree;
821 	const unsigned long pfn = folio_pfn(folio);
822 	const unsigned int order = folio_order(folio);
823 
824 	if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
825 		return -EINVAL;
826 
827 	return kho_radix_add_page(tree, pfn, order);
828 }
829 EXPORT_SYMBOL_GPL(kho_preserve_folio);
830 
831 /**
832  * kho_unpreserve_folio - unpreserve a folio.
833  * @folio: folio to unpreserve.
834  *
835  * Instructs KHO to unpreserve a folio that was preserved by
836  * kho_preserve_folio() before. The provided @folio (pfn and order)
837  * must exactly match a previously preserved folio.
838  */
839 void kho_unpreserve_folio(struct folio *folio)
840 {
841 	struct kho_radix_tree *tree = &kho_out.radix_tree;
842 	const unsigned long pfn = folio_pfn(folio);
843 	const unsigned int order = folio_order(folio);
844 
845 	kho_radix_del_page(tree, pfn, order);
846 }
847 EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
848 
849 /**
850  * kho_preserve_pages - preserve contiguous pages across kexec
851  * @page: first page in the list.
852  * @nr_pages: number of pages.
853  *
854  * Preserve a contiguous list of order 0 pages. Must be restored using
855  * kho_restore_pages() to ensure the pages are restored properly as order 0.
856  *
857  * Return: 0 on success, error code on failure
858  */
859 int kho_preserve_pages(struct page *page, unsigned long nr_pages)
860 {
861 	struct kho_radix_tree *tree = &kho_out.radix_tree;
862 	const unsigned long start_pfn = page_to_pfn(page);
863 	const unsigned long end_pfn = start_pfn + nr_pages;
864 	unsigned long pfn = start_pfn;
865 	unsigned long failed_pfn = 0;
866 	int err = 0;
867 
868 	if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT,
869 					nr_pages << PAGE_SHIFT))) {
870 		return -EINVAL;
871 	}
872 
873 	while (pfn < end_pfn) {
874 		unsigned int order =
875 			min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
876 
877 		/*
878 		 * Make sure all the pages in a single preservation are in the
879 		 * same NUMA node. The restore machinery can not cope with a
880 		 * preservation spanning multiple NUMA nodes.
881 		 */
882 		while (pfn_to_nid(pfn) != pfn_to_nid(pfn + (1UL << order) - 1))
883 			order--;
884 
885 		err = kho_radix_add_page(tree, pfn, order);
886 		if (err) {
887 			failed_pfn = pfn;
888 			break;
889 		}
890 
891 		pfn += 1 << order;
892 	}
893 
894 	if (err)
895 		__kho_unpreserve(tree, start_pfn, failed_pfn);
896 
897 	return err;
898 }
899 EXPORT_SYMBOL_GPL(kho_preserve_pages);
900 
901 /**
902  * kho_unpreserve_pages - unpreserve contiguous pages.
903  * @page: first page in the list.
904  * @nr_pages: number of pages.
905  *
906  * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page.
907  * This must be called with the same @page and @nr_pages as the corresponding
908  * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger
909  * preserved blocks is not supported.
910  */
911 void kho_unpreserve_pages(struct page *page, unsigned long nr_pages)
912 {
913 	struct kho_radix_tree *tree = &kho_out.radix_tree;
914 	const unsigned long start_pfn = page_to_pfn(page);
915 	const unsigned long end_pfn = start_pfn + nr_pages;
916 
917 	__kho_unpreserve(tree, start_pfn, end_pfn);
918 }
919 EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
920 
921 /* vmalloc flags KHO supports */
922 #define KHO_VMALLOC_SUPPORTED_FLAGS	(VM_ALLOC | VM_ALLOW_HUGE_VMAP)
923 
924 /* KHO internal flags for vmalloc preservations */
925 #define KHO_VMALLOC_ALLOC	0x0001
926 #define KHO_VMALLOC_HUGE_VMAP	0x0002
927 
928 static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags)
929 {
930 	unsigned short kho_flags = 0;
931 
932 	if (vm_flags & VM_ALLOC)
933 		kho_flags |= KHO_VMALLOC_ALLOC;
934 	if (vm_flags & VM_ALLOW_HUGE_VMAP)
935 		kho_flags |= KHO_VMALLOC_HUGE_VMAP;
936 
937 	return kho_flags;
938 }
939 
940 static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags)
941 {
942 	unsigned int vm_flags = 0;
943 
944 	if (kho_flags & KHO_VMALLOC_ALLOC)
945 		vm_flags |= VM_ALLOC;
946 	if (kho_flags & KHO_VMALLOC_HUGE_VMAP)
947 		vm_flags |= VM_ALLOW_HUGE_VMAP;
948 
949 	return vm_flags;
950 }
951 
952 static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
953 {
954 	struct kho_vmalloc_chunk *chunk;
955 	int err;
956 
957 	chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL);
958 	if (!chunk)
959 		return NULL;
960 
961 	err = kho_preserve_pages(virt_to_page(chunk), 1);
962 	if (err)
963 		goto err_free;
964 	if (cur)
965 		KHOSER_STORE_PTR(cur->hdr.next, chunk);
966 	return chunk;
967 
968 err_free:
969 	free_page((unsigned long)chunk);
970 	return NULL;
971 }
972 
973 static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
974 					 unsigned short order)
975 {
976 	struct kho_radix_tree *tree = &kho_out.radix_tree;
977 	unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
978 
979 	__kho_unpreserve(tree, pfn, pfn + 1);
980 
981 	for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
982 		pfn = PHYS_PFN(chunk->phys[i]);
983 		__kho_unpreserve(tree, pfn, pfn + (1 << order));
984 	}
985 }
986 
987 /**
988  * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
989  * @ptr: pointer to the area in vmalloc address space
990  * @preservation: placeholder for preservation metadata
991  *
992  * Instructs KHO to preserve the area in vmalloc address space at @ptr. The
993  * physical pages mapped at @ptr will be preserved and on successful return
994  * @preservation will hold the physical address of a structure that describes
995  * the preservation.
996  *
997  * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably
998  * restored on the same node
999  *
1000  * Return: 0 on success, error code on failure
1001  */
1002 int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation)
1003 {
1004 	struct kho_vmalloc_chunk *chunk;
1005 	struct vm_struct *vm = find_vm_area(ptr);
1006 	unsigned int order, flags, nr_contig_pages;
1007 	unsigned int idx = 0;
1008 	int err;
1009 
1010 	if (!vm)
1011 		return -EINVAL;
1012 
1013 	if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
1014 		return -EOPNOTSUPP;
1015 
1016 	flags = vmalloc_flags_to_kho(vm->flags);
1017 	order = get_vm_area_page_order(vm);
1018 
1019 	chunk = new_vmalloc_chunk(NULL);
1020 	if (!chunk)
1021 		return -ENOMEM;
1022 	KHOSER_STORE_PTR(preservation->first, chunk);
1023 
1024 	nr_contig_pages = (1 << order);
1025 	for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) {
1026 		phys_addr_t phys = page_to_phys(vm->pages[i]);
1027 
1028 		err = kho_preserve_pages(vm->pages[i], nr_contig_pages);
1029 		if (err)
1030 			goto err_free;
1031 
1032 		chunk->phys[idx++] = phys;
1033 		if (idx == ARRAY_SIZE(chunk->phys)) {
1034 			chunk = new_vmalloc_chunk(chunk);
1035 			if (!chunk) {
1036 				err = -ENOMEM;
1037 				goto err_free;
1038 			}
1039 			idx = 0;
1040 		}
1041 	}
1042 
1043 	preservation->total_pages = vm->nr_pages;
1044 	preservation->flags = flags;
1045 	preservation->order = order;
1046 
1047 	return 0;
1048 
1049 err_free:
1050 	kho_unpreserve_vmalloc(preservation);
1051 	return err;
1052 }
1053 EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
1054 
1055 /**
1056  * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc()
1057  * @preservation: preservation metadata returned by kho_preserve_vmalloc()
1058  *
1059  * Instructs KHO to unpreserve the area in vmalloc address space that was
1060  * previously preserved with kho_preserve_vmalloc().
1061  */
1062 void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
1063 {
1064 	struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
1065 
1066 	while (chunk) {
1067 		struct kho_vmalloc_chunk *tmp = chunk;
1068 
1069 		kho_vmalloc_unpreserve_chunk(chunk, preservation->order);
1070 
1071 		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
1072 		free_page((unsigned long)tmp);
1073 	}
1074 }
1075 EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc);
1076 
1077 /**
1078  * kho_restore_vmalloc - recreates and populates an area in vmalloc address
1079  * space from the preserved memory.
1080  * @preservation: preservation metadata.
1081  *
1082  * Recreates an area in vmalloc address space and populates it with memory that
1083  * was preserved using kho_preserve_vmalloc().
1084  *
1085  * Return: pointer to the area in the vmalloc address space, NULL on failure.
1086  */
1087 void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
1088 {
1089 	struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
1090 	kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_PROT_NORMAL;
1091 	unsigned int align, order, shift, vm_flags;
1092 	unsigned long total_pages, contig_pages;
1093 	unsigned long addr, size;
1094 	struct vm_struct *area;
1095 	struct page **pages;
1096 	unsigned int idx = 0;
1097 	int err;
1098 
1099 	vm_flags = kho_flags_to_vmalloc(preservation->flags);
1100 	if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
1101 		return NULL;
1102 
1103 	total_pages = preservation->total_pages;
1104 	pages = kvmalloc_objs(*pages, total_pages);
1105 	if (!pages)
1106 		return NULL;
1107 	order = preservation->order;
1108 	contig_pages = (1 << order);
1109 	shift = PAGE_SHIFT + order;
1110 	align = 1 << shift;
1111 
1112 	while (chunk) {
1113 		struct page *page;
1114 
1115 		for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
1116 			phys_addr_t phys = chunk->phys[i];
1117 
1118 			if (idx + contig_pages > total_pages)
1119 				goto err_free_pages_array;
1120 
1121 			page = kho_restore_pages(phys, contig_pages);
1122 			if (!page)
1123 				goto err_free_pages_array;
1124 
1125 			for (int j = 0; j < contig_pages; j++)
1126 				pages[idx++] = page + j;
1127 
1128 			phys += contig_pages * PAGE_SIZE;
1129 		}
1130 
1131 		page = kho_restore_pages(virt_to_phys(chunk), 1);
1132 		if (!page)
1133 			goto err_free_pages_array;
1134 		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
1135 		__free_page(page);
1136 	}
1137 
1138 	if (idx != total_pages)
1139 		goto err_free_pages_array;
1140 
1141 	area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift,
1142 				  vm_flags | VM_UNINITIALIZED,
1143 				  VMALLOC_START, VMALLOC_END,
1144 				  NUMA_NO_NODE, GFP_KERNEL,
1145 				  __builtin_return_address(0));
1146 	if (!area)
1147 		goto err_free_pages_array;
1148 
1149 	addr = (unsigned long)area->addr;
1150 	size = get_vm_area_size(area);
1151 	err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
1152 	if (err)
1153 		goto err_free_vm_area;
1154 
1155 	area->nr_pages = total_pages;
1156 	area->pages = pages;
1157 
1158 	if (vm_flags & VM_ALLOC)
1159 		kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
1160 
1161 	area->addr = kasan_unpoison_vmalloc(area->addr, total_pages * PAGE_SIZE,
1162 					    kasan_flags);
1163 	clear_vm_uninitialized_flag(area);
1164 
1165 	return area->addr;
1166 
1167 err_free_vm_area:
1168 	free_vm_area(area);
1169 err_free_pages_array:
1170 	kvfree(pages);
1171 	return NULL;
1172 }
1173 EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
1174 
1175 /**
1176  * kho_alloc_preserve - Allocate, zero, and preserve memory.
1177  * @size: The number of bytes to allocate.
1178  *
1179  * Allocates a physically contiguous block of zeroed pages that is large
1180  * enough to hold @size bytes. The allocated memory is then registered with
1181  * KHO for preservation across a kexec.
1182  *
1183  * Note: The actual allocated size will be rounded up to the nearest
1184  * power-of-two page boundary.
1185  *
1186  * @return A virtual pointer to the allocated and preserved memory on success,
1187  * or an ERR_PTR() encoded error on failure.
1188  */
1189 void *kho_alloc_preserve(size_t size)
1190 {
1191 	struct folio *folio;
1192 	int order, ret;
1193 
1194 	if (!size)
1195 		return ERR_PTR(-EINVAL);
1196 
1197 	order = get_order(size);
1198 	if (order > MAX_PAGE_ORDER)
1199 		return ERR_PTR(-E2BIG);
1200 
1201 	folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order);
1202 	if (!folio)
1203 		return ERR_PTR(-ENOMEM);
1204 
1205 	ret = kho_preserve_folio(folio);
1206 	if (ret) {
1207 		folio_put(folio);
1208 		return ERR_PTR(ret);
1209 	}
1210 
1211 	return folio_address(folio);
1212 }
1213 EXPORT_SYMBOL_GPL(kho_alloc_preserve);
1214 
1215 /**
1216  * kho_unpreserve_free - Unpreserve and free memory.
1217  * @mem:  Pointer to the memory allocated by kho_alloc_preserve().
1218  *
1219  * Unregisters the memory from KHO preservation and frees the underlying
1220  * pages back to the system. This function should be called to clean up
1221  * memory allocated with kho_alloc_preserve().
1222  */
1223 void kho_unpreserve_free(void *mem)
1224 {
1225 	struct folio *folio;
1226 
1227 	if (!mem)
1228 		return;
1229 
1230 	folio = virt_to_folio(mem);
1231 	kho_unpreserve_folio(folio);
1232 	folio_put(folio);
1233 }
1234 EXPORT_SYMBOL_GPL(kho_unpreserve_free);
1235 
1236 /**
1237  * kho_restore_free - Restore and free memory after kexec.
1238  * @mem:  Pointer to the memory (in the new kernel's address space)
1239  * that was allocated by the old kernel.
1240  *
1241  * This function is intended to be called in the new kernel (post-kexec)
1242  * to take ownership of and free a memory region that was preserved by the
1243  * old kernel using kho_alloc_preserve().
1244  *
1245  * It first restores the pages from KHO (using their physical address)
1246  * and then frees the pages back to the new kernel's page allocator.
1247  */
1248 void kho_restore_free(void *mem)
1249 {
1250 	struct folio *folio;
1251 
1252 	if (!mem)
1253 		return;
1254 
1255 	folio = kho_restore_folio(__pa(mem));
1256 	if (!WARN_ON(!folio))
1257 		folio_put(folio);
1258 }
1259 EXPORT_SYMBOL_GPL(kho_restore_free);
1260 
1261 struct kho_in {
1262 	phys_addr_t fdt_phys;
1263 	phys_addr_t scratch_phys;
1264 	struct kho_debugfs dbg;
1265 };
1266 
1267 static struct kho_in kho_in = {
1268 };
1269 
1270 static const void *kho_get_fdt(void)
1271 {
1272 	return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL;
1273 }
1274 
1275 /**
1276  * is_kho_boot - check if current kernel was booted via KHO-enabled
1277  * kexec
1278  *
1279  * This function checks if the current kernel was loaded through a kexec
1280  * operation with KHO enabled, by verifying that a valid KHO FDT
1281  * was passed.
1282  *
1283  * Note: This function returns reliable results only after
1284  * kho_populate() has been called during early boot. Before that,
1285  * it may return false even if KHO data is present.
1286  *
1287  * Return: true if booted via KHO-enabled kexec, false otherwise
1288  */
1289 bool is_kho_boot(void)
1290 {
1291 	return !!kho_get_fdt();
1292 }
1293 EXPORT_SYMBOL_GPL(is_kho_boot);
1294 
1295 /**
1296  * kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
1297  * @name: the name of the sub FDT passed to kho_add_subtree().
1298  * @phys: if found, the physical address of the sub FDT is stored in @phys.
1299  *
1300  * Retrieve a preserved sub FDT named @name and store its physical
1301  * address in @phys.
1302  *
1303  * Return: 0 on success, error code on failure
1304  */
1305 int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
1306 {
1307 	const void *fdt = kho_get_fdt();
1308 	const u64 *val;
1309 	int offset, len;
1310 
1311 	if (!fdt)
1312 		return -ENOENT;
1313 
1314 	if (!phys)
1315 		return -EINVAL;
1316 
1317 	offset = fdt_subnode_offset(fdt, 0, name);
1318 	if (offset < 0)
1319 		return -ENOENT;
1320 
1321 	val = fdt_getprop(fdt, offset, KHO_FDT_SUB_TREE_PROP_NAME, &len);
1322 	if (!val || len != sizeof(*val))
1323 		return -EINVAL;
1324 
1325 	*phys = (phys_addr_t)*val;
1326 
1327 	return 0;
1328 }
1329 EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
1330 
1331 static int __init kho_mem_retrieve(const void *fdt)
1332 {
1333 	struct kho_radix_tree tree;
1334 	const phys_addr_t *mem;
1335 	int len;
1336 
1337 	/* Retrieve the KHO radix tree from passed-in FDT. */
1338 	mem = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len);
1339 
1340 	if (!mem || len != sizeof(*mem)) {
1341 		pr_err("failed to get preserved KHO memory tree\n");
1342 		return -ENOENT;
1343 	}
1344 
1345 	if (!*mem)
1346 		return -EINVAL;
1347 
1348 	tree.root = phys_to_virt(*mem);
1349 	mutex_init(&tree.lock);
1350 	return kho_radix_walk_tree(&tree, kho_preserved_memory_reserve);
1351 }
1352 
1353 static __init int kho_out_fdt_setup(void)
1354 {
1355 	struct kho_radix_tree *tree = &kho_out.radix_tree;
1356 	void *root = kho_out.fdt;
1357 	u64 preserved_mem_tree_pa;
1358 	int err;
1359 
1360 	err = fdt_create(root, PAGE_SIZE);
1361 	err |= fdt_finish_reservemap(root);
1362 	err |= fdt_begin_node(root, "");
1363 	err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
1364 
1365 	preserved_mem_tree_pa = virt_to_phys(tree->root);
1366 
1367 	err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME,
1368 			    &preserved_mem_tree_pa,
1369 			    sizeof(preserved_mem_tree_pa));
1370 
1371 	err |= fdt_end_node(root);
1372 	err |= fdt_finish(root);
1373 
1374 	return err;
1375 }
1376 
1377 static __init int kho_init(void)
1378 {
1379 	struct kho_radix_tree *tree = &kho_out.radix_tree;
1380 	const void *fdt = kho_get_fdt();
1381 	int err = 0;
1382 
1383 	if (!kho_enable)
1384 		return 0;
1385 
1386 	tree->root = kzalloc(PAGE_SIZE, GFP_KERNEL);
1387 	if (!tree->root) {
1388 		err = -ENOMEM;
1389 		goto err_free_scratch;
1390 	}
1391 
1392 	kho_out.fdt = kho_alloc_preserve(PAGE_SIZE);
1393 	if (IS_ERR(kho_out.fdt)) {
1394 		err = PTR_ERR(kho_out.fdt);
1395 		goto err_free_kho_radix_tree_root;
1396 	}
1397 
1398 	err = kho_debugfs_init();
1399 	if (err)
1400 		goto err_free_fdt;
1401 
1402 	err = kho_out_debugfs_init(&kho_out.dbg);
1403 	if (err)
1404 		goto err_free_fdt;
1405 
1406 	err = kho_out_fdt_setup();
1407 	if (err)
1408 		goto err_free_fdt;
1409 
1410 	if (fdt) {
1411 		kho_in_debugfs_init(&kho_in.dbg, fdt);
1412 		return 0;
1413 	}
1414 
1415 	for (int i = 0; i < kho_scratch_cnt; i++) {
1416 		unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr);
1417 		unsigned long count = kho_scratch[i].size >> PAGE_SHIFT;
1418 		unsigned long pfn;
1419 
1420 		/*
1421 		 * When debug_pagealloc is enabled, __free_pages() clears the
1422 		 * corresponding PRESENT bit in the kernel page table.
1423 		 * Subsequent kmemleak scans of these pages cause the
1424 		 * non-PRESENT page faults.
1425 		 * Mark scratch areas with kmemleak_ignore_phys() to exclude
1426 		 * them from kmemleak scanning.
1427 		 */
1428 		kmemleak_ignore_phys(kho_scratch[i].addr);
1429 		for (pfn = base_pfn; pfn < base_pfn + count;
1430 		     pfn += pageblock_nr_pages)
1431 			init_cma_reserved_pageblock(pfn_to_page(pfn));
1432 	}
1433 
1434 	WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt",
1435 					 kho_out.fdt,
1436 					 fdt_totalsize(kho_out.fdt), true));
1437 
1438 	return 0;
1439 
1440 err_free_fdt:
1441 	kho_unpreserve_free(kho_out.fdt);
1442 err_free_kho_radix_tree_root:
1443 	kfree(tree->root);
1444 	tree->root = NULL;
1445 err_free_scratch:
1446 	kho_out.fdt = NULL;
1447 	for (int i = 0; i < kho_scratch_cnt; i++) {
1448 		void *start = __va(kho_scratch[i].addr);
1449 		void *end = start + kho_scratch[i].size;
1450 
1451 		free_reserved_area(start, end, -1, "");
1452 	}
1453 	kho_enable = false;
1454 	return err;
1455 }
1456 fs_initcall(kho_init);
1457 
1458 static void __init kho_release_scratch(void)
1459 {
1460 	phys_addr_t start, end;
1461 	u64 i;
1462 
1463 	memmap_init_kho_scratch_pages();
1464 
1465 	/*
1466 	 * Mark scratch mem as CMA before we return it. That way we
1467 	 * ensure that no kernel allocations happen on it. That means
1468 	 * we can reuse it as scratch memory again later.
1469 	 */
1470 	__for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
1471 			     MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) {
1472 		ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start));
1473 		ulong end_pfn = pageblock_align(PFN_UP(end));
1474 		ulong pfn;
1475 
1476 		for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
1477 			init_pageblock_migratetype(pfn_to_page(pfn),
1478 						   MIGRATE_CMA, false);
1479 	}
1480 }
1481 
1482 void __init kho_memory_init(void)
1483 {
1484 	if (kho_in.scratch_phys) {
1485 		kho_scratch = phys_to_virt(kho_in.scratch_phys);
1486 		kho_release_scratch();
1487 
1488 		if (kho_mem_retrieve(kho_get_fdt()))
1489 			kho_in.fdt_phys = 0;
1490 	} else {
1491 		kho_reserve_scratch();
1492 	}
1493 }
1494 
1495 void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
1496 			 phys_addr_t scratch_phys, u64 scratch_len)
1497 {
1498 	unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
1499 	struct kho_scratch *scratch = NULL;
1500 	phys_addr_t mem_map_phys;
1501 	void *fdt = NULL;
1502 	bool populated = false;
1503 	int err;
1504 
1505 	/* Validate the input FDT */
1506 	fdt = early_memremap(fdt_phys, fdt_len);
1507 	if (!fdt) {
1508 		pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys);
1509 		goto report;
1510 	}
1511 	err = fdt_check_header(fdt);
1512 	if (err) {
1513 		pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n",
1514 			fdt_phys, err);
1515 		goto unmap_fdt;
1516 	}
1517 	err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE);
1518 	if (err) {
1519 		pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n",
1520 			fdt_phys, KHO_FDT_COMPATIBLE, err);
1521 		goto unmap_fdt;
1522 	}
1523 
1524 	mem_map_phys = kho_get_mem_map_phys(fdt);
1525 	if (!mem_map_phys)
1526 		goto unmap_fdt;
1527 
1528 	scratch = early_memremap(scratch_phys, scratch_len);
1529 	if (!scratch) {
1530 		pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n",
1531 			scratch_phys, scratch_len);
1532 		goto unmap_fdt;
1533 	}
1534 
1535 	/*
1536 	 * We pass a safe contiguous blocks of memory to use for early boot
1537 	 * purporses from the previous kernel so that we can resize the
1538 	 * memblock array as needed.
1539 	 */
1540 	for (int i = 0; i < scratch_cnt; i++) {
1541 		struct kho_scratch *area = &scratch[i];
1542 		u64 size = area->size;
1543 
1544 		memblock_add(area->addr, size);
1545 		err = memblock_mark_kho_scratch(area->addr, size);
1546 		if (err) {
1547 			pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe",
1548 				&area->addr, &size, ERR_PTR(err));
1549 			goto unmap_scratch;
1550 		}
1551 		pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
1552 	}
1553 
1554 	memblock_reserve(scratch_phys, scratch_len);
1555 
1556 	/*
1557 	 * Now that we have a viable region of scratch memory, let's tell
1558 	 * the memblocks allocator to only use that for any allocations.
1559 	 * That way we ensure that nothing scribbles over in use data while
1560 	 * we initialize the page tables which we will need to ingest all
1561 	 * memory reservations from the previous kernel.
1562 	 */
1563 	memblock_set_kho_scratch_only();
1564 
1565 	kho_in.fdt_phys = fdt_phys;
1566 	kho_in.scratch_phys = scratch_phys;
1567 	kho_scratch_cnt = scratch_cnt;
1568 
1569 	populated = true;
1570 	pr_info("found kexec handover data.\n");
1571 
1572 unmap_scratch:
1573 	early_memunmap(scratch, scratch_len);
1574 unmap_fdt:
1575 	early_memunmap(fdt, fdt_len);
1576 report:
1577 	if (!populated)
1578 		pr_warn("disabling KHO revival\n");
1579 }
1580 
1581 /* Helper functions for kexec_file_load */
1582 
1583 int kho_fill_kimage(struct kimage *image)
1584 {
1585 	ssize_t scratch_size;
1586 	int err = 0;
1587 	struct kexec_buf scratch;
1588 
1589 	if (!kho_enable)
1590 		return 0;
1591 
1592 	image->kho.fdt = virt_to_phys(kho_out.fdt);
1593 
1594 	scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt;
1595 	scratch = (struct kexec_buf){
1596 		.image = image,
1597 		.buffer = kho_scratch,
1598 		.bufsz = scratch_size,
1599 		.mem = KEXEC_BUF_MEM_UNKNOWN,
1600 		.memsz = scratch_size,
1601 		.buf_align = SZ_64K, /* Makes it easier to map */
1602 		.buf_max = ULONG_MAX,
1603 		.top_down = true,
1604 	};
1605 	err = kexec_add_buffer(&scratch);
1606 	if (err)
1607 		return err;
1608 	image->kho.scratch = &image->segment[image->nr_segments - 1];
1609 
1610 	return 0;
1611 }
1612 
1613 static int kho_walk_scratch(struct kexec_buf *kbuf,
1614 			    int (*func)(struct resource *, void *))
1615 {
1616 	int ret = 0;
1617 	int i;
1618 
1619 	for (i = 0; i < kho_scratch_cnt; i++) {
1620 		struct resource res = {
1621 			.start = kho_scratch[i].addr,
1622 			.end = kho_scratch[i].addr + kho_scratch[i].size - 1,
1623 		};
1624 
1625 		/* Try to fit the kimage into our KHO scratch region */
1626 		ret = func(&res, kbuf);
1627 		if (ret)
1628 			break;
1629 	}
1630 
1631 	return ret;
1632 }
1633 
1634 int kho_locate_mem_hole(struct kexec_buf *kbuf,
1635 			int (*func)(struct resource *, void *))
1636 {
1637 	int ret;
1638 
1639 	if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH)
1640 		return 1;
1641 
1642 	ret = kho_walk_scratch(kbuf, func);
1643 
1644 	return ret == 1 ? 0 : -EADDRNOTAVAIL;
1645 }
1646