xref: /linux/kernel/liveupdate/kexec_handover.c (revision aec2f682d47c54ef434b2d440992626d80b1ebdc)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * kexec_handover.c - kexec handover metadata processing
4  * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
5  * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
6  * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
7  * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com>
8  * Copyright (C) 2026 Google LLC, Jason Miu <jasonmiu@google.com>
9  */
10 
11 #define pr_fmt(fmt) "KHO: " fmt
12 
13 #include <linux/cleanup.h>
14 #include <linux/cma.h>
15 #include <linux/kmemleak.h>
16 #include <linux/count_zeros.h>
17 #include <linux/kasan.h>
18 #include <linux/kexec.h>
19 #include <linux/kexec_handover.h>
20 #include <linux/kho_radix_tree.h>
21 #include <linux/kho/abi/kexec_handover.h>
22 #include <linux/libfdt.h>
23 #include <linux/list.h>
24 #include <linux/memblock.h>
25 #include <linux/page-isolation.h>
26 #include <linux/unaligned.h>
27 #include <linux/vmalloc.h>
28 
29 #include <asm/early_ioremap.h>
30 
31 /*
32  * KHO is tightly coupled with mm init and needs access to some of mm
33  * internal APIs.
34  */
35 #include "../../mm/internal.h"
36 #include "../kexec_internal.h"
37 #include "kexec_handover_internal.h"
38 
39 /* The magic token for preserved pages */
40 #define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */
41 
42 /*
43  * KHO uses page->private, which is an unsigned long, to store page metadata.
44  * Use it to store both the magic and the order.
45  */
46 union kho_page_info {
47 	unsigned long page_private;
48 	struct {
49 		unsigned int order;
50 		unsigned int magic;
51 	};
52 };
53 
54 static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private));
55 
56 static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT);
57 
58 bool kho_is_enabled(void)
59 {
60 	return kho_enable;
61 }
62 EXPORT_SYMBOL_GPL(kho_is_enabled);
63 
64 static int __init kho_parse_enable(char *p)
65 {
66 	return kstrtobool(p, &kho_enable);
67 }
68 early_param("kho", kho_parse_enable);
69 
70 struct kho_out {
71 	void *fdt;
72 	struct mutex lock; /* protects KHO FDT */
73 
74 	struct kho_radix_tree radix_tree;
75 	struct kho_debugfs dbg;
76 };
77 
78 static struct kho_out kho_out = {
79 	.lock = __MUTEX_INITIALIZER(kho_out.lock),
80 	.radix_tree = {
81 		.lock = __MUTEX_INITIALIZER(kho_out.radix_tree.lock),
82 	},
83 };
84 
85 /**
86  * kho_radix_encode_key - Encodes a physical address and order into a radix key.
87  * @phys: The physical address of the page.
88  * @order: The order of the page.
89  *
90  * This function combines a page's physical address and its order into a
91  * single unsigned long, which is used as a key for all radix tree
92  * operations.
93  *
94  * Return: The encoded unsigned long radix key.
95  */
96 static unsigned long kho_radix_encode_key(phys_addr_t phys, unsigned int order)
97 {
98 	/* Order bits part */
99 	unsigned long h = 1UL << (KHO_ORDER_0_LOG2 - order);
100 	/* Shifted physical address part */
101 	unsigned long l = phys >> (PAGE_SHIFT + order);
102 
103 	return h | l;
104 }
105 
106 /**
107  * kho_radix_decode_key - Decodes a radix key back into a physical address and order.
108  * @key: The unsigned long key to decode.
109  * @order: An output parameter, a pointer to an unsigned int where the decoded
110  *         page order will be stored.
111  *
112  * This function reverses the encoding performed by kho_radix_encode_key(),
113  * extracting the original physical address and page order from a given key.
114  *
115  * Return: The decoded physical address.
116  */
117 static phys_addr_t kho_radix_decode_key(unsigned long key, unsigned int *order)
118 {
119 	unsigned int order_bit = fls64(key);
120 	phys_addr_t phys;
121 
122 	/* order_bit is numbered starting at 1 from fls64 */
123 	*order = KHO_ORDER_0_LOG2 - order_bit + 1;
124 	/* The order is discarded by the shift */
125 	phys = key << (PAGE_SHIFT + *order);
126 
127 	return phys;
128 }
129 
130 static unsigned long kho_radix_get_bitmap_index(unsigned long key)
131 {
132 	return key % (1 << KHO_BITMAP_SIZE_LOG2);
133 }
134 
135 static unsigned long kho_radix_get_table_index(unsigned long key,
136 					       unsigned int level)
137 {
138 	int s;
139 
140 	s = ((level - 1) * KHO_TABLE_SIZE_LOG2) + KHO_BITMAP_SIZE_LOG2;
141 	return (key >> s) % (1 << KHO_TABLE_SIZE_LOG2);
142 }
143 
144 /**
145  * kho_radix_add_page - Marks a page as preserved in the radix tree.
146  * @tree: The KHO radix tree.
147  * @pfn: The page frame number of the page to preserve.
148  * @order: The order of the page.
149  *
150  * This function traverses the radix tree based on the key derived from @pfn
151  * and @order. It sets the corresponding bit in the leaf bitmap to mark the
152  * page for preservation. If intermediate nodes do not exist along the path,
153  * they are allocated and added to the tree.
154  *
155  * Return: 0 on success, or a negative error code on failure.
156  */
157 int kho_radix_add_page(struct kho_radix_tree *tree,
158 		       unsigned long pfn, unsigned int order)
159 {
160 	/* Newly allocated nodes for error cleanup */
161 	struct kho_radix_node *intermediate_nodes[KHO_TREE_MAX_DEPTH] = { 0 };
162 	unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order);
163 	struct kho_radix_node *anchor_node = NULL;
164 	struct kho_radix_node *node = tree->root;
165 	struct kho_radix_node *new_node;
166 	unsigned int i, idx, anchor_idx;
167 	struct kho_radix_leaf *leaf;
168 	int err = 0;
169 
170 	if (WARN_ON_ONCE(!tree->root))
171 		return -EINVAL;
172 
173 	might_sleep();
174 
175 	guard(mutex)(&tree->lock);
176 
177 	/* Go from high levels to low levels */
178 	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
179 		idx = kho_radix_get_table_index(key, i);
180 
181 		if (node->table[idx]) {
182 			node = phys_to_virt(node->table[idx]);
183 			continue;
184 		}
185 
186 		/* Next node is empty, create a new node for it */
187 		new_node = (struct kho_radix_node *)get_zeroed_page(GFP_KERNEL);
188 		if (!new_node) {
189 			err = -ENOMEM;
190 			goto err_free_nodes;
191 		}
192 
193 		node->table[idx] = virt_to_phys(new_node);
194 
195 		/*
196 		 * Capture the node where the new branch starts for cleanup
197 		 * if allocation fails.
198 		 */
199 		if (!anchor_node) {
200 			anchor_node = node;
201 			anchor_idx = idx;
202 		}
203 		intermediate_nodes[i] = new_node;
204 
205 		node = new_node;
206 	}
207 
208 	/* Handle the leaf level bitmap (level 0) */
209 	idx = kho_radix_get_bitmap_index(key);
210 	leaf = (struct kho_radix_leaf *)node;
211 	__set_bit(idx, leaf->bitmap);
212 
213 	return 0;
214 
215 err_free_nodes:
216 	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
217 		if (intermediate_nodes[i])
218 			free_page((unsigned long)intermediate_nodes[i]);
219 	}
220 	if (anchor_node)
221 		anchor_node->table[anchor_idx] = 0;
222 
223 	return err;
224 }
225 EXPORT_SYMBOL_GPL(kho_radix_add_page);
226 
227 /**
228  * kho_radix_del_page - Removes a page's preservation status from the radix tree.
229  * @tree: The KHO radix tree.
230  * @pfn: The page frame number of the page to unpreserve.
231  * @order: The order of the page.
232  *
233  * This function traverses the radix tree and clears the bit corresponding to
234  * the page, effectively removing its "preserved" status. It does not free
235  * the tree's intermediate nodes, even if they become empty.
236  */
237 void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn,
238 			unsigned int order)
239 {
240 	unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order);
241 	struct kho_radix_node *node = tree->root;
242 	struct kho_radix_leaf *leaf;
243 	unsigned int i, idx;
244 
245 	if (WARN_ON_ONCE(!tree->root))
246 		return;
247 
248 	might_sleep();
249 
250 	guard(mutex)(&tree->lock);
251 
252 	/* Go from high levels to low levels */
253 	for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) {
254 		idx = kho_radix_get_table_index(key, i);
255 
256 		/*
257 		 * Attempting to delete a page that has not been preserved,
258 		 * return with a warning.
259 		 */
260 		if (WARN_ON(!node->table[idx]))
261 			return;
262 
263 		node = phys_to_virt(node->table[idx]);
264 	}
265 
266 	/* Handle the leaf level bitmap (level 0) */
267 	leaf = (struct kho_radix_leaf *)node;
268 	idx = kho_radix_get_bitmap_index(key);
269 	__clear_bit(idx, leaf->bitmap);
270 }
271 EXPORT_SYMBOL_GPL(kho_radix_del_page);
272 
273 static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf,
274 			       unsigned long key,
275 			       kho_radix_tree_walk_callback_t cb)
276 {
277 	unsigned long *bitmap = (unsigned long *)leaf;
278 	unsigned int order;
279 	phys_addr_t phys;
280 	unsigned int i;
281 	int err;
282 
283 	for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) {
284 		phys = kho_radix_decode_key(key | i, &order);
285 		err = cb(phys, order);
286 		if (err)
287 			return err;
288 	}
289 
290 	return 0;
291 }
292 
293 static int __kho_radix_walk_tree(struct kho_radix_node *root,
294 				 unsigned int level, unsigned long start,
295 				 kho_radix_tree_walk_callback_t cb)
296 {
297 	struct kho_radix_node *node;
298 	struct kho_radix_leaf *leaf;
299 	unsigned long key, i;
300 	unsigned int shift;
301 	int err;
302 
303 	for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) {
304 		if (!root->table[i])
305 			continue;
306 
307 		shift = ((level - 1) * KHO_TABLE_SIZE_LOG2) +
308 			KHO_BITMAP_SIZE_LOG2;
309 		key = start | (i << shift);
310 
311 		node = phys_to_virt(root->table[i]);
312 
313 		if (level == 1) {
314 			/*
315 			 * we are at level 1,
316 			 * node is pointing to the level 0 bitmap.
317 			 */
318 			leaf = (struct kho_radix_leaf *)node;
319 			err = kho_radix_walk_leaf(leaf, key, cb);
320 		} else {
321 			err  = __kho_radix_walk_tree(node, level - 1,
322 						     key, cb);
323 		}
324 
325 		if (err)
326 			return err;
327 	}
328 
329 	return 0;
330 }
331 
332 /**
333  * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each preserved page.
334  * @tree: A pointer to the KHO radix tree to walk.
335  * @cb: A callback function of type kho_radix_tree_walk_callback_t that will be
336  *      invoked for each preserved page found in the tree. The callback receives
337  *      the physical address and order of the preserved page.
338  *
339  * This function walks the radix tree, searching from the specified top level
340  * down to the lowest level (level 0). For each preserved page found, it invokes
341  * the provided callback, passing the page's physical address and order.
342  *
343  * Return: 0 if the walk completed the specified tree, or the non-zero return
344  *         value from the callback that stopped the walk.
345  */
346 int kho_radix_walk_tree(struct kho_radix_tree *tree,
347 			kho_radix_tree_walk_callback_t cb)
348 {
349 	if (WARN_ON_ONCE(!tree->root))
350 		return -EINVAL;
351 
352 	guard(mutex)(&tree->lock);
353 
354 	return __kho_radix_walk_tree(tree->root, KHO_TREE_MAX_DEPTH - 1, 0, cb);
355 }
356 EXPORT_SYMBOL_GPL(kho_radix_walk_tree);
357 
358 static void __kho_unpreserve(struct kho_radix_tree *tree,
359 			     unsigned long pfn, unsigned long end_pfn)
360 {
361 	unsigned int order;
362 
363 	while (pfn < end_pfn) {
364 		order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
365 
366 		kho_radix_del_page(tree, pfn, order);
367 
368 		pfn += 1 << order;
369 	}
370 }
371 
372 /* For physically contiguous 0-order pages. */
373 static void kho_init_pages(struct page *page, unsigned long nr_pages)
374 {
375 	for (unsigned long i = 0; i < nr_pages; i++) {
376 		set_page_count(page + i, 1);
377 		/* Clear each page's codetag to avoid accounting mismatch. */
378 		clear_page_tag_ref(page + i);
379 	}
380 }
381 
382 static void kho_init_folio(struct page *page, unsigned int order)
383 {
384 	unsigned long nr_pages = (1 << order);
385 
386 	/* Head page gets refcount of 1. */
387 	set_page_count(page, 1);
388 	/* Clear head page's codetag to avoid accounting mismatch. */
389 	clear_page_tag_ref(page);
390 
391 	/* For higher order folios, tail pages get a page count of zero. */
392 	for (unsigned long i = 1; i < nr_pages; i++)
393 		set_page_count(page + i, 0);
394 
395 	if (order > 0)
396 		prep_compound_page(page, order);
397 }
398 
399 static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
400 {
401 	struct page *page = pfn_to_online_page(PHYS_PFN(phys));
402 	unsigned long nr_pages;
403 	union kho_page_info info;
404 
405 	if (!page)
406 		return NULL;
407 
408 	info.page_private = page->private;
409 	/*
410 	 * deserialize_bitmap() only sets the magic on the head page. This magic
411 	 * check also implicitly makes sure phys is order-aligned since for
412 	 * non-order-aligned phys addresses, magic will never be set.
413 	 */
414 	if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC))
415 		return NULL;
416 	nr_pages = (1 << info.order);
417 
418 	/* Clear private to make sure later restores on this page error out. */
419 	page->private = 0;
420 
421 	if (is_folio)
422 		kho_init_folio(page, info.order);
423 	else
424 		kho_init_pages(page, nr_pages);
425 
426 	adjust_managed_page_count(page, nr_pages);
427 	return page;
428 }
429 
430 /**
431  * kho_restore_folio - recreates the folio from the preserved memory.
432  * @phys: physical address of the folio.
433  *
434  * Return: pointer to the struct folio on success, NULL on failure.
435  */
436 struct folio *kho_restore_folio(phys_addr_t phys)
437 {
438 	struct page *page = kho_restore_page(phys, true);
439 
440 	return page ? page_folio(page) : NULL;
441 }
442 EXPORT_SYMBOL_GPL(kho_restore_folio);
443 
444 /**
445  * kho_restore_pages - restore list of contiguous order 0 pages.
446  * @phys: physical address of the first page.
447  * @nr_pages: number of pages.
448  *
449  * Restore a contiguous list of order 0 pages that was preserved with
450  * kho_preserve_pages().
451  *
452  * Return: the first page on success, NULL on failure.
453  */
454 struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages)
455 {
456 	const unsigned long start_pfn = PHYS_PFN(phys);
457 	const unsigned long end_pfn = start_pfn + nr_pages;
458 	unsigned long pfn = start_pfn;
459 
460 	while (pfn < end_pfn) {
461 		const unsigned int order =
462 			min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
463 		struct page *page = kho_restore_page(PFN_PHYS(pfn), false);
464 
465 		if (!page)
466 			return NULL;
467 		pfn += 1 << order;
468 	}
469 
470 	return pfn_to_page(start_pfn);
471 }
472 EXPORT_SYMBOL_GPL(kho_restore_pages);
473 
474 static int __init kho_preserved_memory_reserve(phys_addr_t phys,
475 					       unsigned int order)
476 {
477 	union kho_page_info info;
478 	struct page *page;
479 	u64 sz;
480 
481 	sz = 1 << (order + PAGE_SHIFT);
482 	page = phys_to_page(phys);
483 
484 	/* Reserve the memory preserved in KHO in memblock */
485 	memblock_reserve(phys, sz);
486 	memblock_reserved_mark_noinit(phys, sz);
487 	info.magic = KHO_PAGE_MAGIC;
488 	info.order = order;
489 	page->private = info.page_private;
490 
491 	return 0;
492 }
493 
494 /* Returns physical address of the preserved memory map from FDT */
495 static phys_addr_t __init kho_get_mem_map_phys(const void *fdt)
496 {
497 	const void *mem_ptr;
498 	int len;
499 
500 	mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len);
501 	if (!mem_ptr || len != sizeof(u64)) {
502 		pr_err("failed to get preserved memory map\n");
503 		return 0;
504 	}
505 
506 	return get_unaligned((const u64 *)mem_ptr);
507 }
508 
509 /*
510  * With KHO enabled, memory can become fragmented because KHO regions may
511  * be anywhere in physical address space. The scratch regions give us a
512  * safe zones that we will never see KHO allocations from. This is where we
513  * can later safely load our new kexec images into and then use the scratch
514  * area for early allocations that happen before page allocator is
515  * initialized.
516  */
517 struct kho_scratch *kho_scratch;
518 unsigned int kho_scratch_cnt;
519 
520 /*
521  * The scratch areas are scaled by default as percent of memory allocated from
522  * memblock. A user can override the scale with command line parameter:
523  *
524  * kho_scratch=N%
525  *
526  * It is also possible to explicitly define size for a lowmem, a global and
527  * per-node scratch areas:
528  *
529  * kho_scratch=l[KMG],n[KMG],m[KMG]
530  *
531  * The explicit size definition takes precedence over scale definition.
532  */
533 static unsigned int scratch_scale __initdata = 200;
534 static phys_addr_t scratch_size_global __initdata;
535 static phys_addr_t scratch_size_pernode __initdata;
536 static phys_addr_t scratch_size_lowmem __initdata;
537 
538 static int __init kho_parse_scratch_size(char *p)
539 {
540 	size_t len;
541 	unsigned long sizes[3];
542 	size_t total_size = 0;
543 	int i;
544 
545 	if (!p)
546 		return -EINVAL;
547 
548 	len = strlen(p);
549 	if (!len)
550 		return -EINVAL;
551 
552 	/* parse nn% */
553 	if (p[len - 1] == '%') {
554 		/* unsigned int max is 4,294,967,295, 10 chars */
555 		char s_scale[11] = {};
556 		int ret = 0;
557 
558 		if (len > ARRAY_SIZE(s_scale))
559 			return -EINVAL;
560 
561 		memcpy(s_scale, p, len - 1);
562 		ret = kstrtouint(s_scale, 10, &scratch_scale);
563 		if (!ret)
564 			pr_notice("scratch scale is %d%%\n", scratch_scale);
565 		return ret;
566 	}
567 
568 	/* parse ll[KMG],mm[KMG],nn[KMG] */
569 	for (i = 0; i < ARRAY_SIZE(sizes); i++) {
570 		char *endp = p;
571 
572 		if (i > 0) {
573 			if (*p != ',')
574 				return -EINVAL;
575 			p += 1;
576 		}
577 
578 		sizes[i] = memparse(p, &endp);
579 		if (endp == p)
580 			return -EINVAL;
581 		p = endp;
582 		total_size += sizes[i];
583 	}
584 
585 	if (!total_size)
586 		return -EINVAL;
587 
588 	/* The string should be fully consumed by now. */
589 	if (*p)
590 		return -EINVAL;
591 
592 	scratch_size_lowmem = sizes[0];
593 	scratch_size_global = sizes[1];
594 	scratch_size_pernode = sizes[2];
595 	scratch_scale = 0;
596 
597 	pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n",
598 		  (u64)(scratch_size_lowmem >> 20),
599 		  (u64)(scratch_size_global >> 20),
600 		  (u64)(scratch_size_pernode >> 20));
601 
602 	return 0;
603 }
604 early_param("kho_scratch", kho_parse_scratch_size);
605 
606 static void __init scratch_size_update(void)
607 {
608 	phys_addr_t size;
609 
610 	if (!scratch_scale)
611 		return;
612 
613 	size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT,
614 					   NUMA_NO_NODE);
615 	size = size * scratch_scale / 100;
616 	scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
617 
618 	size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
619 					   NUMA_NO_NODE);
620 	size = size * scratch_scale / 100 - scratch_size_lowmem;
621 	scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
622 }
623 
624 static phys_addr_t __init scratch_size_node(int nid)
625 {
626 	phys_addr_t size;
627 
628 	if (scratch_scale) {
629 		size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
630 						   nid);
631 		size = size * scratch_scale / 100;
632 	} else {
633 		size = scratch_size_pernode;
634 	}
635 
636 	return round_up(size, CMA_MIN_ALIGNMENT_BYTES);
637 }
638 
639 /**
640  * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec
641  *
642  * With KHO we can preserve arbitrary pages in the system. To ensure we still
643  * have a large contiguous region of memory when we search the physical address
644  * space for target memory, let's make sure we always have a large CMA region
645  * active. This CMA region will only be used for movable pages which are not a
646  * problem for us during KHO because we can just move them somewhere else.
647  */
648 static void __init kho_reserve_scratch(void)
649 {
650 	phys_addr_t addr, size;
651 	int nid, i = 0;
652 
653 	if (!kho_enable)
654 		return;
655 
656 	scratch_size_update();
657 
658 	/* FIXME: deal with node hot-plug/remove */
659 	kho_scratch_cnt = nodes_weight(node_states[N_MEMORY]) + 2;
660 	size = kho_scratch_cnt * sizeof(*kho_scratch);
661 	kho_scratch = memblock_alloc(size, PAGE_SIZE);
662 	if (!kho_scratch) {
663 		pr_err("Failed to reserve scratch array\n");
664 		goto err_disable_kho;
665 	}
666 
667 	/*
668 	 * reserve scratch area in low memory for lowmem allocations in the
669 	 * next kernel
670 	 */
671 	size = scratch_size_lowmem;
672 	addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0,
673 					 ARCH_LOW_ADDRESS_LIMIT);
674 	if (!addr) {
675 		pr_err("Failed to reserve lowmem scratch buffer\n");
676 		goto err_free_scratch_desc;
677 	}
678 
679 	kho_scratch[i].addr = addr;
680 	kho_scratch[i].size = size;
681 	i++;
682 
683 	/* reserve large contiguous area for allocations without nid */
684 	size = scratch_size_global;
685 	addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES);
686 	if (!addr) {
687 		pr_err("Failed to reserve global scratch buffer\n");
688 		goto err_free_scratch_areas;
689 	}
690 
691 	kho_scratch[i].addr = addr;
692 	kho_scratch[i].size = size;
693 	i++;
694 
695 	/*
696 	 * Loop over nodes that have both memory and are online. Skip
697 	 * memoryless nodes, as we can not allocate scratch areas there.
698 	 */
699 	for_each_node_state(nid, N_MEMORY) {
700 		size = scratch_size_node(nid);
701 		addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES,
702 						0, MEMBLOCK_ALLOC_ACCESSIBLE,
703 						nid, true);
704 		if (!addr) {
705 			pr_err("Failed to reserve nid %d scratch buffer\n", nid);
706 			goto err_free_scratch_areas;
707 		}
708 
709 		kho_scratch[i].addr = addr;
710 		kho_scratch[i].size = size;
711 		i++;
712 	}
713 
714 	return;
715 
716 err_free_scratch_areas:
717 	for (i--; i >= 0; i--)
718 		memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size);
719 err_free_scratch_desc:
720 	memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch));
721 err_disable_kho:
722 	pr_warn("Failed to reserve scratch area, disabling kexec handover\n");
723 	kho_enable = false;
724 }
725 
726 /**
727  * kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
728  * @name: name of the sub tree.
729  * @fdt: the sub tree blob.
730  *
731  * Creates a new child node named @name in KHO root FDT and records
732  * the physical address of @fdt. The pages of @fdt must also be preserved
733  * by KHO for the new kernel to retrieve it after kexec.
734  *
735  * A debugfs blob entry is also created at
736  * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with
737  * CONFIG_KEXEC_HANDOVER_DEBUGFS
738  *
739  * Return: 0 on success, error code on failure
740  */
741 int kho_add_subtree(const char *name, void *fdt)
742 {
743 	phys_addr_t phys = virt_to_phys(fdt);
744 	void *root_fdt = kho_out.fdt;
745 	int err = -ENOMEM;
746 	int off, fdt_err;
747 
748 	guard(mutex)(&kho_out.lock);
749 
750 	fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE);
751 	if (fdt_err < 0)
752 		return err;
753 
754 	off = fdt_add_subnode(root_fdt, 0, name);
755 	if (off < 0) {
756 		if (off == -FDT_ERR_EXISTS)
757 			err = -EEXIST;
758 		goto out_pack;
759 	}
760 
761 	err = fdt_setprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME,
762 			  &phys, sizeof(phys));
763 	if (err < 0)
764 		goto out_pack;
765 
766 	WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false));
767 
768 out_pack:
769 	fdt_pack(root_fdt);
770 
771 	return err;
772 }
773 EXPORT_SYMBOL_GPL(kho_add_subtree);
774 
775 void kho_remove_subtree(void *fdt)
776 {
777 	phys_addr_t target_phys = virt_to_phys(fdt);
778 	void *root_fdt = kho_out.fdt;
779 	int off;
780 	int err;
781 
782 	guard(mutex)(&kho_out.lock);
783 
784 	err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE);
785 	if (err < 0)
786 		return;
787 
788 	for (off = fdt_first_subnode(root_fdt, 0); off >= 0;
789 	     off = fdt_next_subnode(root_fdt, off)) {
790 		const u64 *val;
791 		int len;
792 
793 		val = fdt_getprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, &len);
794 		if (!val || len != sizeof(phys_addr_t))
795 			continue;
796 
797 		if ((phys_addr_t)*val == target_phys) {
798 			fdt_del_node(root_fdt, off);
799 			kho_debugfs_fdt_remove(&kho_out.dbg, fdt);
800 			break;
801 		}
802 	}
803 
804 	fdt_pack(root_fdt);
805 }
806 EXPORT_SYMBOL_GPL(kho_remove_subtree);
807 
808 /**
809  * kho_preserve_folio - preserve a folio across kexec.
810  * @folio: folio to preserve.
811  *
812  * Instructs KHO to preserve the whole folio across kexec. The order
813  * will be preserved as well.
814  *
815  * Return: 0 on success, error code on failure
816  */
817 int kho_preserve_folio(struct folio *folio)
818 {
819 	struct kho_radix_tree *tree = &kho_out.radix_tree;
820 	const unsigned long pfn = folio_pfn(folio);
821 	const unsigned int order = folio_order(folio);
822 
823 	if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
824 		return -EINVAL;
825 
826 	return kho_radix_add_page(tree, pfn, order);
827 }
828 EXPORT_SYMBOL_GPL(kho_preserve_folio);
829 
830 /**
831  * kho_unpreserve_folio - unpreserve a folio.
832  * @folio: folio to unpreserve.
833  *
834  * Instructs KHO to unpreserve a folio that was preserved by
835  * kho_preserve_folio() before. The provided @folio (pfn and order)
836  * must exactly match a previously preserved folio.
837  */
838 void kho_unpreserve_folio(struct folio *folio)
839 {
840 	struct kho_radix_tree *tree = &kho_out.radix_tree;
841 	const unsigned long pfn = folio_pfn(folio);
842 	const unsigned int order = folio_order(folio);
843 
844 	kho_radix_del_page(tree, pfn, order);
845 }
846 EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
847 
848 /**
849  * kho_preserve_pages - preserve contiguous pages across kexec
850  * @page: first page in the list.
851  * @nr_pages: number of pages.
852  *
853  * Preserve a contiguous list of order 0 pages. Must be restored using
854  * kho_restore_pages() to ensure the pages are restored properly as order 0.
855  *
856  * Return: 0 on success, error code on failure
857  */
858 int kho_preserve_pages(struct page *page, unsigned long nr_pages)
859 {
860 	struct kho_radix_tree *tree = &kho_out.radix_tree;
861 	const unsigned long start_pfn = page_to_pfn(page);
862 	const unsigned long end_pfn = start_pfn + nr_pages;
863 	unsigned long pfn = start_pfn;
864 	unsigned long failed_pfn = 0;
865 	int err = 0;
866 
867 	if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT,
868 					nr_pages << PAGE_SHIFT))) {
869 		return -EINVAL;
870 	}
871 
872 	while (pfn < end_pfn) {
873 		unsigned int order =
874 			min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
875 
876 		/*
877 		 * Make sure all the pages in a single preservation are in the
878 		 * same NUMA node. The restore machinery can not cope with a
879 		 * preservation spanning multiple NUMA nodes.
880 		 */
881 		while (pfn_to_nid(pfn) != pfn_to_nid(pfn + (1UL << order) - 1))
882 			order--;
883 
884 		err = kho_radix_add_page(tree, pfn, order);
885 		if (err) {
886 			failed_pfn = pfn;
887 			break;
888 		}
889 
890 		pfn += 1 << order;
891 	}
892 
893 	if (err)
894 		__kho_unpreserve(tree, start_pfn, failed_pfn);
895 
896 	return err;
897 }
898 EXPORT_SYMBOL_GPL(kho_preserve_pages);
899 
900 /**
901  * kho_unpreserve_pages - unpreserve contiguous pages.
902  * @page: first page in the list.
903  * @nr_pages: number of pages.
904  *
905  * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page.
906  * This must be called with the same @page and @nr_pages as the corresponding
907  * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger
908  * preserved blocks is not supported.
909  */
910 void kho_unpreserve_pages(struct page *page, unsigned long nr_pages)
911 {
912 	struct kho_radix_tree *tree = &kho_out.radix_tree;
913 	const unsigned long start_pfn = page_to_pfn(page);
914 	const unsigned long end_pfn = start_pfn + nr_pages;
915 
916 	__kho_unpreserve(tree, start_pfn, end_pfn);
917 }
918 EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
919 
920 /* vmalloc flags KHO supports */
921 #define KHO_VMALLOC_SUPPORTED_FLAGS	(VM_ALLOC | VM_ALLOW_HUGE_VMAP)
922 
923 /* KHO internal flags for vmalloc preservations */
924 #define KHO_VMALLOC_ALLOC	0x0001
925 #define KHO_VMALLOC_HUGE_VMAP	0x0002
926 
927 static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags)
928 {
929 	unsigned short kho_flags = 0;
930 
931 	if (vm_flags & VM_ALLOC)
932 		kho_flags |= KHO_VMALLOC_ALLOC;
933 	if (vm_flags & VM_ALLOW_HUGE_VMAP)
934 		kho_flags |= KHO_VMALLOC_HUGE_VMAP;
935 
936 	return kho_flags;
937 }
938 
939 static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags)
940 {
941 	unsigned int vm_flags = 0;
942 
943 	if (kho_flags & KHO_VMALLOC_ALLOC)
944 		vm_flags |= VM_ALLOC;
945 	if (kho_flags & KHO_VMALLOC_HUGE_VMAP)
946 		vm_flags |= VM_ALLOW_HUGE_VMAP;
947 
948 	return vm_flags;
949 }
950 
951 static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
952 {
953 	struct kho_vmalloc_chunk *chunk;
954 	int err;
955 
956 	chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL);
957 	if (!chunk)
958 		return NULL;
959 
960 	err = kho_preserve_pages(virt_to_page(chunk), 1);
961 	if (err)
962 		goto err_free;
963 	if (cur)
964 		KHOSER_STORE_PTR(cur->hdr.next, chunk);
965 	return chunk;
966 
967 err_free:
968 	free_page((unsigned long)chunk);
969 	return NULL;
970 }
971 
972 static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
973 					 unsigned short order)
974 {
975 	struct kho_radix_tree *tree = &kho_out.radix_tree;
976 	unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
977 
978 	__kho_unpreserve(tree, pfn, pfn + 1);
979 
980 	for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
981 		pfn = PHYS_PFN(chunk->phys[i]);
982 		__kho_unpreserve(tree, pfn, pfn + (1 << order));
983 	}
984 }
985 
986 /**
987  * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
988  * @ptr: pointer to the area in vmalloc address space
989  * @preservation: placeholder for preservation metadata
990  *
991  * Instructs KHO to preserve the area in vmalloc address space at @ptr. The
992  * physical pages mapped at @ptr will be preserved and on successful return
993  * @preservation will hold the physical address of a structure that describes
994  * the preservation.
995  *
996  * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably
997  * restored on the same node
998  *
999  * Return: 0 on success, error code on failure
1000  */
1001 int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation)
1002 {
1003 	struct kho_vmalloc_chunk *chunk;
1004 	struct vm_struct *vm = find_vm_area(ptr);
1005 	unsigned int order, flags, nr_contig_pages;
1006 	unsigned int idx = 0;
1007 	int err;
1008 
1009 	if (!vm)
1010 		return -EINVAL;
1011 
1012 	if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
1013 		return -EOPNOTSUPP;
1014 
1015 	flags = vmalloc_flags_to_kho(vm->flags);
1016 	order = get_vm_area_page_order(vm);
1017 
1018 	chunk = new_vmalloc_chunk(NULL);
1019 	if (!chunk)
1020 		return -ENOMEM;
1021 	KHOSER_STORE_PTR(preservation->first, chunk);
1022 
1023 	nr_contig_pages = (1 << order);
1024 	for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) {
1025 		phys_addr_t phys = page_to_phys(vm->pages[i]);
1026 
1027 		err = kho_preserve_pages(vm->pages[i], nr_contig_pages);
1028 		if (err)
1029 			goto err_free;
1030 
1031 		chunk->phys[idx++] = phys;
1032 		if (idx == ARRAY_SIZE(chunk->phys)) {
1033 			chunk = new_vmalloc_chunk(chunk);
1034 			if (!chunk) {
1035 				err = -ENOMEM;
1036 				goto err_free;
1037 			}
1038 			idx = 0;
1039 		}
1040 	}
1041 
1042 	preservation->total_pages = vm->nr_pages;
1043 	preservation->flags = flags;
1044 	preservation->order = order;
1045 
1046 	return 0;
1047 
1048 err_free:
1049 	kho_unpreserve_vmalloc(preservation);
1050 	return err;
1051 }
1052 EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
1053 
1054 /**
1055  * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc()
1056  * @preservation: preservation metadata returned by kho_preserve_vmalloc()
1057  *
1058  * Instructs KHO to unpreserve the area in vmalloc address space that was
1059  * previously preserved with kho_preserve_vmalloc().
1060  */
1061 void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
1062 {
1063 	struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
1064 
1065 	while (chunk) {
1066 		struct kho_vmalloc_chunk *tmp = chunk;
1067 
1068 		kho_vmalloc_unpreserve_chunk(chunk, preservation->order);
1069 
1070 		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
1071 		free_page((unsigned long)tmp);
1072 	}
1073 }
1074 EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc);
1075 
1076 /**
1077  * kho_restore_vmalloc - recreates and populates an area in vmalloc address
1078  * space from the preserved memory.
1079  * @preservation: preservation metadata.
1080  *
1081  * Recreates an area in vmalloc address space and populates it with memory that
1082  * was preserved using kho_preserve_vmalloc().
1083  *
1084  * Return: pointer to the area in the vmalloc address space, NULL on failure.
1085  */
1086 void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
1087 {
1088 	struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
1089 	kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_PROT_NORMAL;
1090 	unsigned int align, order, shift, vm_flags;
1091 	unsigned long total_pages, contig_pages;
1092 	unsigned long addr, size;
1093 	struct vm_struct *area;
1094 	struct page **pages;
1095 	unsigned int idx = 0;
1096 	int err;
1097 
1098 	vm_flags = kho_flags_to_vmalloc(preservation->flags);
1099 	if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
1100 		return NULL;
1101 
1102 	total_pages = preservation->total_pages;
1103 	pages = kvmalloc_objs(*pages, total_pages);
1104 	if (!pages)
1105 		return NULL;
1106 	order = preservation->order;
1107 	contig_pages = (1 << order);
1108 	shift = PAGE_SHIFT + order;
1109 	align = 1 << shift;
1110 
1111 	while (chunk) {
1112 		struct page *page;
1113 
1114 		for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
1115 			phys_addr_t phys = chunk->phys[i];
1116 
1117 			if (idx + contig_pages > total_pages)
1118 				goto err_free_pages_array;
1119 
1120 			page = kho_restore_pages(phys, contig_pages);
1121 			if (!page)
1122 				goto err_free_pages_array;
1123 
1124 			for (int j = 0; j < contig_pages; j++)
1125 				pages[idx++] = page + j;
1126 
1127 			phys += contig_pages * PAGE_SIZE;
1128 		}
1129 
1130 		page = kho_restore_pages(virt_to_phys(chunk), 1);
1131 		if (!page)
1132 			goto err_free_pages_array;
1133 		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
1134 		__free_page(page);
1135 	}
1136 
1137 	if (idx != total_pages)
1138 		goto err_free_pages_array;
1139 
1140 	area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift,
1141 				  vm_flags | VM_UNINITIALIZED,
1142 				  VMALLOC_START, VMALLOC_END,
1143 				  NUMA_NO_NODE, GFP_KERNEL,
1144 				  __builtin_return_address(0));
1145 	if (!area)
1146 		goto err_free_pages_array;
1147 
1148 	addr = (unsigned long)area->addr;
1149 	size = get_vm_area_size(area);
1150 	err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
1151 	if (err)
1152 		goto err_free_vm_area;
1153 
1154 	area->nr_pages = total_pages;
1155 	area->pages = pages;
1156 
1157 	if (vm_flags & VM_ALLOC)
1158 		kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
1159 
1160 	area->addr = kasan_unpoison_vmalloc(area->addr, total_pages * PAGE_SIZE,
1161 					    kasan_flags);
1162 	clear_vm_uninitialized_flag(area);
1163 
1164 	return area->addr;
1165 
1166 err_free_vm_area:
1167 	free_vm_area(area);
1168 err_free_pages_array:
1169 	kvfree(pages);
1170 	return NULL;
1171 }
1172 EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
1173 
1174 /**
1175  * kho_alloc_preserve - Allocate, zero, and preserve memory.
1176  * @size: The number of bytes to allocate.
1177  *
1178  * Allocates a physically contiguous block of zeroed pages that is large
1179  * enough to hold @size bytes. The allocated memory is then registered with
1180  * KHO for preservation across a kexec.
1181  *
1182  * Note: The actual allocated size will be rounded up to the nearest
1183  * power-of-two page boundary.
1184  *
1185  * @return A virtual pointer to the allocated and preserved memory on success,
1186  * or an ERR_PTR() encoded error on failure.
1187  */
1188 void *kho_alloc_preserve(size_t size)
1189 {
1190 	struct folio *folio;
1191 	int order, ret;
1192 
1193 	if (!size)
1194 		return ERR_PTR(-EINVAL);
1195 
1196 	order = get_order(size);
1197 	if (order > MAX_PAGE_ORDER)
1198 		return ERR_PTR(-E2BIG);
1199 
1200 	folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order);
1201 	if (!folio)
1202 		return ERR_PTR(-ENOMEM);
1203 
1204 	ret = kho_preserve_folio(folio);
1205 	if (ret) {
1206 		folio_put(folio);
1207 		return ERR_PTR(ret);
1208 	}
1209 
1210 	return folio_address(folio);
1211 }
1212 EXPORT_SYMBOL_GPL(kho_alloc_preserve);
1213 
1214 /**
1215  * kho_unpreserve_free - Unpreserve and free memory.
1216  * @mem:  Pointer to the memory allocated by kho_alloc_preserve().
1217  *
1218  * Unregisters the memory from KHO preservation and frees the underlying
1219  * pages back to the system. This function should be called to clean up
1220  * memory allocated with kho_alloc_preserve().
1221  */
1222 void kho_unpreserve_free(void *mem)
1223 {
1224 	struct folio *folio;
1225 
1226 	if (!mem)
1227 		return;
1228 
1229 	folio = virt_to_folio(mem);
1230 	kho_unpreserve_folio(folio);
1231 	folio_put(folio);
1232 }
1233 EXPORT_SYMBOL_GPL(kho_unpreserve_free);
1234 
1235 /**
1236  * kho_restore_free - Restore and free memory after kexec.
1237  * @mem:  Pointer to the memory (in the new kernel's address space)
1238  * that was allocated by the old kernel.
1239  *
1240  * This function is intended to be called in the new kernel (post-kexec)
1241  * to take ownership of and free a memory region that was preserved by the
1242  * old kernel using kho_alloc_preserve().
1243  *
1244  * It first restores the pages from KHO (using their physical address)
1245  * and then frees the pages back to the new kernel's page allocator.
1246  */
1247 void kho_restore_free(void *mem)
1248 {
1249 	struct folio *folio;
1250 
1251 	if (!mem)
1252 		return;
1253 
1254 	folio = kho_restore_folio(__pa(mem));
1255 	if (!WARN_ON(!folio))
1256 		folio_put(folio);
1257 }
1258 EXPORT_SYMBOL_GPL(kho_restore_free);
1259 
1260 struct kho_in {
1261 	phys_addr_t fdt_phys;
1262 	phys_addr_t scratch_phys;
1263 	struct kho_debugfs dbg;
1264 };
1265 
1266 static struct kho_in kho_in = {
1267 };
1268 
1269 static const void *kho_get_fdt(void)
1270 {
1271 	return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL;
1272 }
1273 
1274 /**
1275  * is_kho_boot - check if current kernel was booted via KHO-enabled
1276  * kexec
1277  *
1278  * This function checks if the current kernel was loaded through a kexec
1279  * operation with KHO enabled, by verifying that a valid KHO FDT
1280  * was passed.
1281  *
1282  * Note: This function returns reliable results only after
1283  * kho_populate() has been called during early boot. Before that,
1284  * it may return false even if KHO data is present.
1285  *
1286  * Return: true if booted via KHO-enabled kexec, false otherwise
1287  */
1288 bool is_kho_boot(void)
1289 {
1290 	return !!kho_get_fdt();
1291 }
1292 EXPORT_SYMBOL_GPL(is_kho_boot);
1293 
1294 /**
1295  * kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
1296  * @name: the name of the sub FDT passed to kho_add_subtree().
1297  * @phys: if found, the physical address of the sub FDT is stored in @phys.
1298  *
1299  * Retrieve a preserved sub FDT named @name and store its physical
1300  * address in @phys.
1301  *
1302  * Return: 0 on success, error code on failure
1303  */
1304 int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
1305 {
1306 	const void *fdt = kho_get_fdt();
1307 	const u64 *val;
1308 	int offset, len;
1309 
1310 	if (!fdt)
1311 		return -ENOENT;
1312 
1313 	if (!phys)
1314 		return -EINVAL;
1315 
1316 	offset = fdt_subnode_offset(fdt, 0, name);
1317 	if (offset < 0)
1318 		return -ENOENT;
1319 
1320 	val = fdt_getprop(fdt, offset, KHO_FDT_SUB_TREE_PROP_NAME, &len);
1321 	if (!val || len != sizeof(*val))
1322 		return -EINVAL;
1323 
1324 	*phys = (phys_addr_t)*val;
1325 
1326 	return 0;
1327 }
1328 EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
1329 
1330 static int __init kho_mem_retrieve(const void *fdt)
1331 {
1332 	struct kho_radix_tree tree;
1333 	const phys_addr_t *mem;
1334 	int len;
1335 
1336 	/* Retrieve the KHO radix tree from passed-in FDT. */
1337 	mem = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len);
1338 
1339 	if (!mem || len != sizeof(*mem)) {
1340 		pr_err("failed to get preserved KHO memory tree\n");
1341 		return -ENOENT;
1342 	}
1343 
1344 	if (!*mem)
1345 		return -EINVAL;
1346 
1347 	tree.root = phys_to_virt(*mem);
1348 	mutex_init(&tree.lock);
1349 	return kho_radix_walk_tree(&tree, kho_preserved_memory_reserve);
1350 }
1351 
1352 static __init int kho_out_fdt_setup(void)
1353 {
1354 	struct kho_radix_tree *tree = &kho_out.radix_tree;
1355 	void *root = kho_out.fdt;
1356 	u64 preserved_mem_tree_pa;
1357 	int err;
1358 
1359 	err = fdt_create(root, PAGE_SIZE);
1360 	err |= fdt_finish_reservemap(root);
1361 	err |= fdt_begin_node(root, "");
1362 	err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
1363 
1364 	preserved_mem_tree_pa = virt_to_phys(tree->root);
1365 
1366 	err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME,
1367 			    &preserved_mem_tree_pa,
1368 			    sizeof(preserved_mem_tree_pa));
1369 
1370 	err |= fdt_end_node(root);
1371 	err |= fdt_finish(root);
1372 
1373 	return err;
1374 }
1375 
1376 static __init int kho_init(void)
1377 {
1378 	struct kho_radix_tree *tree = &kho_out.radix_tree;
1379 	const void *fdt = kho_get_fdt();
1380 	int err = 0;
1381 
1382 	if (!kho_enable)
1383 		return 0;
1384 
1385 	tree->root = kzalloc(PAGE_SIZE, GFP_KERNEL);
1386 	if (!tree->root) {
1387 		err = -ENOMEM;
1388 		goto err_free_scratch;
1389 	}
1390 
1391 	kho_out.fdt = kho_alloc_preserve(PAGE_SIZE);
1392 	if (IS_ERR(kho_out.fdt)) {
1393 		err = PTR_ERR(kho_out.fdt);
1394 		goto err_free_kho_radix_tree_root;
1395 	}
1396 
1397 	err = kho_debugfs_init();
1398 	if (err)
1399 		goto err_free_fdt;
1400 
1401 	err = kho_out_debugfs_init(&kho_out.dbg);
1402 	if (err)
1403 		goto err_free_fdt;
1404 
1405 	err = kho_out_fdt_setup();
1406 	if (err)
1407 		goto err_free_fdt;
1408 
1409 	if (fdt) {
1410 		kho_in_debugfs_init(&kho_in.dbg, fdt);
1411 		return 0;
1412 	}
1413 
1414 	for (int i = 0; i < kho_scratch_cnt; i++) {
1415 		unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr);
1416 		unsigned long count = kho_scratch[i].size >> PAGE_SHIFT;
1417 		unsigned long pfn;
1418 
1419 		/*
1420 		 * When debug_pagealloc is enabled, __free_pages() clears the
1421 		 * corresponding PRESENT bit in the kernel page table.
1422 		 * Subsequent kmemleak scans of these pages cause the
1423 		 * non-PRESENT page faults.
1424 		 * Mark scratch areas with kmemleak_ignore_phys() to exclude
1425 		 * them from kmemleak scanning.
1426 		 */
1427 		kmemleak_ignore_phys(kho_scratch[i].addr);
1428 		for (pfn = base_pfn; pfn < base_pfn + count;
1429 		     pfn += pageblock_nr_pages)
1430 			init_cma_reserved_pageblock(pfn_to_page(pfn));
1431 	}
1432 
1433 	WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt",
1434 					 kho_out.fdt, true));
1435 
1436 	return 0;
1437 
1438 err_free_fdt:
1439 	kho_unpreserve_free(kho_out.fdt);
1440 err_free_kho_radix_tree_root:
1441 	kfree(tree->root);
1442 	tree->root = NULL;
1443 err_free_scratch:
1444 	kho_out.fdt = NULL;
1445 	for (int i = 0; i < kho_scratch_cnt; i++) {
1446 		void *start = __va(kho_scratch[i].addr);
1447 		void *end = start + kho_scratch[i].size;
1448 
1449 		free_reserved_area(start, end, -1, "");
1450 	}
1451 	kho_enable = false;
1452 	return err;
1453 }
1454 fs_initcall(kho_init);
1455 
1456 static void __init kho_release_scratch(void)
1457 {
1458 	phys_addr_t start, end;
1459 	u64 i;
1460 
1461 	memmap_init_kho_scratch_pages();
1462 
1463 	/*
1464 	 * Mark scratch mem as CMA before we return it. That way we
1465 	 * ensure that no kernel allocations happen on it. That means
1466 	 * we can reuse it as scratch memory again later.
1467 	 */
1468 	__for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
1469 			     MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) {
1470 		ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start));
1471 		ulong end_pfn = pageblock_align(PFN_UP(end));
1472 		ulong pfn;
1473 
1474 		for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
1475 			init_pageblock_migratetype(pfn_to_page(pfn),
1476 						   MIGRATE_CMA, false);
1477 	}
1478 }
1479 
1480 void __init kho_memory_init(void)
1481 {
1482 	if (kho_in.scratch_phys) {
1483 		kho_scratch = phys_to_virt(kho_in.scratch_phys);
1484 		kho_release_scratch();
1485 
1486 		if (kho_mem_retrieve(kho_get_fdt()))
1487 			kho_in.fdt_phys = 0;
1488 	} else {
1489 		kho_reserve_scratch();
1490 	}
1491 }
1492 
1493 void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
1494 			 phys_addr_t scratch_phys, u64 scratch_len)
1495 {
1496 	unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
1497 	struct kho_scratch *scratch = NULL;
1498 	phys_addr_t mem_map_phys;
1499 	void *fdt = NULL;
1500 	bool populated = false;
1501 	int err;
1502 
1503 	/* Validate the input FDT */
1504 	fdt = early_memremap(fdt_phys, fdt_len);
1505 	if (!fdt) {
1506 		pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys);
1507 		goto report;
1508 	}
1509 	err = fdt_check_header(fdt);
1510 	if (err) {
1511 		pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n",
1512 			fdt_phys, err);
1513 		goto unmap_fdt;
1514 	}
1515 	err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE);
1516 	if (err) {
1517 		pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n",
1518 			fdt_phys, KHO_FDT_COMPATIBLE, err);
1519 		goto unmap_fdt;
1520 	}
1521 
1522 	mem_map_phys = kho_get_mem_map_phys(fdt);
1523 	if (!mem_map_phys)
1524 		goto unmap_fdt;
1525 
1526 	scratch = early_memremap(scratch_phys, scratch_len);
1527 	if (!scratch) {
1528 		pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n",
1529 			scratch_phys, scratch_len);
1530 		goto unmap_fdt;
1531 	}
1532 
1533 	/*
1534 	 * We pass a safe contiguous blocks of memory to use for early boot
1535 	 * purporses from the previous kernel so that we can resize the
1536 	 * memblock array as needed.
1537 	 */
1538 	for (int i = 0; i < scratch_cnt; i++) {
1539 		struct kho_scratch *area = &scratch[i];
1540 		u64 size = area->size;
1541 
1542 		memblock_add(area->addr, size);
1543 		err = memblock_mark_kho_scratch(area->addr, size);
1544 		if (err) {
1545 			pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe",
1546 				&area->addr, &size, ERR_PTR(err));
1547 			goto unmap_scratch;
1548 		}
1549 		pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
1550 	}
1551 
1552 	memblock_reserve(scratch_phys, scratch_len);
1553 
1554 	/*
1555 	 * Now that we have a viable region of scratch memory, let's tell
1556 	 * the memblocks allocator to only use that for any allocations.
1557 	 * That way we ensure that nothing scribbles over in use data while
1558 	 * we initialize the page tables which we will need to ingest all
1559 	 * memory reservations from the previous kernel.
1560 	 */
1561 	memblock_set_kho_scratch_only();
1562 
1563 	kho_in.fdt_phys = fdt_phys;
1564 	kho_in.scratch_phys = scratch_phys;
1565 	kho_scratch_cnt = scratch_cnt;
1566 
1567 	populated = true;
1568 	pr_info("found kexec handover data.\n");
1569 
1570 unmap_scratch:
1571 	early_memunmap(scratch, scratch_len);
1572 unmap_fdt:
1573 	early_memunmap(fdt, fdt_len);
1574 report:
1575 	if (!populated)
1576 		pr_warn("disabling KHO revival\n");
1577 }
1578 
1579 /* Helper functions for kexec_file_load */
1580 
1581 int kho_fill_kimage(struct kimage *image)
1582 {
1583 	ssize_t scratch_size;
1584 	int err = 0;
1585 	struct kexec_buf scratch;
1586 
1587 	if (!kho_enable)
1588 		return 0;
1589 
1590 	image->kho.fdt = virt_to_phys(kho_out.fdt);
1591 
1592 	scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt;
1593 	scratch = (struct kexec_buf){
1594 		.image = image,
1595 		.buffer = kho_scratch,
1596 		.bufsz = scratch_size,
1597 		.mem = KEXEC_BUF_MEM_UNKNOWN,
1598 		.memsz = scratch_size,
1599 		.buf_align = SZ_64K, /* Makes it easier to map */
1600 		.buf_max = ULONG_MAX,
1601 		.top_down = true,
1602 	};
1603 	err = kexec_add_buffer(&scratch);
1604 	if (err)
1605 		return err;
1606 	image->kho.scratch = &image->segment[image->nr_segments - 1];
1607 
1608 	return 0;
1609 }
1610 
1611 static int kho_walk_scratch(struct kexec_buf *kbuf,
1612 			    int (*func)(struct resource *, void *))
1613 {
1614 	int ret = 0;
1615 	int i;
1616 
1617 	for (i = 0; i < kho_scratch_cnt; i++) {
1618 		struct resource res = {
1619 			.start = kho_scratch[i].addr,
1620 			.end = kho_scratch[i].addr + kho_scratch[i].size - 1,
1621 		};
1622 
1623 		/* Try to fit the kimage into our KHO scratch region */
1624 		ret = func(&res, kbuf);
1625 		if (ret)
1626 			break;
1627 	}
1628 
1629 	return ret;
1630 }
1631 
1632 int kho_locate_mem_hole(struct kexec_buf *kbuf,
1633 			int (*func)(struct resource *, void *))
1634 {
1635 	int ret;
1636 
1637 	if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH)
1638 		return 1;
1639 
1640 	ret = kho_walk_scratch(kbuf, func);
1641 
1642 	return ret == 1 ? 0 : -EADDRNOTAVAIL;
1643 }
1644