1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * kexec_handover.c - kexec handover metadata processing
4 * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
5 * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
6 * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
7 */
8
9 #define pr_fmt(fmt) "KHO: " fmt
10
11 #include <linux/cma.h>
12 #include <linux/count_zeros.h>
13 #include <linux/debugfs.h>
14 #include <linux/kexec.h>
15 #include <linux/kexec_handover.h>
16 #include <linux/libfdt.h>
17 #include <linux/list.h>
18 #include <linux/memblock.h>
19 #include <linux/notifier.h>
20 #include <linux/page-isolation.h>
21
22 #include <asm/early_ioremap.h>
23
24 /*
25 * KHO is tightly coupled with mm init and needs access to some of mm
26 * internal APIs.
27 */
28 #include "../mm/internal.h"
29 #include "kexec_internal.h"
30
31 #define KHO_FDT_COMPATIBLE "kho-v1"
32 #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
33 #define PROP_SUB_FDT "fdt"
34
35 static bool kho_enable __ro_after_init;
36
kho_is_enabled(void)37 bool kho_is_enabled(void)
38 {
39 return kho_enable;
40 }
41 EXPORT_SYMBOL_GPL(kho_is_enabled);
42
kho_parse_enable(char * p)43 static int __init kho_parse_enable(char *p)
44 {
45 return kstrtobool(p, &kho_enable);
46 }
47 early_param("kho", kho_parse_enable);
48
49 /*
50 * Keep track of memory that is to be preserved across KHO.
51 *
52 * The serializing side uses two levels of xarrays to manage chunks of per-order
53 * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a
54 * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations
55 * each bitmap will cover 16M of address space. Thus, for 16G of memory at most
56 * 512K of bitmap memory will be needed for order 0.
57 *
58 * This approach is fully incremental, as the serialization progresses folios
59 * can continue be aggregated to the tracker. The final step, immediately prior
60 * to kexec would serialize the xarray information into a linked list for the
61 * successor kernel to parse.
62 */
63
64 #define PRESERVE_BITS (512 * 8)
65
66 struct kho_mem_phys_bits {
67 DECLARE_BITMAP(preserve, PRESERVE_BITS);
68 };
69
70 struct kho_mem_phys {
71 /*
72 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
73 * to order.
74 */
75 struct xarray phys_bits;
76 };
77
78 struct kho_mem_track {
79 /* Points to kho_mem_phys, each order gets its own bitmap tree */
80 struct xarray orders;
81 };
82
83 struct khoser_mem_chunk;
84
85 struct kho_serialization {
86 struct page *fdt;
87 struct list_head fdt_list;
88 struct dentry *sub_fdt_dir;
89 struct kho_mem_track track;
90 /* First chunk of serialized preserved memory map */
91 struct khoser_mem_chunk *preserved_mem_map;
92 };
93
xa_load_or_alloc(struct xarray * xa,unsigned long index,size_t sz)94 static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
95 {
96 void *elm, *res;
97
98 elm = xa_load(xa, index);
99 if (elm)
100 return elm;
101
102 elm = kzalloc(sz, GFP_KERNEL);
103 if (!elm)
104 return ERR_PTR(-ENOMEM);
105
106 res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
107 if (xa_is_err(res))
108 res = ERR_PTR(xa_err(res));
109
110 if (res) {
111 kfree(elm);
112 return res;
113 }
114
115 return elm;
116 }
117
__kho_unpreserve(struct kho_mem_track * track,unsigned long pfn,unsigned long end_pfn)118 static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
119 unsigned long end_pfn)
120 {
121 struct kho_mem_phys_bits *bits;
122 struct kho_mem_phys *physxa;
123
124 while (pfn < end_pfn) {
125 const unsigned int order =
126 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
127 const unsigned long pfn_high = pfn >> order;
128
129 physxa = xa_load(&track->orders, order);
130 if (!physxa)
131 continue;
132
133 bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
134 if (!bits)
135 continue;
136
137 clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
138
139 pfn += 1 << order;
140 }
141 }
142
__kho_preserve_order(struct kho_mem_track * track,unsigned long pfn,unsigned int order)143 static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
144 unsigned int order)
145 {
146 struct kho_mem_phys_bits *bits;
147 struct kho_mem_phys *physxa, *new_physxa;
148 const unsigned long pfn_high = pfn >> order;
149
150 might_sleep();
151
152 physxa = xa_load(&track->orders, order);
153 if (!physxa) {
154 int err;
155
156 new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
157 if (!new_physxa)
158 return -ENOMEM;
159
160 xa_init(&new_physxa->phys_bits);
161 physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
162 GFP_KERNEL);
163
164 err = xa_err(physxa);
165 if (err || physxa) {
166 xa_destroy(&new_physxa->phys_bits);
167 kfree(new_physxa);
168
169 if (err)
170 return err;
171 } else {
172 physxa = new_physxa;
173 }
174 }
175
176 bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
177 sizeof(*bits));
178 if (IS_ERR(bits))
179 return PTR_ERR(bits);
180
181 set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
182
183 return 0;
184 }
185
186 /* almost as free_reserved_page(), just don't free the page */
kho_restore_page(struct page * page,unsigned int order)187 static void kho_restore_page(struct page *page, unsigned int order)
188 {
189 unsigned int nr_pages = (1 << order);
190
191 /* Head page gets refcount of 1. */
192 set_page_count(page, 1);
193
194 /* For higher order folios, tail pages get a page count of zero. */
195 for (unsigned int i = 1; i < nr_pages; i++)
196 set_page_count(page + i, 0);
197
198 if (order > 0)
199 prep_compound_page(page, order);
200
201 adjust_managed_page_count(page, nr_pages);
202 }
203
204 /**
205 * kho_restore_folio - recreates the folio from the preserved memory.
206 * @phys: physical address of the folio.
207 *
208 * Return: pointer to the struct folio on success, NULL on failure.
209 */
kho_restore_folio(phys_addr_t phys)210 struct folio *kho_restore_folio(phys_addr_t phys)
211 {
212 struct page *page = pfn_to_online_page(PHYS_PFN(phys));
213 unsigned long order;
214
215 if (!page)
216 return NULL;
217
218 order = page->private;
219 if (order > MAX_PAGE_ORDER)
220 return NULL;
221
222 kho_restore_page(page, order);
223 return page_folio(page);
224 }
225 EXPORT_SYMBOL_GPL(kho_restore_folio);
226
227 /* Serialize and deserialize struct kho_mem_phys across kexec
228 *
229 * Record all the bitmaps in a linked list of pages for the next kernel to
230 * process. Each chunk holds bitmaps of the same order and each block of bitmaps
231 * starts at a given physical address. This allows the bitmaps to be sparse. The
232 * xarray is used to store them in a tree while building up the data structure,
233 * but the KHO successor kernel only needs to process them once in order.
234 *
235 * All of this memory is normal kmalloc() memory and is not marked for
236 * preservation. The successor kernel will remain isolated to the scratch space
237 * until it completes processing this list. Once processed all the memory
238 * storing these ranges will be marked as free.
239 */
240
241 struct khoser_mem_bitmap_ptr {
242 phys_addr_t phys_start;
243 DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *);
244 };
245
246 struct khoser_mem_chunk_hdr {
247 DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *);
248 unsigned int order;
249 unsigned int num_elms;
250 };
251
252 #define KHOSER_BITMAP_SIZE \
253 ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \
254 sizeof(struct khoser_mem_bitmap_ptr))
255
256 struct khoser_mem_chunk {
257 struct khoser_mem_chunk_hdr hdr;
258 struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE];
259 };
260
261 static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
262
new_chunk(struct khoser_mem_chunk * cur_chunk,unsigned long order)263 static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
264 unsigned long order)
265 {
266 struct khoser_mem_chunk *chunk;
267
268 chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
269 if (!chunk)
270 return NULL;
271 chunk->hdr.order = order;
272 if (cur_chunk)
273 KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
274 return chunk;
275 }
276
kho_mem_ser_free(struct khoser_mem_chunk * first_chunk)277 static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
278 {
279 struct khoser_mem_chunk *chunk = first_chunk;
280
281 while (chunk) {
282 struct khoser_mem_chunk *tmp = chunk;
283
284 chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
285 kfree(tmp);
286 }
287 }
288
kho_mem_serialize(struct kho_serialization * ser)289 static int kho_mem_serialize(struct kho_serialization *ser)
290 {
291 struct khoser_mem_chunk *first_chunk = NULL;
292 struct khoser_mem_chunk *chunk = NULL;
293 struct kho_mem_phys *physxa;
294 unsigned long order;
295
296 xa_for_each(&ser->track.orders, order, physxa) {
297 struct kho_mem_phys_bits *bits;
298 unsigned long phys;
299
300 chunk = new_chunk(chunk, order);
301 if (!chunk)
302 goto err_free;
303
304 if (!first_chunk)
305 first_chunk = chunk;
306
307 xa_for_each(&physxa->phys_bits, phys, bits) {
308 struct khoser_mem_bitmap_ptr *elm;
309
310 if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
311 chunk = new_chunk(chunk, order);
312 if (!chunk)
313 goto err_free;
314 }
315
316 elm = &chunk->bitmaps[chunk->hdr.num_elms];
317 chunk->hdr.num_elms++;
318 elm->phys_start = (phys * PRESERVE_BITS)
319 << (order + PAGE_SHIFT);
320 KHOSER_STORE_PTR(elm->bitmap, bits);
321 }
322 }
323
324 ser->preserved_mem_map = first_chunk;
325
326 return 0;
327
328 err_free:
329 kho_mem_ser_free(first_chunk);
330 return -ENOMEM;
331 }
332
deserialize_bitmap(unsigned int order,struct khoser_mem_bitmap_ptr * elm)333 static void __init deserialize_bitmap(unsigned int order,
334 struct khoser_mem_bitmap_ptr *elm)
335 {
336 struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
337 unsigned long bit;
338
339 for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
340 int sz = 1 << (order + PAGE_SHIFT);
341 phys_addr_t phys =
342 elm->phys_start + (bit << (order + PAGE_SHIFT));
343 struct page *page = phys_to_page(phys);
344
345 memblock_reserve(phys, sz);
346 memblock_reserved_mark_noinit(phys, sz);
347 page->private = order;
348 }
349 }
350
kho_mem_deserialize(const void * fdt)351 static void __init kho_mem_deserialize(const void *fdt)
352 {
353 struct khoser_mem_chunk *chunk;
354 const phys_addr_t *mem;
355 int len;
356
357 mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
358
359 if (!mem || len != sizeof(*mem)) {
360 pr_err("failed to get preserved memory bitmaps\n");
361 return;
362 }
363
364 chunk = *mem ? phys_to_virt(*mem) : NULL;
365 while (chunk) {
366 unsigned int i;
367
368 for (i = 0; i != chunk->hdr.num_elms; i++)
369 deserialize_bitmap(chunk->hdr.order,
370 &chunk->bitmaps[i]);
371 chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
372 }
373 }
374
375 /*
376 * With KHO enabled, memory can become fragmented because KHO regions may
377 * be anywhere in physical address space. The scratch regions give us a
378 * safe zones that we will never see KHO allocations from. This is where we
379 * can later safely load our new kexec images into and then use the scratch
380 * area for early allocations that happen before page allocator is
381 * initialized.
382 */
383 static struct kho_scratch *kho_scratch;
384 static unsigned int kho_scratch_cnt;
385
386 /*
387 * The scratch areas are scaled by default as percent of memory allocated from
388 * memblock. A user can override the scale with command line parameter:
389 *
390 * kho_scratch=N%
391 *
392 * It is also possible to explicitly define size for a lowmem, a global and
393 * per-node scratch areas:
394 *
395 * kho_scratch=l[KMG],n[KMG],m[KMG]
396 *
397 * The explicit size definition takes precedence over scale definition.
398 */
399 static unsigned int scratch_scale __initdata = 200;
400 static phys_addr_t scratch_size_global __initdata;
401 static phys_addr_t scratch_size_pernode __initdata;
402 static phys_addr_t scratch_size_lowmem __initdata;
403
kho_parse_scratch_size(char * p)404 static int __init kho_parse_scratch_size(char *p)
405 {
406 size_t len;
407 unsigned long sizes[3];
408 int i;
409
410 if (!p)
411 return -EINVAL;
412
413 len = strlen(p);
414 if (!len)
415 return -EINVAL;
416
417 /* parse nn% */
418 if (p[len - 1] == '%') {
419 /* unsigned int max is 4,294,967,295, 10 chars */
420 char s_scale[11] = {};
421 int ret = 0;
422
423 if (len > ARRAY_SIZE(s_scale))
424 return -EINVAL;
425
426 memcpy(s_scale, p, len - 1);
427 ret = kstrtouint(s_scale, 10, &scratch_scale);
428 if (!ret)
429 pr_notice("scratch scale is %d%%\n", scratch_scale);
430 return ret;
431 }
432
433 /* parse ll[KMG],mm[KMG],nn[KMG] */
434 for (i = 0; i < ARRAY_SIZE(sizes); i++) {
435 char *endp = p;
436
437 if (i > 0) {
438 if (*p != ',')
439 return -EINVAL;
440 p += 1;
441 }
442
443 sizes[i] = memparse(p, &endp);
444 if (!sizes[i] || endp == p)
445 return -EINVAL;
446 p = endp;
447 }
448
449 scratch_size_lowmem = sizes[0];
450 scratch_size_global = sizes[1];
451 scratch_size_pernode = sizes[2];
452 scratch_scale = 0;
453
454 pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n",
455 (u64)(scratch_size_lowmem >> 20),
456 (u64)(scratch_size_global >> 20),
457 (u64)(scratch_size_pernode >> 20));
458
459 return 0;
460 }
461 early_param("kho_scratch", kho_parse_scratch_size);
462
scratch_size_update(void)463 static void __init scratch_size_update(void)
464 {
465 phys_addr_t size;
466
467 if (!scratch_scale)
468 return;
469
470 size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT,
471 NUMA_NO_NODE);
472 size = size * scratch_scale / 100;
473 scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
474
475 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
476 NUMA_NO_NODE);
477 size = size * scratch_scale / 100 - scratch_size_lowmem;
478 scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
479 }
480
scratch_size_node(int nid)481 static phys_addr_t __init scratch_size_node(int nid)
482 {
483 phys_addr_t size;
484
485 if (scratch_scale) {
486 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
487 nid);
488 size = size * scratch_scale / 100;
489 } else {
490 size = scratch_size_pernode;
491 }
492
493 return round_up(size, CMA_MIN_ALIGNMENT_BYTES);
494 }
495
496 /**
497 * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec
498 *
499 * With KHO we can preserve arbitrary pages in the system. To ensure we still
500 * have a large contiguous region of memory when we search the physical address
501 * space for target memory, let's make sure we always have a large CMA region
502 * active. This CMA region will only be used for movable pages which are not a
503 * problem for us during KHO because we can just move them somewhere else.
504 */
kho_reserve_scratch(void)505 static void __init kho_reserve_scratch(void)
506 {
507 phys_addr_t addr, size;
508 int nid, i = 0;
509
510 if (!kho_enable)
511 return;
512
513 scratch_size_update();
514
515 /* FIXME: deal with node hot-plug/remove */
516 kho_scratch_cnt = num_online_nodes() + 2;
517 size = kho_scratch_cnt * sizeof(*kho_scratch);
518 kho_scratch = memblock_alloc(size, PAGE_SIZE);
519 if (!kho_scratch)
520 goto err_disable_kho;
521
522 /*
523 * reserve scratch area in low memory for lowmem allocations in the
524 * next kernel
525 */
526 size = scratch_size_lowmem;
527 addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0,
528 ARCH_LOW_ADDRESS_LIMIT);
529 if (!addr)
530 goto err_free_scratch_desc;
531
532 kho_scratch[i].addr = addr;
533 kho_scratch[i].size = size;
534 i++;
535
536 /* reserve large contiguous area for allocations without nid */
537 size = scratch_size_global;
538 addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES);
539 if (!addr)
540 goto err_free_scratch_areas;
541
542 kho_scratch[i].addr = addr;
543 kho_scratch[i].size = size;
544 i++;
545
546 for_each_online_node(nid) {
547 size = scratch_size_node(nid);
548 addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES,
549 0, MEMBLOCK_ALLOC_ACCESSIBLE,
550 nid, true);
551 if (!addr)
552 goto err_free_scratch_areas;
553
554 kho_scratch[i].addr = addr;
555 kho_scratch[i].size = size;
556 i++;
557 }
558
559 return;
560
561 err_free_scratch_areas:
562 for (i--; i >= 0; i--)
563 memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size);
564 err_free_scratch_desc:
565 memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch));
566 err_disable_kho:
567 pr_warn("Failed to reserve scratch area, disabling kexec handover\n");
568 kho_enable = false;
569 }
570
571 struct fdt_debugfs {
572 struct list_head list;
573 struct debugfs_blob_wrapper wrapper;
574 struct dentry *file;
575 };
576
kho_debugfs_fdt_add(struct list_head * list,struct dentry * dir,const char * name,const void * fdt)577 static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
578 const char *name, const void *fdt)
579 {
580 struct fdt_debugfs *f;
581 struct dentry *file;
582
583 f = kmalloc(sizeof(*f), GFP_KERNEL);
584 if (!f)
585 return -ENOMEM;
586
587 f->wrapper.data = (void *)fdt;
588 f->wrapper.size = fdt_totalsize(fdt);
589
590 file = debugfs_create_blob(name, 0400, dir, &f->wrapper);
591 if (IS_ERR(file)) {
592 kfree(f);
593 return PTR_ERR(file);
594 }
595
596 f->file = file;
597 list_add(&f->list, list);
598
599 return 0;
600 }
601
602 /**
603 * kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
604 * @ser: serialization control object passed by KHO notifiers.
605 * @name: name of the sub tree.
606 * @fdt: the sub tree blob.
607 *
608 * Creates a new child node named @name in KHO root FDT and records
609 * the physical address of @fdt. The pages of @fdt must also be preserved
610 * by KHO for the new kernel to retrieve it after kexec.
611 *
612 * A debugfs blob entry is also created at
613 * ``/sys/kernel/debug/kho/out/sub_fdts/@name``.
614 *
615 * Return: 0 on success, error code on failure
616 */
kho_add_subtree(struct kho_serialization * ser,const char * name,void * fdt)617 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt)
618 {
619 int err = 0;
620 u64 phys = (u64)virt_to_phys(fdt);
621 void *root = page_to_virt(ser->fdt);
622
623 err |= fdt_begin_node(root, name);
624 err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys));
625 err |= fdt_end_node(root);
626
627 if (err)
628 return err;
629
630 return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt);
631 }
632 EXPORT_SYMBOL_GPL(kho_add_subtree);
633
634 struct kho_out {
635 struct blocking_notifier_head chain_head;
636
637 struct dentry *dir;
638
639 struct mutex lock; /* protects KHO FDT finalization */
640
641 struct kho_serialization ser;
642 bool finalized;
643 };
644
645 static struct kho_out kho_out = {
646 .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
647 .lock = __MUTEX_INITIALIZER(kho_out.lock),
648 .ser = {
649 .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list),
650 .track = {
651 .orders = XARRAY_INIT(kho_out.ser.track.orders, 0),
652 },
653 },
654 .finalized = false,
655 };
656
register_kho_notifier(struct notifier_block * nb)657 int register_kho_notifier(struct notifier_block *nb)
658 {
659 return blocking_notifier_chain_register(&kho_out.chain_head, nb);
660 }
661 EXPORT_SYMBOL_GPL(register_kho_notifier);
662
unregister_kho_notifier(struct notifier_block * nb)663 int unregister_kho_notifier(struct notifier_block *nb)
664 {
665 return blocking_notifier_chain_unregister(&kho_out.chain_head, nb);
666 }
667 EXPORT_SYMBOL_GPL(unregister_kho_notifier);
668
669 /**
670 * kho_preserve_folio - preserve a folio across kexec.
671 * @folio: folio to preserve.
672 *
673 * Instructs KHO to preserve the whole folio across kexec. The order
674 * will be preserved as well.
675 *
676 * Return: 0 on success, error code on failure
677 */
kho_preserve_folio(struct folio * folio)678 int kho_preserve_folio(struct folio *folio)
679 {
680 const unsigned long pfn = folio_pfn(folio);
681 const unsigned int order = folio_order(folio);
682 struct kho_mem_track *track = &kho_out.ser.track;
683
684 if (kho_out.finalized)
685 return -EBUSY;
686
687 return __kho_preserve_order(track, pfn, order);
688 }
689 EXPORT_SYMBOL_GPL(kho_preserve_folio);
690
691 /**
692 * kho_preserve_phys - preserve a physically contiguous range across kexec.
693 * @phys: physical address of the range.
694 * @size: size of the range.
695 *
696 * Instructs KHO to preserve the memory range from @phys to @phys + @size
697 * across kexec.
698 *
699 * Return: 0 on success, error code on failure
700 */
kho_preserve_phys(phys_addr_t phys,size_t size)701 int kho_preserve_phys(phys_addr_t phys, size_t size)
702 {
703 unsigned long pfn = PHYS_PFN(phys);
704 unsigned long failed_pfn = 0;
705 const unsigned long start_pfn = pfn;
706 const unsigned long end_pfn = PHYS_PFN(phys + size);
707 int err = 0;
708 struct kho_mem_track *track = &kho_out.ser.track;
709
710 if (kho_out.finalized)
711 return -EBUSY;
712
713 if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
714 return -EINVAL;
715
716 while (pfn < end_pfn) {
717 const unsigned int order =
718 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
719
720 err = __kho_preserve_order(track, pfn, order);
721 if (err) {
722 failed_pfn = pfn;
723 break;
724 }
725
726 pfn += 1 << order;
727 }
728
729 if (err)
730 __kho_unpreserve(track, start_pfn, failed_pfn);
731
732 return err;
733 }
734 EXPORT_SYMBOL_GPL(kho_preserve_phys);
735
736 /* Handling for debug/kho/out */
737
738 static struct dentry *debugfs_root;
739
kho_out_update_debugfs_fdt(void)740 static int kho_out_update_debugfs_fdt(void)
741 {
742 int err = 0;
743 struct fdt_debugfs *ff, *tmp;
744
745 if (kho_out.finalized) {
746 err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir,
747 "fdt", page_to_virt(kho_out.ser.fdt));
748 } else {
749 list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) {
750 debugfs_remove(ff->file);
751 list_del(&ff->list);
752 kfree(ff);
753 }
754 }
755
756 return err;
757 }
758
kho_abort(void)759 static int kho_abort(void)
760 {
761 int err;
762 unsigned long order;
763 struct kho_mem_phys *physxa;
764
765 xa_for_each(&kho_out.ser.track.orders, order, physxa) {
766 struct kho_mem_phys_bits *bits;
767 unsigned long phys;
768
769 xa_for_each(&physxa->phys_bits, phys, bits)
770 kfree(bits);
771
772 xa_destroy(&physxa->phys_bits);
773 kfree(physxa);
774 }
775 xa_destroy(&kho_out.ser.track.orders);
776
777 if (kho_out.ser.preserved_mem_map) {
778 kho_mem_ser_free(kho_out.ser.preserved_mem_map);
779 kho_out.ser.preserved_mem_map = NULL;
780 }
781
782 err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT,
783 NULL);
784 err = notifier_to_errno(err);
785
786 if (err)
787 pr_err("Failed to abort KHO finalization: %d\n", err);
788
789 return err;
790 }
791
kho_finalize(void)792 static int kho_finalize(void)
793 {
794 int err = 0;
795 u64 *preserved_mem_map;
796 void *fdt = page_to_virt(kho_out.ser.fdt);
797
798 err |= fdt_create(fdt, PAGE_SIZE);
799 err |= fdt_finish_reservemap(fdt);
800 err |= fdt_begin_node(fdt, "");
801 err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE);
802 /**
803 * Reserve the preserved-memory-map property in the root FDT, so
804 * that all property definitions will precede subnodes created by
805 * KHO callers.
806 */
807 err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP,
808 sizeof(*preserved_mem_map),
809 (void **)&preserved_mem_map);
810 if (err)
811 goto abort;
812
813 err = kho_preserve_folio(page_folio(kho_out.ser.fdt));
814 if (err)
815 goto abort;
816
817 err = blocking_notifier_call_chain(&kho_out.chain_head,
818 KEXEC_KHO_FINALIZE, &kho_out.ser);
819 err = notifier_to_errno(err);
820 if (err)
821 goto abort;
822
823 err = kho_mem_serialize(&kho_out.ser);
824 if (err)
825 goto abort;
826
827 *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map);
828
829 err |= fdt_end_node(fdt);
830 err |= fdt_finish(fdt);
831
832 abort:
833 if (err) {
834 pr_err("Failed to convert KHO state tree: %d\n", err);
835 kho_abort();
836 }
837
838 return err;
839 }
840
kho_out_finalize_get(void * data,u64 * val)841 static int kho_out_finalize_get(void *data, u64 *val)
842 {
843 mutex_lock(&kho_out.lock);
844 *val = kho_out.finalized;
845 mutex_unlock(&kho_out.lock);
846
847 return 0;
848 }
849
kho_out_finalize_set(void * data,u64 _val)850 static int kho_out_finalize_set(void *data, u64 _val)
851 {
852 int ret = 0;
853 bool val = !!_val;
854
855 mutex_lock(&kho_out.lock);
856
857 if (val == kho_out.finalized) {
858 if (kho_out.finalized)
859 ret = -EEXIST;
860 else
861 ret = -ENOENT;
862 goto unlock;
863 }
864
865 if (val)
866 ret = kho_finalize();
867 else
868 ret = kho_abort();
869
870 if (ret)
871 goto unlock;
872
873 kho_out.finalized = val;
874 ret = kho_out_update_debugfs_fdt();
875
876 unlock:
877 mutex_unlock(&kho_out.lock);
878 return ret;
879 }
880
881 DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get,
882 kho_out_finalize_set, "%llu\n");
883
scratch_phys_show(struct seq_file * m,void * v)884 static int scratch_phys_show(struct seq_file *m, void *v)
885 {
886 for (int i = 0; i < kho_scratch_cnt; i++)
887 seq_printf(m, "0x%llx\n", kho_scratch[i].addr);
888
889 return 0;
890 }
891 DEFINE_SHOW_ATTRIBUTE(scratch_phys);
892
scratch_len_show(struct seq_file * m,void * v)893 static int scratch_len_show(struct seq_file *m, void *v)
894 {
895 for (int i = 0; i < kho_scratch_cnt; i++)
896 seq_printf(m, "0x%llx\n", kho_scratch[i].size);
897
898 return 0;
899 }
900 DEFINE_SHOW_ATTRIBUTE(scratch_len);
901
kho_out_debugfs_init(void)902 static __init int kho_out_debugfs_init(void)
903 {
904 struct dentry *dir, *f, *sub_fdt_dir;
905
906 dir = debugfs_create_dir("out", debugfs_root);
907 if (IS_ERR(dir))
908 return -ENOMEM;
909
910 sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
911 if (IS_ERR(sub_fdt_dir))
912 goto err_rmdir;
913
914 f = debugfs_create_file("scratch_phys", 0400, dir, NULL,
915 &scratch_phys_fops);
916 if (IS_ERR(f))
917 goto err_rmdir;
918
919 f = debugfs_create_file("scratch_len", 0400, dir, NULL,
920 &scratch_len_fops);
921 if (IS_ERR(f))
922 goto err_rmdir;
923
924 f = debugfs_create_file("finalize", 0600, dir, NULL,
925 &fops_kho_out_finalize);
926 if (IS_ERR(f))
927 goto err_rmdir;
928
929 kho_out.dir = dir;
930 kho_out.ser.sub_fdt_dir = sub_fdt_dir;
931 return 0;
932
933 err_rmdir:
934 debugfs_remove_recursive(dir);
935 return -ENOENT;
936 }
937
938 struct kho_in {
939 struct dentry *dir;
940 phys_addr_t fdt_phys;
941 phys_addr_t scratch_phys;
942 struct list_head fdt_list;
943 };
944
945 static struct kho_in kho_in = {
946 .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list),
947 };
948
kho_get_fdt(void)949 static const void *kho_get_fdt(void)
950 {
951 return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL;
952 }
953
954 /**
955 * kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
956 * @name: the name of the sub FDT passed to kho_add_subtree().
957 * @phys: if found, the physical address of the sub FDT is stored in @phys.
958 *
959 * Retrieve a preserved sub FDT named @name and store its physical
960 * address in @phys.
961 *
962 * Return: 0 on success, error code on failure
963 */
kho_retrieve_subtree(const char * name,phys_addr_t * phys)964 int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
965 {
966 const void *fdt = kho_get_fdt();
967 const u64 *val;
968 int offset, len;
969
970 if (!fdt)
971 return -ENOENT;
972
973 if (!phys)
974 return -EINVAL;
975
976 offset = fdt_subnode_offset(fdt, 0, name);
977 if (offset < 0)
978 return -ENOENT;
979
980 val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len);
981 if (!val || len != sizeof(*val))
982 return -EINVAL;
983
984 *phys = (phys_addr_t)*val;
985
986 return 0;
987 }
988 EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
989
990 /* Handling for debugfs/kho/in */
991
kho_in_debugfs_init(const void * fdt)992 static __init int kho_in_debugfs_init(const void *fdt)
993 {
994 struct dentry *sub_fdt_dir;
995 int err, child;
996
997 kho_in.dir = debugfs_create_dir("in", debugfs_root);
998 if (IS_ERR(kho_in.dir))
999 return PTR_ERR(kho_in.dir);
1000
1001 sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir);
1002 if (IS_ERR(sub_fdt_dir)) {
1003 err = PTR_ERR(sub_fdt_dir);
1004 goto err_rmdir;
1005 }
1006
1007 err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt);
1008 if (err)
1009 goto err_rmdir;
1010
1011 fdt_for_each_subnode(child, fdt, 0) {
1012 int len = 0;
1013 const char *name = fdt_get_name(fdt, child, NULL);
1014 const u64 *fdt_phys;
1015
1016 fdt_phys = fdt_getprop(fdt, child, "fdt", &len);
1017 if (!fdt_phys)
1018 continue;
1019 if (len != sizeof(*fdt_phys)) {
1020 pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n",
1021 name, len);
1022 continue;
1023 }
1024 err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name,
1025 phys_to_virt(*fdt_phys));
1026 if (err) {
1027 pr_warn("failed to add fdt `%s` to debugfs: %d\n", name,
1028 err);
1029 continue;
1030 }
1031 }
1032
1033 return 0;
1034
1035 err_rmdir:
1036 debugfs_remove_recursive(kho_in.dir);
1037 return err;
1038 }
1039
kho_init(void)1040 static __init int kho_init(void)
1041 {
1042 int err = 0;
1043 const void *fdt = kho_get_fdt();
1044
1045 if (!kho_enable)
1046 return 0;
1047
1048 kho_out.ser.fdt = alloc_page(GFP_KERNEL);
1049 if (!kho_out.ser.fdt) {
1050 err = -ENOMEM;
1051 goto err_free_scratch;
1052 }
1053
1054 debugfs_root = debugfs_create_dir("kho", NULL);
1055 if (IS_ERR(debugfs_root)) {
1056 err = -ENOENT;
1057 goto err_free_fdt;
1058 }
1059
1060 err = kho_out_debugfs_init();
1061 if (err)
1062 goto err_free_fdt;
1063
1064 if (fdt) {
1065 err = kho_in_debugfs_init(fdt);
1066 /*
1067 * Failure to create /sys/kernel/debug/kho/in does not prevent
1068 * reviving state from KHO and setting up KHO for the next
1069 * kexec.
1070 */
1071 if (err)
1072 pr_err("failed exposing handover FDT in debugfs: %d\n",
1073 err);
1074
1075 return 0;
1076 }
1077
1078 for (int i = 0; i < kho_scratch_cnt; i++) {
1079 unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr);
1080 unsigned long count = kho_scratch[i].size >> PAGE_SHIFT;
1081 unsigned long pfn;
1082
1083 for (pfn = base_pfn; pfn < base_pfn + count;
1084 pfn += pageblock_nr_pages)
1085 init_cma_reserved_pageblock(pfn_to_page(pfn));
1086 }
1087
1088 return 0;
1089
1090 err_free_fdt:
1091 put_page(kho_out.ser.fdt);
1092 kho_out.ser.fdt = NULL;
1093 err_free_scratch:
1094 for (int i = 0; i < kho_scratch_cnt; i++) {
1095 void *start = __va(kho_scratch[i].addr);
1096 void *end = start + kho_scratch[i].size;
1097
1098 free_reserved_area(start, end, -1, "");
1099 }
1100 kho_enable = false;
1101 return err;
1102 }
1103 late_initcall(kho_init);
1104
kho_release_scratch(void)1105 static void __init kho_release_scratch(void)
1106 {
1107 phys_addr_t start, end;
1108 u64 i;
1109
1110 memmap_init_kho_scratch_pages();
1111
1112 /*
1113 * Mark scratch mem as CMA before we return it. That way we
1114 * ensure that no kernel allocations happen on it. That means
1115 * we can reuse it as scratch memory again later.
1116 */
1117 __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
1118 MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) {
1119 ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start));
1120 ulong end_pfn = pageblock_align(PFN_UP(end));
1121 ulong pfn;
1122
1123 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
1124 init_pageblock_migratetype(pfn_to_page(pfn),
1125 MIGRATE_CMA, false);
1126 }
1127 }
1128
kho_memory_init(void)1129 void __init kho_memory_init(void)
1130 {
1131 struct folio *folio;
1132
1133 if (kho_in.scratch_phys) {
1134 kho_scratch = phys_to_virt(kho_in.scratch_phys);
1135 kho_release_scratch();
1136
1137 kho_mem_deserialize(kho_get_fdt());
1138 folio = kho_restore_folio(kho_in.fdt_phys);
1139 if (!folio)
1140 pr_warn("failed to restore folio for KHO fdt\n");
1141 } else {
1142 kho_reserve_scratch();
1143 }
1144 }
1145
kho_populate(phys_addr_t fdt_phys,u64 fdt_len,phys_addr_t scratch_phys,u64 scratch_len)1146 void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
1147 phys_addr_t scratch_phys, u64 scratch_len)
1148 {
1149 void *fdt = NULL;
1150 struct kho_scratch *scratch = NULL;
1151 int err = 0;
1152 unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
1153
1154 /* Validate the input FDT */
1155 fdt = early_memremap(fdt_phys, fdt_len);
1156 if (!fdt) {
1157 pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys);
1158 err = -EFAULT;
1159 goto out;
1160 }
1161 err = fdt_check_header(fdt);
1162 if (err) {
1163 pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n",
1164 fdt_phys, err);
1165 err = -EINVAL;
1166 goto out;
1167 }
1168 err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE);
1169 if (err) {
1170 pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n",
1171 fdt_phys, KHO_FDT_COMPATIBLE, err);
1172 err = -EINVAL;
1173 goto out;
1174 }
1175
1176 scratch = early_memremap(scratch_phys, scratch_len);
1177 if (!scratch) {
1178 pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n",
1179 scratch_phys, scratch_len);
1180 err = -EFAULT;
1181 goto out;
1182 }
1183
1184 /*
1185 * We pass a safe contiguous blocks of memory to use for early boot
1186 * purporses from the previous kernel so that we can resize the
1187 * memblock array as needed.
1188 */
1189 for (int i = 0; i < scratch_cnt; i++) {
1190 struct kho_scratch *area = &scratch[i];
1191 u64 size = area->size;
1192
1193 memblock_add(area->addr, size);
1194 err = memblock_mark_kho_scratch(area->addr, size);
1195 if (WARN_ON(err)) {
1196 pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d",
1197 &area->addr, &size, err);
1198 goto out;
1199 }
1200 pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
1201 }
1202
1203 memblock_reserve(scratch_phys, scratch_len);
1204
1205 /*
1206 * Now that we have a viable region of scratch memory, let's tell
1207 * the memblocks allocator to only use that for any allocations.
1208 * That way we ensure that nothing scribbles over in use data while
1209 * we initialize the page tables which we will need to ingest all
1210 * memory reservations from the previous kernel.
1211 */
1212 memblock_set_kho_scratch_only();
1213
1214 kho_in.fdt_phys = fdt_phys;
1215 kho_in.scratch_phys = scratch_phys;
1216 kho_scratch_cnt = scratch_cnt;
1217 pr_info("found kexec handover data. Will skip init for some devices\n");
1218
1219 out:
1220 if (fdt)
1221 early_memunmap(fdt, fdt_len);
1222 if (scratch)
1223 early_memunmap(scratch, scratch_len);
1224 if (err)
1225 pr_warn("disabling KHO revival: %d\n", err);
1226 }
1227
1228 /* Helper functions for kexec_file_load */
1229
kho_fill_kimage(struct kimage * image)1230 int kho_fill_kimage(struct kimage *image)
1231 {
1232 ssize_t scratch_size;
1233 int err = 0;
1234 struct kexec_buf scratch;
1235
1236 if (!kho_enable)
1237 return 0;
1238
1239 image->kho.fdt = page_to_phys(kho_out.ser.fdt);
1240
1241 scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt;
1242 scratch = (struct kexec_buf){
1243 .image = image,
1244 .buffer = kho_scratch,
1245 .bufsz = scratch_size,
1246 .mem = KEXEC_BUF_MEM_UNKNOWN,
1247 .memsz = scratch_size,
1248 .buf_align = SZ_64K, /* Makes it easier to map */
1249 .buf_max = ULONG_MAX,
1250 .top_down = true,
1251 };
1252 err = kexec_add_buffer(&scratch);
1253 if (err)
1254 return err;
1255 image->kho.scratch = &image->segment[image->nr_segments - 1];
1256
1257 return 0;
1258 }
1259
kho_walk_scratch(struct kexec_buf * kbuf,int (* func)(struct resource *,void *))1260 static int kho_walk_scratch(struct kexec_buf *kbuf,
1261 int (*func)(struct resource *, void *))
1262 {
1263 int ret = 0;
1264 int i;
1265
1266 for (i = 0; i < kho_scratch_cnt; i++) {
1267 struct resource res = {
1268 .start = kho_scratch[i].addr,
1269 .end = kho_scratch[i].addr + kho_scratch[i].size - 1,
1270 };
1271
1272 /* Try to fit the kimage into our KHO scratch region */
1273 ret = func(&res, kbuf);
1274 if (ret)
1275 break;
1276 }
1277
1278 return ret;
1279 }
1280
kho_locate_mem_hole(struct kexec_buf * kbuf,int (* func)(struct resource *,void *))1281 int kho_locate_mem_hole(struct kexec_buf *kbuf,
1282 int (*func)(struct resource *, void *))
1283 {
1284 int ret;
1285
1286 if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH)
1287 return 1;
1288
1289 ret = kho_walk_scratch(kbuf, func);
1290
1291 return ret == 1 ? 0 : -EADDRNOTAVAIL;
1292 }
1293