13dc92c31SAlexander Graf // SPDX-License-Identifier: GPL-2.0-only
23dc92c31SAlexander Graf /*
33dc92c31SAlexander Graf * kexec_handover.c - kexec handover metadata processing
43dc92c31SAlexander Graf * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
53dc92c31SAlexander Graf * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
63dc92c31SAlexander Graf * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
73dc92c31SAlexander Graf */
83dc92c31SAlexander Graf
93dc92c31SAlexander Graf #define pr_fmt(fmt) "KHO: " fmt
103dc92c31SAlexander Graf
113dc92c31SAlexander Graf #include <linux/cma.h>
12fc33e4b4SMike Rapoport (Microsoft) #include <linux/count_zeros.h>
133dc92c31SAlexander Graf #include <linux/debugfs.h>
143dc92c31SAlexander Graf #include <linux/kexec.h>
153dc92c31SAlexander Graf #include <linux/kexec_handover.h>
163dc92c31SAlexander Graf #include <linux/libfdt.h>
173dc92c31SAlexander Graf #include <linux/list.h>
183dc92c31SAlexander Graf #include <linux/memblock.h>
193dc92c31SAlexander Graf #include <linux/notifier.h>
203dc92c31SAlexander Graf #include <linux/page-isolation.h>
21c609c144SAlexander Graf
22c609c144SAlexander Graf #include <asm/early_ioremap.h>
23c609c144SAlexander Graf
243dc92c31SAlexander Graf /*
253dc92c31SAlexander Graf * KHO is tightly coupled with mm init and needs access to some of mm
263dc92c31SAlexander Graf * internal APIs.
273dc92c31SAlexander Graf */
283dc92c31SAlexander Graf #include "../mm/internal.h"
293bdecc3cSAlexander Graf #include "kexec_internal.h"
303dc92c31SAlexander Graf
313dc92c31SAlexander Graf #define KHO_FDT_COMPATIBLE "kho-v1"
323dc92c31SAlexander Graf #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
333dc92c31SAlexander Graf #define PROP_SUB_FDT "fdt"
343dc92c31SAlexander Graf
353dc92c31SAlexander Graf static bool kho_enable __ro_after_init;
363dc92c31SAlexander Graf
kho_is_enabled(void)373dc92c31SAlexander Graf bool kho_is_enabled(void)
383dc92c31SAlexander Graf {
393dc92c31SAlexander Graf return kho_enable;
403dc92c31SAlexander Graf }
413dc92c31SAlexander Graf EXPORT_SYMBOL_GPL(kho_is_enabled);
423dc92c31SAlexander Graf
kho_parse_enable(char * p)433dc92c31SAlexander Graf static int __init kho_parse_enable(char *p)
443dc92c31SAlexander Graf {
453dc92c31SAlexander Graf return kstrtobool(p, &kho_enable);
463dc92c31SAlexander Graf }
473dc92c31SAlexander Graf early_param("kho", kho_parse_enable);
483dc92c31SAlexander Graf
49fc33e4b4SMike Rapoport (Microsoft) /*
50fc33e4b4SMike Rapoport (Microsoft) * Keep track of memory that is to be preserved across KHO.
51fc33e4b4SMike Rapoport (Microsoft) *
52fc33e4b4SMike Rapoport (Microsoft) * The serializing side uses two levels of xarrays to manage chunks of per-order
53fc33e4b4SMike Rapoport (Microsoft) * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a
54fc33e4b4SMike Rapoport (Microsoft) * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations
55fc33e4b4SMike Rapoport (Microsoft) * each bitmap will cover 16M of address space. Thus, for 16G of memory at most
56fc33e4b4SMike Rapoport (Microsoft) * 512K of bitmap memory will be needed for order 0.
57fc33e4b4SMike Rapoport (Microsoft) *
58fc33e4b4SMike Rapoport (Microsoft) * This approach is fully incremental, as the serialization progresses folios
59fc33e4b4SMike Rapoport (Microsoft) * can continue be aggregated to the tracker. The final step, immediately prior
60fc33e4b4SMike Rapoport (Microsoft) * to kexec would serialize the xarray information into a linked list for the
61fc33e4b4SMike Rapoport (Microsoft) * successor kernel to parse.
62fc33e4b4SMike Rapoport (Microsoft) */
63fc33e4b4SMike Rapoport (Microsoft)
64fc33e4b4SMike Rapoport (Microsoft) #define PRESERVE_BITS (512 * 8)
65fc33e4b4SMike Rapoport (Microsoft)
66fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_phys_bits {
67fc33e4b4SMike Rapoport (Microsoft) DECLARE_BITMAP(preserve, PRESERVE_BITS);
68fc33e4b4SMike Rapoport (Microsoft) };
69fc33e4b4SMike Rapoport (Microsoft)
70fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_phys {
71fc33e4b4SMike Rapoport (Microsoft) /*
72fc33e4b4SMike Rapoport (Microsoft) * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
73fc33e4b4SMike Rapoport (Microsoft) * to order.
74fc33e4b4SMike Rapoport (Microsoft) */
75fc33e4b4SMike Rapoport (Microsoft) struct xarray phys_bits;
76fc33e4b4SMike Rapoport (Microsoft) };
77fc33e4b4SMike Rapoport (Microsoft)
78fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_track {
79fc33e4b4SMike Rapoport (Microsoft) /* Points to kho_mem_phys, each order gets its own bitmap tree */
80fc33e4b4SMike Rapoport (Microsoft) struct xarray orders;
81fc33e4b4SMike Rapoport (Microsoft) };
82fc33e4b4SMike Rapoport (Microsoft)
83fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_chunk;
84fc33e4b4SMike Rapoport (Microsoft)
853dc92c31SAlexander Graf struct kho_serialization {
863dc92c31SAlexander Graf struct page *fdt;
873dc92c31SAlexander Graf struct list_head fdt_list;
883dc92c31SAlexander Graf struct dentry *sub_fdt_dir;
89fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_track track;
90fc33e4b4SMike Rapoport (Microsoft) /* First chunk of serialized preserved memory map */
91fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_chunk *preserved_mem_map;
923dc92c31SAlexander Graf };
933dc92c31SAlexander Graf
xa_load_or_alloc(struct xarray * xa,unsigned long index,size_t sz)94fc33e4b4SMike Rapoport (Microsoft) static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
95fc33e4b4SMike Rapoport (Microsoft) {
96fc33e4b4SMike Rapoport (Microsoft) void *elm, *res;
97fc33e4b4SMike Rapoport (Microsoft)
98fc33e4b4SMike Rapoport (Microsoft) elm = xa_load(xa, index);
99fc33e4b4SMike Rapoport (Microsoft) if (elm)
100fc33e4b4SMike Rapoport (Microsoft) return elm;
101fc33e4b4SMike Rapoport (Microsoft)
102fc33e4b4SMike Rapoport (Microsoft) elm = kzalloc(sz, GFP_KERNEL);
103fc33e4b4SMike Rapoport (Microsoft) if (!elm)
104fc33e4b4SMike Rapoport (Microsoft) return ERR_PTR(-ENOMEM);
105fc33e4b4SMike Rapoport (Microsoft)
106fc33e4b4SMike Rapoport (Microsoft) res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
107fc33e4b4SMike Rapoport (Microsoft) if (xa_is_err(res))
108fc33e4b4SMike Rapoport (Microsoft) res = ERR_PTR(xa_err(res));
109fc33e4b4SMike Rapoport (Microsoft)
110fc33e4b4SMike Rapoport (Microsoft) if (res) {
111fc33e4b4SMike Rapoport (Microsoft) kfree(elm);
112fc33e4b4SMike Rapoport (Microsoft) return res;
113fc33e4b4SMike Rapoport (Microsoft) }
114fc33e4b4SMike Rapoport (Microsoft)
115fc33e4b4SMike Rapoport (Microsoft) return elm;
116fc33e4b4SMike Rapoport (Microsoft) }
117fc33e4b4SMike Rapoport (Microsoft)
__kho_unpreserve(struct kho_mem_track * track,unsigned long pfn,unsigned long end_pfn)118fc33e4b4SMike Rapoport (Microsoft) static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
119fc33e4b4SMike Rapoport (Microsoft) unsigned long end_pfn)
120fc33e4b4SMike Rapoport (Microsoft) {
121fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_phys_bits *bits;
122fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_phys *physxa;
123fc33e4b4SMike Rapoport (Microsoft)
124fc33e4b4SMike Rapoport (Microsoft) while (pfn < end_pfn) {
125fc33e4b4SMike Rapoport (Microsoft) const unsigned int order =
126fc33e4b4SMike Rapoport (Microsoft) min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
127fc33e4b4SMike Rapoport (Microsoft) const unsigned long pfn_high = pfn >> order;
128fc33e4b4SMike Rapoport (Microsoft)
129fc33e4b4SMike Rapoport (Microsoft) physxa = xa_load(&track->orders, order);
130fc33e4b4SMike Rapoport (Microsoft) if (!physxa)
131fc33e4b4SMike Rapoport (Microsoft) continue;
132fc33e4b4SMike Rapoport (Microsoft)
133fc33e4b4SMike Rapoport (Microsoft) bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
134fc33e4b4SMike Rapoport (Microsoft) if (!bits)
135fc33e4b4SMike Rapoport (Microsoft) continue;
136fc33e4b4SMike Rapoport (Microsoft)
137fc33e4b4SMike Rapoport (Microsoft) clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
138fc33e4b4SMike Rapoport (Microsoft)
139fc33e4b4SMike Rapoport (Microsoft) pfn += 1 << order;
140fc33e4b4SMike Rapoport (Microsoft) }
141fc33e4b4SMike Rapoport (Microsoft) }
142fc33e4b4SMike Rapoport (Microsoft)
__kho_preserve_order(struct kho_mem_track * track,unsigned long pfn,unsigned int order)143fc33e4b4SMike Rapoport (Microsoft) static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
144fc33e4b4SMike Rapoport (Microsoft) unsigned int order)
145fc33e4b4SMike Rapoport (Microsoft) {
146fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_phys_bits *bits;
147fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_phys *physxa;
148fc33e4b4SMike Rapoport (Microsoft) const unsigned long pfn_high = pfn >> order;
149fc33e4b4SMike Rapoport (Microsoft)
150fc33e4b4SMike Rapoport (Microsoft) might_sleep();
151fc33e4b4SMike Rapoport (Microsoft)
152fc33e4b4SMike Rapoport (Microsoft) physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa));
153fc33e4b4SMike Rapoport (Microsoft) if (IS_ERR(physxa))
154fc33e4b4SMike Rapoport (Microsoft) return PTR_ERR(physxa);
155fc33e4b4SMike Rapoport (Microsoft)
156fc33e4b4SMike Rapoport (Microsoft) bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
157fc33e4b4SMike Rapoport (Microsoft) sizeof(*bits));
158fc33e4b4SMike Rapoport (Microsoft) if (IS_ERR(bits))
159fc33e4b4SMike Rapoport (Microsoft) return PTR_ERR(bits);
160fc33e4b4SMike Rapoport (Microsoft)
161fc33e4b4SMike Rapoport (Microsoft) set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
162fc33e4b4SMike Rapoport (Microsoft)
163fc33e4b4SMike Rapoport (Microsoft) return 0;
164fc33e4b4SMike Rapoport (Microsoft) }
165fc33e4b4SMike Rapoport (Microsoft)
166fc33e4b4SMike Rapoport (Microsoft) /* almost as free_reserved_page(), just don't free the page */
kho_restore_page(struct page * page,unsigned int order)167*12b9a2c0SPratyush Yadav static void kho_restore_page(struct page *page, unsigned int order)
168fc33e4b4SMike Rapoport (Microsoft) {
169*12b9a2c0SPratyush Yadav unsigned int nr_pages = (1 << order);
170*12b9a2c0SPratyush Yadav
171*12b9a2c0SPratyush Yadav /* Head page gets refcount of 1. */
172*12b9a2c0SPratyush Yadav set_page_count(page, 1);
173*12b9a2c0SPratyush Yadav
174*12b9a2c0SPratyush Yadav /* For higher order folios, tail pages get a page count of zero. */
175*12b9a2c0SPratyush Yadav for (unsigned int i = 1; i < nr_pages; i++)
176*12b9a2c0SPratyush Yadav set_page_count(page + i, 0);
177*12b9a2c0SPratyush Yadav
178*12b9a2c0SPratyush Yadav if (order > 0)
179*12b9a2c0SPratyush Yadav prep_compound_page(page, order);
180*12b9a2c0SPratyush Yadav
181*12b9a2c0SPratyush Yadav adjust_managed_page_count(page, nr_pages);
182fc33e4b4SMike Rapoport (Microsoft) }
183fc33e4b4SMike Rapoport (Microsoft)
184fc33e4b4SMike Rapoport (Microsoft) /**
185fc33e4b4SMike Rapoport (Microsoft) * kho_restore_folio - recreates the folio from the preserved memory.
186fc33e4b4SMike Rapoport (Microsoft) * @phys: physical address of the folio.
187fc33e4b4SMike Rapoport (Microsoft) *
188fc33e4b4SMike Rapoport (Microsoft) * Return: pointer to the struct folio on success, NULL on failure.
189fc33e4b4SMike Rapoport (Microsoft) */
kho_restore_folio(phys_addr_t phys)190fc33e4b4SMike Rapoport (Microsoft) struct folio *kho_restore_folio(phys_addr_t phys)
191fc33e4b4SMike Rapoport (Microsoft) {
192fc33e4b4SMike Rapoport (Microsoft) struct page *page = pfn_to_online_page(PHYS_PFN(phys));
193fc33e4b4SMike Rapoport (Microsoft) unsigned long order;
194fc33e4b4SMike Rapoport (Microsoft)
195fc33e4b4SMike Rapoport (Microsoft) if (!page)
196fc33e4b4SMike Rapoport (Microsoft) return NULL;
197fc33e4b4SMike Rapoport (Microsoft)
198fc33e4b4SMike Rapoport (Microsoft) order = page->private;
199fc33e4b4SMike Rapoport (Microsoft) if (order > MAX_PAGE_ORDER)
200fc33e4b4SMike Rapoport (Microsoft) return NULL;
201fc33e4b4SMike Rapoport (Microsoft)
202*12b9a2c0SPratyush Yadav kho_restore_page(page, order);
203fc33e4b4SMike Rapoport (Microsoft) return page_folio(page);
204fc33e4b4SMike Rapoport (Microsoft) }
205fc33e4b4SMike Rapoport (Microsoft) EXPORT_SYMBOL_GPL(kho_restore_folio);
206fc33e4b4SMike Rapoport (Microsoft)
207fc33e4b4SMike Rapoport (Microsoft) /* Serialize and deserialize struct kho_mem_phys across kexec
208fc33e4b4SMike Rapoport (Microsoft) *
209fc33e4b4SMike Rapoport (Microsoft) * Record all the bitmaps in a linked list of pages for the next kernel to
210fc33e4b4SMike Rapoport (Microsoft) * process. Each chunk holds bitmaps of the same order and each block of bitmaps
211fc33e4b4SMike Rapoport (Microsoft) * starts at a given physical address. This allows the bitmaps to be sparse. The
212fc33e4b4SMike Rapoport (Microsoft) * xarray is used to store them in a tree while building up the data structure,
213fc33e4b4SMike Rapoport (Microsoft) * but the KHO successor kernel only needs to process them once in order.
214fc33e4b4SMike Rapoport (Microsoft) *
215fc33e4b4SMike Rapoport (Microsoft) * All of this memory is normal kmalloc() memory and is not marked for
216fc33e4b4SMike Rapoport (Microsoft) * preservation. The successor kernel will remain isolated to the scratch space
217fc33e4b4SMike Rapoport (Microsoft) * until it completes processing this list. Once processed all the memory
218fc33e4b4SMike Rapoport (Microsoft) * storing these ranges will be marked as free.
219fc33e4b4SMike Rapoport (Microsoft) */
220fc33e4b4SMike Rapoport (Microsoft)
221fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_bitmap_ptr {
222fc33e4b4SMike Rapoport (Microsoft) phys_addr_t phys_start;
223fc33e4b4SMike Rapoport (Microsoft) DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *);
224fc33e4b4SMike Rapoport (Microsoft) };
225fc33e4b4SMike Rapoport (Microsoft)
226fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_chunk_hdr {
227fc33e4b4SMike Rapoport (Microsoft) DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *);
228fc33e4b4SMike Rapoport (Microsoft) unsigned int order;
229fc33e4b4SMike Rapoport (Microsoft) unsigned int num_elms;
230fc33e4b4SMike Rapoport (Microsoft) };
231fc33e4b4SMike Rapoport (Microsoft)
232fc33e4b4SMike Rapoport (Microsoft) #define KHOSER_BITMAP_SIZE \
233fc33e4b4SMike Rapoport (Microsoft) ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \
234fc33e4b4SMike Rapoport (Microsoft) sizeof(struct khoser_mem_bitmap_ptr))
235fc33e4b4SMike Rapoport (Microsoft)
236fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_chunk {
237fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_chunk_hdr hdr;
238fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE];
239fc33e4b4SMike Rapoport (Microsoft) };
240fc33e4b4SMike Rapoport (Microsoft)
241fc33e4b4SMike Rapoport (Microsoft) static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
242fc33e4b4SMike Rapoport (Microsoft)
new_chunk(struct khoser_mem_chunk * cur_chunk,unsigned long order)243fc33e4b4SMike Rapoport (Microsoft) static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
244fc33e4b4SMike Rapoport (Microsoft) unsigned long order)
245fc33e4b4SMike Rapoport (Microsoft) {
246fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_chunk *chunk;
247fc33e4b4SMike Rapoport (Microsoft)
248fc33e4b4SMike Rapoport (Microsoft) chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
249fc33e4b4SMike Rapoport (Microsoft) if (!chunk)
250fc33e4b4SMike Rapoport (Microsoft) return NULL;
251fc33e4b4SMike Rapoport (Microsoft) chunk->hdr.order = order;
252fc33e4b4SMike Rapoport (Microsoft) if (cur_chunk)
253fc33e4b4SMike Rapoport (Microsoft) KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
254fc33e4b4SMike Rapoport (Microsoft) return chunk;
255fc33e4b4SMike Rapoport (Microsoft) }
256fc33e4b4SMike Rapoport (Microsoft)
kho_mem_ser_free(struct khoser_mem_chunk * first_chunk)257fc33e4b4SMike Rapoport (Microsoft) static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
258fc33e4b4SMike Rapoport (Microsoft) {
259fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_chunk *chunk = first_chunk;
260fc33e4b4SMike Rapoport (Microsoft)
261fc33e4b4SMike Rapoport (Microsoft) while (chunk) {
262fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_chunk *tmp = chunk;
263fc33e4b4SMike Rapoport (Microsoft)
264fc33e4b4SMike Rapoport (Microsoft) chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
265fc33e4b4SMike Rapoport (Microsoft) kfree(tmp);
266fc33e4b4SMike Rapoport (Microsoft) }
267fc33e4b4SMike Rapoport (Microsoft) }
268fc33e4b4SMike Rapoport (Microsoft)
kho_mem_serialize(struct kho_serialization * ser)269fc33e4b4SMike Rapoport (Microsoft) static int kho_mem_serialize(struct kho_serialization *ser)
270fc33e4b4SMike Rapoport (Microsoft) {
271fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_chunk *first_chunk = NULL;
272fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_chunk *chunk = NULL;
273fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_phys *physxa;
274fc33e4b4SMike Rapoport (Microsoft) unsigned long order;
275fc33e4b4SMike Rapoport (Microsoft)
276fc33e4b4SMike Rapoport (Microsoft) xa_for_each(&ser->track.orders, order, physxa) {
277fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_phys_bits *bits;
278fc33e4b4SMike Rapoport (Microsoft) unsigned long phys;
279fc33e4b4SMike Rapoport (Microsoft)
280fc33e4b4SMike Rapoport (Microsoft) chunk = new_chunk(chunk, order);
281fc33e4b4SMike Rapoport (Microsoft) if (!chunk)
282fc33e4b4SMike Rapoport (Microsoft) goto err_free;
283fc33e4b4SMike Rapoport (Microsoft)
284fc33e4b4SMike Rapoport (Microsoft) if (!first_chunk)
285fc33e4b4SMike Rapoport (Microsoft) first_chunk = chunk;
286fc33e4b4SMike Rapoport (Microsoft)
287fc33e4b4SMike Rapoport (Microsoft) xa_for_each(&physxa->phys_bits, phys, bits) {
288fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_bitmap_ptr *elm;
289fc33e4b4SMike Rapoport (Microsoft)
290fc33e4b4SMike Rapoport (Microsoft) if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
291fc33e4b4SMike Rapoport (Microsoft) chunk = new_chunk(chunk, order);
292fc33e4b4SMike Rapoport (Microsoft) if (!chunk)
293fc33e4b4SMike Rapoport (Microsoft) goto err_free;
294fc33e4b4SMike Rapoport (Microsoft) }
295fc33e4b4SMike Rapoport (Microsoft)
296fc33e4b4SMike Rapoport (Microsoft) elm = &chunk->bitmaps[chunk->hdr.num_elms];
297fc33e4b4SMike Rapoport (Microsoft) chunk->hdr.num_elms++;
298fc33e4b4SMike Rapoport (Microsoft) elm->phys_start = (phys * PRESERVE_BITS)
299fc33e4b4SMike Rapoport (Microsoft) << (order + PAGE_SHIFT);
300fc33e4b4SMike Rapoport (Microsoft) KHOSER_STORE_PTR(elm->bitmap, bits);
301fc33e4b4SMike Rapoport (Microsoft) }
302fc33e4b4SMike Rapoport (Microsoft) }
303fc33e4b4SMike Rapoport (Microsoft)
304fc33e4b4SMike Rapoport (Microsoft) ser->preserved_mem_map = first_chunk;
305fc33e4b4SMike Rapoport (Microsoft)
306fc33e4b4SMike Rapoport (Microsoft) return 0;
307fc33e4b4SMike Rapoport (Microsoft)
308fc33e4b4SMike Rapoport (Microsoft) err_free:
309fc33e4b4SMike Rapoport (Microsoft) kho_mem_ser_free(first_chunk);
310fc33e4b4SMike Rapoport (Microsoft) return -ENOMEM;
311fc33e4b4SMike Rapoport (Microsoft) }
312fc33e4b4SMike Rapoport (Microsoft)
deserialize_bitmap(unsigned int order,struct khoser_mem_bitmap_ptr * elm)313fc33e4b4SMike Rapoport (Microsoft) static void deserialize_bitmap(unsigned int order,
314fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_bitmap_ptr *elm)
315fc33e4b4SMike Rapoport (Microsoft) {
316fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
317fc33e4b4SMike Rapoport (Microsoft) unsigned long bit;
318fc33e4b4SMike Rapoport (Microsoft)
319fc33e4b4SMike Rapoport (Microsoft) for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
320fc33e4b4SMike Rapoport (Microsoft) int sz = 1 << (order + PAGE_SHIFT);
321fc33e4b4SMike Rapoport (Microsoft) phys_addr_t phys =
322fc33e4b4SMike Rapoport (Microsoft) elm->phys_start + (bit << (order + PAGE_SHIFT));
323fc33e4b4SMike Rapoport (Microsoft) struct page *page = phys_to_page(phys);
324fc33e4b4SMike Rapoport (Microsoft)
325fc33e4b4SMike Rapoport (Microsoft) memblock_reserve(phys, sz);
326fc33e4b4SMike Rapoport (Microsoft) memblock_reserved_mark_noinit(phys, sz);
327fc33e4b4SMike Rapoport (Microsoft) page->private = order;
328fc33e4b4SMike Rapoport (Microsoft) }
329fc33e4b4SMike Rapoport (Microsoft) }
330fc33e4b4SMike Rapoport (Microsoft)
kho_mem_deserialize(const void * fdt)331fc33e4b4SMike Rapoport (Microsoft) static void __init kho_mem_deserialize(const void *fdt)
332fc33e4b4SMike Rapoport (Microsoft) {
333fc33e4b4SMike Rapoport (Microsoft) struct khoser_mem_chunk *chunk;
334fc33e4b4SMike Rapoport (Microsoft) const phys_addr_t *mem;
335fc33e4b4SMike Rapoport (Microsoft) int len;
336fc33e4b4SMike Rapoport (Microsoft)
337fc33e4b4SMike Rapoport (Microsoft) mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
338fc33e4b4SMike Rapoport (Microsoft)
339fc33e4b4SMike Rapoport (Microsoft) if (!mem || len != sizeof(*mem)) {
340fc33e4b4SMike Rapoport (Microsoft) pr_err("failed to get preserved memory bitmaps\n");
341fc33e4b4SMike Rapoport (Microsoft) return;
342fc33e4b4SMike Rapoport (Microsoft) }
343fc33e4b4SMike Rapoport (Microsoft)
344fc33e4b4SMike Rapoport (Microsoft) chunk = *mem ? phys_to_virt(*mem) : NULL;
345fc33e4b4SMike Rapoport (Microsoft) while (chunk) {
346fc33e4b4SMike Rapoport (Microsoft) unsigned int i;
347fc33e4b4SMike Rapoport (Microsoft)
348fc33e4b4SMike Rapoport (Microsoft) for (i = 0; i != chunk->hdr.num_elms; i++)
349fc33e4b4SMike Rapoport (Microsoft) deserialize_bitmap(chunk->hdr.order,
350fc33e4b4SMike Rapoport (Microsoft) &chunk->bitmaps[i]);
351fc33e4b4SMike Rapoport (Microsoft) chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
352fc33e4b4SMike Rapoport (Microsoft) }
353fc33e4b4SMike Rapoport (Microsoft) }
354fc33e4b4SMike Rapoport (Microsoft)
3553dc92c31SAlexander Graf /*
3563dc92c31SAlexander Graf * With KHO enabled, memory can become fragmented because KHO regions may
3573dc92c31SAlexander Graf * be anywhere in physical address space. The scratch regions give us a
3583dc92c31SAlexander Graf * safe zones that we will never see KHO allocations from. This is where we
3593dc92c31SAlexander Graf * can later safely load our new kexec images into and then use the scratch
3603dc92c31SAlexander Graf * area for early allocations that happen before page allocator is
3613dc92c31SAlexander Graf * initialized.
3623dc92c31SAlexander Graf */
3633dc92c31SAlexander Graf static struct kho_scratch *kho_scratch;
3643dc92c31SAlexander Graf static unsigned int kho_scratch_cnt;
3653dc92c31SAlexander Graf
3663dc92c31SAlexander Graf /*
3673dc92c31SAlexander Graf * The scratch areas are scaled by default as percent of memory allocated from
3683dc92c31SAlexander Graf * memblock. A user can override the scale with command line parameter:
3693dc92c31SAlexander Graf *
3703dc92c31SAlexander Graf * kho_scratch=N%
3713dc92c31SAlexander Graf *
3723dc92c31SAlexander Graf * It is also possible to explicitly define size for a lowmem, a global and
3733dc92c31SAlexander Graf * per-node scratch areas:
3743dc92c31SAlexander Graf *
3753dc92c31SAlexander Graf * kho_scratch=l[KMG],n[KMG],m[KMG]
3763dc92c31SAlexander Graf *
3773dc92c31SAlexander Graf * The explicit size definition takes precedence over scale definition.
3783dc92c31SAlexander Graf */
3793dc92c31SAlexander Graf static unsigned int scratch_scale __initdata = 200;
3803dc92c31SAlexander Graf static phys_addr_t scratch_size_global __initdata;
3813dc92c31SAlexander Graf static phys_addr_t scratch_size_pernode __initdata;
3823dc92c31SAlexander Graf static phys_addr_t scratch_size_lowmem __initdata;
3833dc92c31SAlexander Graf
kho_parse_scratch_size(char * p)3843dc92c31SAlexander Graf static int __init kho_parse_scratch_size(char *p)
3853dc92c31SAlexander Graf {
3863dc92c31SAlexander Graf size_t len;
3873dc92c31SAlexander Graf unsigned long sizes[3];
3883dc92c31SAlexander Graf int i;
3893dc92c31SAlexander Graf
3903dc92c31SAlexander Graf if (!p)
3913dc92c31SAlexander Graf return -EINVAL;
3923dc92c31SAlexander Graf
3933dc92c31SAlexander Graf len = strlen(p);
3943dc92c31SAlexander Graf if (!len)
3953dc92c31SAlexander Graf return -EINVAL;
3963dc92c31SAlexander Graf
3973dc92c31SAlexander Graf /* parse nn% */
3983dc92c31SAlexander Graf if (p[len - 1] == '%') {
3993dc92c31SAlexander Graf /* unsigned int max is 4,294,967,295, 10 chars */
4003dc92c31SAlexander Graf char s_scale[11] = {};
4013dc92c31SAlexander Graf int ret = 0;
4023dc92c31SAlexander Graf
4033dc92c31SAlexander Graf if (len > ARRAY_SIZE(s_scale))
4043dc92c31SAlexander Graf return -EINVAL;
4053dc92c31SAlexander Graf
4063dc92c31SAlexander Graf memcpy(s_scale, p, len - 1);
4073dc92c31SAlexander Graf ret = kstrtouint(s_scale, 10, &scratch_scale);
4083dc92c31SAlexander Graf if (!ret)
4093dc92c31SAlexander Graf pr_notice("scratch scale is %d%%\n", scratch_scale);
4103dc92c31SAlexander Graf return ret;
4113dc92c31SAlexander Graf }
4123dc92c31SAlexander Graf
4133dc92c31SAlexander Graf /* parse ll[KMG],mm[KMG],nn[KMG] */
4143dc92c31SAlexander Graf for (i = 0; i < ARRAY_SIZE(sizes); i++) {
4153dc92c31SAlexander Graf char *endp = p;
4163dc92c31SAlexander Graf
4173dc92c31SAlexander Graf if (i > 0) {
4183dc92c31SAlexander Graf if (*p != ',')
4193dc92c31SAlexander Graf return -EINVAL;
4203dc92c31SAlexander Graf p += 1;
4213dc92c31SAlexander Graf }
4223dc92c31SAlexander Graf
4233dc92c31SAlexander Graf sizes[i] = memparse(p, &endp);
4243dc92c31SAlexander Graf if (!sizes[i] || endp == p)
4253dc92c31SAlexander Graf return -EINVAL;
4263dc92c31SAlexander Graf p = endp;
4273dc92c31SAlexander Graf }
4283dc92c31SAlexander Graf
4293dc92c31SAlexander Graf scratch_size_lowmem = sizes[0];
4303dc92c31SAlexander Graf scratch_size_global = sizes[1];
4313dc92c31SAlexander Graf scratch_size_pernode = sizes[2];
4323dc92c31SAlexander Graf scratch_scale = 0;
4333dc92c31SAlexander Graf
4343dc92c31SAlexander Graf pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n",
4353dc92c31SAlexander Graf (u64)(scratch_size_lowmem >> 20),
4363dc92c31SAlexander Graf (u64)(scratch_size_global >> 20),
4373dc92c31SAlexander Graf (u64)(scratch_size_pernode >> 20));
4383dc92c31SAlexander Graf
4393dc92c31SAlexander Graf return 0;
4403dc92c31SAlexander Graf }
4413dc92c31SAlexander Graf early_param("kho_scratch", kho_parse_scratch_size);
4423dc92c31SAlexander Graf
scratch_size_update(void)4433dc92c31SAlexander Graf static void __init scratch_size_update(void)
4443dc92c31SAlexander Graf {
4453dc92c31SAlexander Graf phys_addr_t size;
4463dc92c31SAlexander Graf
4473dc92c31SAlexander Graf if (!scratch_scale)
4483dc92c31SAlexander Graf return;
4493dc92c31SAlexander Graf
4503dc92c31SAlexander Graf size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT,
4513dc92c31SAlexander Graf NUMA_NO_NODE);
4523dc92c31SAlexander Graf size = size * scratch_scale / 100;
4533dc92c31SAlexander Graf scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
4543dc92c31SAlexander Graf
4553dc92c31SAlexander Graf size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
4563dc92c31SAlexander Graf NUMA_NO_NODE);
4573dc92c31SAlexander Graf size = size * scratch_scale / 100 - scratch_size_lowmem;
4583dc92c31SAlexander Graf scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
4593dc92c31SAlexander Graf }
4603dc92c31SAlexander Graf
scratch_size_node(int nid)4613dc92c31SAlexander Graf static phys_addr_t __init scratch_size_node(int nid)
4623dc92c31SAlexander Graf {
4633dc92c31SAlexander Graf phys_addr_t size;
4643dc92c31SAlexander Graf
4653dc92c31SAlexander Graf if (scratch_scale) {
4663dc92c31SAlexander Graf size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
4673dc92c31SAlexander Graf nid);
4683dc92c31SAlexander Graf size = size * scratch_scale / 100;
4693dc92c31SAlexander Graf } else {
4703dc92c31SAlexander Graf size = scratch_size_pernode;
4713dc92c31SAlexander Graf }
4723dc92c31SAlexander Graf
4733dc92c31SAlexander Graf return round_up(size, CMA_MIN_ALIGNMENT_BYTES);
4743dc92c31SAlexander Graf }
4753dc92c31SAlexander Graf
4763dc92c31SAlexander Graf /**
4773dc92c31SAlexander Graf * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec
4783dc92c31SAlexander Graf *
4793dc92c31SAlexander Graf * With KHO we can preserve arbitrary pages in the system. To ensure we still
4803dc92c31SAlexander Graf * have a large contiguous region of memory when we search the physical address
4813dc92c31SAlexander Graf * space for target memory, let's make sure we always have a large CMA region
4823dc92c31SAlexander Graf * active. This CMA region will only be used for movable pages which are not a
4833dc92c31SAlexander Graf * problem for us during KHO because we can just move them somewhere else.
4843dc92c31SAlexander Graf */
kho_reserve_scratch(void)4853dc92c31SAlexander Graf static void __init kho_reserve_scratch(void)
4863dc92c31SAlexander Graf {
4873dc92c31SAlexander Graf phys_addr_t addr, size;
4883dc92c31SAlexander Graf int nid, i = 0;
4893dc92c31SAlexander Graf
4903dc92c31SAlexander Graf if (!kho_enable)
4913dc92c31SAlexander Graf return;
4923dc92c31SAlexander Graf
4933dc92c31SAlexander Graf scratch_size_update();
4943dc92c31SAlexander Graf
4953dc92c31SAlexander Graf /* FIXME: deal with node hot-plug/remove */
4963dc92c31SAlexander Graf kho_scratch_cnt = num_online_nodes() + 2;
4973dc92c31SAlexander Graf size = kho_scratch_cnt * sizeof(*kho_scratch);
4983dc92c31SAlexander Graf kho_scratch = memblock_alloc(size, PAGE_SIZE);
4993dc92c31SAlexander Graf if (!kho_scratch)
5003dc92c31SAlexander Graf goto err_disable_kho;
5013dc92c31SAlexander Graf
5023dc92c31SAlexander Graf /*
5033dc92c31SAlexander Graf * reserve scratch area in low memory for lowmem allocations in the
5043dc92c31SAlexander Graf * next kernel
5053dc92c31SAlexander Graf */
5063dc92c31SAlexander Graf size = scratch_size_lowmem;
5073dc92c31SAlexander Graf addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0,
5083dc92c31SAlexander Graf ARCH_LOW_ADDRESS_LIMIT);
5093dc92c31SAlexander Graf if (!addr)
5103dc92c31SAlexander Graf goto err_free_scratch_desc;
5113dc92c31SAlexander Graf
5123dc92c31SAlexander Graf kho_scratch[i].addr = addr;
5133dc92c31SAlexander Graf kho_scratch[i].size = size;
5143dc92c31SAlexander Graf i++;
5153dc92c31SAlexander Graf
5163dc92c31SAlexander Graf /* reserve large contiguous area for allocations without nid */
5173dc92c31SAlexander Graf size = scratch_size_global;
5183dc92c31SAlexander Graf addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES);
5193dc92c31SAlexander Graf if (!addr)
5203dc92c31SAlexander Graf goto err_free_scratch_areas;
5213dc92c31SAlexander Graf
5223dc92c31SAlexander Graf kho_scratch[i].addr = addr;
5233dc92c31SAlexander Graf kho_scratch[i].size = size;
5243dc92c31SAlexander Graf i++;
5253dc92c31SAlexander Graf
5263dc92c31SAlexander Graf for_each_online_node(nid) {
5273dc92c31SAlexander Graf size = scratch_size_node(nid);
5283dc92c31SAlexander Graf addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES,
5293dc92c31SAlexander Graf 0, MEMBLOCK_ALLOC_ACCESSIBLE,
5303dc92c31SAlexander Graf nid, true);
5313dc92c31SAlexander Graf if (!addr)
5323dc92c31SAlexander Graf goto err_free_scratch_areas;
5333dc92c31SAlexander Graf
5343dc92c31SAlexander Graf kho_scratch[i].addr = addr;
5353dc92c31SAlexander Graf kho_scratch[i].size = size;
5363dc92c31SAlexander Graf i++;
5373dc92c31SAlexander Graf }
5383dc92c31SAlexander Graf
5393dc92c31SAlexander Graf return;
5403dc92c31SAlexander Graf
5413dc92c31SAlexander Graf err_free_scratch_areas:
5423dc92c31SAlexander Graf for (i--; i >= 0; i--)
5433dc92c31SAlexander Graf memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size);
5443dc92c31SAlexander Graf err_free_scratch_desc:
5453dc92c31SAlexander Graf memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch));
5463dc92c31SAlexander Graf err_disable_kho:
5473dc92c31SAlexander Graf kho_enable = false;
5483dc92c31SAlexander Graf }
5493dc92c31SAlexander Graf
5503dc92c31SAlexander Graf struct fdt_debugfs {
5513dc92c31SAlexander Graf struct list_head list;
5523dc92c31SAlexander Graf struct debugfs_blob_wrapper wrapper;
5533dc92c31SAlexander Graf struct dentry *file;
5543dc92c31SAlexander Graf };
5553dc92c31SAlexander Graf
kho_debugfs_fdt_add(struct list_head * list,struct dentry * dir,const char * name,const void * fdt)5563dc92c31SAlexander Graf static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
5573dc92c31SAlexander Graf const char *name, const void *fdt)
5583dc92c31SAlexander Graf {
5593dc92c31SAlexander Graf struct fdt_debugfs *f;
5603dc92c31SAlexander Graf struct dentry *file;
5613dc92c31SAlexander Graf
5623dc92c31SAlexander Graf f = kmalloc(sizeof(*f), GFP_KERNEL);
5633dc92c31SAlexander Graf if (!f)
5643dc92c31SAlexander Graf return -ENOMEM;
5653dc92c31SAlexander Graf
5663dc92c31SAlexander Graf f->wrapper.data = (void *)fdt;
5673dc92c31SAlexander Graf f->wrapper.size = fdt_totalsize(fdt);
5683dc92c31SAlexander Graf
5693dc92c31SAlexander Graf file = debugfs_create_blob(name, 0400, dir, &f->wrapper);
5703dc92c31SAlexander Graf if (IS_ERR(file)) {
5713dc92c31SAlexander Graf kfree(f);
5723dc92c31SAlexander Graf return PTR_ERR(file);
5733dc92c31SAlexander Graf }
5743dc92c31SAlexander Graf
5753dc92c31SAlexander Graf f->file = file;
5763dc92c31SAlexander Graf list_add(&f->list, list);
5773dc92c31SAlexander Graf
5783dc92c31SAlexander Graf return 0;
5793dc92c31SAlexander Graf }
5803dc92c31SAlexander Graf
5813dc92c31SAlexander Graf /**
5823dc92c31SAlexander Graf * kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
5833dc92c31SAlexander Graf * @ser: serialization control object passed by KHO notifiers.
5843dc92c31SAlexander Graf * @name: name of the sub tree.
5853dc92c31SAlexander Graf * @fdt: the sub tree blob.
5863dc92c31SAlexander Graf *
5873dc92c31SAlexander Graf * Creates a new child node named @name in KHO root FDT and records
5883dc92c31SAlexander Graf * the physical address of @fdt. The pages of @fdt must also be preserved
5893dc92c31SAlexander Graf * by KHO for the new kernel to retrieve it after kexec.
5903dc92c31SAlexander Graf *
5913dc92c31SAlexander Graf * A debugfs blob entry is also created at
5923dc92c31SAlexander Graf * ``/sys/kernel/debug/kho/out/sub_fdts/@name``.
5933dc92c31SAlexander Graf *
5943dc92c31SAlexander Graf * Return: 0 on success, error code on failure
5953dc92c31SAlexander Graf */
kho_add_subtree(struct kho_serialization * ser,const char * name,void * fdt)5963dc92c31SAlexander Graf int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt)
5973dc92c31SAlexander Graf {
5983dc92c31SAlexander Graf int err = 0;
5993dc92c31SAlexander Graf u64 phys = (u64)virt_to_phys(fdt);
6003dc92c31SAlexander Graf void *root = page_to_virt(ser->fdt);
6013dc92c31SAlexander Graf
6023dc92c31SAlexander Graf err |= fdt_begin_node(root, name);
6033dc92c31SAlexander Graf err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys));
6043dc92c31SAlexander Graf err |= fdt_end_node(root);
6053dc92c31SAlexander Graf
6063dc92c31SAlexander Graf if (err)
6073dc92c31SAlexander Graf return err;
6083dc92c31SAlexander Graf
6093dc92c31SAlexander Graf return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt);
6103dc92c31SAlexander Graf }
6113dc92c31SAlexander Graf EXPORT_SYMBOL_GPL(kho_add_subtree);
6123dc92c31SAlexander Graf
6133dc92c31SAlexander Graf struct kho_out {
6143dc92c31SAlexander Graf struct blocking_notifier_head chain_head;
6153dc92c31SAlexander Graf
6163dc92c31SAlexander Graf struct dentry *dir;
6173dc92c31SAlexander Graf
6183dc92c31SAlexander Graf struct mutex lock; /* protects KHO FDT finalization */
6193dc92c31SAlexander Graf
6203dc92c31SAlexander Graf struct kho_serialization ser;
6213dc92c31SAlexander Graf bool finalized;
6223dc92c31SAlexander Graf };
6233dc92c31SAlexander Graf
6243dc92c31SAlexander Graf static struct kho_out kho_out = {
6253dc92c31SAlexander Graf .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
6263dc92c31SAlexander Graf .lock = __MUTEX_INITIALIZER(kho_out.lock),
6273dc92c31SAlexander Graf .ser = {
6283dc92c31SAlexander Graf .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list),
629fc33e4b4SMike Rapoport (Microsoft) .track = {
630fc33e4b4SMike Rapoport (Microsoft) .orders = XARRAY_INIT(kho_out.ser.track.orders, 0),
631fc33e4b4SMike Rapoport (Microsoft) },
6323dc92c31SAlexander Graf },
6333dc92c31SAlexander Graf .finalized = false,
6343dc92c31SAlexander Graf };
6353dc92c31SAlexander Graf
register_kho_notifier(struct notifier_block * nb)6363dc92c31SAlexander Graf int register_kho_notifier(struct notifier_block *nb)
6373dc92c31SAlexander Graf {
6383dc92c31SAlexander Graf return blocking_notifier_chain_register(&kho_out.chain_head, nb);
6393dc92c31SAlexander Graf }
6403dc92c31SAlexander Graf EXPORT_SYMBOL_GPL(register_kho_notifier);
6413dc92c31SAlexander Graf
unregister_kho_notifier(struct notifier_block * nb)6423dc92c31SAlexander Graf int unregister_kho_notifier(struct notifier_block *nb)
6433dc92c31SAlexander Graf {
6443dc92c31SAlexander Graf return blocking_notifier_chain_unregister(&kho_out.chain_head, nb);
6453dc92c31SAlexander Graf }
6463dc92c31SAlexander Graf EXPORT_SYMBOL_GPL(unregister_kho_notifier);
6473dc92c31SAlexander Graf
648fc33e4b4SMike Rapoport (Microsoft) /**
649fc33e4b4SMike Rapoport (Microsoft) * kho_preserve_folio - preserve a folio across kexec.
650fc33e4b4SMike Rapoport (Microsoft) * @folio: folio to preserve.
651fc33e4b4SMike Rapoport (Microsoft) *
652fc33e4b4SMike Rapoport (Microsoft) * Instructs KHO to preserve the whole folio across kexec. The order
653fc33e4b4SMike Rapoport (Microsoft) * will be preserved as well.
654fc33e4b4SMike Rapoport (Microsoft) *
655fc33e4b4SMike Rapoport (Microsoft) * Return: 0 on success, error code on failure
656fc33e4b4SMike Rapoport (Microsoft) */
kho_preserve_folio(struct folio * folio)657fc33e4b4SMike Rapoport (Microsoft) int kho_preserve_folio(struct folio *folio)
658fc33e4b4SMike Rapoport (Microsoft) {
659fc33e4b4SMike Rapoport (Microsoft) const unsigned long pfn = folio_pfn(folio);
660fc33e4b4SMike Rapoport (Microsoft) const unsigned int order = folio_order(folio);
661fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_track *track = &kho_out.ser.track;
662fc33e4b4SMike Rapoport (Microsoft)
663fc33e4b4SMike Rapoport (Microsoft) if (kho_out.finalized)
664fc33e4b4SMike Rapoport (Microsoft) return -EBUSY;
665fc33e4b4SMike Rapoport (Microsoft)
666fc33e4b4SMike Rapoport (Microsoft) return __kho_preserve_order(track, pfn, order);
667fc33e4b4SMike Rapoport (Microsoft) }
668fc33e4b4SMike Rapoport (Microsoft) EXPORT_SYMBOL_GPL(kho_preserve_folio);
669fc33e4b4SMike Rapoport (Microsoft)
670fc33e4b4SMike Rapoport (Microsoft) /**
671fc33e4b4SMike Rapoport (Microsoft) * kho_preserve_phys - preserve a physically contiguous range across kexec.
672fc33e4b4SMike Rapoport (Microsoft) * @phys: physical address of the range.
673fc33e4b4SMike Rapoport (Microsoft) * @size: size of the range.
674fc33e4b4SMike Rapoport (Microsoft) *
675fc33e4b4SMike Rapoport (Microsoft) * Instructs KHO to preserve the memory range from @phys to @phys + @size
676fc33e4b4SMike Rapoport (Microsoft) * across kexec.
677fc33e4b4SMike Rapoport (Microsoft) *
678fc33e4b4SMike Rapoport (Microsoft) * Return: 0 on success, error code on failure
679fc33e4b4SMike Rapoport (Microsoft) */
kho_preserve_phys(phys_addr_t phys,size_t size)680fc33e4b4SMike Rapoport (Microsoft) int kho_preserve_phys(phys_addr_t phys, size_t size)
681fc33e4b4SMike Rapoport (Microsoft) {
682fc33e4b4SMike Rapoport (Microsoft) unsigned long pfn = PHYS_PFN(phys);
683fc33e4b4SMike Rapoport (Microsoft) unsigned long failed_pfn = 0;
684fc33e4b4SMike Rapoport (Microsoft) const unsigned long start_pfn = pfn;
685fc33e4b4SMike Rapoport (Microsoft) const unsigned long end_pfn = PHYS_PFN(phys + size);
686fc33e4b4SMike Rapoport (Microsoft) int err = 0;
687fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_track *track = &kho_out.ser.track;
688fc33e4b4SMike Rapoport (Microsoft)
689fc33e4b4SMike Rapoport (Microsoft) if (kho_out.finalized)
690fc33e4b4SMike Rapoport (Microsoft) return -EBUSY;
691fc33e4b4SMike Rapoport (Microsoft)
692fc33e4b4SMike Rapoport (Microsoft) if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
693fc33e4b4SMike Rapoport (Microsoft) return -EINVAL;
694fc33e4b4SMike Rapoport (Microsoft)
695fc33e4b4SMike Rapoport (Microsoft) while (pfn < end_pfn) {
696fc33e4b4SMike Rapoport (Microsoft) const unsigned int order =
697fc33e4b4SMike Rapoport (Microsoft) min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
698fc33e4b4SMike Rapoport (Microsoft)
699fc33e4b4SMike Rapoport (Microsoft) err = __kho_preserve_order(track, pfn, order);
700fc33e4b4SMike Rapoport (Microsoft) if (err) {
701fc33e4b4SMike Rapoport (Microsoft) failed_pfn = pfn;
702fc33e4b4SMike Rapoport (Microsoft) break;
703fc33e4b4SMike Rapoport (Microsoft) }
704fc33e4b4SMike Rapoport (Microsoft)
705fc33e4b4SMike Rapoport (Microsoft) pfn += 1 << order;
706fc33e4b4SMike Rapoport (Microsoft) }
707fc33e4b4SMike Rapoport (Microsoft)
708fc33e4b4SMike Rapoport (Microsoft) if (err)
709fc33e4b4SMike Rapoport (Microsoft) __kho_unpreserve(track, start_pfn, failed_pfn);
710fc33e4b4SMike Rapoport (Microsoft)
711fc33e4b4SMike Rapoport (Microsoft) return err;
712fc33e4b4SMike Rapoport (Microsoft) }
713fc33e4b4SMike Rapoport (Microsoft) EXPORT_SYMBOL_GPL(kho_preserve_phys);
714fc33e4b4SMike Rapoport (Microsoft)
7153dc92c31SAlexander Graf /* Handling for debug/kho/out */
7163dc92c31SAlexander Graf
7173dc92c31SAlexander Graf static struct dentry *debugfs_root;
7183dc92c31SAlexander Graf
kho_out_update_debugfs_fdt(void)7193dc92c31SAlexander Graf static int kho_out_update_debugfs_fdt(void)
7203dc92c31SAlexander Graf {
7213dc92c31SAlexander Graf int err = 0;
7223dc92c31SAlexander Graf struct fdt_debugfs *ff, *tmp;
7233dc92c31SAlexander Graf
7243dc92c31SAlexander Graf if (kho_out.finalized) {
7253dc92c31SAlexander Graf err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir,
7263dc92c31SAlexander Graf "fdt", page_to_virt(kho_out.ser.fdt));
7273dc92c31SAlexander Graf } else {
7283dc92c31SAlexander Graf list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) {
7293dc92c31SAlexander Graf debugfs_remove(ff->file);
7303dc92c31SAlexander Graf list_del(&ff->list);
7313dc92c31SAlexander Graf kfree(ff);
7323dc92c31SAlexander Graf }
7333dc92c31SAlexander Graf }
7343dc92c31SAlexander Graf
7353dc92c31SAlexander Graf return err;
7363dc92c31SAlexander Graf }
7373dc92c31SAlexander Graf
kho_abort(void)7383dc92c31SAlexander Graf static int kho_abort(void)
7393dc92c31SAlexander Graf {
7403dc92c31SAlexander Graf int err;
741fc33e4b4SMike Rapoport (Microsoft) unsigned long order;
742fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_phys *physxa;
743fc33e4b4SMike Rapoport (Microsoft)
744fc33e4b4SMike Rapoport (Microsoft) xa_for_each(&kho_out.ser.track.orders, order, physxa) {
745fc33e4b4SMike Rapoport (Microsoft) struct kho_mem_phys_bits *bits;
746fc33e4b4SMike Rapoport (Microsoft) unsigned long phys;
747fc33e4b4SMike Rapoport (Microsoft)
748fc33e4b4SMike Rapoport (Microsoft) xa_for_each(&physxa->phys_bits, phys, bits)
749fc33e4b4SMike Rapoport (Microsoft) kfree(bits);
750fc33e4b4SMike Rapoport (Microsoft)
751fc33e4b4SMike Rapoport (Microsoft) xa_destroy(&physxa->phys_bits);
752fc33e4b4SMike Rapoport (Microsoft) kfree(physxa);
753fc33e4b4SMike Rapoport (Microsoft) }
754fc33e4b4SMike Rapoport (Microsoft) xa_destroy(&kho_out.ser.track.orders);
755fc33e4b4SMike Rapoport (Microsoft)
756fc33e4b4SMike Rapoport (Microsoft) if (kho_out.ser.preserved_mem_map) {
757fc33e4b4SMike Rapoport (Microsoft) kho_mem_ser_free(kho_out.ser.preserved_mem_map);
758fc33e4b4SMike Rapoport (Microsoft) kho_out.ser.preserved_mem_map = NULL;
759fc33e4b4SMike Rapoport (Microsoft) }
7603dc92c31SAlexander Graf
7613dc92c31SAlexander Graf err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT,
7623dc92c31SAlexander Graf NULL);
7633dc92c31SAlexander Graf err = notifier_to_errno(err);
7643dc92c31SAlexander Graf
7653dc92c31SAlexander Graf if (err)
7663dc92c31SAlexander Graf pr_err("Failed to abort KHO finalization: %d\n", err);
7673dc92c31SAlexander Graf
7683dc92c31SAlexander Graf return err;
7693dc92c31SAlexander Graf }
7703dc92c31SAlexander Graf
kho_finalize(void)7713dc92c31SAlexander Graf static int kho_finalize(void)
7723dc92c31SAlexander Graf {
7733dc92c31SAlexander Graf int err = 0;
774fc33e4b4SMike Rapoport (Microsoft) u64 *preserved_mem_map;
7753dc92c31SAlexander Graf void *fdt = page_to_virt(kho_out.ser.fdt);
7763dc92c31SAlexander Graf
7773dc92c31SAlexander Graf err |= fdt_create(fdt, PAGE_SIZE);
7783dc92c31SAlexander Graf err |= fdt_finish_reservemap(fdt);
7793dc92c31SAlexander Graf err |= fdt_begin_node(fdt, "");
7803dc92c31SAlexander Graf err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE);
781fc33e4b4SMike Rapoport (Microsoft) /**
782fc33e4b4SMike Rapoport (Microsoft) * Reserve the preserved-memory-map property in the root FDT, so
783fc33e4b4SMike Rapoport (Microsoft) * that all property definitions will precede subnodes created by
784fc33e4b4SMike Rapoport (Microsoft) * KHO callers.
785fc33e4b4SMike Rapoport (Microsoft) */
786fc33e4b4SMike Rapoport (Microsoft) err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP,
787fc33e4b4SMike Rapoport (Microsoft) sizeof(*preserved_mem_map),
788fc33e4b4SMike Rapoport (Microsoft) (void **)&preserved_mem_map);
789fc33e4b4SMike Rapoport (Microsoft) if (err)
790fc33e4b4SMike Rapoport (Microsoft) goto abort;
791fc33e4b4SMike Rapoport (Microsoft)
792fc33e4b4SMike Rapoport (Microsoft) err = kho_preserve_folio(page_folio(kho_out.ser.fdt));
7933dc92c31SAlexander Graf if (err)
7943dc92c31SAlexander Graf goto abort;
7953dc92c31SAlexander Graf
7963dc92c31SAlexander Graf err = blocking_notifier_call_chain(&kho_out.chain_head,
7973dc92c31SAlexander Graf KEXEC_KHO_FINALIZE, &kho_out.ser);
7983dc92c31SAlexander Graf err = notifier_to_errno(err);
7993dc92c31SAlexander Graf if (err)
8003dc92c31SAlexander Graf goto abort;
8013dc92c31SAlexander Graf
802fc33e4b4SMike Rapoport (Microsoft) err = kho_mem_serialize(&kho_out.ser);
803fc33e4b4SMike Rapoport (Microsoft) if (err)
804fc33e4b4SMike Rapoport (Microsoft) goto abort;
805fc33e4b4SMike Rapoport (Microsoft)
806fc33e4b4SMike Rapoport (Microsoft) *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map);
807fc33e4b4SMike Rapoport (Microsoft)
8083dc92c31SAlexander Graf err |= fdt_end_node(fdt);
8093dc92c31SAlexander Graf err |= fdt_finish(fdt);
8103dc92c31SAlexander Graf
8113dc92c31SAlexander Graf abort:
8123dc92c31SAlexander Graf if (err) {
8133dc92c31SAlexander Graf pr_err("Failed to convert KHO state tree: %d\n", err);
8143dc92c31SAlexander Graf kho_abort();
8153dc92c31SAlexander Graf }
8163dc92c31SAlexander Graf
8173dc92c31SAlexander Graf return err;
8183dc92c31SAlexander Graf }
8193dc92c31SAlexander Graf
kho_out_finalize_get(void * data,u64 * val)8203dc92c31SAlexander Graf static int kho_out_finalize_get(void *data, u64 *val)
8213dc92c31SAlexander Graf {
8223dc92c31SAlexander Graf mutex_lock(&kho_out.lock);
8233dc92c31SAlexander Graf *val = kho_out.finalized;
8243dc92c31SAlexander Graf mutex_unlock(&kho_out.lock);
8253dc92c31SAlexander Graf
8263dc92c31SAlexander Graf return 0;
8273dc92c31SAlexander Graf }
8283dc92c31SAlexander Graf
kho_out_finalize_set(void * data,u64 _val)8293dc92c31SAlexander Graf static int kho_out_finalize_set(void *data, u64 _val)
8303dc92c31SAlexander Graf {
8313dc92c31SAlexander Graf int ret = 0;
8323dc92c31SAlexander Graf bool val = !!_val;
8333dc92c31SAlexander Graf
8343dc92c31SAlexander Graf mutex_lock(&kho_out.lock);
8353dc92c31SAlexander Graf
8363dc92c31SAlexander Graf if (val == kho_out.finalized) {
8373dc92c31SAlexander Graf if (kho_out.finalized)
8383dc92c31SAlexander Graf ret = -EEXIST;
8393dc92c31SAlexander Graf else
8403dc92c31SAlexander Graf ret = -ENOENT;
8413dc92c31SAlexander Graf goto unlock;
8423dc92c31SAlexander Graf }
8433dc92c31SAlexander Graf
8443dc92c31SAlexander Graf if (val)
8453dc92c31SAlexander Graf ret = kho_finalize();
8463dc92c31SAlexander Graf else
8473dc92c31SAlexander Graf ret = kho_abort();
8483dc92c31SAlexander Graf
8493dc92c31SAlexander Graf if (ret)
8503dc92c31SAlexander Graf goto unlock;
8513dc92c31SAlexander Graf
8523dc92c31SAlexander Graf kho_out.finalized = val;
8533dc92c31SAlexander Graf ret = kho_out_update_debugfs_fdt();
8543dc92c31SAlexander Graf
8553dc92c31SAlexander Graf unlock:
8563dc92c31SAlexander Graf mutex_unlock(&kho_out.lock);
8573dc92c31SAlexander Graf return ret;
8583dc92c31SAlexander Graf }
8593dc92c31SAlexander Graf
8603dc92c31SAlexander Graf DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get,
8613dc92c31SAlexander Graf kho_out_finalize_set, "%llu\n");
8623dc92c31SAlexander Graf
scratch_phys_show(struct seq_file * m,void * v)8633dc92c31SAlexander Graf static int scratch_phys_show(struct seq_file *m, void *v)
8643dc92c31SAlexander Graf {
8653dc92c31SAlexander Graf for (int i = 0; i < kho_scratch_cnt; i++)
8663dc92c31SAlexander Graf seq_printf(m, "0x%llx\n", kho_scratch[i].addr);
8673dc92c31SAlexander Graf
8683dc92c31SAlexander Graf return 0;
8693dc92c31SAlexander Graf }
8703dc92c31SAlexander Graf DEFINE_SHOW_ATTRIBUTE(scratch_phys);
8713dc92c31SAlexander Graf
scratch_len_show(struct seq_file * m,void * v)8723dc92c31SAlexander Graf static int scratch_len_show(struct seq_file *m, void *v)
8733dc92c31SAlexander Graf {
8743dc92c31SAlexander Graf for (int i = 0; i < kho_scratch_cnt; i++)
8753dc92c31SAlexander Graf seq_printf(m, "0x%llx\n", kho_scratch[i].size);
8763dc92c31SAlexander Graf
8773dc92c31SAlexander Graf return 0;
8783dc92c31SAlexander Graf }
8793dc92c31SAlexander Graf DEFINE_SHOW_ATTRIBUTE(scratch_len);
8803dc92c31SAlexander Graf
kho_out_debugfs_init(void)8813dc92c31SAlexander Graf static __init int kho_out_debugfs_init(void)
8823dc92c31SAlexander Graf {
8833dc92c31SAlexander Graf struct dentry *dir, *f, *sub_fdt_dir;
8843dc92c31SAlexander Graf
8853dc92c31SAlexander Graf dir = debugfs_create_dir("out", debugfs_root);
8863dc92c31SAlexander Graf if (IS_ERR(dir))
8873dc92c31SAlexander Graf return -ENOMEM;
8883dc92c31SAlexander Graf
8893dc92c31SAlexander Graf sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
8903dc92c31SAlexander Graf if (IS_ERR(sub_fdt_dir))
8913dc92c31SAlexander Graf goto err_rmdir;
8923dc92c31SAlexander Graf
8933dc92c31SAlexander Graf f = debugfs_create_file("scratch_phys", 0400, dir, NULL,
8943dc92c31SAlexander Graf &scratch_phys_fops);
8953dc92c31SAlexander Graf if (IS_ERR(f))
8963dc92c31SAlexander Graf goto err_rmdir;
8973dc92c31SAlexander Graf
8983dc92c31SAlexander Graf f = debugfs_create_file("scratch_len", 0400, dir, NULL,
8993dc92c31SAlexander Graf &scratch_len_fops);
9003dc92c31SAlexander Graf if (IS_ERR(f))
9013dc92c31SAlexander Graf goto err_rmdir;
9023dc92c31SAlexander Graf
9033dc92c31SAlexander Graf f = debugfs_create_file("finalize", 0600, dir, NULL,
9043dc92c31SAlexander Graf &fops_kho_out_finalize);
9053dc92c31SAlexander Graf if (IS_ERR(f))
9063dc92c31SAlexander Graf goto err_rmdir;
9073dc92c31SAlexander Graf
9083dc92c31SAlexander Graf kho_out.dir = dir;
9093dc92c31SAlexander Graf kho_out.ser.sub_fdt_dir = sub_fdt_dir;
9103dc92c31SAlexander Graf return 0;
9113dc92c31SAlexander Graf
9123dc92c31SAlexander Graf err_rmdir:
9133dc92c31SAlexander Graf debugfs_remove_recursive(dir);
9143dc92c31SAlexander Graf return -ENOENT;
9153dc92c31SAlexander Graf }
9163dc92c31SAlexander Graf
917c609c144SAlexander Graf struct kho_in {
918c609c144SAlexander Graf struct dentry *dir;
919c609c144SAlexander Graf phys_addr_t fdt_phys;
920c609c144SAlexander Graf phys_addr_t scratch_phys;
921c609c144SAlexander Graf struct list_head fdt_list;
922c609c144SAlexander Graf };
923c609c144SAlexander Graf
924c609c144SAlexander Graf static struct kho_in kho_in = {
925c609c144SAlexander Graf .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list),
926c609c144SAlexander Graf };
927c609c144SAlexander Graf
kho_get_fdt(void)928c609c144SAlexander Graf static const void *kho_get_fdt(void)
929c609c144SAlexander Graf {
930c609c144SAlexander Graf return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL;
931c609c144SAlexander Graf }
932c609c144SAlexander Graf
933c609c144SAlexander Graf /**
934c609c144SAlexander Graf * kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
935c609c144SAlexander Graf * @name: the name of the sub FDT passed to kho_add_subtree().
936c609c144SAlexander Graf * @phys: if found, the physical address of the sub FDT is stored in @phys.
937c609c144SAlexander Graf *
938c609c144SAlexander Graf * Retrieve a preserved sub FDT named @name and store its physical
939c609c144SAlexander Graf * address in @phys.
940c609c144SAlexander Graf *
941c609c144SAlexander Graf * Return: 0 on success, error code on failure
942c609c144SAlexander Graf */
kho_retrieve_subtree(const char * name,phys_addr_t * phys)943c609c144SAlexander Graf int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
944c609c144SAlexander Graf {
945c609c144SAlexander Graf const void *fdt = kho_get_fdt();
946c609c144SAlexander Graf const u64 *val;
947c609c144SAlexander Graf int offset, len;
948c609c144SAlexander Graf
949c609c144SAlexander Graf if (!fdt)
950c609c144SAlexander Graf return -ENOENT;
951c609c144SAlexander Graf
952c609c144SAlexander Graf if (!phys)
953c609c144SAlexander Graf return -EINVAL;
954c609c144SAlexander Graf
955c609c144SAlexander Graf offset = fdt_subnode_offset(fdt, 0, name);
956c609c144SAlexander Graf if (offset < 0)
957c609c144SAlexander Graf return -ENOENT;
958c609c144SAlexander Graf
959c609c144SAlexander Graf val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len);
960c609c144SAlexander Graf if (!val || len != sizeof(*val))
961c609c144SAlexander Graf return -EINVAL;
962c609c144SAlexander Graf
963c609c144SAlexander Graf *phys = (phys_addr_t)*val;
964c609c144SAlexander Graf
965c609c144SAlexander Graf return 0;
966c609c144SAlexander Graf }
967c609c144SAlexander Graf EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
968c609c144SAlexander Graf
969c609c144SAlexander Graf /* Handling for debugfs/kho/in */
970c609c144SAlexander Graf
kho_in_debugfs_init(const void * fdt)971c609c144SAlexander Graf static __init int kho_in_debugfs_init(const void *fdt)
972c609c144SAlexander Graf {
973c609c144SAlexander Graf struct dentry *sub_fdt_dir;
974c609c144SAlexander Graf int err, child;
975c609c144SAlexander Graf
976c609c144SAlexander Graf kho_in.dir = debugfs_create_dir("in", debugfs_root);
977c609c144SAlexander Graf if (IS_ERR(kho_in.dir))
978c609c144SAlexander Graf return PTR_ERR(kho_in.dir);
979c609c144SAlexander Graf
980c609c144SAlexander Graf sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir);
981c609c144SAlexander Graf if (IS_ERR(sub_fdt_dir)) {
982c609c144SAlexander Graf err = PTR_ERR(sub_fdt_dir);
983c609c144SAlexander Graf goto err_rmdir;
984c609c144SAlexander Graf }
985c609c144SAlexander Graf
986c609c144SAlexander Graf err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt);
987c609c144SAlexander Graf if (err)
988c609c144SAlexander Graf goto err_rmdir;
989c609c144SAlexander Graf
990c609c144SAlexander Graf fdt_for_each_subnode(child, fdt, 0) {
991c609c144SAlexander Graf int len = 0;
992c609c144SAlexander Graf const char *name = fdt_get_name(fdt, child, NULL);
993c609c144SAlexander Graf const u64 *fdt_phys;
994c609c144SAlexander Graf
995c609c144SAlexander Graf fdt_phys = fdt_getprop(fdt, child, "fdt", &len);
996c609c144SAlexander Graf if (!fdt_phys)
997c609c144SAlexander Graf continue;
998c609c144SAlexander Graf if (len != sizeof(*fdt_phys)) {
999c609c144SAlexander Graf pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n",
1000c609c144SAlexander Graf name, len);
1001c609c144SAlexander Graf continue;
1002c609c144SAlexander Graf }
1003c609c144SAlexander Graf err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name,
1004c609c144SAlexander Graf phys_to_virt(*fdt_phys));
1005c609c144SAlexander Graf if (err) {
1006c609c144SAlexander Graf pr_warn("failed to add fdt `%s` to debugfs: %d\n", name,
1007c609c144SAlexander Graf err);
1008c609c144SAlexander Graf continue;
1009c609c144SAlexander Graf }
1010c609c144SAlexander Graf }
1011c609c144SAlexander Graf
1012c609c144SAlexander Graf return 0;
1013c609c144SAlexander Graf
1014c609c144SAlexander Graf err_rmdir:
1015c609c144SAlexander Graf debugfs_remove_recursive(kho_in.dir);
1016c609c144SAlexander Graf return err;
1017c609c144SAlexander Graf }
1018c609c144SAlexander Graf
kho_init(void)10193dc92c31SAlexander Graf static __init int kho_init(void)
10203dc92c31SAlexander Graf {
10213dc92c31SAlexander Graf int err = 0;
1022c609c144SAlexander Graf const void *fdt = kho_get_fdt();
10233dc92c31SAlexander Graf
10243dc92c31SAlexander Graf if (!kho_enable)
10253dc92c31SAlexander Graf return 0;
10263dc92c31SAlexander Graf
10273dc92c31SAlexander Graf kho_out.ser.fdt = alloc_page(GFP_KERNEL);
10283dc92c31SAlexander Graf if (!kho_out.ser.fdt) {
10293dc92c31SAlexander Graf err = -ENOMEM;
10303dc92c31SAlexander Graf goto err_free_scratch;
10313dc92c31SAlexander Graf }
10323dc92c31SAlexander Graf
10333dc92c31SAlexander Graf debugfs_root = debugfs_create_dir("kho", NULL);
10343dc92c31SAlexander Graf if (IS_ERR(debugfs_root)) {
10353dc92c31SAlexander Graf err = -ENOENT;
10363dc92c31SAlexander Graf goto err_free_fdt;
10373dc92c31SAlexander Graf }
10383dc92c31SAlexander Graf
10393dc92c31SAlexander Graf err = kho_out_debugfs_init();
10403dc92c31SAlexander Graf if (err)
10413dc92c31SAlexander Graf goto err_free_fdt;
10423dc92c31SAlexander Graf
1043c609c144SAlexander Graf if (fdt) {
1044c609c144SAlexander Graf err = kho_in_debugfs_init(fdt);
1045c609c144SAlexander Graf /*
1046c609c144SAlexander Graf * Failure to create /sys/kernel/debug/kho/in does not prevent
1047c609c144SAlexander Graf * reviving state from KHO and setting up KHO for the next
1048c609c144SAlexander Graf * kexec.
1049c609c144SAlexander Graf */
1050c609c144SAlexander Graf if (err)
1051c609c144SAlexander Graf pr_err("failed exposing handover FDT in debugfs: %d\n",
1052c609c144SAlexander Graf err);
1053c609c144SAlexander Graf
1054c609c144SAlexander Graf return 0;
1055c609c144SAlexander Graf }
1056c609c144SAlexander Graf
10573dc92c31SAlexander Graf for (int i = 0; i < kho_scratch_cnt; i++) {
10583dc92c31SAlexander Graf unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr);
10593dc92c31SAlexander Graf unsigned long count = kho_scratch[i].size >> PAGE_SHIFT;
10603dc92c31SAlexander Graf unsigned long pfn;
10613dc92c31SAlexander Graf
10623dc92c31SAlexander Graf for (pfn = base_pfn; pfn < base_pfn + count;
10633dc92c31SAlexander Graf pfn += pageblock_nr_pages)
10643dc92c31SAlexander Graf init_cma_reserved_pageblock(pfn_to_page(pfn));
10653dc92c31SAlexander Graf }
10663dc92c31SAlexander Graf
10673dc92c31SAlexander Graf return 0;
10683dc92c31SAlexander Graf
10693dc92c31SAlexander Graf err_free_fdt:
10703dc92c31SAlexander Graf put_page(kho_out.ser.fdt);
10713dc92c31SAlexander Graf kho_out.ser.fdt = NULL;
10723dc92c31SAlexander Graf err_free_scratch:
10733dc92c31SAlexander Graf for (int i = 0; i < kho_scratch_cnt; i++) {
10743dc92c31SAlexander Graf void *start = __va(kho_scratch[i].addr);
10753dc92c31SAlexander Graf void *end = start + kho_scratch[i].size;
10763dc92c31SAlexander Graf
10773dc92c31SAlexander Graf free_reserved_area(start, end, -1, "");
10783dc92c31SAlexander Graf }
10793dc92c31SAlexander Graf kho_enable = false;
10803dc92c31SAlexander Graf return err;
10813dc92c31SAlexander Graf }
10823dc92c31SAlexander Graf late_initcall(kho_init);
10833dc92c31SAlexander Graf
kho_release_scratch(void)1084c609c144SAlexander Graf static void __init kho_release_scratch(void)
1085c609c144SAlexander Graf {
1086c609c144SAlexander Graf phys_addr_t start, end;
1087c609c144SAlexander Graf u64 i;
1088c609c144SAlexander Graf
1089c609c144SAlexander Graf memmap_init_kho_scratch_pages();
1090c609c144SAlexander Graf
1091c609c144SAlexander Graf /*
1092c609c144SAlexander Graf * Mark scratch mem as CMA before we return it. That way we
1093c609c144SAlexander Graf * ensure that no kernel allocations happen on it. That means
1094c609c144SAlexander Graf * we can reuse it as scratch memory again later.
1095c609c144SAlexander Graf */
1096c609c144SAlexander Graf __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
1097c609c144SAlexander Graf MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) {
1098c609c144SAlexander Graf ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start));
1099c609c144SAlexander Graf ulong end_pfn = pageblock_align(PFN_UP(end));
1100c609c144SAlexander Graf ulong pfn;
1101c609c144SAlexander Graf
1102c609c144SAlexander Graf for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
1103c609c144SAlexander Graf set_pageblock_migratetype(pfn_to_page(pfn),
1104c609c144SAlexander Graf MIGRATE_CMA);
1105c609c144SAlexander Graf }
1106c609c144SAlexander Graf }
1107c609c144SAlexander Graf
kho_memory_init(void)11083dc92c31SAlexander Graf void __init kho_memory_init(void)
11093dc92c31SAlexander Graf {
1110fc33e4b4SMike Rapoport (Microsoft) struct folio *folio;
1111fc33e4b4SMike Rapoport (Microsoft)
1112c609c144SAlexander Graf if (kho_in.scratch_phys) {
1113c609c144SAlexander Graf kho_scratch = phys_to_virt(kho_in.scratch_phys);
1114c609c144SAlexander Graf kho_release_scratch();
1115fc33e4b4SMike Rapoport (Microsoft)
1116fc33e4b4SMike Rapoport (Microsoft) kho_mem_deserialize(kho_get_fdt());
1117fc33e4b4SMike Rapoport (Microsoft) folio = kho_restore_folio(kho_in.fdt_phys);
1118fc33e4b4SMike Rapoport (Microsoft) if (!folio)
1119fc33e4b4SMike Rapoport (Microsoft) pr_warn("failed to restore folio for KHO fdt\n");
1120c609c144SAlexander Graf } else {
11213dc92c31SAlexander Graf kho_reserve_scratch();
11223dc92c31SAlexander Graf }
1123c609c144SAlexander Graf }
1124c609c144SAlexander Graf
kho_populate(phys_addr_t fdt_phys,u64 fdt_len,phys_addr_t scratch_phys,u64 scratch_len)1125c609c144SAlexander Graf void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
1126c609c144SAlexander Graf phys_addr_t scratch_phys, u64 scratch_len)
1127c609c144SAlexander Graf {
1128c609c144SAlexander Graf void *fdt = NULL;
1129c609c144SAlexander Graf struct kho_scratch *scratch = NULL;
1130c609c144SAlexander Graf int err = 0;
1131c609c144SAlexander Graf unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
1132c609c144SAlexander Graf
1133c609c144SAlexander Graf /* Validate the input FDT */
1134c609c144SAlexander Graf fdt = early_memremap(fdt_phys, fdt_len);
1135c609c144SAlexander Graf if (!fdt) {
1136c609c144SAlexander Graf pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys);
1137c609c144SAlexander Graf err = -EFAULT;
1138c609c144SAlexander Graf goto out;
1139c609c144SAlexander Graf }
1140c609c144SAlexander Graf err = fdt_check_header(fdt);
1141c609c144SAlexander Graf if (err) {
1142c609c144SAlexander Graf pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n",
1143c609c144SAlexander Graf fdt_phys, err);
1144c609c144SAlexander Graf err = -EINVAL;
1145c609c144SAlexander Graf goto out;
1146c609c144SAlexander Graf }
1147c609c144SAlexander Graf err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE);
1148c609c144SAlexander Graf if (err) {
1149c609c144SAlexander Graf pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n",
1150c609c144SAlexander Graf fdt_phys, KHO_FDT_COMPATIBLE, err);
1151c609c144SAlexander Graf err = -EINVAL;
1152c609c144SAlexander Graf goto out;
1153c609c144SAlexander Graf }
1154c609c144SAlexander Graf
1155c609c144SAlexander Graf scratch = early_memremap(scratch_phys, scratch_len);
1156c609c144SAlexander Graf if (!scratch) {
1157c609c144SAlexander Graf pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n",
1158c609c144SAlexander Graf scratch_phys, scratch_len);
1159c609c144SAlexander Graf err = -EFAULT;
1160c609c144SAlexander Graf goto out;
1161c609c144SAlexander Graf }
1162c609c144SAlexander Graf
1163c609c144SAlexander Graf /*
1164c609c144SAlexander Graf * We pass a safe contiguous blocks of memory to use for early boot
1165c609c144SAlexander Graf * purporses from the previous kernel so that we can resize the
1166c609c144SAlexander Graf * memblock array as needed.
1167c609c144SAlexander Graf */
1168c609c144SAlexander Graf for (int i = 0; i < scratch_cnt; i++) {
1169c609c144SAlexander Graf struct kho_scratch *area = &scratch[i];
1170c609c144SAlexander Graf u64 size = area->size;
1171c609c144SAlexander Graf
1172c609c144SAlexander Graf memblock_add(area->addr, size);
1173c609c144SAlexander Graf err = memblock_mark_kho_scratch(area->addr, size);
1174c609c144SAlexander Graf if (WARN_ON(err)) {
1175c609c144SAlexander Graf pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d",
1176c609c144SAlexander Graf &area->addr, &size, err);
1177c609c144SAlexander Graf goto out;
1178c609c144SAlexander Graf }
1179c609c144SAlexander Graf pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
1180c609c144SAlexander Graf }
1181c609c144SAlexander Graf
1182c609c144SAlexander Graf memblock_reserve(scratch_phys, scratch_len);
1183c609c144SAlexander Graf
1184c609c144SAlexander Graf /*
1185c609c144SAlexander Graf * Now that we have a viable region of scratch memory, let's tell
1186c609c144SAlexander Graf * the memblocks allocator to only use that for any allocations.
1187c609c144SAlexander Graf * That way we ensure that nothing scribbles over in use data while
1188c609c144SAlexander Graf * we initialize the page tables which we will need to ingest all
1189c609c144SAlexander Graf * memory reservations from the previous kernel.
1190c609c144SAlexander Graf */
1191c609c144SAlexander Graf memblock_set_kho_scratch_only();
1192c609c144SAlexander Graf
1193c609c144SAlexander Graf kho_in.fdt_phys = fdt_phys;
1194c609c144SAlexander Graf kho_in.scratch_phys = scratch_phys;
1195c609c144SAlexander Graf kho_scratch_cnt = scratch_cnt;
1196c609c144SAlexander Graf pr_info("found kexec handover data. Will skip init for some devices\n");
1197c609c144SAlexander Graf
1198c609c144SAlexander Graf out:
1199c609c144SAlexander Graf if (fdt)
1200c609c144SAlexander Graf early_memunmap(fdt, fdt_len);
1201c609c144SAlexander Graf if (scratch)
1202c609c144SAlexander Graf early_memunmap(scratch, scratch_len);
1203c609c144SAlexander Graf if (err)
1204c609c144SAlexander Graf pr_warn("disabling KHO revival: %d\n", err);
1205c609c144SAlexander Graf }
12063bdecc3cSAlexander Graf
12073bdecc3cSAlexander Graf /* Helper functions for kexec_file_load */
12083bdecc3cSAlexander Graf
kho_fill_kimage(struct kimage * image)12093bdecc3cSAlexander Graf int kho_fill_kimage(struct kimage *image)
12103bdecc3cSAlexander Graf {
12113bdecc3cSAlexander Graf ssize_t scratch_size;
12123bdecc3cSAlexander Graf int err = 0;
12133bdecc3cSAlexander Graf struct kexec_buf scratch;
12143bdecc3cSAlexander Graf
12153bdecc3cSAlexander Graf if (!kho_enable)
12163bdecc3cSAlexander Graf return 0;
12173bdecc3cSAlexander Graf
12183bdecc3cSAlexander Graf image->kho.fdt = page_to_phys(kho_out.ser.fdt);
12193bdecc3cSAlexander Graf
12203bdecc3cSAlexander Graf scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt;
12213bdecc3cSAlexander Graf scratch = (struct kexec_buf){
12223bdecc3cSAlexander Graf .image = image,
12233bdecc3cSAlexander Graf .buffer = kho_scratch,
12243bdecc3cSAlexander Graf .bufsz = scratch_size,
12253bdecc3cSAlexander Graf .mem = KEXEC_BUF_MEM_UNKNOWN,
12263bdecc3cSAlexander Graf .memsz = scratch_size,
12273bdecc3cSAlexander Graf .buf_align = SZ_64K, /* Makes it easier to map */
12283bdecc3cSAlexander Graf .buf_max = ULONG_MAX,
12293bdecc3cSAlexander Graf .top_down = true,
12303bdecc3cSAlexander Graf };
12313bdecc3cSAlexander Graf err = kexec_add_buffer(&scratch);
12323bdecc3cSAlexander Graf if (err)
12333bdecc3cSAlexander Graf return err;
12343bdecc3cSAlexander Graf image->kho.scratch = &image->segment[image->nr_segments - 1];
12353bdecc3cSAlexander Graf
12363bdecc3cSAlexander Graf return 0;
12373bdecc3cSAlexander Graf }
12383bdecc3cSAlexander Graf
kho_walk_scratch(struct kexec_buf * kbuf,int (* func)(struct resource *,void *))12393bdecc3cSAlexander Graf static int kho_walk_scratch(struct kexec_buf *kbuf,
12403bdecc3cSAlexander Graf int (*func)(struct resource *, void *))
12413bdecc3cSAlexander Graf {
12423bdecc3cSAlexander Graf int ret = 0;
12433bdecc3cSAlexander Graf int i;
12443bdecc3cSAlexander Graf
12453bdecc3cSAlexander Graf for (i = 0; i < kho_scratch_cnt; i++) {
12463bdecc3cSAlexander Graf struct resource res = {
12473bdecc3cSAlexander Graf .start = kho_scratch[i].addr,
12483bdecc3cSAlexander Graf .end = kho_scratch[i].addr + kho_scratch[i].size - 1,
12493bdecc3cSAlexander Graf };
12503bdecc3cSAlexander Graf
12513bdecc3cSAlexander Graf /* Try to fit the kimage into our KHO scratch region */
12523bdecc3cSAlexander Graf ret = func(&res, kbuf);
12533bdecc3cSAlexander Graf if (ret)
12543bdecc3cSAlexander Graf break;
12553bdecc3cSAlexander Graf }
12563bdecc3cSAlexander Graf
12573bdecc3cSAlexander Graf return ret;
12583bdecc3cSAlexander Graf }
12593bdecc3cSAlexander Graf
kho_locate_mem_hole(struct kexec_buf * kbuf,int (* func)(struct resource *,void *))12603bdecc3cSAlexander Graf int kho_locate_mem_hole(struct kexec_buf *kbuf,
12613bdecc3cSAlexander Graf int (*func)(struct resource *, void *))
12623bdecc3cSAlexander Graf {
12633bdecc3cSAlexander Graf int ret;
12643bdecc3cSAlexander Graf
12653bdecc3cSAlexander Graf if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH)
12663bdecc3cSAlexander Graf return 1;
12673bdecc3cSAlexander Graf
12683bdecc3cSAlexander Graf ret = kho_walk_scratch(kbuf, func);
12693bdecc3cSAlexander Graf
12703bdecc3cSAlexander Graf return ret == 1 ? 0 : -EADDRNOTAVAIL;
12713bdecc3cSAlexander Graf }
1272