xref: /freebsd/sys/amd64/amd64/kexec_support.c (revision 16db4c6fff450d3dd802b6c4b23cae0a8af44976)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2025 Juniper Networks, Inc.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/systm.h>
29 #include <sys/bus.h>
30 #include <sys/conf.h>
31 #include <sys/interrupt.h>
32 #include <sys/kernel.h>
33 #include <sys/kexec.h>
34 #include <vm/vm.h>
35 #include <vm/vm_extern.h>
36 #include <vm/vm_object.h>
37 #include <vm/vm_phys.h>
38 #include <vm/pmap.h>
39 #include <vm/vm_page.h>
40 #include <vm/vm_radix.h>
41 
42 #include <machine/intr_machdep.h>
43 #include <machine/kexec.h>
44 #include <machine/md_var.h>
45 #include <machine/pmap.h>
46 #include <x86/apicvar.h>
47 
48 /*
49  * Idea behind this:
50  *
51  * kexec_load_md():
52  * - Update boot page tables (identity map) to include all pages needed before
53  *   disabling MMU.
54  *
55  * kexec_reboot_md():
56  * - Copy pages into target(s)
57  * - Do "other stuff"
58  * - Does not return
59  */
60 
61 /*
62  * do_pte: Create PTE entries (4k pages). If false, create 2MB superpages.
63  * identity: This is for an identity map, treat `start` as a physical address.
64  * Only valid here if do_pte is false.
65  */
66 static void
kexec_generate_page_tables(pml4_entry_t * root,vm_offset_t start,vm_size_t size,bool do_pte,bool identity,struct pctrie_iter * pages)67 kexec_generate_page_tables(pml4_entry_t *root, vm_offset_t start,
68     vm_size_t size, bool do_pte, bool identity, struct pctrie_iter *pages)
69 {
70 	vm_paddr_t mpa;
71 	vm_offset_t pg;
72 	vm_size_t stride = do_pte ? PAGE_SIZE : NBPDR;
73 	vm_page_t m;
74 	vm_pindex_t i, j, k, l;
75 
76 	pg = start & ~(stride - 1);
77 	i = pmap_pml4e_index(pg);
78 	j = pmap_pdpe_index(pg);
79 	k = pmap_pde_index(pg);
80 	l = pmap_pte_index(pg);
81 	for (; pg < start + size; i++, j = 0, k = 0, l = 0) {
82 		/*
83 		 * Walk linearly, as above, but one fell swoop, one page at a
84 		 * time.
85 		 */
86 		if (root[i] == 0) {
87 			m = vm_radix_iter_next(pages);
88 			mpa = VM_PAGE_TO_PHYS(m);
89 			root[i] = mpa | PG_RW | PG_V;
90 		}
91 		pdp_entry_t *pdp =
92 			(pdp_entry_t *)(PHYS_TO_DMAP(root[i] & PG_FRAME));
93 		for (; j < NPDPEPG && pg < start + size; j++, k = 0, l = 0) {
94 			if (pdp[j] == 0) {
95 				m = vm_radix_iter_next(pages);
96 				mpa = VM_PAGE_TO_PHYS(m);
97 				pdp[j] = mpa | PG_RW | PG_V;
98 			}
99 			pd_entry_t *pde =
100 			    (pd_entry_t *)(PHYS_TO_DMAP(pdp[j] & PG_FRAME));
101 			for (; k < NPDEPG && pg < start + size; k++, l = 0) {
102 				if (pde[k] == 0) {
103 					if (!do_pte) {
104 						pde[k] =
105 						    (identity ? pg : pmap_kextract(pg)) |
106 						    PG_RW | PG_PS | PG_V;
107 						pg += NBPDR;
108 						continue;
109 					}
110 					m = vm_radix_iter_next(pages);
111 					mpa = VM_PAGE_TO_PHYS(m);
112 					pde[k] = mpa | PG_V | PG_RW;
113 				} else if ((pde[k] & PG_PS) != 0) {
114 					pg += NBPDR;
115 					continue;
116 				}
117 				/* Populate the PTEs. */
118 				for (; l < NPTEPG && pg < start + size;
119 				    l++, pg += PAGE_SIZE) {
120 					pt_entry_t *pte =
121 					    (pt_entry_t *)PHYS_TO_DMAP(pde[pmap_pde_index(pg)] & PG_FRAME);
122 					pte[pmap_pte_index(pg)] =
123 					    pmap_kextract(pg) | PG_RW | PG_V;
124 				}
125 			}
126 		}
127 	}
128 }
129 
130 void
kexec_reboot_md(struct kexec_image * image)131 kexec_reboot_md(struct kexec_image *image)
132 {
133 	void (*kexec_do_tramp)(void) = image->md_image;
134 
135 	intr_disable_all();
136 	lapic_disable();
137 	kexec_do_reboot_trampoline(VM_PAGE_TO_PHYS(image->first_md_page),
138 	    kexec_do_tramp);
139 
140 	for (;;)
141 		;
142 }
143 
144 int
kexec_load_md(struct kexec_image * image)145 kexec_load_md(struct kexec_image *image)
146 {
147 	struct pctrie_iter pct_iter;
148 	pml4_entry_t *PT4;
149 	pdp_entry_t *PDP_l;
150 	pd_entry_t *PD_l0;
151 	vm_offset_t va;
152 	int i;
153 
154 	/*
155 	 * Start building the page table.
156 	 * First part of the page table is standard for all.
157 	 */
158 	vm_offset_t pa_pdp_l, pa_pd_l0, pa_pd_l1, pa_pd_l2, pa_pd_l3;
159 	vm_page_t m;
160 
161 	if (la57)
162 		return (EINVAL);
163 
164 	vm_radix_iter_init(&pct_iter, &image->map_obj->rtree);
165 	/* Working in linear space in the mapped space, `va` is our tracker. */
166 	m = vm_radix_iter_lookup(&pct_iter, image->first_md_page->pindex);
167 	va = (vm_offset_t)image->map_addr + ptoa(m->pindex);
168 	/* We'll find a place for these later */
169 	PT4 = (void *)va;
170 	va += PAGE_SIZE;
171 	m = vm_radix_iter_next(&pct_iter);
172 	pa_pdp_l = VM_PAGE_TO_PHYS(m);
173 	PDP_l = (void *)va;
174 	va += PAGE_SIZE;
175 	m = vm_radix_iter_next(&pct_iter);
176 	pa_pd_l0 = VM_PAGE_TO_PHYS(m);
177 	PD_l0 = (void *)va;
178 	va += PAGE_SIZE;
179 	m = vm_radix_iter_next(&pct_iter);
180 	pa_pd_l1 = VM_PAGE_TO_PHYS(m);
181 	m = vm_radix_iter_next(&pct_iter);
182 	pa_pd_l2 = VM_PAGE_TO_PHYS(m);
183 	m = vm_radix_iter_next(&pct_iter);
184 	pa_pd_l3 = VM_PAGE_TO_PHYS(m);
185 	m = vm_radix_iter_next(&pct_iter);
186 
187 	/* 1:1 mapping of lower 4G */
188 	PT4[0] = (pml4_entry_t)pa_pdp_l | PG_V | PG_RW;
189 	PDP_l[0] = (pdp_entry_t)pa_pd_l0 | PG_V | PG_RW;
190 	PDP_l[1] = (pdp_entry_t)pa_pd_l1 | PG_V | PG_RW;
191 	PDP_l[2] = (pdp_entry_t)pa_pd_l2 | PG_V | PG_RW;
192 	PDP_l[3] = (pdp_entry_t)pa_pd_l3 | PG_V | PG_RW;
193 	for (i = 0; i < 4 * NPDEPG; i++) {	/* we overflow PD_l0 into _l1, etc */
194 		PD_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
195 		    PG_RW | PG_PS;
196 	}
197 
198 	/* Map the target(s) in 2MB chunks. */
199 	for (i = 0; i < KEXEC_SEGMENT_MAX; i++) {
200 		struct kexec_segment_stage *s = &image->segments[i];
201 
202 		if (s->size == 0)
203 			break;
204 		kexec_generate_page_tables(PT4, s->target, s->size, false,
205 		    true, &pct_iter);
206 	}
207 	/* Now create the source page tables */
208 	kexec_generate_page_tables(PT4, image->map_addr, image->map_size, true,
209 	    false, &pct_iter);
210 	kexec_generate_page_tables(PT4,
211 	    trunc_page((vm_offset_t)kexec_do_reboot_trampoline),
212 	    PAGE_SIZE, true, false, &pct_iter);
213 	KASSERT(m != NULL, ("kexec_load_md: Missing trampoline page!\n"));
214 
215 	/* MD control pages start at this next page. */
216 	image->md_image = (void *)(image->map_addr + ptoa(m->pindex));
217 	bcopy(kexec_do_reboot, image->md_image, kexec_do_reboot_size);
218 
219 	/* Save the image into the MD page(s) right after the trampoline */
220 	bcopy(image, (void *)((vm_offset_t)image->md_image +
221 	    (vm_offset_t)&kexec_saved_image - (vm_offset_t)&kexec_do_reboot),
222 	    sizeof(*image));
223 
224 	return (0);
225 }
226 
227 /*
228  * Required pages:
229  * - L4 (1) (root)
230  * - L3 (PDPE) - 2 (bottom 512GB, bottom 4 used, top range for kernel map)
231  * - L2 (PDP) - 5 (2MB superpage mappings, 1GB each, for bottom 4GB, top 1)
232  * - L1 (PDR) - 1 (kexec trampoline page, first MD page)
233  * - kexec_do_reboot trampoline - 1
234  * - Slop pages for staging (in case it's not aligned nicely) - 3 (worst case)
235  *
236  * Minimum 9 pages for the direct map.
237  */
238 int
kexec_md_pages(struct kexec_segment * seg_in)239 kexec_md_pages(struct kexec_segment *seg_in)
240 {
241 	struct kexec_segment *segs = seg_in;
242 	vm_size_t pages = 13;	/* Minimum number of starting pages */
243 	vm_paddr_t cur_addr = (1UL << 32) - 1;	/* Bottom 4G will be identity mapped in full */
244 	vm_size_t source_total = 0;
245 
246 	for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) {
247 		vm_offset_t start, end;
248 		if (segs[i].memsz == 0)
249 			break;
250 
251 		end = round_2mpage((vm_offset_t)segs[i].mem + segs[i].memsz);
252 		start = trunc_2mpage((vm_offset_t)segs[i].mem);
253 		start = max(start, cur_addr + 1);
254 		/*
255 		 * Round to cover the full range of page table pages for each
256 		 * segment.
257 		 */
258 		source_total += round_2mpage(end - start);
259 
260 		/*
261 		 * Bottom 4GB are identity mapped already in the count, so skip
262 		 * any segments that end up there, this will short-circuit that.
263 		 */
264 		if (end <= cur_addr + 1)
265 			continue;
266 
267 		if (pmap_pml4e_index(end) != pmap_pml4e_index(cur_addr)) {
268 			/* Need a new 512GB mapping page */
269 			pages++;
270 			pages += howmany(end - (start & ~PML4MASK), NBPML4);
271 			pages += howmany(end - (start & ~PDPMASK), NBPDP);
272 			pages += howmany(end - (start & ~PDRMASK), NBPDR);
273 
274 		} else if (pmap_pdpe_index(end) != pmap_pdpe_index(cur_addr)) {
275 			pages++;
276 			pages += howmany(end - (start & ~PDPMASK), NBPDP) - 1;
277 			pages += howmany(end - (start & ~PDRMASK), NBPDR);
278 		}
279 
280 	}
281 	/* Be pessimistic when totaling up source pages.  We likely
282 	 * can't use superpages, so need to map each page individually.
283 	 */
284 	pages += howmany(source_total, NBPDR);
285 	pages += howmany(source_total, NBPDP);
286 	pages += howmany(source_total, NBPML4);
287 
288 	/*
289 	 * Be intentionally sloppy adding in the extra page table pages. It's
290 	 * better to go over than under.
291 	 */
292 	pages += howmany(pages * PAGE_SIZE, NBPDR);
293 	pages += howmany(pages * PAGE_SIZE, NBPDP);
294 	pages += howmany(pages * PAGE_SIZE, NBPML4);
295 
296 	/* Add in the trampoline pages */
297 	pages += howmany(kexec_do_reboot_size, PAGE_SIZE);
298 
299 	return (pages);
300 }
301