1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2025 Juniper Networks, Inc.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/systm.h>
29 #include <sys/bus.h>
30 #include <sys/conf.h>
31 #include <sys/interrupt.h>
32 #include <sys/kernel.h>
33 #include <sys/kexec.h>
34 #include <vm/vm.h>
35 #include <vm/vm_extern.h>
36 #include <vm/vm_object.h>
37 #include <vm/vm_phys.h>
38 #include <vm/pmap.h>
39 #include <vm/vm_page.h>
40 #include <vm/vm_radix.h>
41
42 #include <machine/intr_machdep.h>
43 #include <machine/kexec.h>
44 #include <machine/md_var.h>
45 #include <machine/pmap.h>
46 #include <x86/apicvar.h>
47
48 /*
49 * Idea behind this:
50 *
51 * kexec_load_md():
52 * - Update boot page tables (identity map) to include all pages needed before
53 * disabling MMU.
54 *
55 * kexec_reboot_md():
56 * - Copy pages into target(s)
57 * - Do "other stuff"
58 * - Does not return
59 */
60
61 /*
62 * do_pte: Create PTE entries (4k pages). If false, create 2MB superpages.
63 * identity: This is for an identity map, treat `start` as a physical address.
64 * Only valid here if do_pte is false.
65 */
66 static void
kexec_generate_page_tables(pml4_entry_t * root,vm_offset_t start,vm_size_t size,bool do_pte,bool identity,struct pctrie_iter * pages)67 kexec_generate_page_tables(pml4_entry_t *root, vm_offset_t start,
68 vm_size_t size, bool do_pte, bool identity, struct pctrie_iter *pages)
69 {
70 vm_paddr_t mpa;
71 vm_offset_t pg;
72 vm_size_t stride = do_pte ? PAGE_SIZE : NBPDR;
73 vm_page_t m;
74 vm_pindex_t i, j, k, l;
75
76 pg = start & ~(stride - 1);
77 i = pmap_pml4e_index(pg);
78 j = pmap_pdpe_index(pg);
79 k = pmap_pde_index(pg);
80 l = pmap_pte_index(pg);
81 for (; pg < start + size; i++, j = 0, k = 0, l = 0) {
82 /*
83 * Walk linearly, as above, but one fell swoop, one page at a
84 * time.
85 */
86 if (root[i] == 0) {
87 m = vm_radix_iter_next(pages);
88 mpa = VM_PAGE_TO_PHYS(m);
89 root[i] = mpa | PG_RW | PG_V;
90 }
91 pdp_entry_t *pdp =
92 (pdp_entry_t *)(PHYS_TO_DMAP(root[i] & PG_FRAME));
93 for (; j < NPDPEPG && pg < start + size; j++, k = 0, l = 0) {
94 if (pdp[j] == 0) {
95 m = vm_radix_iter_next(pages);
96 mpa = VM_PAGE_TO_PHYS(m);
97 pdp[j] = mpa | PG_RW | PG_V;
98 }
99 pd_entry_t *pde =
100 (pd_entry_t *)(PHYS_TO_DMAP(pdp[j] & PG_FRAME));
101 for (; k < NPDEPG && pg < start + size; k++, l = 0) {
102 if (pde[k] == 0) {
103 if (!do_pte) {
104 pde[k] =
105 (identity ? pg : pmap_kextract(pg)) |
106 PG_RW | PG_PS | PG_V;
107 pg += NBPDR;
108 continue;
109 }
110 m = vm_radix_iter_next(pages);
111 mpa = VM_PAGE_TO_PHYS(m);
112 pde[k] = mpa | PG_V | PG_RW;
113 } else if ((pde[k] & PG_PS) != 0) {
114 pg += NBPDR;
115 continue;
116 }
117 /* Populate the PTEs. */
118 for (; l < NPTEPG && pg < start + size;
119 l++, pg += PAGE_SIZE) {
120 pt_entry_t *pte =
121 (pt_entry_t *)PHYS_TO_DMAP(pde[pmap_pde_index(pg)] & PG_FRAME);
122 pte[pmap_pte_index(pg)] =
123 pmap_kextract(pg) | PG_RW | PG_V;
124 }
125 }
126 }
127 }
128 }
129
130 void
kexec_reboot_md(struct kexec_image * image)131 kexec_reboot_md(struct kexec_image *image)
132 {
133 void (*kexec_do_tramp)(void) = image->md_image;
134
135 intr_disable_all();
136 lapic_disable();
137 kexec_do_reboot_trampoline(VM_PAGE_TO_PHYS(image->first_md_page),
138 kexec_do_tramp);
139
140 for (;;)
141 ;
142 }
143
144 int
kexec_load_md(struct kexec_image * image)145 kexec_load_md(struct kexec_image *image)
146 {
147 struct pctrie_iter pct_iter;
148 pml4_entry_t *PT4;
149 pdp_entry_t *PDP_l;
150 pd_entry_t *PD_l0;
151 vm_offset_t va;
152 int i;
153
154 /*
155 * Start building the page table.
156 * First part of the page table is standard for all.
157 */
158 vm_offset_t pa_pdp_l, pa_pd_l0, pa_pd_l1, pa_pd_l2, pa_pd_l3;
159 vm_page_t m;
160
161 if (la57)
162 return (EINVAL);
163
164 vm_radix_iter_init(&pct_iter, &image->map_obj->rtree);
165 /* Working in linear space in the mapped space, `va` is our tracker. */
166 m = vm_radix_iter_lookup(&pct_iter, image->first_md_page->pindex);
167 va = (vm_offset_t)image->map_addr + ptoa(m->pindex);
168 /* We'll find a place for these later */
169 PT4 = (void *)va;
170 va += PAGE_SIZE;
171 m = vm_radix_iter_next(&pct_iter);
172 pa_pdp_l = VM_PAGE_TO_PHYS(m);
173 PDP_l = (void *)va;
174 va += PAGE_SIZE;
175 m = vm_radix_iter_next(&pct_iter);
176 pa_pd_l0 = VM_PAGE_TO_PHYS(m);
177 PD_l0 = (void *)va;
178 va += PAGE_SIZE;
179 m = vm_radix_iter_next(&pct_iter);
180 pa_pd_l1 = VM_PAGE_TO_PHYS(m);
181 m = vm_radix_iter_next(&pct_iter);
182 pa_pd_l2 = VM_PAGE_TO_PHYS(m);
183 m = vm_radix_iter_next(&pct_iter);
184 pa_pd_l3 = VM_PAGE_TO_PHYS(m);
185 m = vm_radix_iter_next(&pct_iter);
186
187 /* 1:1 mapping of lower 4G */
188 PT4[0] = (pml4_entry_t)pa_pdp_l | PG_V | PG_RW;
189 PDP_l[0] = (pdp_entry_t)pa_pd_l0 | PG_V | PG_RW;
190 PDP_l[1] = (pdp_entry_t)pa_pd_l1 | PG_V | PG_RW;
191 PDP_l[2] = (pdp_entry_t)pa_pd_l2 | PG_V | PG_RW;
192 PDP_l[3] = (pdp_entry_t)pa_pd_l3 | PG_V | PG_RW;
193 for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PD_l0 into _l1, etc */
194 PD_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
195 PG_RW | PG_PS;
196 }
197
198 /* Map the target(s) in 2MB chunks. */
199 for (i = 0; i < KEXEC_SEGMENT_MAX; i++) {
200 struct kexec_segment_stage *s = &image->segments[i];
201
202 if (s->size == 0)
203 break;
204 kexec_generate_page_tables(PT4, s->target, s->size, false,
205 true, &pct_iter);
206 }
207 /* Now create the source page tables */
208 kexec_generate_page_tables(PT4, image->map_addr, image->map_size, true,
209 false, &pct_iter);
210 kexec_generate_page_tables(PT4,
211 trunc_page((vm_offset_t)kexec_do_reboot_trampoline),
212 PAGE_SIZE, true, false, &pct_iter);
213 KASSERT(m != NULL, ("kexec_load_md: Missing trampoline page!\n"));
214
215 /* MD control pages start at this next page. */
216 image->md_image = (void *)(image->map_addr + ptoa(m->pindex));
217 bcopy(kexec_do_reboot, image->md_image, kexec_do_reboot_size);
218
219 /* Save the image into the MD page(s) right after the trampoline */
220 bcopy(image, (void *)((vm_offset_t)image->md_image +
221 (vm_offset_t)&kexec_saved_image - (vm_offset_t)&kexec_do_reboot),
222 sizeof(*image));
223
224 return (0);
225 }
226
227 /*
228 * Required pages:
229 * - L4 (1) (root)
230 * - L3 (PDPE) - 2 (bottom 512GB, bottom 4 used, top range for kernel map)
231 * - L2 (PDP) - 5 (2MB superpage mappings, 1GB each, for bottom 4GB, top 1)
232 * - L1 (PDR) - 1 (kexec trampoline page, first MD page)
233 * - kexec_do_reboot trampoline - 1
234 * - Slop pages for staging (in case it's not aligned nicely) - 3 (worst case)
235 *
236 * Minimum 9 pages for the direct map.
237 */
238 int
kexec_md_pages(struct kexec_segment * seg_in)239 kexec_md_pages(struct kexec_segment *seg_in)
240 {
241 struct kexec_segment *segs = seg_in;
242 vm_size_t pages = 13; /* Minimum number of starting pages */
243 vm_paddr_t cur_addr = (1UL << 32) - 1; /* Bottom 4G will be identity mapped in full */
244 vm_size_t source_total = 0;
245
246 for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) {
247 vm_offset_t start, end;
248 if (segs[i].memsz == 0)
249 break;
250
251 end = round_2mpage((vm_offset_t)segs[i].mem + segs[i].memsz);
252 start = trunc_2mpage((vm_offset_t)segs[i].mem);
253 start = max(start, cur_addr + 1);
254 /*
255 * Round to cover the full range of page table pages for each
256 * segment.
257 */
258 source_total += round_2mpage(end - start);
259
260 /*
261 * Bottom 4GB are identity mapped already in the count, so skip
262 * any segments that end up there, this will short-circuit that.
263 */
264 if (end <= cur_addr + 1)
265 continue;
266
267 if (pmap_pml4e_index(end) != pmap_pml4e_index(cur_addr)) {
268 /* Need a new 512GB mapping page */
269 pages++;
270 pages += howmany(end - (start & ~PML4MASK), NBPML4);
271 pages += howmany(end - (start & ~PDPMASK), NBPDP);
272 pages += howmany(end - (start & ~PDRMASK), NBPDR);
273
274 } else if (pmap_pdpe_index(end) != pmap_pdpe_index(cur_addr)) {
275 pages++;
276 pages += howmany(end - (start & ~PDPMASK), NBPDP) - 1;
277 pages += howmany(end - (start & ~PDRMASK), NBPDR);
278 }
279
280 }
281 /* Be pessimistic when totaling up source pages. We likely
282 * can't use superpages, so need to map each page individually.
283 */
284 pages += howmany(source_total, NBPDR);
285 pages += howmany(source_total, NBPDP);
286 pages += howmany(source_total, NBPML4);
287
288 /*
289 * Be intentionally sloppy adding in the extra page table pages. It's
290 * better to go over than under.
291 */
292 pages += howmany(pages * PAGE_SIZE, NBPDR);
293 pages += howmany(pages * PAGE_SIZE, NBPDP);
294 pages += howmany(pages * PAGE_SIZE, NBPML4);
295
296 /* Add in the trampoline pages */
297 pages += howmany(kexec_do_reboot_size, PAGE_SIZE);
298
299 return (pages);
300 }
301