1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2025 Juniper Networks, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/bus.h>
31 #include <sys/eventhandler.h>
32 #include <sys/kernel.h>
33 #ifdef INTRNG
34 #include <sys/intr.h>
35 #endif
36 #include <sys/kexec.h>
37 #include <sys/malloc.h>
38 #include <sys/proc.h>
39 #include <sys/priv.h>
40 #include <sys/reboot.h>
41 #include <sys/rman.h>
42 #include <sys/rwlock.h>
43 #include <sys/smp.h>
44 #include <sys/syscallsubr.h>
45 #include <sys/sysproto.h>
46
47 #include <vm/vm.h>
48 #include <vm/pmap.h>
49 #include <vm/vm_extern.h>
50 #include <vm/vm_kern.h>
51 #include <vm/vm_map.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 #include <vm/vm_pagequeue.h>
55 #include <vm/vm_phys.h>
56 #include <vm/vm_radix.h>
57
58 #include <machine/kexec.h>
59
60 #ifndef KEXEC_MD_PAGES
61 /*
62 * Number of MD pages for extra bookkeeping.
63 * This is a macro because it can be a constant (some architectures make it 0).
64 * It accepts an argument, which is an array of
65 * kexec_segment[KEXEC_SEGMENT_MAX].
66 */
67 #define KEXEC_MD_PAGES(x) 0
68 #endif
69
70 /*
71 * Basic design:
72 *
73 * Given an array of "segment descriptors" stage an image to be loaded and
74 * jumped to at reboot, instead of rebooting via firmware.
75 *
76 * Constraints:
77 * - The segment descriptors' "mem" and "memsz" must each fit within a
78 * vm_phys_seg segment, which can be obtained via the `vm.phys_segs` sysctl.
79 * A single segment cannot span multiple vm_phys_seg segments, even if the
80 * vm_phys_seg segments are adjacent.
81 *
82 * Technical details:
83 *
84 * Take advantage of the VM subsystem and create a vm_object to hold the staged
85 * image. When grabbing pages for the object, sort the pages so that if a page
86 * in the object is located in the physical range of any of the kexec segment
87 * targets then it gets placed at the pindex corresponding to that physical
88 * address. This avoids the chance of corruption by writing over the page in
89 * the final copy, or the need for a copy buffer page.
90 */
91
92 static struct kexec_image staged_image;
93 static vm_offset_t stage_addr;
94 static vm_object_t kexec_obj;
95
96 static eventhandler_tag kexec_reboot_handler;
97 static struct mtx kexec_mutex;
98
99 static MALLOC_DEFINE(M_KEXEC, "kexec", "Kexec segments");
100
101
102 static void
kexec_reboot(void * junk __unused,int howto)103 kexec_reboot(void *junk __unused, int howto)
104 {
105 if ((howto & RB_KEXEC) == 0 || kexec_obj == NULL)
106 return;
107
108 #ifdef SMP
109 cpu_mp_stop();
110 #endif /* SMP */
111 intr_disable();
112 printf("Starting kexec reboot\n");
113
114 scheduler_stopped = true;
115 kexec_reboot_md(&staged_image);
116 }
117
118 MTX_SYSINIT(kexec_mutex, &kexec_mutex, "kexec", MTX_DEF);
119
120 /* Sort the segment list once copied in */
121 static int
seg_cmp(const void * seg1,const void * seg2)122 seg_cmp(const void *seg1, const void *seg2)
123 {
124 const struct kexec_segment *s1, *s2;
125
126 s1 = seg1;
127 s2 = seg2;
128
129 return ((uintptr_t)s1->mem - (uintptr_t)s2->mem);
130 }
131
132 static bool
segment_fits(struct kexec_segment * seg)133 segment_fits(struct kexec_segment *seg)
134 {
135 vm_paddr_t v = (vm_paddr_t)(uintptr_t)seg->mem;
136
137 for (int i = 0; i < vm_phys_nsegs; i++) {
138 if (v >= vm_phys_segs[i].start &&
139 (v + seg->memsz - 1) <= vm_phys_segs[i].end)
140 return (true);
141 }
142
143 return (false);
144 }
145
146 static vm_paddr_t
pa_for_pindex(struct kexec_segment_stage * segs,int count,vm_pindex_t pind)147 pa_for_pindex(struct kexec_segment_stage *segs, int count, vm_pindex_t pind)
148 {
149 for (int i = count; i > 0; --i) {
150 if (pind >= segs[i - 1].pindex)
151 return (ptoa(pind - segs[i-1].pindex) + segs[i - 1].target);
152 }
153
154 panic("No segment for pindex %ju\n", (uintmax_t)pind);
155 }
156
157 /*
158 * For now still tied to the system call, so assumes all memory is userspace.
159 */
160 int
kern_kexec_load(struct thread * td,u_long entry,u_long nseg,struct kexec_segment * seg,u_long flags)161 kern_kexec_load(struct thread *td, u_long entry, u_long nseg,
162 struct kexec_segment *seg, u_long flags)
163 {
164 static int kexec_loading;
165 struct kexec_segment segtmp[KEXEC_SEGMENT_MAX];
166 struct kexec_image *new_image_stage = 0;
167 vm_object_t new_segments = NULL;
168 uint8_t *buf;
169 int err = 0;
170 int i;
171 const size_t segsize = nseg * sizeof(struct kexec_segment);
172 vm_page_t *page_list = 0;
173 vm_size_t image_count, md_pages, page_count, tmpsize;
174 vm_offset_t segment_va = 0;
175 /*
176 * - Do any sanity checking
177 * - Load the new segments to temporary
178 * - Remove the old segments
179 * - Install the new segments
180 */
181
182 if (nseg > KEXEC_SEGMENT_MAX)
183 return (EINVAL);
184
185 if (atomic_cmpset_acq_int(&kexec_loading, false, true) == 0)
186 return (EBUSY);
187
188 /* Only do error checking if we're installing new segments. */
189 if (nseg > 0) {
190 /* Create the new kexec object before destroying the old one. */
191 bzero(&segtmp, sizeof(segtmp));
192 err = copyin(seg, segtmp, segsize);
193 if (err != 0)
194 goto out;
195 qsort(segtmp, nseg, sizeof(*segtmp), seg_cmp);
196 new_image_stage = malloc(sizeof(*new_image_stage), M_TEMP, M_WAITOK | M_ZERO);
197 /*
198 * Sanity checking:
199 * - All segments must not overlap the kernel, so must be fully enclosed
200 * in a vm_phys_seg (each kexec segment must be in a single
201 * vm_phys_seg segment, cannot cross even adjacent segments).
202 */
203 image_count = 0;
204 for (i = 0; i < nseg; i++) {
205 if (!segment_fits(&segtmp[i]) ||
206 segtmp[i].bufsz > segtmp[i].memsz) {
207 err = EINVAL;
208 goto out;
209 }
210 new_image_stage->segments[i].pindex = image_count;
211 new_image_stage->segments[i].target = (vm_offset_t)segtmp[i].mem;
212 new_image_stage->segments[i].size = segtmp[i].memsz;
213 image_count += atop(segtmp[i].memsz);
214 }
215 md_pages = KEXEC_MD_PAGES(segtmp);
216 page_count = image_count + md_pages;
217 new_segments = vm_object_allocate(OBJT_PHYS, page_count);
218 page_list = malloc(page_count * sizeof(vm_page_t), M_TEMP, M_WAITOK);
219
220 /*
221 * - Grab all pages for all segments (use pindex to slice it)
222 * - Walk the list (once)
223 * - At each pindex, check if the target PA that corresponds
224 * to that index is in the object. If so, swap the pages.
225 * - At the end of this the list will be "best" sorted.
226 */
227 vm_page_grab_pages_unlocked(new_segments, 0,
228 VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO,
229 page_list, page_count);
230
231 /* Sort the pages to best match the PA */
232 VM_OBJECT_WLOCK(new_segments);
233 for (i = 0; i < image_count; i++) {
234 vm_page_t curpg, otherpg, tmp;
235 vm_pindex_t otheridx;
236
237 curpg = page_list[i];
238 otherpg = PHYS_TO_VM_PAGE(pa_for_pindex(new_image_stage->segments,
239 nseg, curpg->pindex));
240 otheridx = otherpg->pindex;
241
242 if (otherpg->object == new_segments) {
243 /*
244 * Swap 'curpg' and 'otherpg', since 'otherpg'
245 * is at the PA 'curpg' covers.
246 */
247 vm_radix_remove(&new_segments->rtree, otheridx);
248 vm_radix_remove(&new_segments->rtree, i);
249 otherpg->pindex = i;
250 curpg->pindex = otheridx;
251 vm_radix_insert(&new_segments->rtree, curpg);
252 vm_radix_insert(&new_segments->rtree, otherpg);
253 tmp = curpg;
254 page_list[i] = otherpg;
255 page_list[otheridx] = tmp;
256 }
257 }
258 for (i = 0; i < nseg; i++) {
259 new_image_stage->segments[i].first_page =
260 vm_radix_lookup(&new_segments->rtree,
261 new_image_stage->segments[i].pindex);
262 }
263 if (md_pages > 0)
264 new_image_stage->first_md_page =
265 vm_radix_lookup(&new_segments->rtree,
266 page_count - md_pages);
267 else
268 new_image_stage->first_md_page = NULL;
269 VM_OBJECT_WUNLOCK(new_segments);
270
271 /* Map the object to do the copies */
272 err = vm_map_find(kernel_map, new_segments, 0, &segment_va,
273 ptoa(page_count), 0, VMFS_ANY_SPACE,
274 VM_PROT_RW, VM_PROT_RW, MAP_PREFAULT);
275 if (err != 0)
276 goto out;
277 buf = (void *)segment_va;
278 new_image_stage->map_addr = segment_va;
279 new_image_stage->map_size = ptoa(new_segments->size);
280 new_image_stage->entry = entry;
281 new_image_stage->map_obj = new_segments;
282 for (i = 0; i < nseg; i++) {
283 err = copyin(segtmp[i].buf, buf, segtmp[i].bufsz);
284 if (err != 0) {
285 goto out;
286 }
287 new_image_stage->segments[i].map_buf = buf;
288 buf += segtmp[i].bufsz;
289 tmpsize = segtmp[i].memsz - segtmp[i].bufsz;
290 if (tmpsize > 0)
291 memset(buf, 0, tmpsize);
292 buf += tmpsize;
293 }
294 /* What's left are the MD pages, so zero them all out. */
295 if (md_pages > 0)
296 bzero(buf, ptoa(md_pages));
297
298 cpu_flush_dcache((void *)segment_va, ptoa(page_count));
299 if ((err = kexec_load_md(new_image_stage)) != 0)
300 goto out;
301 }
302 if (kexec_obj != NULL) {
303 vm_object_unwire(kexec_obj, 0, kexec_obj->size, 0);
304 KASSERT(stage_addr != 0, ("Mapped kexec_obj without address"));
305 vm_map_remove(kernel_map, stage_addr, stage_addr + kexec_obj->size);
306 }
307 kexec_obj = new_segments;
308 bzero(&staged_image, sizeof(staged_image));
309 if (nseg > 0)
310 memcpy(&staged_image, new_image_stage, sizeof(*new_image_stage));
311
312 printf("trampoline at %#jx\n", (uintmax_t)staged_image.entry);
313 if (nseg > 0) {
314 if (kexec_reboot_handler == NULL)
315 kexec_reboot_handler =
316 EVENTHANDLER_REGISTER(shutdown_final, kexec_reboot, NULL,
317 SHUTDOWN_PRI_DEFAULT - 150);
318 } else {
319 if (kexec_reboot_handler != NULL)
320 EVENTHANDLER_DEREGISTER(shutdown_final, kexec_reboot_handler);
321 }
322 out:
323 /* Clean up the mess if we've gotten far. */
324 if (err != 0 && new_segments != NULL) {
325 vm_object_unwire(new_segments, 0, new_segments->size, 0);
326 if (segment_va != 0)
327 vm_map_remove(kernel_map, segment_va, segment_va + kexec_obj->size);
328 else
329 vm_object_deallocate(new_segments);
330 }
331 atomic_store_rel_int(&kexec_loading, false);
332 if (new_image_stage != NULL)
333 free(new_image_stage, M_TEMP);
334 if (page_list != 0)
335 free(page_list, M_TEMP);
336
337 return (err);
338 }
339
340 int
sys_kexec_load(struct thread * td,struct kexec_load_args * uap)341 sys_kexec_load(struct thread *td, struct kexec_load_args *uap)
342 {
343 int error;
344
345 // FIXME: Do w need a better privilege check than PRIV_REBOOT here?
346 error = priv_check(td, PRIV_REBOOT);
347 if (error != 0)
348 return (error);
349 return (kern_kexec_load(td, uap->entry, uap->nseg, uap->segments, uap->flags));
350 }
351