1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2025 Juniper Networks, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/bus.h> 31 #include <sys/eventhandler.h> 32 #include <sys/kernel.h> 33 #ifdef INTRNG 34 #include <sys/intr.h> 35 #endif 36 #include <sys/kexec.h> 37 #include <sys/malloc.h> 38 #include <sys/proc.h> 39 #include <sys/priv.h> 40 #include <sys/reboot.h> 41 #include <sys/rman.h> 42 #include <sys/rwlock.h> 43 #include <sys/smp.h> 44 #include <sys/syscallsubr.h> 45 #include <sys/sysproto.h> 46 47 #include <vm/vm.h> 48 #include <vm/pmap.h> 49 #include <vm/vm_extern.h> 50 #include <vm/vm_kern.h> 51 #include <vm/vm_map.h> 52 #include <vm/vm_object.h> 53 #include <vm/vm_page.h> 54 #include <vm/vm_pagequeue.h> 55 #include <vm/vm_phys.h> 56 #include <vm/vm_radix.h> 57 58 #include <machine/kexec.h> 59 60 #ifndef KEXEC_MD_PAGES 61 /* 62 * Number of MD pages for extra bookkeeping. 63 * This is a macro because it can be a constant (some architectures make it 0). 64 * It accepts an argument, which is an array of 65 * kexec_segment[KEXEC_SEGMENT_MAX]. 66 */ 67 #define KEXEC_MD_PAGES(x) 0 68 #endif 69 70 /* 71 * Basic design: 72 * 73 * Given an array of "segment descriptors" stage an image to be loaded and 74 * jumped to at reboot, instead of rebooting via firmware. 75 * 76 * Constraints: 77 * - The segment descriptors' "mem" and "memsz" must each fit within a 78 * vm_phys_seg segment, which can be obtained via the `vm.phys_segs` sysctl. 79 * A single segment cannot span multiple vm_phys_seg segments, even if the 80 * vm_phys_seg segments are adjacent. 81 * 82 * Technical details: 83 * 84 * Take advantage of the VM subsystem and create a vm_object to hold the staged 85 * image. When grabbing pages for the object, sort the pages so that if a page 86 * in the object is located in the physical range of any of the kexec segment 87 * targets then it gets placed at the pindex corresponding to that physical 88 * address. This avoids the chance of corruption by writing over the page in 89 * the final copy, or the need for a copy buffer page. 90 */ 91 92 static struct kexec_image staged_image; 93 static vm_offset_t stage_addr; 94 static vm_object_t kexec_obj; 95 96 static eventhandler_tag kexec_reboot_handler; 97 static struct mtx kexec_mutex; 98 99 static MALLOC_DEFINE(M_KEXEC, "kexec", "Kexec segments"); 100 101 102 static void 103 kexec_reboot(void *junk __unused, int howto) 104 { 105 if ((howto & RB_KEXEC) == 0 || kexec_obj == NULL) 106 return; 107 108 #ifdef SMP 109 cpu_mp_stop(); 110 #endif /* SMP */ 111 intr_disable(); 112 printf("Starting kexec reboot\n"); 113 114 scheduler_stopped = true; 115 kexec_reboot_md(&staged_image); 116 } 117 118 MTX_SYSINIT(kexec_mutex, &kexec_mutex, "kexec", MTX_DEF); 119 120 /* Sort the segment list once copied in */ 121 static int 122 seg_cmp(const void *seg1, const void *seg2) 123 { 124 const struct kexec_segment *s1, *s2; 125 126 s1 = seg1; 127 s2 = seg2; 128 129 return ((uintptr_t)s1->mem - (uintptr_t)s2->mem); 130 } 131 132 static bool 133 segment_fits(struct kexec_segment *seg) 134 { 135 vm_paddr_t v = (vm_paddr_t)(uintptr_t)seg->mem; 136 137 for (int i = 0; i < vm_phys_nsegs; i++) { 138 if (v >= vm_phys_segs[i].start && 139 (v + seg->memsz - 1) <= vm_phys_segs[i].end) 140 return (true); 141 } 142 143 return (false); 144 } 145 146 static vm_paddr_t 147 pa_for_pindex(struct kexec_segment_stage *segs, int count, vm_pindex_t pind) 148 { 149 for (int i = count; i > 0; --i) { 150 if (pind >= segs[i - 1].pindex) 151 return (ptoa(pind - segs[i-1].pindex) + segs[i - 1].target); 152 } 153 154 panic("No segment for pindex %ju\n", (uintmax_t)pind); 155 } 156 157 /* 158 * For now still tied to the system call, so assumes all memory is userspace. 159 */ 160 int 161 kern_kexec_load(struct thread *td, u_long entry, u_long nseg, 162 struct kexec_segment *seg, u_long flags) 163 { 164 static int kexec_loading; 165 struct kexec_segment segtmp[KEXEC_SEGMENT_MAX]; 166 struct kexec_image *new_image_stage = 0; 167 vm_object_t new_segments = NULL; 168 uint8_t *buf; 169 int err = 0; 170 int i; 171 const size_t segsize = nseg * sizeof(struct kexec_segment); 172 vm_page_t *page_list = 0; 173 vm_size_t image_count, md_pages, page_count, tmpsize; 174 vm_offset_t segment_va = 0; 175 /* 176 * - Do any sanity checking 177 * - Load the new segments to temporary 178 * - Remove the old segments 179 * - Install the new segments 180 */ 181 182 if (nseg > KEXEC_SEGMENT_MAX) 183 return (EINVAL); 184 185 if (atomic_cmpset_acq_int(&kexec_loading, false, true) == 0) 186 return (EBUSY); 187 188 /* Only do error checking if we're installing new segments. */ 189 if (nseg > 0) { 190 /* Create the new kexec object before destroying the old one. */ 191 bzero(&segtmp, sizeof(segtmp)); 192 err = copyin(seg, segtmp, segsize); 193 if (err != 0) 194 goto out; 195 qsort(segtmp, nseg, sizeof(*segtmp), seg_cmp); 196 new_image_stage = malloc(sizeof(*new_image_stage), M_TEMP, M_WAITOK | M_ZERO); 197 /* 198 * Sanity checking: 199 * - All segments must not overlap the kernel, so must be fully enclosed 200 * in a vm_phys_seg (each kexec segment must be in a single 201 * vm_phys_seg segment, cannot cross even adjacent segments). 202 */ 203 image_count = 0; 204 for (i = 0; i < nseg; i++) { 205 if (!segment_fits(&segtmp[i]) || 206 segtmp[i].bufsz > segtmp[i].memsz) { 207 err = EINVAL; 208 goto out; 209 } 210 new_image_stage->segments[i].pindex = image_count; 211 new_image_stage->segments[i].target = (vm_offset_t)segtmp[i].mem; 212 new_image_stage->segments[i].size = segtmp[i].memsz; 213 image_count += atop(segtmp[i].memsz); 214 } 215 md_pages = KEXEC_MD_PAGES(segtmp); 216 page_count = image_count + md_pages; 217 new_segments = vm_object_allocate(OBJT_PHYS, page_count); 218 page_list = malloc(page_count * sizeof(vm_page_t), M_TEMP, M_WAITOK); 219 220 /* 221 * - Grab all pages for all segments (use pindex to slice it) 222 * - Walk the list (once) 223 * - At each pindex, check if the target PA that corresponds 224 * to that index is in the object. If so, swap the pages. 225 * - At the end of this the list will be "best" sorted. 226 */ 227 vm_page_grab_pages_unlocked(new_segments, 0, 228 VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO, 229 page_list, page_count); 230 231 /* Sort the pages to best match the PA */ 232 VM_OBJECT_WLOCK(new_segments); 233 for (i = 0; i < image_count; i++) { 234 vm_page_t curpg, otherpg, tmp; 235 vm_pindex_t otheridx; 236 237 curpg = page_list[i]; 238 otherpg = PHYS_TO_VM_PAGE(pa_for_pindex(new_image_stage->segments, 239 nseg, curpg->pindex)); 240 otheridx = otherpg->pindex; 241 242 if (otherpg->object == new_segments) { 243 /* 244 * Swap 'curpg' and 'otherpg', since 'otherpg' 245 * is at the PA 'curpg' covers. 246 */ 247 vm_radix_remove(&new_segments->rtree, otheridx); 248 vm_radix_remove(&new_segments->rtree, i); 249 otherpg->pindex = i; 250 curpg->pindex = otheridx; 251 vm_radix_insert(&new_segments->rtree, curpg); 252 vm_radix_insert(&new_segments->rtree, otherpg); 253 tmp = curpg; 254 page_list[i] = otherpg; 255 page_list[otheridx] = tmp; 256 } 257 } 258 for (i = 0; i < nseg; i++) { 259 new_image_stage->segments[i].first_page = 260 vm_radix_lookup(&new_segments->rtree, 261 new_image_stage->segments[i].pindex); 262 } 263 if (md_pages > 0) 264 new_image_stage->first_md_page = 265 vm_radix_lookup(&new_segments->rtree, 266 page_count - md_pages); 267 else 268 new_image_stage->first_md_page = NULL; 269 VM_OBJECT_WUNLOCK(new_segments); 270 271 /* Map the object to do the copies */ 272 err = vm_map_find(kernel_map, new_segments, 0, &segment_va, 273 ptoa(page_count), 0, VMFS_ANY_SPACE, 274 VM_PROT_RW, VM_PROT_RW, MAP_PREFAULT); 275 if (err != 0) 276 goto out; 277 buf = (void *)segment_va; 278 new_image_stage->map_addr = segment_va; 279 new_image_stage->map_size = ptoa(new_segments->size); 280 new_image_stage->entry = entry; 281 new_image_stage->map_obj = new_segments; 282 for (i = 0; i < nseg; i++) { 283 err = copyin(segtmp[i].buf, buf, segtmp[i].bufsz); 284 if (err != 0) { 285 goto out; 286 } 287 new_image_stage->segments[i].map_buf = buf; 288 buf += segtmp[i].bufsz; 289 tmpsize = segtmp[i].memsz - segtmp[i].bufsz; 290 if (tmpsize > 0) 291 memset(buf, 0, tmpsize); 292 buf += tmpsize; 293 } 294 /* What's left are the MD pages, so zero them all out. */ 295 if (md_pages > 0) 296 bzero(buf, ptoa(md_pages)); 297 298 cpu_flush_dcache((void *)segment_va, ptoa(page_count)); 299 if ((err = kexec_load_md(new_image_stage)) != 0) 300 goto out; 301 } 302 if (kexec_obj != NULL) { 303 vm_object_unwire(kexec_obj, 0, kexec_obj->size, 0); 304 KASSERT(stage_addr != 0, ("Mapped kexec_obj without address")); 305 vm_map_remove(kernel_map, stage_addr, stage_addr + kexec_obj->size); 306 } 307 kexec_obj = new_segments; 308 bzero(&staged_image, sizeof(staged_image)); 309 if (nseg > 0) 310 memcpy(&staged_image, new_image_stage, sizeof(*new_image_stage)); 311 312 printf("trampoline at %#jx\n", (uintmax_t)staged_image.entry); 313 if (nseg > 0) { 314 if (kexec_reboot_handler == NULL) 315 kexec_reboot_handler = 316 EVENTHANDLER_REGISTER(shutdown_final, kexec_reboot, NULL, 317 SHUTDOWN_PRI_DEFAULT - 150); 318 } else { 319 if (kexec_reboot_handler != NULL) 320 EVENTHANDLER_DEREGISTER(shutdown_final, kexec_reboot_handler); 321 } 322 out: 323 /* Clean up the mess if we've gotten far. */ 324 if (err != 0 && new_segments != NULL) { 325 vm_object_unwire(new_segments, 0, new_segments->size, 0); 326 if (segment_va != 0) 327 vm_map_remove(kernel_map, segment_va, segment_va + kexec_obj->size); 328 else 329 vm_object_deallocate(new_segments); 330 } 331 atomic_store_rel_int(&kexec_loading, false); 332 if (new_image_stage != NULL) 333 free(new_image_stage, M_TEMP); 334 if (page_list != 0) 335 free(page_list, M_TEMP); 336 337 return (err); 338 } 339 340 int 341 sys_kexec_load(struct thread *td, struct kexec_load_args *uap) 342 { 343 int error; 344 345 // FIXME: Do w need a better privilege check than PRIV_REBOOT here? 346 error = priv_check(td, PRIV_REBOOT); 347 if (error != 0) 348 return (error); 349 return (kern_kexec_load(td, uap->entry, uap->nseg, uap->segments, uap->flags)); 350 } 351