xref: /freebsd/sys/kern/kern_shutdown.c (revision 480093f4440d54b30b3025afeac24b48f2ba7a2e)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1986, 1988, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)kern_shutdown.c	8.3 (Berkeley) 1/21/94
37  */
38 
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
41 
42 #include "opt_ddb.h"
43 #include "opt_ekcd.h"
44 #include "opt_kdb.h"
45 #include "opt_panic.h"
46 #include "opt_printf.h"
47 #include "opt_sched.h"
48 #include "opt_watchdog.h"
49 
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/bio.h>
53 #include <sys/buf.h>
54 #include <sys/conf.h>
55 #include <sys/compressor.h>
56 #include <sys/cons.h>
57 #include <sys/disk.h>
58 #include <sys/eventhandler.h>
59 #include <sys/filedesc.h>
60 #include <sys/jail.h>
61 #include <sys/kdb.h>
62 #include <sys/kernel.h>
63 #include <sys/kerneldump.h>
64 #include <sys/kthread.h>
65 #include <sys/ktr.h>
66 #include <sys/malloc.h>
67 #include <sys/mbuf.h>
68 #include <sys/mount.h>
69 #include <sys/priv.h>
70 #include <sys/proc.h>
71 #include <sys/reboot.h>
72 #include <sys/resourcevar.h>
73 #include <sys/rwlock.h>
74 #include <sys/sbuf.h>
75 #include <sys/sched.h>
76 #include <sys/smp.h>
77 #include <sys/sysctl.h>
78 #include <sys/sysproto.h>
79 #include <sys/taskqueue.h>
80 #include <sys/vnode.h>
81 #include <sys/watchdog.h>
82 
83 #include <crypto/chacha20/chacha.h>
84 #include <crypto/rijndael/rijndael-api-fst.h>
85 #include <crypto/sha2/sha256.h>
86 
87 #include <ddb/ddb.h>
88 
89 #include <machine/cpu.h>
90 #include <machine/dump.h>
91 #include <machine/pcb.h>
92 #include <machine/smp.h>
93 
94 #include <security/mac/mac_framework.h>
95 
96 #include <vm/vm.h>
97 #include <vm/vm_object.h>
98 #include <vm/vm_page.h>
99 #include <vm/vm_pager.h>
100 #include <vm/swap_pager.h>
101 
102 #include <sys/signalvar.h>
103 
104 static MALLOC_DEFINE(M_DUMPER, "dumper", "dumper block buffer");
105 
106 #ifndef PANIC_REBOOT_WAIT_TIME
107 #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
108 #endif
109 static int panic_reboot_wait_time = PANIC_REBOOT_WAIT_TIME;
110 SYSCTL_INT(_kern, OID_AUTO, panic_reboot_wait_time, CTLFLAG_RWTUN,
111     &panic_reboot_wait_time, 0,
112     "Seconds to wait before rebooting after a panic");
113 
114 /*
115  * Note that stdarg.h and the ANSI style va_start macro is used for both
116  * ANSI and traditional C compilers.
117  */
118 #include <machine/stdarg.h>
119 
120 #ifdef KDB
121 #ifdef KDB_UNATTENDED
122 static int debugger_on_panic = 0;
123 #else
124 static int debugger_on_panic = 1;
125 #endif
126 SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic,
127     CTLFLAG_RWTUN | CTLFLAG_SECURE,
128     &debugger_on_panic, 0, "Run debugger on kernel panic");
129 
130 int debugger_on_trap = 0;
131 SYSCTL_INT(_debug, OID_AUTO, debugger_on_trap,
132     CTLFLAG_RWTUN | CTLFLAG_SECURE,
133     &debugger_on_trap, 0, "Run debugger on kernel trap before panic");
134 
135 #ifdef KDB_TRACE
136 static int trace_on_panic = 1;
137 static bool trace_all_panics = true;
138 #else
139 static int trace_on_panic = 0;
140 static bool trace_all_panics = false;
141 #endif
142 SYSCTL_INT(_debug, OID_AUTO, trace_on_panic,
143     CTLFLAG_RWTUN | CTLFLAG_SECURE,
144     &trace_on_panic, 0, "Print stack trace on kernel panic");
145 SYSCTL_BOOL(_debug, OID_AUTO, trace_all_panics, CTLFLAG_RWTUN,
146     &trace_all_panics, 0, "Print stack traces on secondary kernel panics");
147 #endif /* KDB */
148 
149 static int sync_on_panic = 0;
150 SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RWTUN,
151 	&sync_on_panic, 0, "Do a sync before rebooting from a panic");
152 
153 static bool poweroff_on_panic = 0;
154 SYSCTL_BOOL(_kern, OID_AUTO, poweroff_on_panic, CTLFLAG_RWTUN,
155 	&poweroff_on_panic, 0, "Do a power off instead of a reboot on a panic");
156 
157 static bool powercycle_on_panic = 0;
158 SYSCTL_BOOL(_kern, OID_AUTO, powercycle_on_panic, CTLFLAG_RWTUN,
159 	&powercycle_on_panic, 0, "Do a power cycle instead of a reboot on a panic");
160 
161 static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0,
162     "Shutdown environment");
163 
164 #ifndef DIAGNOSTIC
165 static int show_busybufs;
166 #else
167 static int show_busybufs = 1;
168 #endif
169 SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW,
170 	&show_busybufs, 0, "");
171 
172 int suspend_blocked = 0;
173 SYSCTL_INT(_kern, OID_AUTO, suspend_blocked, CTLFLAG_RW,
174 	&suspend_blocked, 0, "Block suspend due to a pending shutdown");
175 
176 #ifdef EKCD
177 FEATURE(ekcd, "Encrypted kernel crash dumps support");
178 
179 MALLOC_DEFINE(M_EKCD, "ekcd", "Encrypted kernel crash dumps data");
180 
181 struct kerneldumpcrypto {
182 	uint8_t			kdc_encryption;
183 	uint8_t			kdc_iv[KERNELDUMP_IV_MAX_SIZE];
184 	union {
185 		struct {
186 			keyInstance	aes_ki;
187 			cipherInstance	aes_ci;
188 		} u_aes;
189 		struct chacha_ctx	u_chacha;
190 	} u;
191 #define	kdc_ki	u.u_aes.aes_ki
192 #define	kdc_ci	u.u_aes.aes_ci
193 #define	kdc_chacha	u.u_chacha
194 	uint32_t		kdc_dumpkeysize;
195 	struct kerneldumpkey	kdc_dumpkey[];
196 };
197 #endif
198 
199 struct kerneldumpcomp {
200 	uint8_t			kdc_format;
201 	struct compressor	*kdc_stream;
202 	uint8_t			*kdc_buf;
203 	size_t			kdc_resid;
204 };
205 
206 static struct kerneldumpcomp *kerneldumpcomp_create(struct dumperinfo *di,
207 		    uint8_t compression);
208 static void	kerneldumpcomp_destroy(struct dumperinfo *di);
209 static int	kerneldumpcomp_write_cb(void *base, size_t len, off_t off, void *arg);
210 
211 static int kerneldump_gzlevel = 6;
212 SYSCTL_INT(_kern, OID_AUTO, kerneldump_gzlevel, CTLFLAG_RWTUN,
213     &kerneldump_gzlevel, 0,
214     "Kernel crash dump compression level");
215 
216 /*
217  * Variable panicstr contains argument to first call to panic; used as flag
218  * to indicate that the kernel has already called panic.
219  */
220 const char *panicstr;
221 bool __read_frequently panicked;
222 
223 int __read_mostly dumping;		/* system is dumping */
224 int rebooting;				/* system is rebooting */
225 /*
226  * Used to serialize between sysctl kern.shutdown.dumpdevname and list
227  * modifications via ioctl.
228  */
229 static struct mtx dumpconf_list_lk;
230 MTX_SYSINIT(dumper_configs, &dumpconf_list_lk, "dumper config list", MTX_DEF);
231 
232 /* Our selected dumper(s). */
233 static TAILQ_HEAD(dumpconflist, dumperinfo) dumper_configs =
234     TAILQ_HEAD_INITIALIZER(dumper_configs);
235 
236 /* Context information for dump-debuggers. */
237 static struct pcb dumppcb;		/* Registers. */
238 lwpid_t dumptid;			/* Thread ID. */
239 
240 static struct cdevsw reroot_cdevsw = {
241      .d_version = D_VERSION,
242      .d_name    = "reroot",
243 };
244 
245 static void poweroff_wait(void *, int);
246 static void shutdown_halt(void *junk, int howto);
247 static void shutdown_panic(void *junk, int howto);
248 static void shutdown_reset(void *junk, int howto);
249 static int kern_reroot(void);
250 
251 /* register various local shutdown events */
252 static void
253 shutdown_conf(void *unused)
254 {
255 
256 	EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL,
257 	    SHUTDOWN_PRI_FIRST);
258 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL,
259 	    SHUTDOWN_PRI_LAST + 100);
260 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL,
261 	    SHUTDOWN_PRI_LAST + 100);
262 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL,
263 	    SHUTDOWN_PRI_LAST + 200);
264 }
265 
266 SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL);
267 
268 /*
269  * The only reason this exists is to create the /dev/reroot/ directory,
270  * used by reroot code in init(8) as a mountpoint for tmpfs.
271  */
272 static void
273 reroot_conf(void *unused)
274 {
275 	int error;
276 	struct cdev *cdev;
277 
278 	error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &cdev,
279 	    &reroot_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "reroot/reroot");
280 	if (error != 0) {
281 		printf("%s: failed to create device node, error %d",
282 		    __func__, error);
283 	}
284 }
285 
286 SYSINIT(reroot_conf, SI_SUB_DEVFS, SI_ORDER_ANY, reroot_conf, NULL);
287 
288 /*
289  * The system call that results in a reboot.
290  */
291 /* ARGSUSED */
292 int
293 sys_reboot(struct thread *td, struct reboot_args *uap)
294 {
295 	int error;
296 
297 	error = 0;
298 #ifdef MAC
299 	error = mac_system_check_reboot(td->td_ucred, uap->opt);
300 #endif
301 	if (error == 0)
302 		error = priv_check(td, PRIV_REBOOT);
303 	if (error == 0) {
304 		if (uap->opt & RB_REROOT)
305 			error = kern_reroot();
306 		else
307 			kern_reboot(uap->opt);
308 	}
309 	return (error);
310 }
311 
312 static void
313 shutdown_nice_task_fn(void *arg, int pending __unused)
314 {
315 	int howto;
316 
317 	howto = (uintptr_t)arg;
318 	/* Send a signal to init(8) and have it shutdown the world. */
319 	PROC_LOCK(initproc);
320 	if (howto & RB_POWEROFF)
321 		kern_psignal(initproc, SIGUSR2);
322 	else if (howto & RB_POWERCYCLE)
323 		kern_psignal(initproc, SIGWINCH);
324 	else if (howto & RB_HALT)
325 		kern_psignal(initproc, SIGUSR1);
326 	else
327 		kern_psignal(initproc, SIGINT);
328 	PROC_UNLOCK(initproc);
329 }
330 
331 static struct task shutdown_nice_task = TASK_INITIALIZER(0,
332     &shutdown_nice_task_fn, NULL);
333 
334 /*
335  * Called by events that want to shut down.. e.g  <CTL><ALT><DEL> on a PC
336  */
337 void
338 shutdown_nice(int howto)
339 {
340 
341 	if (initproc != NULL && !SCHEDULER_STOPPED()) {
342 		shutdown_nice_task.ta_context = (void *)(uintptr_t)howto;
343 		taskqueue_enqueue(taskqueue_fast, &shutdown_nice_task);
344 	} else {
345 		/*
346 		 * No init(8) running, or scheduler would not allow it
347 		 * to run, so simply reboot.
348 		 */
349 		kern_reboot(howto | RB_NOSYNC);
350 	}
351 }
352 
353 static void
354 print_uptime(void)
355 {
356 	int f;
357 	struct timespec ts;
358 
359 	getnanouptime(&ts);
360 	printf("Uptime: ");
361 	f = 0;
362 	if (ts.tv_sec >= 86400) {
363 		printf("%ldd", (long)ts.tv_sec / 86400);
364 		ts.tv_sec %= 86400;
365 		f = 1;
366 	}
367 	if (f || ts.tv_sec >= 3600) {
368 		printf("%ldh", (long)ts.tv_sec / 3600);
369 		ts.tv_sec %= 3600;
370 		f = 1;
371 	}
372 	if (f || ts.tv_sec >= 60) {
373 		printf("%ldm", (long)ts.tv_sec / 60);
374 		ts.tv_sec %= 60;
375 		f = 1;
376 	}
377 	printf("%lds\n", (long)ts.tv_sec);
378 }
379 
380 int
381 doadump(boolean_t textdump)
382 {
383 	boolean_t coredump;
384 	int error;
385 
386 	error = 0;
387 	if (dumping)
388 		return (EBUSY);
389 	if (TAILQ_EMPTY(&dumper_configs))
390 		return (ENXIO);
391 
392 	savectx(&dumppcb);
393 	dumptid = curthread->td_tid;
394 	dumping++;
395 
396 	coredump = TRUE;
397 #ifdef DDB
398 	if (textdump && textdump_pending) {
399 		coredump = FALSE;
400 		textdump_dumpsys(TAILQ_FIRST(&dumper_configs));
401 	}
402 #endif
403 	if (coredump) {
404 		struct dumperinfo *di;
405 
406 		TAILQ_FOREACH(di, &dumper_configs, di_next) {
407 			error = dumpsys(di);
408 			if (error == 0)
409 				break;
410 		}
411 	}
412 
413 	dumping--;
414 	return (error);
415 }
416 
417 /*
418  * Shutdown the system cleanly to prepare for reboot, halt, or power off.
419  */
420 void
421 kern_reboot(int howto)
422 {
423 	static int once = 0;
424 
425 	/*
426 	 * Normal paths here don't hold Giant, but we can wind up here
427 	 * unexpectedly with it held.  Drop it now so we don't have to
428 	 * drop and pick it up elsewhere. The paths it is locking will
429 	 * never be returned to, and it is preferable to preclude
430 	 * deadlock than to lock against code that won't ever
431 	 * continue.
432 	 */
433 	while (mtx_owned(&Giant))
434 		mtx_unlock(&Giant);
435 
436 #if defined(SMP)
437 	/*
438 	 * Bind us to the first CPU so that all shutdown code runs there.  Some
439 	 * systems don't shutdown properly (i.e., ACPI power off) if we
440 	 * run on another processor.
441 	 */
442 	if (!SCHEDULER_STOPPED()) {
443 		thread_lock(curthread);
444 		sched_bind(curthread, CPU_FIRST());
445 		thread_unlock(curthread);
446 		KASSERT(PCPU_GET(cpuid) == CPU_FIRST(),
447 		    ("boot: not running on cpu 0"));
448 	}
449 #endif
450 	/* We're in the process of rebooting. */
451 	rebooting = 1;
452 
453 	/* We are out of the debugger now. */
454 	kdb_active = 0;
455 
456 	/*
457 	 * Do any callouts that should be done BEFORE syncing the filesystems.
458 	 */
459 	EVENTHANDLER_INVOKE(shutdown_pre_sync, howto);
460 
461 	/*
462 	 * Now sync filesystems
463 	 */
464 	if (!cold && (howto & RB_NOSYNC) == 0 && once == 0) {
465 		once = 1;
466 		bufshutdown(show_busybufs);
467 	}
468 
469 	print_uptime();
470 
471 	cngrab();
472 
473 	/*
474 	 * Ok, now do things that assume all filesystem activity has
475 	 * been completed.
476 	 */
477 	EVENTHANDLER_INVOKE(shutdown_post_sync, howto);
478 
479 	if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping)
480 		doadump(TRUE);
481 
482 	/* Now that we're going to really halt the system... */
483 	EVENTHANDLER_INVOKE(shutdown_final, howto);
484 
485 	for(;;) ;	/* safety against shutdown_reset not working */
486 	/* NOTREACHED */
487 }
488 
489 /*
490  * The system call that results in changing the rootfs.
491  */
492 static int
493 kern_reroot(void)
494 {
495 	struct vnode *oldrootvnode, *vp;
496 	struct mount *mp, *devmp;
497 	int error;
498 
499 	if (curproc != initproc)
500 		return (EPERM);
501 
502 	/*
503 	 * Mark the filesystem containing currently-running executable
504 	 * (the temporary copy of init(8)) busy.
505 	 */
506 	vp = curproc->p_textvp;
507 	error = vn_lock(vp, LK_SHARED);
508 	if (error != 0)
509 		return (error);
510 	mp = vp->v_mount;
511 	error = vfs_busy(mp, MBF_NOWAIT);
512 	if (error != 0) {
513 		vfs_ref(mp);
514 		VOP_UNLOCK(vp);
515 		error = vfs_busy(mp, 0);
516 		vn_lock(vp, LK_SHARED | LK_RETRY);
517 		vfs_rel(mp);
518 		if (error != 0) {
519 			VOP_UNLOCK(vp);
520 			return (ENOENT);
521 		}
522 		if (VN_IS_DOOMED(vp)) {
523 			VOP_UNLOCK(vp);
524 			vfs_unbusy(mp);
525 			return (ENOENT);
526 		}
527 	}
528 	VOP_UNLOCK(vp);
529 
530 	/*
531 	 * Remove the filesystem containing currently-running executable
532 	 * from the mount list, to prevent it from being unmounted
533 	 * by vfs_unmountall(), and to avoid confusing vfs_mountroot().
534 	 *
535 	 * Also preserve /dev - forcibly unmounting it could cause driver
536 	 * reinitialization.
537 	 */
538 
539 	vfs_ref(rootdevmp);
540 	devmp = rootdevmp;
541 	rootdevmp = NULL;
542 
543 	mtx_lock(&mountlist_mtx);
544 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
545 	TAILQ_REMOVE(&mountlist, devmp, mnt_list);
546 	mtx_unlock(&mountlist_mtx);
547 
548 	oldrootvnode = rootvnode;
549 
550 	/*
551 	 * Unmount everything except for the two filesystems preserved above.
552 	 */
553 	vfs_unmountall();
554 
555 	/*
556 	 * Add /dev back; vfs_mountroot() will move it into its new place.
557 	 */
558 	mtx_lock(&mountlist_mtx);
559 	TAILQ_INSERT_HEAD(&mountlist, devmp, mnt_list);
560 	mtx_unlock(&mountlist_mtx);
561 	rootdevmp = devmp;
562 	vfs_rel(rootdevmp);
563 
564 	/*
565 	 * Mount the new rootfs.
566 	 */
567 	vfs_mountroot();
568 
569 	/*
570 	 * Update all references to the old rootvnode.
571 	 */
572 	mountcheckdirs(oldrootvnode, rootvnode);
573 
574 	/*
575 	 * Add the temporary filesystem back and unbusy it.
576 	 */
577 	mtx_lock(&mountlist_mtx);
578 	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
579 	mtx_unlock(&mountlist_mtx);
580 	vfs_unbusy(mp);
581 
582 	return (0);
583 }
584 
585 /*
586  * If the shutdown was a clean halt, behave accordingly.
587  */
588 static void
589 shutdown_halt(void *junk, int howto)
590 {
591 
592 	if (howto & RB_HALT) {
593 		printf("\n");
594 		printf("The operating system has halted.\n");
595 		printf("Please press any key to reboot.\n\n");
596 
597 		wdog_kern_pat(WD_TO_NEVER);
598 
599 		switch (cngetc()) {
600 		case -1:		/* No console, just die */
601 			cpu_halt();
602 			/* NOTREACHED */
603 		default:
604 			break;
605 		}
606 	}
607 }
608 
609 /*
610  * Check to see if the system paniced, pause and then reboot
611  * according to the specified delay.
612  */
613 static void
614 shutdown_panic(void *junk, int howto)
615 {
616 	int loop;
617 
618 	if (howto & RB_DUMP) {
619 		if (panic_reboot_wait_time != 0) {
620 			if (panic_reboot_wait_time != -1) {
621 				printf("Automatic reboot in %d seconds - "
622 				       "press a key on the console to abort\n",
623 					panic_reboot_wait_time);
624 				for (loop = panic_reboot_wait_time * 10;
625 				     loop > 0; --loop) {
626 					DELAY(1000 * 100); /* 1/10th second */
627 					/* Did user type a key? */
628 					if (cncheckc() != -1)
629 						break;
630 				}
631 				if (!loop)
632 					return;
633 			}
634 		} else { /* zero time specified - reboot NOW */
635 			return;
636 		}
637 		printf("--> Press a key on the console to reboot,\n");
638 		printf("--> or switch off the system now.\n");
639 		cngetc();
640 	}
641 }
642 
643 /*
644  * Everything done, now reset
645  */
646 static void
647 shutdown_reset(void *junk, int howto)
648 {
649 
650 	printf("Rebooting...\n");
651 	DELAY(1000000);	/* wait 1 sec for printf's to complete and be read */
652 
653 	/*
654 	 * Acquiring smp_ipi_mtx here has a double effect:
655 	 * - it disables interrupts avoiding CPU0 preemption
656 	 *   by fast handlers (thus deadlocking  against other CPUs)
657 	 * - it avoids deadlocks against smp_rendezvous() or, more
658 	 *   generally, threads busy-waiting, with this spinlock held,
659 	 *   and waiting for responses by threads on other CPUs
660 	 *   (ie. smp_tlb_shootdown()).
661 	 *
662 	 * For the !SMP case it just needs to handle the former problem.
663 	 */
664 #ifdef SMP
665 	mtx_lock_spin(&smp_ipi_mtx);
666 #else
667 	spinlock_enter();
668 #endif
669 
670 	/* cpu_boot(howto); */ /* doesn't do anything at the moment */
671 	cpu_reset();
672 	/* NOTREACHED */ /* assuming reset worked */
673 }
674 
675 #if defined(WITNESS) || defined(INVARIANT_SUPPORT)
676 static int kassert_warn_only = 0;
677 #ifdef KDB
678 static int kassert_do_kdb = 0;
679 #endif
680 #ifdef KTR
681 static int kassert_do_ktr = 0;
682 #endif
683 static int kassert_do_log = 1;
684 static int kassert_log_pps_limit = 4;
685 static int kassert_log_mute_at = 0;
686 static int kassert_log_panic_at = 0;
687 static int kassert_suppress_in_panic = 0;
688 static int kassert_warnings = 0;
689 
690 SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options");
691 
692 #ifdef KASSERT_PANIC_OPTIONAL
693 #define KASSERT_RWTUN	CTLFLAG_RWTUN
694 #else
695 #define KASSERT_RWTUN	CTLFLAG_RDTUN
696 #endif
697 
698 SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, KASSERT_RWTUN,
699     &kassert_warn_only, 0,
700     "KASSERT triggers a panic (0) or just a warning (1)");
701 
702 #ifdef KDB
703 SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, KASSERT_RWTUN,
704     &kassert_do_kdb, 0, "KASSERT will enter the debugger");
705 #endif
706 
707 #ifdef KTR
708 SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, KASSERT_RWTUN,
709     &kassert_do_ktr, 0,
710     "KASSERT does a KTR, set this to the KTRMASK you want");
711 #endif
712 
713 SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, KASSERT_RWTUN,
714     &kassert_do_log, 0,
715     "If warn_only is enabled, log (1) or do not log (0) assertion violations");
716 
717 SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RD | CTLFLAG_STATS,
718     &kassert_warnings, 0, "number of KASSERTs that have been triggered");
719 
720 SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, KASSERT_RWTUN,
721     &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic");
722 
723 SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, KASSERT_RWTUN,
724     &kassert_log_pps_limit, 0, "limit number of log messages per second");
725 
726 SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, KASSERT_RWTUN,
727     &kassert_log_mute_at, 0, "max number of KASSERTS to log");
728 
729 SYSCTL_INT(_debug_kassert, OID_AUTO, suppress_in_panic, KASSERT_RWTUN,
730     &kassert_suppress_in_panic, 0,
731     "KASSERTs will be suppressed while handling a panic");
732 #undef KASSERT_RWTUN
733 
734 static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS);
735 
736 SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert,
737     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
738     kassert_sysctl_kassert, "I", "set to trigger a test kassert");
739 
740 static int
741 kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS)
742 {
743 	int error, i;
744 
745 	error = sysctl_wire_old_buffer(req, sizeof(int));
746 	if (error == 0) {
747 		i = 0;
748 		error = sysctl_handle_int(oidp, &i, 0, req);
749 	}
750 	if (error != 0 || req->newptr == NULL)
751 		return (error);
752 	KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i));
753 	return (0);
754 }
755 
756 #ifdef KASSERT_PANIC_OPTIONAL
757 /*
758  * Called by KASSERT, this decides if we will panic
759  * or if we will log via printf and/or ktr.
760  */
761 void
762 kassert_panic(const char *fmt, ...)
763 {
764 	static char buf[256];
765 	va_list ap;
766 
767 	va_start(ap, fmt);
768 	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
769 	va_end(ap);
770 
771 	/*
772 	 * If we are suppressing secondary panics, log the warning but do not
773 	 * re-enter panic/kdb.
774 	 */
775 	if (panicstr != NULL && kassert_suppress_in_panic) {
776 		if (kassert_do_log) {
777 			printf("KASSERT failed: %s\n", buf);
778 #ifdef KDB
779 			if (trace_all_panics && trace_on_panic)
780 				kdb_backtrace();
781 #endif
782 		}
783 		return;
784 	}
785 
786 	/*
787 	 * panic if we're not just warning, or if we've exceeded
788 	 * kassert_log_panic_at warnings.
789 	 */
790 	if (!kassert_warn_only ||
791 	    (kassert_log_panic_at > 0 &&
792 	     kassert_warnings >= kassert_log_panic_at)) {
793 		va_start(ap, fmt);
794 		vpanic(fmt, ap);
795 		/* NORETURN */
796 	}
797 #ifdef KTR
798 	if (kassert_do_ktr)
799 		CTR0(ktr_mask, buf);
800 #endif /* KTR */
801 	/*
802 	 * log if we've not yet met the mute limit.
803 	 */
804 	if (kassert_do_log &&
805 	    (kassert_log_mute_at == 0 ||
806 	     kassert_warnings < kassert_log_mute_at)) {
807 		static  struct timeval lasterr;
808 		static  int curerr;
809 
810 		if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) {
811 			printf("KASSERT failed: %s\n", buf);
812 			kdb_backtrace();
813 		}
814 	}
815 #ifdef KDB
816 	if (kassert_do_kdb) {
817 		kdb_enter(KDB_WHY_KASSERT, buf);
818 	}
819 #endif
820 	atomic_add_int(&kassert_warnings, 1);
821 }
822 #endif /* KASSERT_PANIC_OPTIONAL */
823 #endif
824 
825 /*
826  * Panic is called on unresolvable fatal errors.  It prints "panic: mesg",
827  * and then reboots.  If we are called twice, then we avoid trying to sync
828  * the disks as this often leads to recursive panics.
829  */
830 void
831 panic(const char *fmt, ...)
832 {
833 	va_list ap;
834 
835 	va_start(ap, fmt);
836 	vpanic(fmt, ap);
837 }
838 
839 void
840 vpanic(const char *fmt, va_list ap)
841 {
842 #ifdef SMP
843 	cpuset_t other_cpus;
844 #endif
845 	struct thread *td = curthread;
846 	int bootopt, newpanic;
847 	static char buf[256];
848 
849 	spinlock_enter();
850 
851 #ifdef SMP
852 	/*
853 	 * stop_cpus_hard(other_cpus) should prevent multiple CPUs from
854 	 * concurrently entering panic.  Only the winner will proceed
855 	 * further.
856 	 */
857 	if (panicstr == NULL && !kdb_active) {
858 		other_cpus = all_cpus;
859 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
860 		stop_cpus_hard(other_cpus);
861 	}
862 #endif
863 
864 	/*
865 	 * Ensure that the scheduler is stopped while panicking, even if panic
866 	 * has been entered from kdb.
867 	 */
868 	td->td_stopsched = 1;
869 
870 	bootopt = RB_AUTOBOOT;
871 	newpanic = 0;
872 	if (panicstr)
873 		bootopt |= RB_NOSYNC;
874 	else {
875 		bootopt |= RB_DUMP;
876 		panicstr = fmt;
877 		panicked = true;
878 		newpanic = 1;
879 	}
880 
881 	if (newpanic) {
882 		(void)vsnprintf(buf, sizeof(buf), fmt, ap);
883 		panicstr = buf;
884 		cngrab();
885 		printf("panic: %s\n", buf);
886 	} else {
887 		printf("panic: ");
888 		vprintf(fmt, ap);
889 		printf("\n");
890 	}
891 #ifdef SMP
892 	printf("cpuid = %d\n", PCPU_GET(cpuid));
893 #endif
894 	printf("time = %jd\n", (intmax_t )time_second);
895 #ifdef KDB
896 	if ((newpanic || trace_all_panics) && trace_on_panic)
897 		kdb_backtrace();
898 	if (debugger_on_panic)
899 		kdb_enter(KDB_WHY_PANIC, "panic");
900 #endif
901 	/*thread_lock(td); */
902 	td->td_flags |= TDF_INPANIC;
903 	/* thread_unlock(td); */
904 	if (!sync_on_panic)
905 		bootopt |= RB_NOSYNC;
906 	if (poweroff_on_panic)
907 		bootopt |= RB_POWEROFF;
908 	if (powercycle_on_panic)
909 		bootopt |= RB_POWERCYCLE;
910 	kern_reboot(bootopt);
911 }
912 
913 /*
914  * Support for poweroff delay.
915  *
916  * Please note that setting this delay too short might power off your machine
917  * before the write cache on your hard disk has been flushed, leading to
918  * soft-updates inconsistencies.
919  */
920 #ifndef POWEROFF_DELAY
921 # define POWEROFF_DELAY 5000
922 #endif
923 static int poweroff_delay = POWEROFF_DELAY;
924 
925 SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW,
926     &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)");
927 
928 static void
929 poweroff_wait(void *junk, int howto)
930 {
931 
932 	if ((howto & (RB_POWEROFF | RB_POWERCYCLE)) == 0 || poweroff_delay <= 0)
933 		return;
934 	DELAY(poweroff_delay * 1000);
935 }
936 
937 /*
938  * Some system processes (e.g. syncer) need to be stopped at appropriate
939  * points in their main loops prior to a system shutdown, so that they
940  * won't interfere with the shutdown process (e.g. by holding a disk buf
941  * to cause sync to fail).  For each of these system processes, register
942  * shutdown_kproc() as a handler for one of shutdown events.
943  */
944 static int kproc_shutdown_wait = 60;
945 SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW,
946     &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process");
947 
948 void
949 kproc_shutdown(void *arg, int howto)
950 {
951 	struct proc *p;
952 	int error;
953 
954 	if (panicstr)
955 		return;
956 
957 	p = (struct proc *)arg;
958 	printf("Waiting (max %d seconds) for system process `%s' to stop... ",
959 	    kproc_shutdown_wait, p->p_comm);
960 	error = kproc_suspend(p, kproc_shutdown_wait * hz);
961 
962 	if (error == EWOULDBLOCK)
963 		printf("timed out\n");
964 	else
965 		printf("done\n");
966 }
967 
968 void
969 kthread_shutdown(void *arg, int howto)
970 {
971 	struct thread *td;
972 	int error;
973 
974 	if (panicstr)
975 		return;
976 
977 	td = (struct thread *)arg;
978 	printf("Waiting (max %d seconds) for system thread `%s' to stop... ",
979 	    kproc_shutdown_wait, td->td_name);
980 	error = kthread_suspend(td, kproc_shutdown_wait * hz);
981 
982 	if (error == EWOULDBLOCK)
983 		printf("timed out\n");
984 	else
985 		printf("done\n");
986 }
987 
988 static int
989 dumpdevname_sysctl_handler(SYSCTL_HANDLER_ARGS)
990 {
991 	char buf[256];
992 	struct dumperinfo *di;
993 	struct sbuf sb;
994 	int error;
995 
996 	error = sysctl_wire_old_buffer(req, 0);
997 	if (error != 0)
998 		return (error);
999 
1000 	sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req);
1001 
1002 	mtx_lock(&dumpconf_list_lk);
1003 	TAILQ_FOREACH(di, &dumper_configs, di_next) {
1004 		if (di != TAILQ_FIRST(&dumper_configs))
1005 			sbuf_putc(&sb, ',');
1006 		sbuf_cat(&sb, di->di_devname);
1007 	}
1008 	mtx_unlock(&dumpconf_list_lk);
1009 
1010 	error = sbuf_finish(&sb);
1011 	sbuf_delete(&sb);
1012 	return (error);
1013 }
1014 SYSCTL_PROC(_kern_shutdown, OID_AUTO, dumpdevname, CTLTYPE_STRING | CTLFLAG_RD,
1015     &dumper_configs, 0, dumpdevname_sysctl_handler, "A",
1016     "Device(s) for kernel dumps");
1017 
1018 static int	_dump_append(struct dumperinfo *di, void *virtual,
1019 		    vm_offset_t physical, size_t length);
1020 
1021 #ifdef EKCD
1022 static struct kerneldumpcrypto *
1023 kerneldumpcrypto_create(size_t blocksize, uint8_t encryption,
1024     const uint8_t *key, uint32_t encryptedkeysize, const uint8_t *encryptedkey)
1025 {
1026 	struct kerneldumpcrypto *kdc;
1027 	struct kerneldumpkey *kdk;
1028 	uint32_t dumpkeysize;
1029 
1030 	dumpkeysize = roundup2(sizeof(*kdk) + encryptedkeysize, blocksize);
1031 	kdc = malloc(sizeof(*kdc) + dumpkeysize, M_EKCD, M_WAITOK | M_ZERO);
1032 
1033 	arc4rand(kdc->kdc_iv, sizeof(kdc->kdc_iv), 0);
1034 
1035 	kdc->kdc_encryption = encryption;
1036 	switch (kdc->kdc_encryption) {
1037 	case KERNELDUMP_ENC_AES_256_CBC:
1038 		if (rijndael_makeKey(&kdc->kdc_ki, DIR_ENCRYPT, 256, key) <= 0)
1039 			goto failed;
1040 		break;
1041 	case KERNELDUMP_ENC_CHACHA20:
1042 		chacha_keysetup(&kdc->kdc_chacha, key, 256);
1043 		break;
1044 	default:
1045 		goto failed;
1046 	}
1047 
1048 	kdc->kdc_dumpkeysize = dumpkeysize;
1049 	kdk = kdc->kdc_dumpkey;
1050 	kdk->kdk_encryption = kdc->kdc_encryption;
1051 	memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv));
1052 	kdk->kdk_encryptedkeysize = htod32(encryptedkeysize);
1053 	memcpy(kdk->kdk_encryptedkey, encryptedkey, encryptedkeysize);
1054 
1055 	return (kdc);
1056 failed:
1057 	explicit_bzero(kdc, sizeof(*kdc) + dumpkeysize);
1058 	free(kdc, M_EKCD);
1059 	return (NULL);
1060 }
1061 
1062 static int
1063 kerneldumpcrypto_init(struct kerneldumpcrypto *kdc)
1064 {
1065 	uint8_t hash[SHA256_DIGEST_LENGTH];
1066 	SHA256_CTX ctx;
1067 	struct kerneldumpkey *kdk;
1068 	int error;
1069 
1070 	error = 0;
1071 
1072 	if (kdc == NULL)
1073 		return (0);
1074 
1075 	/*
1076 	 * When a user enters ddb it can write a crash dump multiple times.
1077 	 * Each time it should be encrypted using a different IV.
1078 	 */
1079 	SHA256_Init(&ctx);
1080 	SHA256_Update(&ctx, kdc->kdc_iv, sizeof(kdc->kdc_iv));
1081 	SHA256_Final(hash, &ctx);
1082 	bcopy(hash, kdc->kdc_iv, sizeof(kdc->kdc_iv));
1083 
1084 	switch (kdc->kdc_encryption) {
1085 	case KERNELDUMP_ENC_AES_256_CBC:
1086 		if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC,
1087 		    kdc->kdc_iv) <= 0) {
1088 			error = EINVAL;
1089 			goto out;
1090 		}
1091 		break;
1092 	case KERNELDUMP_ENC_CHACHA20:
1093 		chacha_ivsetup(&kdc->kdc_chacha, kdc->kdc_iv, NULL);
1094 		break;
1095 	default:
1096 		error = EINVAL;
1097 		goto out;
1098 	}
1099 
1100 	kdk = kdc->kdc_dumpkey;
1101 	memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv));
1102 out:
1103 	explicit_bzero(hash, sizeof(hash));
1104 	return (error);
1105 }
1106 
1107 static uint32_t
1108 kerneldumpcrypto_dumpkeysize(const struct kerneldumpcrypto *kdc)
1109 {
1110 
1111 	if (kdc == NULL)
1112 		return (0);
1113 	return (kdc->kdc_dumpkeysize);
1114 }
1115 #endif /* EKCD */
1116 
1117 static struct kerneldumpcomp *
1118 kerneldumpcomp_create(struct dumperinfo *di, uint8_t compression)
1119 {
1120 	struct kerneldumpcomp *kdcomp;
1121 	int format;
1122 
1123 	switch (compression) {
1124 	case KERNELDUMP_COMP_GZIP:
1125 		format = COMPRESS_GZIP;
1126 		break;
1127 	case KERNELDUMP_COMP_ZSTD:
1128 		format = COMPRESS_ZSTD;
1129 		break;
1130 	default:
1131 		return (NULL);
1132 	}
1133 
1134 	kdcomp = malloc(sizeof(*kdcomp), M_DUMPER, M_WAITOK | M_ZERO);
1135 	kdcomp->kdc_format = compression;
1136 	kdcomp->kdc_stream = compressor_init(kerneldumpcomp_write_cb,
1137 	    format, di->maxiosize, kerneldump_gzlevel, di);
1138 	if (kdcomp->kdc_stream == NULL) {
1139 		free(kdcomp, M_DUMPER);
1140 		return (NULL);
1141 	}
1142 	kdcomp->kdc_buf = malloc(di->maxiosize, M_DUMPER, M_WAITOK | M_NODUMP);
1143 	return (kdcomp);
1144 }
1145 
1146 static void
1147 kerneldumpcomp_destroy(struct dumperinfo *di)
1148 {
1149 	struct kerneldumpcomp *kdcomp;
1150 
1151 	kdcomp = di->kdcomp;
1152 	if (kdcomp == NULL)
1153 		return;
1154 	compressor_fini(kdcomp->kdc_stream);
1155 	explicit_bzero(kdcomp->kdc_buf, di->maxiosize);
1156 	free(kdcomp->kdc_buf, M_DUMPER);
1157 	free(kdcomp, M_DUMPER);
1158 }
1159 
1160 /*
1161  * Must not be present on global list.
1162  */
1163 static void
1164 free_single_dumper(struct dumperinfo *di)
1165 {
1166 
1167 	if (di == NULL)
1168 		return;
1169 
1170 	if (di->blockbuf != NULL) {
1171 		explicit_bzero(di->blockbuf, di->blocksize);
1172 		free(di->blockbuf, M_DUMPER);
1173 	}
1174 
1175 	kerneldumpcomp_destroy(di);
1176 
1177 #ifdef EKCD
1178 	if (di->kdcrypto != NULL) {
1179 		explicit_bzero(di->kdcrypto, sizeof(*di->kdcrypto) +
1180 		    di->kdcrypto->kdc_dumpkeysize);
1181 		free(di->kdcrypto, M_EKCD);
1182 	}
1183 #endif
1184 
1185 	explicit_bzero(di, sizeof(*di));
1186 	free(di, M_DUMPER);
1187 }
1188 
1189 /* Registration of dumpers */
1190 int
1191 dumper_insert(const struct dumperinfo *di_template, const char *devname,
1192     const struct diocskerneldump_arg *kda)
1193 {
1194 	struct dumperinfo *newdi, *listdi;
1195 	bool inserted;
1196 	uint8_t index;
1197 	int error;
1198 
1199 	index = kda->kda_index;
1200 	MPASS(index != KDA_REMOVE && index != KDA_REMOVE_DEV &&
1201 	    index != KDA_REMOVE_ALL);
1202 
1203 	error = priv_check(curthread, PRIV_SETDUMPER);
1204 	if (error != 0)
1205 		return (error);
1206 
1207 	newdi = malloc(sizeof(*newdi) + strlen(devname) + 1, M_DUMPER, M_WAITOK
1208 	    | M_ZERO);
1209 	memcpy(newdi, di_template, sizeof(*newdi));
1210 	newdi->blockbuf = NULL;
1211 	newdi->kdcrypto = NULL;
1212 	newdi->kdcomp = NULL;
1213 	strcpy(newdi->di_devname, devname);
1214 
1215 	if (kda->kda_encryption != KERNELDUMP_ENC_NONE) {
1216 #ifdef EKCD
1217 		newdi->kdcrypto = kerneldumpcrypto_create(di_template->blocksize,
1218 		    kda->kda_encryption, kda->kda_key,
1219 		    kda->kda_encryptedkeysize, kda->kda_encryptedkey);
1220 		if (newdi->kdcrypto == NULL) {
1221 			error = EINVAL;
1222 			goto cleanup;
1223 		}
1224 #else
1225 		error = EOPNOTSUPP;
1226 		goto cleanup;
1227 #endif
1228 	}
1229 	if (kda->kda_compression != KERNELDUMP_COMP_NONE) {
1230 		/*
1231 		 * We can't support simultaneous unpadded block cipher
1232 		 * encryption and compression because there is no guarantee the
1233 		 * length of the compressed result is exactly a multiple of the
1234 		 * cipher block size.
1235 		 */
1236 		if (kda->kda_encryption == KERNELDUMP_ENC_AES_256_CBC) {
1237 			error = EOPNOTSUPP;
1238 			goto cleanup;
1239 		}
1240 		newdi->kdcomp = kerneldumpcomp_create(newdi,
1241 		    kda->kda_compression);
1242 		if (newdi->kdcomp == NULL) {
1243 			error = EINVAL;
1244 			goto cleanup;
1245 		}
1246 	}
1247 
1248 	newdi->blockbuf = malloc(newdi->blocksize, M_DUMPER, M_WAITOK | M_ZERO);
1249 
1250 	/* Add the new configuration to the queue */
1251 	mtx_lock(&dumpconf_list_lk);
1252 	inserted = false;
1253 	TAILQ_FOREACH(listdi, &dumper_configs, di_next) {
1254 		if (index == 0) {
1255 			TAILQ_INSERT_BEFORE(listdi, newdi, di_next);
1256 			inserted = true;
1257 			break;
1258 		}
1259 		index--;
1260 	}
1261 	if (!inserted)
1262 		TAILQ_INSERT_TAIL(&dumper_configs, newdi, di_next);
1263 	mtx_unlock(&dumpconf_list_lk);
1264 
1265 	return (0);
1266 
1267 cleanup:
1268 	free_single_dumper(newdi);
1269 	return (error);
1270 }
1271 
1272 #ifdef DDB
1273 void
1274 dumper_ddb_insert(struct dumperinfo *newdi)
1275 {
1276 	TAILQ_INSERT_HEAD(&dumper_configs, newdi, di_next);
1277 }
1278 
1279 void
1280 dumper_ddb_remove(struct dumperinfo *di)
1281 {
1282 	TAILQ_REMOVE(&dumper_configs, di, di_next);
1283 }
1284 #endif
1285 
1286 static bool
1287 dumper_config_match(const struct dumperinfo *di, const char *devname,
1288     const struct diocskerneldump_arg *kda)
1289 {
1290 	if (kda->kda_index == KDA_REMOVE_ALL)
1291 		return (true);
1292 
1293 	if (strcmp(di->di_devname, devname) != 0)
1294 		return (false);
1295 
1296 	/*
1297 	 * Allow wildcard removal of configs matching a device on g_dev_orphan.
1298 	 */
1299 	if (kda->kda_index == KDA_REMOVE_DEV)
1300 		return (true);
1301 
1302 	if (di->kdcomp != NULL) {
1303 		if (di->kdcomp->kdc_format != kda->kda_compression)
1304 			return (false);
1305 	} else if (kda->kda_compression != KERNELDUMP_COMP_NONE)
1306 		return (false);
1307 #ifdef EKCD
1308 	if (di->kdcrypto != NULL) {
1309 		if (di->kdcrypto->kdc_encryption != kda->kda_encryption)
1310 			return (false);
1311 		/*
1312 		 * Do we care to verify keys match to delete?  It seems weird
1313 		 * to expect multiple fallback dump configurations on the same
1314 		 * device that only differ in crypto key.
1315 		 */
1316 	} else
1317 #endif
1318 		if (kda->kda_encryption != KERNELDUMP_ENC_NONE)
1319 			return (false);
1320 
1321 	return (true);
1322 }
1323 
1324 int
1325 dumper_remove(const char *devname, const struct diocskerneldump_arg *kda)
1326 {
1327 	struct dumperinfo *di, *sdi;
1328 	bool found;
1329 	int error;
1330 
1331 	error = priv_check(curthread, PRIV_SETDUMPER);
1332 	if (error != 0)
1333 		return (error);
1334 
1335 	/*
1336 	 * Try to find a matching configuration, and kill it.
1337 	 *
1338 	 * NULL 'kda' indicates remove any configuration matching 'devname',
1339 	 * which may remove multiple configurations in atypical configurations.
1340 	 */
1341 	found = false;
1342 	mtx_lock(&dumpconf_list_lk);
1343 	TAILQ_FOREACH_SAFE(di, &dumper_configs, di_next, sdi) {
1344 		if (dumper_config_match(di, devname, kda)) {
1345 			found = true;
1346 			TAILQ_REMOVE(&dumper_configs, di, di_next);
1347 			free_single_dumper(di);
1348 		}
1349 	}
1350 	mtx_unlock(&dumpconf_list_lk);
1351 
1352 	/* Only produce ENOENT if a more targeted match didn't match. */
1353 	if (!found && kda->kda_index == KDA_REMOVE)
1354 		return (ENOENT);
1355 	return (0);
1356 }
1357 
1358 static int
1359 dump_check_bounds(struct dumperinfo *di, off_t offset, size_t length)
1360 {
1361 
1362 	if (di->mediasize > 0 && length != 0 && (offset < di->mediaoffset ||
1363 	    offset - di->mediaoffset + length > di->mediasize)) {
1364 		if (di->kdcomp != NULL && offset >= di->mediaoffset) {
1365 			printf(
1366 		    "Compressed dump failed to fit in device boundaries.\n");
1367 			return (E2BIG);
1368 		}
1369 
1370 		printf("Attempt to write outside dump device boundaries.\n"
1371 	    "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n",
1372 		    (intmax_t)offset, (intmax_t)di->mediaoffset,
1373 		    (uintmax_t)length, (intmax_t)di->mediasize);
1374 		return (ENOSPC);
1375 	}
1376 	if (length % di->blocksize != 0) {
1377 		printf("Attempt to write partial block of length %ju.\n",
1378 		    (uintmax_t)length);
1379 		return (EINVAL);
1380 	}
1381 	if (offset % di->blocksize != 0) {
1382 		printf("Attempt to write at unaligned offset %jd.\n",
1383 		    (intmax_t)offset);
1384 		return (EINVAL);
1385 	}
1386 
1387 	return (0);
1388 }
1389 
1390 #ifdef EKCD
1391 static int
1392 dump_encrypt(struct kerneldumpcrypto *kdc, uint8_t *buf, size_t size)
1393 {
1394 
1395 	switch (kdc->kdc_encryption) {
1396 	case KERNELDUMP_ENC_AES_256_CBC:
1397 		if (rijndael_blockEncrypt(&kdc->kdc_ci, &kdc->kdc_ki, buf,
1398 		    8 * size, buf) <= 0) {
1399 			return (EIO);
1400 		}
1401 		if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC,
1402 		    buf + size - 16 /* IV size for AES-256-CBC */) <= 0) {
1403 			return (EIO);
1404 		}
1405 		break;
1406 	case KERNELDUMP_ENC_CHACHA20:
1407 		chacha_encrypt_bytes(&kdc->kdc_chacha, buf, buf, size);
1408 		break;
1409 	default:
1410 		return (EINVAL);
1411 	}
1412 
1413 	return (0);
1414 }
1415 
1416 /* Encrypt data and call dumper. */
1417 static int
1418 dump_encrypted_write(struct dumperinfo *di, void *virtual,
1419     vm_offset_t physical, off_t offset, size_t length)
1420 {
1421 	static uint8_t buf[KERNELDUMP_BUFFER_SIZE];
1422 	struct kerneldumpcrypto *kdc;
1423 	int error;
1424 	size_t nbytes;
1425 
1426 	kdc = di->kdcrypto;
1427 
1428 	while (length > 0) {
1429 		nbytes = MIN(length, sizeof(buf));
1430 		bcopy(virtual, buf, nbytes);
1431 
1432 		if (dump_encrypt(kdc, buf, nbytes) != 0)
1433 			return (EIO);
1434 
1435 		error = dump_write(di, buf, physical, offset, nbytes);
1436 		if (error != 0)
1437 			return (error);
1438 
1439 		offset += nbytes;
1440 		virtual = (void *)((uint8_t *)virtual + nbytes);
1441 		length -= nbytes;
1442 	}
1443 
1444 	return (0);
1445 }
1446 #endif /* EKCD */
1447 
1448 static int
1449 kerneldumpcomp_write_cb(void *base, size_t length, off_t offset, void *arg)
1450 {
1451 	struct dumperinfo *di;
1452 	size_t resid, rlength;
1453 	int error;
1454 
1455 	di = arg;
1456 
1457 	if (length % di->blocksize != 0) {
1458 		/*
1459 		 * This must be the final write after flushing the compression
1460 		 * stream. Write as many full blocks as possible and stash the
1461 		 * residual data in the dumper's block buffer. It will be
1462 		 * padded and written in dump_finish().
1463 		 */
1464 		rlength = rounddown(length, di->blocksize);
1465 		if (rlength != 0) {
1466 			error = _dump_append(di, base, 0, rlength);
1467 			if (error != 0)
1468 				return (error);
1469 		}
1470 		resid = length - rlength;
1471 		memmove(di->blockbuf, (uint8_t *)base + rlength, resid);
1472 		di->kdcomp->kdc_resid = resid;
1473 		return (EAGAIN);
1474 	}
1475 	return (_dump_append(di, base, 0, length));
1476 }
1477 
1478 /*
1479  * Write kernel dump headers at the beginning and end of the dump extent.
1480  * Write the kernel dump encryption key after the leading header if we were
1481  * configured to do so.
1482  */
1483 static int
1484 dump_write_headers(struct dumperinfo *di, struct kerneldumpheader *kdh)
1485 {
1486 #ifdef EKCD
1487 	struct kerneldumpcrypto *kdc;
1488 #endif
1489 	void *buf, *key;
1490 	size_t hdrsz;
1491 	uint64_t extent;
1492 	uint32_t keysize;
1493 	int error;
1494 
1495 	hdrsz = sizeof(*kdh);
1496 	if (hdrsz > di->blocksize)
1497 		return (ENOMEM);
1498 
1499 #ifdef EKCD
1500 	kdc = di->kdcrypto;
1501 	key = kdc->kdc_dumpkey;
1502 	keysize = kerneldumpcrypto_dumpkeysize(kdc);
1503 #else
1504 	key = NULL;
1505 	keysize = 0;
1506 #endif
1507 
1508 	/*
1509 	 * If the dump device has special handling for headers, let it take care
1510 	 * of writing them out.
1511 	 */
1512 	if (di->dumper_hdr != NULL)
1513 		return (di->dumper_hdr(di, kdh, key, keysize));
1514 
1515 	if (hdrsz == di->blocksize)
1516 		buf = kdh;
1517 	else {
1518 		buf = di->blockbuf;
1519 		memset(buf, 0, di->blocksize);
1520 		memcpy(buf, kdh, hdrsz);
1521 	}
1522 
1523 	extent = dtoh64(kdh->dumpextent);
1524 #ifdef EKCD
1525 	if (kdc != NULL) {
1526 		error = dump_write(di, kdc->kdc_dumpkey, 0,
1527 		    di->mediaoffset + di->mediasize - di->blocksize - extent -
1528 		    keysize, keysize);
1529 		if (error != 0)
1530 			return (error);
1531 	}
1532 #endif
1533 
1534 	error = dump_write(di, buf, 0,
1535 	    di->mediaoffset + di->mediasize - 2 * di->blocksize - extent -
1536 	    keysize, di->blocksize);
1537 	if (error == 0)
1538 		error = dump_write(di, buf, 0, di->mediaoffset + di->mediasize -
1539 		    di->blocksize, di->blocksize);
1540 	return (error);
1541 }
1542 
1543 /*
1544  * Don't touch the first SIZEOF_METADATA bytes on the dump device.  This is to
1545  * protect us from metadata and metadata from us.
1546  */
1547 #define	SIZEOF_METADATA		(64 * 1024)
1548 
1549 /*
1550  * Do some preliminary setup for a kernel dump: initialize state for encryption,
1551  * if requested, and make sure that we have enough space on the dump device.
1552  *
1553  * We set things up so that the dump ends before the last sector of the dump
1554  * device, at which the trailing header is written.
1555  *
1556  *     +-----------+------+-----+----------------------------+------+
1557  *     |           | lhdr | key |    ... kernel dump ...     | thdr |
1558  *     +-----------+------+-----+----------------------------+------+
1559  *                   1 blk  opt <------- dump extent --------> 1 blk
1560  *
1561  * Dumps written using dump_append() start at the beginning of the extent.
1562  * Uncompressed dumps will use the entire extent, but compressed dumps typically
1563  * will not. The true length of the dump is recorded in the leading and trailing
1564  * headers once the dump has been completed.
1565  *
1566  * The dump device may provide a callback, in which case it will initialize
1567  * dumpoff and take care of laying out the headers.
1568  */
1569 int
1570 dump_start(struct dumperinfo *di, struct kerneldumpheader *kdh)
1571 {
1572 	uint64_t dumpextent, span;
1573 	uint32_t keysize;
1574 	int error;
1575 
1576 #ifdef EKCD
1577 	error = kerneldumpcrypto_init(di->kdcrypto);
1578 	if (error != 0)
1579 		return (error);
1580 	keysize = kerneldumpcrypto_dumpkeysize(di->kdcrypto);
1581 #else
1582 	error = 0;
1583 	keysize = 0;
1584 #endif
1585 
1586 	if (di->dumper_start != NULL) {
1587 		error = di->dumper_start(di);
1588 	} else {
1589 		dumpextent = dtoh64(kdh->dumpextent);
1590 		span = SIZEOF_METADATA + dumpextent + 2 * di->blocksize +
1591 		    keysize;
1592 		if (di->mediasize < span) {
1593 			if (di->kdcomp == NULL)
1594 				return (E2BIG);
1595 
1596 			/*
1597 			 * We don't yet know how much space the compressed dump
1598 			 * will occupy, so try to use the whole swap partition
1599 			 * (minus the first 64KB) in the hope that the
1600 			 * compressed dump will fit. If that doesn't turn out to
1601 			 * be enough, the bounds checking in dump_write()
1602 			 * will catch us and cause the dump to fail.
1603 			 */
1604 			dumpextent = di->mediasize - span + dumpextent;
1605 			kdh->dumpextent = htod64(dumpextent);
1606 		}
1607 
1608 		/*
1609 		 * The offset at which to begin writing the dump.
1610 		 */
1611 		di->dumpoff = di->mediaoffset + di->mediasize - di->blocksize -
1612 		    dumpextent;
1613 	}
1614 	di->origdumpoff = di->dumpoff;
1615 	return (error);
1616 }
1617 
1618 static int
1619 _dump_append(struct dumperinfo *di, void *virtual, vm_offset_t physical,
1620     size_t length)
1621 {
1622 	int error;
1623 
1624 #ifdef EKCD
1625 	if (di->kdcrypto != NULL)
1626 		error = dump_encrypted_write(di, virtual, physical, di->dumpoff,
1627 		    length);
1628 	else
1629 #endif
1630 		error = dump_write(di, virtual, physical, di->dumpoff, length);
1631 	if (error == 0)
1632 		di->dumpoff += length;
1633 	return (error);
1634 }
1635 
1636 /*
1637  * Write to the dump device starting at dumpoff. When compression is enabled,
1638  * writes to the device will be performed using a callback that gets invoked
1639  * when the compression stream's output buffer is full.
1640  */
1641 int
1642 dump_append(struct dumperinfo *di, void *virtual, vm_offset_t physical,
1643     size_t length)
1644 {
1645 	void *buf;
1646 
1647 	if (di->kdcomp != NULL) {
1648 		/* Bounce through a buffer to avoid CRC errors. */
1649 		if (length > di->maxiosize)
1650 			return (EINVAL);
1651 		buf = di->kdcomp->kdc_buf;
1652 		memmove(buf, virtual, length);
1653 		return (compressor_write(di->kdcomp->kdc_stream, buf, length));
1654 	}
1655 	return (_dump_append(di, virtual, physical, length));
1656 }
1657 
1658 /*
1659  * Write to the dump device at the specified offset.
1660  */
1661 int
1662 dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical,
1663     off_t offset, size_t length)
1664 {
1665 	int error;
1666 
1667 	error = dump_check_bounds(di, offset, length);
1668 	if (error != 0)
1669 		return (error);
1670 	return (di->dumper(di->priv, virtual, physical, offset, length));
1671 }
1672 
1673 /*
1674  * Perform kernel dump finalization: flush the compression stream, if necessary,
1675  * write the leading and trailing kernel dump headers now that we know the true
1676  * length of the dump, and optionally write the encryption key following the
1677  * leading header.
1678  */
1679 int
1680 dump_finish(struct dumperinfo *di, struct kerneldumpheader *kdh)
1681 {
1682 	int error;
1683 
1684 	if (di->kdcomp != NULL) {
1685 		error = compressor_flush(di->kdcomp->kdc_stream);
1686 		if (error == EAGAIN) {
1687 			/* We have residual data in di->blockbuf. */
1688 			error = dump_write(di, di->blockbuf, 0, di->dumpoff,
1689 			    di->blocksize);
1690 			di->dumpoff += di->kdcomp->kdc_resid;
1691 			di->kdcomp->kdc_resid = 0;
1692 		}
1693 		if (error != 0)
1694 			return (error);
1695 
1696 		/*
1697 		 * We now know the size of the compressed dump, so update the
1698 		 * header accordingly and recompute parity.
1699 		 */
1700 		kdh->dumplength = htod64(di->dumpoff - di->origdumpoff);
1701 		kdh->parity = 0;
1702 		kdh->parity = kerneldump_parity(kdh);
1703 
1704 		compressor_reset(di->kdcomp->kdc_stream);
1705 	}
1706 
1707 	error = dump_write_headers(di, kdh);
1708 	if (error != 0)
1709 		return (error);
1710 
1711 	(void)dump_write(di, NULL, 0, 0, 0);
1712 	return (0);
1713 }
1714 
1715 void
1716 dump_init_header(const struct dumperinfo *di, struct kerneldumpheader *kdh,
1717     char *magic, uint32_t archver, uint64_t dumplen)
1718 {
1719 	size_t dstsize;
1720 
1721 	bzero(kdh, sizeof(*kdh));
1722 	strlcpy(kdh->magic, magic, sizeof(kdh->magic));
1723 	strlcpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture));
1724 	kdh->version = htod32(KERNELDUMPVERSION);
1725 	kdh->architectureversion = htod32(archver);
1726 	kdh->dumplength = htod64(dumplen);
1727 	kdh->dumpextent = kdh->dumplength;
1728 	kdh->dumptime = htod64(time_second);
1729 #ifdef EKCD
1730 	kdh->dumpkeysize = htod32(kerneldumpcrypto_dumpkeysize(di->kdcrypto));
1731 #else
1732 	kdh->dumpkeysize = 0;
1733 #endif
1734 	kdh->blocksize = htod32(di->blocksize);
1735 	strlcpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname));
1736 	dstsize = sizeof(kdh->versionstring);
1737 	if (strlcpy(kdh->versionstring, version, dstsize) >= dstsize)
1738 		kdh->versionstring[dstsize - 2] = '\n';
1739 	if (panicstr != NULL)
1740 		strlcpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring));
1741 	if (di->kdcomp != NULL)
1742 		kdh->compression = di->kdcomp->kdc_format;
1743 	kdh->parity = kerneldump_parity(kdh);
1744 }
1745 
1746 #ifdef DDB
1747 DB_SHOW_COMMAND(panic, db_show_panic)
1748 {
1749 
1750 	if (panicstr == NULL)
1751 		db_printf("panicstr not set\n");
1752 	else
1753 		db_printf("panic: %s\n", panicstr);
1754 }
1755 #endif
1756