1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2013 Joyent, Inc. All rights reserved.
26 * Copyright 2024 Oxide Computer Company
27 */
28
29 #include <sys/param.h>
30 #include <sys/types.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/errno.h>
34 #include <sys/vfs.h>
35 #include <sys/vnode.h>
36 #include <sys/swap.h>
37 #include <sys/file.h>
38 #include <sys/proc.h>
39 #include <sys/var.h>
40 #include <sys/uadmin.h>
41 #include <sys/signal.h>
42 #include <sys/time.h>
43 #include <vm/seg_kmem.h>
44 #include <sys/modctl.h>
45 #include <sys/callb.h>
46 #include <sys/dumphdr.h>
47 #include <sys/debug.h>
48 #include <sys/ftrace.h>
49 #include <sys/cmn_err.h>
50 #include <sys/panic.h>
51 #include <sys/ddi.h>
52 #include <sys/ddi_periodic.h>
53 #include <sys/sunddi.h>
54 #include <sys/policy.h>
55 #include <sys/zone.h>
56 #include <sys/condvar.h>
57 #include <sys/thread.h>
58 #include <sys/sdt.h>
59
60 /*
61 * Administrivia system call. We provide this in two flavors: one for calling
62 * from the system call path (uadmin), and the other for calling from elsewhere
63 * within the kernel (kadmin). Callers must beware that certain uadmin cmd
64 * values (specifically A_SWAPCTL) are only supported by uadmin and not kadmin.
65 */
66
67 extern ksema_t fsflush_sema;
68 kmutex_t ualock;
69 kcondvar_t uacond;
70 kthread_t *ua_shutdown_thread = NULL;
71
72 int sys_shutdown = 0;
73 volatile int fastreboot_dryrun = 0;
74
75 /*
76 * Kill all user processes in said zone. A special argument of ALL_ZONES is
77 * passed in when the system as a whole is shutting down. The lack of per-zone
78 * process lists is likely to make the following a performance bottleneck on a
79 * system with many zones.
80 */
81 void
killall(zoneid_t zoneid)82 killall(zoneid_t zoneid)
83 {
84 proc_t *p;
85
86 ASSERT(zoneid != GLOBAL_ZONEID);
87 /*
88 * Kill all processes except kernel daemons and ourself.
89 * Make a first pass to stop all processes so they won't
90 * be trying to restart children as we kill them.
91 */
92 mutex_enter(&pidlock);
93 for (p = practive; p != NULL; p = p->p_next) {
94 if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) &&
95 p->p_exec != NULLVP && /* kernel daemons */
96 p->p_as != &kas &&
97 p->p_stat != SZOMB) {
98 mutex_enter(&p->p_lock);
99 p->p_flag |= SNOWAIT;
100 sigtoproc(p, NULL, SIGSTOP);
101 mutex_exit(&p->p_lock);
102 }
103 }
104 p = practive;
105 while (p != NULL) {
106 if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) &&
107 p->p_exec != NULLVP && /* kernel daemons */
108 p->p_as != &kas &&
109 p->p_stat != SIDL &&
110 p->p_stat != SZOMB) {
111 mutex_enter(&p->p_lock);
112 if (sigismember(&p->p_sig, SIGKILL)) {
113 mutex_exit(&p->p_lock);
114 p = p->p_next;
115 } else {
116 sigtoproc(p, NULL, SIGKILL);
117 mutex_exit(&p->p_lock);
118 (void) cv_reltimedwait(&p->p_srwchan_cv,
119 &pidlock, hz, TR_CLOCK_TICK);
120 p = practive;
121 }
122 } else {
123 p = p->p_next;
124 }
125 }
126 mutex_exit(&pidlock);
127 }
128
129 /*
130 * Emits an SDT probe (sdt:::test) with 7 arguments. This is used to test
131 * arguments are passed properly, whether via registers or on the stack.
132 */
133 static void
sdt_test_args(void)134 sdt_test_args(void)
135 {
136 DTRACE_PROBE7(test, int, 1, int, 2, int, 3, int, 4, int, 5,
137 int, 6, int, 7);
138 }
139
140 /*
141 * Same as above, but with the probe called as a tail call.
142 * Unfortunately, gcc doesn't yet have a [[musttail]] attribute that would
143 * either generate a tail call or error at compile time if it can't. Instead
144 * we use a separate function along with a optimize pragma. On x86, this does
145 * indeed generate a tail call as written.
146 */
147 #pragma GCC push_options
148 #pragma GCC optimize("optimize-sibling-calls")
149 static void
sdt_test_args_tail_call(int a,int b,int c,int d,int e,int f,int g)150 sdt_test_args_tail_call(int a, int b, int c, int d, int e, int f, int g)
151 {
152 DTRACE_PROBE7(test, int, a, int, b, int, c, int, d,
153 int, e, int, f, int, g);
154 }
155 #pragma GCC pop_options
156
157 int
kadmin(int cmd,int fcn,void * mdep,cred_t * credp)158 kadmin(int cmd, int fcn, void *mdep, cred_t *credp)
159 {
160 int error = 0;
161 char *buf;
162 size_t buflen = 0;
163 boolean_t invoke_cb = B_FALSE;
164
165 /*
166 * We might be called directly by the kernel's fault-handling code, so
167 * we can't assert that the caller is in the global zone.
168 */
169
170 /*
171 * Make sure that cmd is one of the valid <sys/uadmin.h> command codes
172 * and that we have appropriate privileges for this action.
173 */
174 switch (cmd) {
175 case A_FTRACE:
176 case A_SHUTDOWN:
177 case A_REBOOT:
178 case A_REMOUNT:
179 case A_FREEZE:
180 case A_DUMP:
181 case A_SDTTEST:
182 case A_CONFIG:
183 if (secpolicy_sys_config(credp, B_FALSE) != 0)
184 return (EPERM);
185 break;
186
187 default:
188 return (EINVAL);
189 }
190
191 /*
192 * Serialize these operations on ualock. If it is held, the
193 * system should shutdown, reboot, or remount shortly, unless there is
194 * an error. We need a cv rather than just a mutex because proper
195 * functioning of A_REBOOT relies on being able to interrupt blocked
196 * userland callers.
197 *
198 * We only clear ua_shutdown_thread after A_REMOUNT or A_CONFIG.
199 * Other commands should never return.
200 */
201 if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_REMOUNT ||
202 cmd == A_CONFIG) {
203 mutex_enter(&ualock);
204 while (ua_shutdown_thread != NULL) {
205 if (cv_wait_sig(&uacond, &ualock) == 0) {
206 /*
207 * If we were interrupted, leave, and handle
208 * the signal (or exit, depending on what
209 * happened)
210 */
211 mutex_exit(&ualock);
212 return (EINTR);
213 }
214 }
215 ua_shutdown_thread = curthread;
216 mutex_exit(&ualock);
217 }
218
219 switch (cmd) {
220 case A_SHUTDOWN:
221 {
222 proc_t *p = ttoproc(curthread);
223
224 /*
225 * Release (almost) all of our own resources if we are called
226 * from a user context, however if we are calling kadmin() from
227 * a kernel context then we do not release these resources.
228 */
229 if (p != &p0) {
230 proc_is_exiting(p);
231 if ((error = exitlwps(0)) != 0) {
232 /*
233 * Another thread in this process also called
234 * exitlwps().
235 */
236 mutex_enter(&ualock);
237 ua_shutdown_thread = NULL;
238 cv_signal(&uacond);
239 mutex_exit(&ualock);
240 return (error);
241 }
242 mutex_enter(&p->p_lock);
243 p->p_flag |= SNOWAIT;
244 sigfillset(&p->p_ignore);
245 curthread->t_lwp->lwp_cursig = 0;
246 curthread->t_lwp->lwp_extsig = 0;
247 if (p->p_exec) {
248 vnode_t *exec_vp = p->p_exec;
249 p->p_exec = NULLVP;
250 mutex_exit(&p->p_lock);
251 VN_RELE(exec_vp);
252 } else {
253 mutex_exit(&p->p_lock);
254 }
255
256 pollcleanup();
257 closeall(P_FINFO(curproc));
258 relvm();
259
260 } else {
261 /*
262 * Reset t_cred if not set because much of the
263 * filesystem code depends on CRED() being valid.
264 */
265 if (curthread->t_cred == NULL)
266 curthread->t_cred = kcred;
267 }
268
269 /* indicate shutdown in progress */
270 sys_shutdown = 1;
271
272 /*
273 * Communcate that init shouldn't be restarted.
274 */
275 zone_shutdown_global();
276
277 killall(ALL_ZONES);
278 /*
279 * If we are calling kadmin() from a kernel context then we
280 * do not release these resources.
281 */
282 if (ttoproc(curthread) != &p0) {
283 VN_RELE(PTOU(curproc)->u_cdir);
284 if (PTOU(curproc)->u_rdir)
285 VN_RELE(PTOU(curproc)->u_rdir);
286 if (PTOU(curproc)->u_cwd)
287 refstr_rele(PTOU(curproc)->u_cwd);
288
289 PTOU(curproc)->u_cdir = rootdir;
290 PTOU(curproc)->u_rdir = NULL;
291 PTOU(curproc)->u_cwd = NULL;
292 }
293
294 /*
295 * Allow the reboot/halt/poweroff code a chance to do
296 * anything it needs to whilst we still have filesystems
297 * mounted, like loading any modules necessary for later
298 * performing the actual poweroff.
299 */
300 if ((mdep != NULL) && (*(char *)mdep == '/')) {
301 buf = i_convert_boot_device_name(mdep, NULL, &buflen);
302 mdpreboot(cmd, fcn, buf);
303 } else
304 mdpreboot(cmd, fcn, mdep);
305
306 /*
307 * Allow fsflush to finish running and then prevent it
308 * from ever running again so that vfs_unmountall() and
309 * vfs_syncall() can acquire the vfs locks they need.
310 */
311 sema_p(&fsflush_sema);
312 (void) callb_execute_class(CB_CL_UADMIN_PRE_VFS, 0);
313
314 vfs_unmountall();
315 (void) VFS_MOUNTROOT(rootvfs, ROOT_UNMOUNT);
316 vfs_syncall();
317
318 /*
319 * Check for (and unregister) any DDI periodic handlers that
320 * still exist, as they most likely constitute resource leaks:
321 */
322 ddi_periodic_fini();
323
324 dump_ereports();
325 dump_messages();
326
327 invoke_cb = B_TRUE;
328 }
329 /* FALLTHROUGH */
330
331 case A_REBOOT:
332 if ((mdep != NULL) && (*(char *)mdep == '/')) {
333 buf = i_convert_boot_device_name(mdep, NULL, &buflen);
334 mdboot(cmd, fcn, buf, invoke_cb);
335 } else
336 mdboot(cmd, fcn, mdep, invoke_cb);
337 /* no return expected */
338 break;
339
340 case A_CONFIG:
341 switch (fcn) {
342 case AD_UPDATE_BOOT_CONFIG:
343 #ifndef __sparc
344 {
345 extern void fastboot_update_config(const char *);
346
347 fastboot_update_config(mdep);
348 }
349 #endif
350
351 break;
352 }
353 /* Let other threads enter the shutdown path now */
354 mutex_enter(&ualock);
355 ua_shutdown_thread = NULL;
356 cv_signal(&uacond);
357 mutex_exit(&ualock);
358 break;
359
360 case A_REMOUNT:
361 (void) VFS_MOUNTROOT(rootvfs, ROOT_REMOUNT);
362 /* Let other threads enter the shutdown path now */
363 mutex_enter(&ualock);
364 ua_shutdown_thread = NULL;
365 cv_signal(&uacond);
366 mutex_exit(&ualock);
367 break;
368
369 case A_FREEZE:
370 {
371 /*
372 * This is the entrypoint for all suspend/resume actions.
373 */
374 extern int cpr(int, void *);
375
376 if (modload("misc", "cpr") == -1)
377 return (ENOTSUP);
378 /* Let the CPR module decide what to do with mdep */
379 error = cpr(fcn, mdep);
380 break;
381 }
382
383 case A_FTRACE:
384 {
385 switch (fcn) {
386 case AD_FTRACE_START:
387 (void) FTRACE_START();
388 break;
389 case AD_FTRACE_STOP:
390 (void) FTRACE_STOP();
391 break;
392 default:
393 error = EINVAL;
394 }
395 break;
396 }
397
398 case A_DUMP:
399 {
400 if (fcn == AD_NOSYNC) {
401 in_sync = 1;
402 break;
403 }
404
405 panic_bootfcn = fcn;
406 panic_forced = 1;
407
408 if ((mdep != NULL) && (*(char *)mdep == '/')) {
409 panic_bootstr = i_convert_boot_device_name(mdep,
410 NULL, &buflen);
411 } else
412 panic_bootstr = mdep;
413
414 #ifndef __sparc
415 extern void fastboot_update_and_load(int, char *);
416
417 fastboot_update_and_load(fcn, mdep);
418 #endif
419
420 panic("forced crash dump initiated at user request");
421 /*NOTREACHED*/
422 }
423
424 case A_SDTTEST:
425 {
426 sdt_test_args();
427 sdt_test_args_tail_call(1, 2, 3, 4, 5, 6, 7);
428 break;
429 }
430
431 default:
432 error = EINVAL;
433 }
434
435 return (error);
436 }
437
438 int
uadmin(int cmd,int fcn,uintptr_t mdep)439 uadmin(int cmd, int fcn, uintptr_t mdep)
440 {
441 int error = 0, rv = 0;
442 size_t nbytes = 0;
443 cred_t *credp = CRED();
444 char *bootargs = NULL;
445 int reset_status = 0;
446
447 if (cmd == A_SHUTDOWN && fcn == AD_FASTREBOOT_DRYRUN) {
448 ddi_walk_devs(ddi_root_node(), check_driver_quiesce,
449 &reset_status);
450 if (reset_status != 0)
451 return (EIO);
452 else
453 return (0);
454 }
455
456 /*
457 * The swapctl system call doesn't have its own entry point: it uses
458 * uadmin as a wrapper so we just call it directly from here.
459 */
460 if (cmd == A_SWAPCTL) {
461 if (get_udatamodel() == DATAMODEL_NATIVE)
462 error = swapctl(fcn, (void *)mdep, &rv);
463 #if defined(_SYSCALL32_IMPL)
464 else
465 error = swapctl32(fcn, (void *)mdep, &rv);
466 #endif /* _SYSCALL32_IMPL */
467 return (error ? set_errno(error) : rv);
468 }
469
470 /*
471 * Certain subcommands intepret a non-NULL mdep value as a pointer to
472 * a boot string. We pull that in as bootargs, if applicable.
473 */
474 if (mdep != (uintptr_t)NULL &&
475 (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_DUMP ||
476 cmd == A_FREEZE || cmd == A_CONFIG)) {
477 bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
478 if ((error = copyinstr((const char *)mdep, bootargs,
479 BOOTARGS_MAX, &nbytes)) != 0) {
480 kmem_free(bootargs, BOOTARGS_MAX);
481 return (set_errno(error));
482 }
483 }
484
485 /*
486 * Invoke the appropriate kadmin() routine.
487 */
488 if (getzoneid() != GLOBAL_ZONEID)
489 error = zone_kadmin(cmd, fcn, bootargs, credp);
490 else
491 error = kadmin(cmd, fcn, bootargs, credp);
492
493 if (bootargs != NULL)
494 kmem_free(bootargs, BOOTARGS_MAX);
495 return (error ? set_errno(error) : 0);
496 }
497