xref: /illumos-gate/usr/src/uts/common/syscall/uadmin.c (revision 8119dad84d6416f13557b0ba8e2aaf9064cbcfd3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2013 Joyent, Inc.  All rights reserved.
26  * Copyright 2024 Oxide Computer Company
27  */
28 
29 #include <sys/param.h>
30 #include <sys/types.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/errno.h>
34 #include <sys/vfs.h>
35 #include <sys/vnode.h>
36 #include <sys/swap.h>
37 #include <sys/file.h>
38 #include <sys/proc.h>
39 #include <sys/var.h>
40 #include <sys/uadmin.h>
41 #include <sys/signal.h>
42 #include <sys/time.h>
43 #include <vm/seg_kmem.h>
44 #include <sys/modctl.h>
45 #include <sys/callb.h>
46 #include <sys/dumphdr.h>
47 #include <sys/debug.h>
48 #include <sys/ftrace.h>
49 #include <sys/cmn_err.h>
50 #include <sys/panic.h>
51 #include <sys/ddi.h>
52 #include <sys/ddi_periodic.h>
53 #include <sys/sunddi.h>
54 #include <sys/policy.h>
55 #include <sys/zone.h>
56 #include <sys/condvar.h>
57 #include <sys/thread.h>
58 #include <sys/sdt.h>
59 
60 /*
61  * Administrivia system call.  We provide this in two flavors: one for calling
62  * from the system call path (uadmin), and the other for calling from elsewhere
63  * within the kernel (kadmin).  Callers must beware that certain uadmin cmd
64  * values (specifically A_SWAPCTL) are only supported by uadmin and not kadmin.
65  */
66 
67 extern ksema_t fsflush_sema;
68 kmutex_t ualock;
69 kcondvar_t uacond;
70 kthread_t *ua_shutdown_thread = NULL;
71 
72 int sys_shutdown = 0;
73 volatile int fastreboot_dryrun = 0;
74 
75 /*
76  * Kill all user processes in said zone.  A special argument of ALL_ZONES is
77  * passed in when the system as a whole is shutting down.  The lack of per-zone
78  * process lists is likely to make the following a performance bottleneck on a
79  * system with many zones.
80  */
81 void
82 killall(zoneid_t zoneid)
83 {
84 	proc_t *p;
85 
86 	ASSERT(zoneid != GLOBAL_ZONEID);
87 	/*
88 	 * Kill all processes except kernel daemons and ourself.
89 	 * Make a first pass to stop all processes so they won't
90 	 * be trying to restart children as we kill them.
91 	 */
92 	mutex_enter(&pidlock);
93 	for (p = practive; p != NULL; p = p->p_next) {
94 		if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) &&
95 		    p->p_exec != NULLVP &&	/* kernel daemons */
96 		    p->p_as != &kas &&
97 		    p->p_stat != SZOMB) {
98 			mutex_enter(&p->p_lock);
99 			p->p_flag |= SNOWAIT;
100 			sigtoproc(p, NULL, SIGSTOP);
101 			mutex_exit(&p->p_lock);
102 		}
103 	}
104 	p = practive;
105 	while (p != NULL) {
106 		if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) &&
107 		    p->p_exec != NULLVP &&	/* kernel daemons */
108 		    p->p_as != &kas &&
109 		    p->p_stat != SIDL &&
110 		    p->p_stat != SZOMB) {
111 			mutex_enter(&p->p_lock);
112 			if (sigismember(&p->p_sig, SIGKILL)) {
113 				mutex_exit(&p->p_lock);
114 				p = p->p_next;
115 			} else {
116 				sigtoproc(p, NULL, SIGKILL);
117 				mutex_exit(&p->p_lock);
118 				(void) cv_reltimedwait(&p->p_srwchan_cv,
119 				    &pidlock, hz, TR_CLOCK_TICK);
120 				p = practive;
121 			}
122 		} else {
123 			p = p->p_next;
124 		}
125 	}
126 	mutex_exit(&pidlock);
127 }
128 
129 /*
130  * Emits an SDT probe (sdt:::test) with 7 arguments.  This is used to test
131  * arguments are passed properly, whether via registers or on the stack.
132  */
133 static void
134 sdt_test_args(void)
135 {
136 	DTRACE_PROBE7(test, int, 1, int, 2, int, 3, int, 4, int, 5,
137 	    int, 6, int, 7);
138 }
139 
140 /*
141  * Same as above, but with the probe called as a tail call.
142  * Unfortunately, gcc doesn't yet have a [[musttail]] attribute that would
143  * either generate a tail call or error at compile time if it can't.  Instead
144  * we use a separate function along with a optimize pragma.  On x86, this does
145  * indeed generate a tail call as written.
146  */
147 #pragma GCC push_options
148 #pragma GCC optimize("optimize-sibling-calls")
149 static void
150 sdt_test_args_tail_call(int a, int b, int c, int d, int e, int f, int g)
151 {
152 	DTRACE_PROBE7(test, int, a, int, b, int, c, int, d,
153 	    int, e, int, f, int, g);
154 }
155 #pragma GCC pop_options
156 
157 int
158 kadmin(int cmd, int fcn, void *mdep, cred_t *credp)
159 {
160 	int error = 0;
161 	char *buf;
162 	size_t buflen = 0;
163 	boolean_t invoke_cb = B_FALSE;
164 
165 	/*
166 	 * We might be called directly by the kernel's fault-handling code, so
167 	 * we can't assert that the caller is in the global zone.
168 	 */
169 
170 	/*
171 	 * Make sure that cmd is one of the valid <sys/uadmin.h> command codes
172 	 * and that we have appropriate privileges for this action.
173 	 */
174 	switch (cmd) {
175 	case A_FTRACE:
176 	case A_SHUTDOWN:
177 	case A_REBOOT:
178 	case A_REMOUNT:
179 	case A_FREEZE:
180 	case A_DUMP:
181 	case A_SDTTEST:
182 	case A_CONFIG:
183 		if (secpolicy_sys_config(credp, B_FALSE) != 0)
184 			return (EPERM);
185 		break;
186 
187 	default:
188 		return (EINVAL);
189 	}
190 
191 	/*
192 	 * Serialize these operations on ualock.  If it is held, the
193 	 * system should shutdown, reboot, or remount shortly, unless there is
194 	 * an error.  We need a cv rather than just a mutex because proper
195 	 * functioning of A_REBOOT relies on being able to interrupt blocked
196 	 * userland callers.
197 	 *
198 	 * We only clear ua_shutdown_thread after A_REMOUNT or A_CONFIG.
199 	 * Other commands should never return.
200 	 */
201 	if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_REMOUNT ||
202 	    cmd == A_CONFIG) {
203 		mutex_enter(&ualock);
204 		while (ua_shutdown_thread != NULL) {
205 			if (cv_wait_sig(&uacond, &ualock) == 0) {
206 				/*
207 				 * If we were interrupted, leave, and handle
208 				 * the signal (or exit, depending on what
209 				 * happened)
210 				 */
211 				mutex_exit(&ualock);
212 				return (EINTR);
213 			}
214 		}
215 		ua_shutdown_thread = curthread;
216 		mutex_exit(&ualock);
217 	}
218 
219 	switch (cmd) {
220 	case A_SHUTDOWN:
221 	{
222 		proc_t *p = ttoproc(curthread);
223 
224 		/*
225 		 * Release (almost) all of our own resources if we are called
226 		 * from a user context, however if we are calling kadmin() from
227 		 * a kernel context then we do not release these resources.
228 		 */
229 		if (p != &p0) {
230 			proc_is_exiting(p);
231 			if ((error = exitlwps(0)) != 0) {
232 				/*
233 				 * Another thread in this process also called
234 				 * exitlwps().
235 				 */
236 				mutex_enter(&ualock);
237 				ua_shutdown_thread = NULL;
238 				cv_signal(&uacond);
239 				mutex_exit(&ualock);
240 				return (error);
241 			}
242 			mutex_enter(&p->p_lock);
243 			p->p_flag |= SNOWAIT;
244 			sigfillset(&p->p_ignore);
245 			curthread->t_lwp->lwp_cursig = 0;
246 			curthread->t_lwp->lwp_extsig = 0;
247 			if (p->p_exec) {
248 				vnode_t *exec_vp = p->p_exec;
249 				p->p_exec = NULLVP;
250 				mutex_exit(&p->p_lock);
251 				VN_RELE(exec_vp);
252 			} else {
253 				mutex_exit(&p->p_lock);
254 			}
255 
256 			pollcleanup();
257 			closeall(P_FINFO(curproc));
258 			relvm();
259 
260 		} else {
261 			/*
262 			 * Reset t_cred if not set because much of the
263 			 * filesystem code depends on CRED() being valid.
264 			 */
265 			if (curthread->t_cred == NULL)
266 				curthread->t_cred = kcred;
267 		}
268 
269 		/* indicate shutdown in progress */
270 		sys_shutdown = 1;
271 
272 		/*
273 		 * Communcate that init shouldn't be restarted.
274 		 */
275 		zone_shutdown_global();
276 
277 		killall(ALL_ZONES);
278 		/*
279 		 * If we are calling kadmin() from a kernel context then we
280 		 * do not release these resources.
281 		 */
282 		if (ttoproc(curthread) != &p0) {
283 			VN_RELE(PTOU(curproc)->u_cdir);
284 			if (PTOU(curproc)->u_rdir)
285 				VN_RELE(PTOU(curproc)->u_rdir);
286 			if (PTOU(curproc)->u_cwd)
287 				refstr_rele(PTOU(curproc)->u_cwd);
288 
289 			PTOU(curproc)->u_cdir = rootdir;
290 			PTOU(curproc)->u_rdir = NULL;
291 			PTOU(curproc)->u_cwd = NULL;
292 		}
293 
294 		/*
295 		 * Allow the reboot/halt/poweroff code a chance to do
296 		 * anything it needs to whilst we still have filesystems
297 		 * mounted, like loading any modules necessary for later
298 		 * performing the actual poweroff.
299 		 */
300 		if ((mdep != NULL) && (*(char *)mdep == '/')) {
301 			buf = i_convert_boot_device_name(mdep, NULL, &buflen);
302 			mdpreboot(cmd, fcn, buf);
303 		} else
304 			mdpreboot(cmd, fcn, mdep);
305 
306 		/*
307 		 * Allow fsflush to finish running and then prevent it
308 		 * from ever running again so that vfs_unmountall() and
309 		 * vfs_syncall() can acquire the vfs locks they need.
310 		 */
311 		sema_p(&fsflush_sema);
312 		(void) callb_execute_class(CB_CL_UADMIN_PRE_VFS, 0);
313 
314 		vfs_unmountall();
315 		(void) VFS_MOUNTROOT(rootvfs, ROOT_UNMOUNT);
316 		vfs_syncall();
317 
318 		/*
319 		 * Check for (and unregister) any DDI periodic handlers that
320 		 * still exist, as they most likely constitute resource leaks:
321 		 */
322 		ddi_periodic_fini();
323 
324 		dump_ereports();
325 		dump_messages();
326 
327 		invoke_cb = B_TRUE;
328 	}
329 	/* FALLTHROUGH */
330 
331 	case A_REBOOT:
332 		if ((mdep != NULL) && (*(char *)mdep == '/')) {
333 			buf = i_convert_boot_device_name(mdep, NULL, &buflen);
334 			mdboot(cmd, fcn, buf, invoke_cb);
335 		} else
336 			mdboot(cmd, fcn, mdep, invoke_cb);
337 		/* no return expected */
338 		break;
339 
340 	case A_CONFIG:
341 		switch (fcn) {
342 		case AD_UPDATE_BOOT_CONFIG:
343 #ifndef	__sparc
344 		{
345 			extern void fastboot_update_config(const char *);
346 
347 			fastboot_update_config(mdep);
348 		}
349 #endif
350 
351 			break;
352 		}
353 		/* Let other threads enter the shutdown path now */
354 		mutex_enter(&ualock);
355 		ua_shutdown_thread = NULL;
356 		cv_signal(&uacond);
357 		mutex_exit(&ualock);
358 		break;
359 
360 	case A_REMOUNT:
361 		(void) VFS_MOUNTROOT(rootvfs, ROOT_REMOUNT);
362 		/* Let other threads enter the shutdown path now */
363 		mutex_enter(&ualock);
364 		ua_shutdown_thread = NULL;
365 		cv_signal(&uacond);
366 		mutex_exit(&ualock);
367 		break;
368 
369 	case A_FREEZE:
370 	{
371 		/*
372 		 * This is the entrypoint for all suspend/resume actions.
373 		 */
374 		extern int cpr(int, void *);
375 
376 		if (modload("misc", "cpr") == -1)
377 			return (ENOTSUP);
378 		/* Let the CPR module decide what to do with mdep */
379 		error = cpr(fcn, mdep);
380 		break;
381 	}
382 
383 	case A_FTRACE:
384 	{
385 		switch (fcn) {
386 		case AD_FTRACE_START:
387 			(void) FTRACE_START();
388 			break;
389 		case AD_FTRACE_STOP:
390 			(void) FTRACE_STOP();
391 			break;
392 		default:
393 			error = EINVAL;
394 		}
395 		break;
396 	}
397 
398 	case A_DUMP:
399 	{
400 		if (fcn == AD_NOSYNC) {
401 			in_sync = 1;
402 			break;
403 		}
404 
405 		panic_bootfcn = fcn;
406 		panic_forced = 1;
407 
408 		if ((mdep != NULL) && (*(char *)mdep == '/')) {
409 			panic_bootstr = i_convert_boot_device_name(mdep,
410 			    NULL, &buflen);
411 		} else
412 			panic_bootstr = mdep;
413 
414 #ifndef	__sparc
415 		extern void fastboot_update_and_load(int, char *);
416 
417 		fastboot_update_and_load(fcn, mdep);
418 #endif
419 
420 		panic("forced crash dump initiated at user request");
421 		/*NOTREACHED*/
422 	}
423 
424 	case A_SDTTEST:
425 	{
426 		sdt_test_args();
427 		sdt_test_args_tail_call(1, 2, 3, 4, 5, 6, 7);
428 		break;
429 	}
430 
431 	default:
432 		error = EINVAL;
433 	}
434 
435 	return (error);
436 }
437 
438 int
439 uadmin(int cmd, int fcn, uintptr_t mdep)
440 {
441 	int error = 0, rv = 0;
442 	size_t nbytes = 0;
443 	cred_t *credp = CRED();
444 	char *bootargs = NULL;
445 	int reset_status = 0;
446 
447 	if (cmd == A_SHUTDOWN && fcn == AD_FASTREBOOT_DRYRUN) {
448 		ddi_walk_devs(ddi_root_node(), check_driver_quiesce,
449 		    &reset_status);
450 		if (reset_status != 0)
451 			return (EIO);
452 		else
453 			return (0);
454 	}
455 
456 	/*
457 	 * The swapctl system call doesn't have its own entry point: it uses
458 	 * uadmin as a wrapper so we just call it directly from here.
459 	 */
460 	if (cmd == A_SWAPCTL) {
461 		if (get_udatamodel() == DATAMODEL_NATIVE)
462 			error = swapctl(fcn, (void *)mdep, &rv);
463 #if defined(_SYSCALL32_IMPL)
464 		else
465 			error = swapctl32(fcn, (void *)mdep, &rv);
466 #endif /* _SYSCALL32_IMPL */
467 		return (error ? set_errno(error) : rv);
468 	}
469 
470 	/*
471 	 * Certain subcommands intepret a non-NULL mdep value as a pointer to
472 	 * a boot string.  We pull that in as bootargs, if applicable.
473 	 */
474 	if (mdep != (uintptr_t)NULL &&
475 	    (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_DUMP ||
476 	    cmd == A_FREEZE || cmd == A_CONFIG)) {
477 		bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
478 		if ((error = copyinstr((const char *)mdep, bootargs,
479 		    BOOTARGS_MAX, &nbytes)) != 0) {
480 			kmem_free(bootargs, BOOTARGS_MAX);
481 			return (set_errno(error));
482 		}
483 	}
484 
485 	/*
486 	 * Invoke the appropriate kadmin() routine.
487 	 */
488 	if (getzoneid() != GLOBAL_ZONEID)
489 		error = zone_kadmin(cmd, fcn, bootargs, credp);
490 	else
491 		error = kadmin(cmd, fcn, bootargs, credp);
492 
493 	if (bootargs != NULL)
494 		kmem_free(bootargs, BOOTARGS_MAX);
495 	return (error ? set_errno(error) : 0);
496 }
497