1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright 2013 Joyent, Inc. All rights reserved. 26 * Copyright 2024 Oxide Computer Company 27 */ 28 29 #include <sys/param.h> 30 #include <sys/types.h> 31 #include <sys/sysmacros.h> 32 #include <sys/systm.h> 33 #include <sys/errno.h> 34 #include <sys/vfs.h> 35 #include <sys/vnode.h> 36 #include <sys/swap.h> 37 #include <sys/file.h> 38 #include <sys/proc.h> 39 #include <sys/var.h> 40 #include <sys/uadmin.h> 41 #include <sys/signal.h> 42 #include <sys/time.h> 43 #include <vm/seg_kmem.h> 44 #include <sys/modctl.h> 45 #include <sys/callb.h> 46 #include <sys/dumphdr.h> 47 #include <sys/debug.h> 48 #include <sys/ftrace.h> 49 #include <sys/cmn_err.h> 50 #include <sys/panic.h> 51 #include <sys/ddi.h> 52 #include <sys/ddi_periodic.h> 53 #include <sys/sunddi.h> 54 #include <sys/policy.h> 55 #include <sys/zone.h> 56 #include <sys/condvar.h> 57 #include <sys/thread.h> 58 #include <sys/sdt.h> 59 60 /* 61 * Administrivia system call. We provide this in two flavors: one for calling 62 * from the system call path (uadmin), and the other for calling from elsewhere 63 * within the kernel (kadmin). Callers must beware that certain uadmin cmd 64 * values (specifically A_SWAPCTL) are only supported by uadmin and not kadmin. 65 */ 66 67 extern ksema_t fsflush_sema; 68 kmutex_t ualock; 69 kcondvar_t uacond; 70 kthread_t *ua_shutdown_thread = NULL; 71 72 int sys_shutdown = 0; 73 volatile int fastreboot_dryrun = 0; 74 75 /* 76 * Kill all user processes in said zone. A special argument of ALL_ZONES is 77 * passed in when the system as a whole is shutting down. The lack of per-zone 78 * process lists is likely to make the following a performance bottleneck on a 79 * system with many zones. 80 */ 81 void 82 killall(zoneid_t zoneid) 83 { 84 proc_t *p; 85 86 ASSERT(zoneid != GLOBAL_ZONEID); 87 /* 88 * Kill all processes except kernel daemons and ourself. 89 * Make a first pass to stop all processes so they won't 90 * be trying to restart children as we kill them. 91 */ 92 mutex_enter(&pidlock); 93 for (p = practive; p != NULL; p = p->p_next) { 94 if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) && 95 p->p_exec != NULLVP && /* kernel daemons */ 96 p->p_as != &kas && 97 p->p_stat != SZOMB) { 98 mutex_enter(&p->p_lock); 99 p->p_flag |= SNOWAIT; 100 sigtoproc(p, NULL, SIGSTOP); 101 mutex_exit(&p->p_lock); 102 } 103 } 104 p = practive; 105 while (p != NULL) { 106 if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) && 107 p->p_exec != NULLVP && /* kernel daemons */ 108 p->p_as != &kas && 109 p->p_stat != SIDL && 110 p->p_stat != SZOMB) { 111 mutex_enter(&p->p_lock); 112 if (sigismember(&p->p_sig, SIGKILL)) { 113 mutex_exit(&p->p_lock); 114 p = p->p_next; 115 } else { 116 sigtoproc(p, NULL, SIGKILL); 117 mutex_exit(&p->p_lock); 118 (void) cv_reltimedwait(&p->p_srwchan_cv, 119 &pidlock, hz, TR_CLOCK_TICK); 120 p = practive; 121 } 122 } else { 123 p = p->p_next; 124 } 125 } 126 mutex_exit(&pidlock); 127 } 128 129 /* 130 * Emits an SDT probe (sdt:::test) with 7 arguments. This is used to test 131 * arguments are passed properly, whether via registers or on the stack. 132 */ 133 static void 134 sdt_test_args(void) 135 { 136 DTRACE_PROBE7(test, int, 1, int, 2, int, 3, int, 4, int, 5, 137 int, 6, int, 7); 138 } 139 140 /* 141 * Same as above, but with the probe called as a tail call. 142 * Unfortunately, gcc doesn't yet have a [[musttail]] attribute that would 143 * either generate a tail call or error at compile time if it can't. Instead 144 * we use a separate function along with a optimize pragma. On x86, this does 145 * indeed generate a tail call as written. 146 */ 147 #pragma GCC push_options 148 #pragma GCC optimize("optimize-sibling-calls") 149 static void 150 sdt_test_args_tail_call(int a, int b, int c, int d, int e, int f, int g) 151 { 152 DTRACE_PROBE7(test, int, a, int, b, int, c, int, d, 153 int, e, int, f, int, g); 154 } 155 #pragma GCC pop_options 156 157 int 158 kadmin(int cmd, int fcn, void *mdep, cred_t *credp) 159 { 160 int error = 0; 161 char *buf; 162 size_t buflen = 0; 163 boolean_t invoke_cb = B_FALSE; 164 165 /* 166 * We might be called directly by the kernel's fault-handling code, so 167 * we can't assert that the caller is in the global zone. 168 */ 169 170 /* 171 * Make sure that cmd is one of the valid <sys/uadmin.h> command codes 172 * and that we have appropriate privileges for this action. 173 */ 174 switch (cmd) { 175 case A_FTRACE: 176 case A_SHUTDOWN: 177 case A_REBOOT: 178 case A_REMOUNT: 179 case A_FREEZE: 180 case A_DUMP: 181 case A_SDTTEST: 182 case A_CONFIG: 183 if (secpolicy_sys_config(credp, B_FALSE) != 0) 184 return (EPERM); 185 break; 186 187 default: 188 return (EINVAL); 189 } 190 191 /* 192 * Serialize these operations on ualock. If it is held, the 193 * system should shutdown, reboot, or remount shortly, unless there is 194 * an error. We need a cv rather than just a mutex because proper 195 * functioning of A_REBOOT relies on being able to interrupt blocked 196 * userland callers. 197 * 198 * We only clear ua_shutdown_thread after A_REMOUNT or A_CONFIG. 199 * Other commands should never return. 200 */ 201 if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_REMOUNT || 202 cmd == A_CONFIG) { 203 mutex_enter(&ualock); 204 while (ua_shutdown_thread != NULL) { 205 if (cv_wait_sig(&uacond, &ualock) == 0) { 206 /* 207 * If we were interrupted, leave, and handle 208 * the signal (or exit, depending on what 209 * happened) 210 */ 211 mutex_exit(&ualock); 212 return (EINTR); 213 } 214 } 215 ua_shutdown_thread = curthread; 216 mutex_exit(&ualock); 217 } 218 219 switch (cmd) { 220 case A_SHUTDOWN: 221 { 222 proc_t *p = ttoproc(curthread); 223 224 /* 225 * Release (almost) all of our own resources if we are called 226 * from a user context, however if we are calling kadmin() from 227 * a kernel context then we do not release these resources. 228 */ 229 if (p != &p0) { 230 proc_is_exiting(p); 231 if ((error = exitlwps(0)) != 0) { 232 /* 233 * Another thread in this process also called 234 * exitlwps(). 235 */ 236 mutex_enter(&ualock); 237 ua_shutdown_thread = NULL; 238 cv_signal(&uacond); 239 mutex_exit(&ualock); 240 return (error); 241 } 242 mutex_enter(&p->p_lock); 243 p->p_flag |= SNOWAIT; 244 sigfillset(&p->p_ignore); 245 curthread->t_lwp->lwp_cursig = 0; 246 curthread->t_lwp->lwp_extsig = 0; 247 if (p->p_exec) { 248 vnode_t *exec_vp = p->p_exec; 249 p->p_exec = NULLVP; 250 mutex_exit(&p->p_lock); 251 VN_RELE(exec_vp); 252 } else { 253 mutex_exit(&p->p_lock); 254 } 255 256 pollcleanup(); 257 closeall(P_FINFO(curproc)); 258 relvm(); 259 260 } else { 261 /* 262 * Reset t_cred if not set because much of the 263 * filesystem code depends on CRED() being valid. 264 */ 265 if (curthread->t_cred == NULL) 266 curthread->t_cred = kcred; 267 } 268 269 /* indicate shutdown in progress */ 270 sys_shutdown = 1; 271 272 /* 273 * Communcate that init shouldn't be restarted. 274 */ 275 zone_shutdown_global(); 276 277 killall(ALL_ZONES); 278 /* 279 * If we are calling kadmin() from a kernel context then we 280 * do not release these resources. 281 */ 282 if (ttoproc(curthread) != &p0) { 283 VN_RELE(PTOU(curproc)->u_cdir); 284 if (PTOU(curproc)->u_rdir) 285 VN_RELE(PTOU(curproc)->u_rdir); 286 if (PTOU(curproc)->u_cwd) 287 refstr_rele(PTOU(curproc)->u_cwd); 288 289 PTOU(curproc)->u_cdir = rootdir; 290 PTOU(curproc)->u_rdir = NULL; 291 PTOU(curproc)->u_cwd = NULL; 292 } 293 294 /* 295 * Allow the reboot/halt/poweroff code a chance to do 296 * anything it needs to whilst we still have filesystems 297 * mounted, like loading any modules necessary for later 298 * performing the actual poweroff. 299 */ 300 if ((mdep != NULL) && (*(char *)mdep == '/')) { 301 buf = i_convert_boot_device_name(mdep, NULL, &buflen); 302 mdpreboot(cmd, fcn, buf); 303 } else 304 mdpreboot(cmd, fcn, mdep); 305 306 /* 307 * Allow fsflush to finish running and then prevent it 308 * from ever running again so that vfs_unmountall() and 309 * vfs_syncall() can acquire the vfs locks they need. 310 */ 311 sema_p(&fsflush_sema); 312 (void) callb_execute_class(CB_CL_UADMIN_PRE_VFS, 0); 313 314 vfs_unmountall(); 315 (void) VFS_MOUNTROOT(rootvfs, ROOT_UNMOUNT); 316 vfs_syncall(); 317 318 /* 319 * Check for (and unregister) any DDI periodic handlers that 320 * still exist, as they most likely constitute resource leaks: 321 */ 322 ddi_periodic_fini(); 323 324 dump_ereports(); 325 dump_messages(); 326 327 invoke_cb = B_TRUE; 328 } 329 /* FALLTHROUGH */ 330 331 case A_REBOOT: 332 if ((mdep != NULL) && (*(char *)mdep == '/')) { 333 buf = i_convert_boot_device_name(mdep, NULL, &buflen); 334 mdboot(cmd, fcn, buf, invoke_cb); 335 } else 336 mdboot(cmd, fcn, mdep, invoke_cb); 337 /* no return expected */ 338 break; 339 340 case A_CONFIG: 341 switch (fcn) { 342 case AD_UPDATE_BOOT_CONFIG: 343 #ifndef __sparc 344 { 345 extern void fastboot_update_config(const char *); 346 347 fastboot_update_config(mdep); 348 } 349 #endif 350 351 break; 352 } 353 /* Let other threads enter the shutdown path now */ 354 mutex_enter(&ualock); 355 ua_shutdown_thread = NULL; 356 cv_signal(&uacond); 357 mutex_exit(&ualock); 358 break; 359 360 case A_REMOUNT: 361 (void) VFS_MOUNTROOT(rootvfs, ROOT_REMOUNT); 362 /* Let other threads enter the shutdown path now */ 363 mutex_enter(&ualock); 364 ua_shutdown_thread = NULL; 365 cv_signal(&uacond); 366 mutex_exit(&ualock); 367 break; 368 369 case A_FREEZE: 370 { 371 /* 372 * This is the entrypoint for all suspend/resume actions. 373 */ 374 extern int cpr(int, void *); 375 376 if (modload("misc", "cpr") == -1) 377 return (ENOTSUP); 378 /* Let the CPR module decide what to do with mdep */ 379 error = cpr(fcn, mdep); 380 break; 381 } 382 383 case A_FTRACE: 384 { 385 switch (fcn) { 386 case AD_FTRACE_START: 387 (void) FTRACE_START(); 388 break; 389 case AD_FTRACE_STOP: 390 (void) FTRACE_STOP(); 391 break; 392 default: 393 error = EINVAL; 394 } 395 break; 396 } 397 398 case A_DUMP: 399 { 400 if (fcn == AD_NOSYNC) { 401 in_sync = 1; 402 break; 403 } 404 405 panic_bootfcn = fcn; 406 panic_forced = 1; 407 408 if ((mdep != NULL) && (*(char *)mdep == '/')) { 409 panic_bootstr = i_convert_boot_device_name(mdep, 410 NULL, &buflen); 411 } else 412 panic_bootstr = mdep; 413 414 #ifndef __sparc 415 extern void fastboot_update_and_load(int, char *); 416 417 fastboot_update_and_load(fcn, mdep); 418 #endif 419 420 panic("forced crash dump initiated at user request"); 421 /*NOTREACHED*/ 422 } 423 424 case A_SDTTEST: 425 { 426 sdt_test_args(); 427 sdt_test_args_tail_call(1, 2, 3, 4, 5, 6, 7); 428 break; 429 } 430 431 default: 432 error = EINVAL; 433 } 434 435 return (error); 436 } 437 438 int 439 uadmin(int cmd, int fcn, uintptr_t mdep) 440 { 441 int error = 0, rv = 0; 442 size_t nbytes = 0; 443 cred_t *credp = CRED(); 444 char *bootargs = NULL; 445 int reset_status = 0; 446 447 if (cmd == A_SHUTDOWN && fcn == AD_FASTREBOOT_DRYRUN) { 448 ddi_walk_devs(ddi_root_node(), check_driver_quiesce, 449 &reset_status); 450 if (reset_status != 0) 451 return (EIO); 452 else 453 return (0); 454 } 455 456 /* 457 * The swapctl system call doesn't have its own entry point: it uses 458 * uadmin as a wrapper so we just call it directly from here. 459 */ 460 if (cmd == A_SWAPCTL) { 461 if (get_udatamodel() == DATAMODEL_NATIVE) 462 error = swapctl(fcn, (void *)mdep, &rv); 463 #if defined(_SYSCALL32_IMPL) 464 else 465 error = swapctl32(fcn, (void *)mdep, &rv); 466 #endif /* _SYSCALL32_IMPL */ 467 return (error ? set_errno(error) : rv); 468 } 469 470 /* 471 * Certain subcommands intepret a non-NULL mdep value as a pointer to 472 * a boot string. We pull that in as bootargs, if applicable. 473 */ 474 if (mdep != (uintptr_t)NULL && 475 (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_DUMP || 476 cmd == A_FREEZE || cmd == A_CONFIG)) { 477 bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP); 478 if ((error = copyinstr((const char *)mdep, bootargs, 479 BOOTARGS_MAX, &nbytes)) != 0) { 480 kmem_free(bootargs, BOOTARGS_MAX); 481 return (set_errno(error)); 482 } 483 } 484 485 /* 486 * Invoke the appropriate kadmin() routine. 487 */ 488 if (getzoneid() != GLOBAL_ZONEID) 489 error = zone_kadmin(cmd, fcn, bootargs, credp); 490 else 491 error = kadmin(cmd, fcn, bootargs, credp); 492 493 if (bootargs != NULL) 494 kmem_free(bootargs, BOOTARGS_MAX); 495 return (error ? set_errno(error) : 0); 496 } 497