1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright 2023 Oxide Computer Company 29 */ 30 31 /* 32 * This workaround inhibits prom_printf after the cpus are grabbed. 33 * This can be removed when 4154263 is corrected. 34 */ 35 #define Bug_4154263 36 37 /* 38 * A CPR derivative specifically for sunfire 39 */ 40 41 #include <sys/types.h> 42 #include <sys/systm.h> 43 #include <sys/machparam.h> 44 #include <sys/machsystm.h> 45 #include <sys/ddi.h> 46 #define SUNDDI_IMPL 47 #include <sys/sunddi.h> 48 #include <sys/time.h> 49 #include <sys/kmem.h> 50 #include <nfs/lm.h> 51 #include <sys/ddi_impldefs.h> 52 #include <sys/obpdefs.h> 53 #include <sys/cmn_err.h> 54 #include <sys/debug.h> 55 #include <sys/errno.h> 56 #include <sys/callb.h> 57 #include <sys/clock.h> 58 #include <sys/x_call.h> 59 #include <sys/cpuvar.h> 60 #include <sys/epm.h> 61 #include <sys/vfs.h> 62 #include <sys/fhc.h> 63 #include <sys/sysctrl.h> 64 #include <sys/promif.h> 65 #include <sys/conf.h> 66 #include <sys/modctl.h> 67 #include <sys/cyclic.h> 68 #include <sys/sunndi.h> 69 #include <sys/machsystm.h> 70 71 static enum sysctrl_suspend_state { 72 SYSC_STATE_BEGIN = 0, 73 SYSC_STATE_USER, 74 SYSC_STATE_DAEMON, 75 SYSC_STATE_DRIVER, 76 SYSC_STATE_FULL } suspend_state; 77 78 static int pstate_save; 79 static uint_t sysctrl_gate[NCPU]; 80 int sysctrl_quiesce_debug = FALSE; 81 static int sysctrl_skip_kernel_threads = TRUE; 82 83 /* 84 * sysctrl_skip_user_threads is used to control if user threads should 85 * be suspended. If sysctrl_skip_user_threads is true, the rest of the 86 * flags are not used; if it is false, sysctrl_check_user_stop_result 87 * will be used to control whether or not we need to check suspend 88 * result, and sysctrl_allow_blocked_threads will be used to control 89 * whether or not we allow suspend to continue if there are blocked 90 * threads. We allow all combinations of sysctrl_check_user_stop_result 91 * and sysctrl_allow_block_threads, even though it might not make much 92 * sense to not allow block threads when we don't even check stop 93 * result. 94 */ 95 static int sysctrl_skip_user_threads = 0; /* default to FALSE */ 96 static int sysctrl_check_user_stop_result = 1; /* default to TRUE */ 97 static int sysctrl_allow_blocked_threads = 1; /* default to TRUE */ 98 99 static int sysc_watchdog_suspended; 100 101 extern int sysctrl_enable_detach_suspend; 102 static int sysc_lastval; 103 104 #define DEBUGP(p) { if (sysctrl_quiesce_debug) p; } 105 #define errp prom_printf 106 107 #define SYSC_CPU_LOOP_MSEC 1000 108 109 static void 110 sysctrl_grab_cpus(void) 111 { 112 int i; 113 cpuset_t others; 114 extern cpuset_t cpu_ready_set; 115 extern void sysctrl_freeze(void); 116 uint64_t sysc_tick_limit; 117 uint64_t sysc_current_tick; 118 uint64_t sysc_tick_deadline; 119 120 extern u_longlong_t gettick(void); 121 122 for (i = 0; i < NCPU; i++) 123 sysctrl_gate[i] = 0; 124 125 /* tell other cpus to go quiet and wait for continue signal */ 126 others = cpu_ready_set; 127 CPUSET_DEL(others, CPU->cpu_id); 128 xt_some(others, (xcfunc_t *)sysctrl_freeze, (uint64_t)sysctrl_gate, 129 (uint64_t)(&sysctrl_gate[CPU->cpu_id])); 130 131 sysc_tick_limit = ((uint64_t)sys_tick_freq * SYSC_CPU_LOOP_MSEC) / 1000; 132 133 /* wait for each cpu to check in */ 134 for (i = 0; i < NCPU; i++) { 135 if (!CPU_IN_SET(others, i)) 136 continue; 137 138 /* 139 * Get current tick value and calculate the deadline tick 140 */ 141 sysc_current_tick = gettick(); 142 sysc_tick_deadline = sysc_current_tick + sysc_tick_limit; 143 144 while (sysctrl_gate[i] == 0) { 145 /* If in panic, we just return */ 146 if (panicstr) 147 break; 148 149 /* Panic the system if cpu not responsed by deadline */ 150 sysc_current_tick = gettick(); 151 if (sysc_current_tick >= sysc_tick_deadline) { 152 cmn_err(CE_PANIC, "sysctrl: cpu %d not " 153 "responding to quiesce command", i); 154 } 155 } 156 } 157 158 /* now even our interrupts are disabled -- really quiet now */ 159 pstate_save = disable_vec_intr(); 160 } 161 162 static void 163 sysctrl_release_cpus(void) 164 { 165 /* let the other cpus go */ 166 sysctrl_gate[CPU->cpu_id] = 1; 167 168 /* restore our interrupts too */ 169 enable_vec_intr(pstate_save); 170 } 171 172 static void 173 sysctrl_stop_intr(void) 174 { 175 mutex_enter(&cpu_lock); 176 kpreempt_disable(); 177 cyclic_suspend(); 178 } 179 180 static void 181 sysctrl_enable_intr(void) 182 { 183 cyclic_resume(); 184 (void) spl0(); 185 kpreempt_enable(); 186 mutex_exit(&cpu_lock); 187 } 188 189 static int 190 sysctrl_is_real_device(dev_info_t *dip) 191 { 192 struct regspec *regbuf; 193 int length; 194 int rc; 195 196 if (ddi_get_driver(dip) == NULL) 197 return (FALSE); 198 199 if (DEVI(dip)->devi_pm_flags & (PMC_NEEDS_SR|PMC_PARENTAL_SR)) 200 return (TRUE); 201 if (DEVI(dip)->devi_pm_flags & PMC_NO_SR) 202 return (FALSE); 203 204 /* 205 * now the general case 206 */ 207 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg", 208 (caddr_t)®buf, &length); 209 ASSERT(rc != DDI_PROP_NO_MEMORY); 210 if (rc != DDI_PROP_SUCCESS) { 211 return (FALSE); 212 } else { 213 kmem_free(regbuf, length); 214 return (TRUE); 215 } 216 } 217 218 static dev_info_t *failed_driver; 219 static char device_path[MAXPATHLEN]; 220 221 static int 222 sysctrl_suspend_devices(dev_info_t *dip, sysc_cfga_pkt_t *pkt) 223 { 224 ASSERT(dip == NULL || ddi_get_parent(dip) == NULL || 225 DEVI_BUSY_OWNED(ddi_get_parent(dip))); 226 227 failed_driver = NULL; 228 for (; dip != NULL; dip = ddi_get_next_sibling(dip)) { 229 /* 230 * Hold parent busy while walking child list 231 */ 232 ndi_devi_enter(dip); 233 if (sysctrl_suspend_devices(ddi_get_child(dip), pkt)) { 234 ndi_devi_exit(dip); 235 return (ENXIO); 236 } 237 ndi_devi_exit(dip); 238 239 if (!sysctrl_is_real_device(dip)) 240 continue; 241 242 /* 243 * Safe to call ddi_pathname() as parent is held busy 244 */ 245 (void) ddi_pathname(dip, device_path); 246 DEBUGP(errp(" suspending device %s\n", device_path)); 247 if (devi_detach(dip, DDI_SUSPEND) != DDI_SUCCESS) { 248 DEBUGP(errp(" unable to suspend device %s\n", 249 device_path)); 250 251 (void) strncpy(pkt->errbuf, device_path, 252 SYSC_OUTPUT_LEN); 253 SYSC_ERR_SET(pkt, SYSC_ERR_SUSPEND); 254 ndi_hold_devi(dip); 255 failed_driver = dip; 256 return (ENXIO); 257 } 258 } 259 260 return (DDI_SUCCESS); 261 } 262 263 static void 264 sysctrl_resume_devices(dev_info_t *start, sysc_cfga_pkt_t *pkt) 265 { 266 dev_info_t *dip, *next, *last = NULL; 267 268 ASSERT(start == NULL || ddi_get_parent(start) == NULL || 269 DEVI_BUSY_OWNED(ddi_get_parent(start))); 270 271 /* attach in reverse device tree order */ 272 while (last != start) { 273 dip = start; 274 next = ddi_get_next_sibling(dip); 275 while (next != last && dip != failed_driver) { 276 dip = next; 277 next = ddi_get_next_sibling(dip); 278 } 279 if (dip == failed_driver) { 280 failed_driver = NULL; 281 ndi_rele_devi(dip); 282 } else if (sysctrl_is_real_device(dip) && 283 failed_driver == NULL) { 284 /* 285 * Parent dip is held busy, so ddi_pathname() can 286 * be safely called. 287 */ 288 (void) ddi_pathname(dip, device_path); 289 DEBUGP(errp(" resuming device %s\n", device_path)); 290 if (devi_attach(dip, DDI_RESUME) != DDI_SUCCESS) { 291 /* 292 * XXX - if in the future we decide not to 293 * panic the system, we need to set the error 294 * SYSC_ERR_RESUME here and also change the 295 * cfgadm platform library. 296 */ 297 cmn_err(CE_PANIC, "Unable to resume device %s", 298 device_path); 299 } 300 } 301 ndi_devi_enter(dip); 302 sysctrl_resume_devices(ddi_get_child(dip), pkt); 303 ndi_devi_exit(dip); 304 305 last = dip; 306 } 307 } 308 309 /* 310 * True if thread is virtually stopped. Similar to CPR_VSTOPPED 311 * but from DR point of view. These user threads are waiting in 312 * the kernel. Once they complete in the kernel, they will process 313 * the stop signal and stop. 314 */ 315 #define SYSCTRL_VSTOPPED(t) \ 316 ((t)->t_state == TS_SLEEP && \ 317 (t)->t_wchan != NULL && \ 318 (t)->t_astflag && \ 319 ((t)->t_proc_flag & TP_CHKPT)) 320 321 static int 322 sysctrl_stop_user_threads(sysc_cfga_pkt_t *pkt) 323 { 324 int count; 325 char cache_psargs[PSARGSZ]; 326 kthread_id_t cache_tp; 327 uint_t cache_t_state; 328 int bailout; 329 pid_t pid; 330 331 extern void add_one_utstop(); 332 extern void utstop_timedwait(clock_t); 333 extern void utstop_init(void); 334 335 #define SYSCTRL_UTSTOP_RETRY 4 336 #define SYSCTRL_UTSTOP_WAIT hz 337 338 if (sysctrl_skip_user_threads) 339 return (DDI_SUCCESS); 340 341 utstop_init(); 342 343 /* we need to try a few times to get past fork, etc. */ 344 for (count = 0; count < SYSCTRL_UTSTOP_RETRY; count++) { 345 kthread_id_t tp; 346 347 /* walk the entire threadlist */ 348 mutex_enter(&pidlock); 349 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 350 proc_t *p = ttoproc(tp); 351 352 /* handle kernel threads separately */ 353 if (p->p_as == &kas || p->p_stat == SZOMB) 354 continue; 355 356 mutex_enter(&p->p_lock); 357 thread_lock(tp); 358 359 if (tp->t_state == TS_STOPPED) { 360 /* add another reason to stop this thread */ 361 tp->t_schedflag &= ~TS_RESUME; 362 } else { 363 tp->t_proc_flag |= TP_CHKPT; 364 365 thread_unlock(tp); 366 mutex_exit(&p->p_lock); 367 add_one_utstop(); 368 mutex_enter(&p->p_lock); 369 thread_lock(tp); 370 371 aston(tp); 372 373 if (ISWAKEABLE(tp) || ISWAITING(tp)) { 374 setrun_locked(tp); 375 } 376 377 } 378 379 /* grab thread if needed */ 380 if (tp->t_state == TS_ONPROC && tp->t_cpu != CPU) 381 poke_cpu(tp->t_cpu->cpu_id); 382 383 384 thread_unlock(tp); 385 mutex_exit(&p->p_lock); 386 } 387 mutex_exit(&pidlock); 388 389 390 /* let everything catch up */ 391 utstop_timedwait(count * count * SYSCTRL_UTSTOP_WAIT); 392 393 394 /* now, walk the threadlist again to see if we are done */ 395 mutex_enter(&pidlock); 396 for (tp = curthread->t_next, bailout = 0; 397 bailout == 0 && tp != curthread; tp = tp->t_next) { 398 proc_t *p = ttoproc(tp); 399 400 /* handle kernel threads separately */ 401 if (p->p_as == &kas || p->p_stat == SZOMB) 402 continue; 403 404 /* 405 * If this thread didn't stop, and we don't allow 406 * unstopped blocked threads, bail. 407 */ 408 /* did this thread stop? */ 409 thread_lock(tp); 410 if (!CPR_ISTOPPED(tp) && 411 !(sysctrl_allow_blocked_threads && 412 SYSCTRL_VSTOPPED(tp))) { 413 414 /* nope, cache the details for later */ 415 bcopy(p->p_user.u_psargs, cache_psargs, 416 sizeof (cache_psargs)); 417 cache_tp = tp; 418 cache_t_state = tp->t_state; 419 bailout = 1; 420 pid = p->p_pidp->pid_id; 421 } 422 thread_unlock(tp); 423 } 424 mutex_exit(&pidlock); 425 426 /* were all the threads stopped? */ 427 if (!bailout) 428 break; 429 } 430 431 /* were we unable to stop all threads after a few tries? */ 432 if (bailout) { 433 (void) sprintf(pkt->errbuf, "process: %s id: %d state: %x" 434 " thread descriptor: %p", cache_psargs, (int)pid, 435 cache_t_state, (void *)cache_tp); 436 437 SYSC_ERR_SET(pkt, SYSC_ERR_UTHREAD); 438 439 return (ESRCH); 440 } 441 442 return (DDI_SUCCESS); 443 } 444 445 static int 446 sysctrl_stop_kernel_threads(sysc_cfga_pkt_t *pkt) 447 { 448 caddr_t name; 449 kthread_id_t tp; 450 451 if (sysctrl_skip_kernel_threads) { 452 return (DDI_SUCCESS); 453 } 454 455 /* 456 * Note: we unlock the table in resume. 457 * We only need to lock the callback table if we are actually 458 * suspending kernel threads. 459 */ 460 callb_lock_table(); 461 if ((name = callb_execute_class(CB_CL_CPR_DAEMON, 462 CB_CODE_CPR_CHKPT)) != (caddr_t)NULL) { 463 464 (void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN); 465 SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD); 466 return (EBUSY); 467 } 468 469 /* 470 * Verify that all threads are accounted for 471 */ 472 mutex_enter(&pidlock); 473 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 474 proc_t *p = ttoproc(tp); 475 476 if (p->p_as != &kas) 477 continue; 478 479 if (tp->t_flag & T_INTR_THREAD) 480 continue; 481 482 if (!callb_is_stopped(tp, &name)) { 483 mutex_exit(&pidlock); 484 (void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN); 485 SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD); 486 return (EBUSY); 487 } 488 } 489 490 mutex_exit(&pidlock); 491 return (DDI_SUCCESS); 492 } 493 494 static void 495 sysctrl_start_user_threads(void) 496 { 497 kthread_id_t tp; 498 499 mutex_enter(&pidlock); 500 501 /* walk all threads and release them */ 502 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 503 proc_t *p = ttoproc(tp); 504 505 /* skip kernel threads */ 506 if (ttoproc(tp)->p_as == &kas) 507 continue; 508 509 mutex_enter(&p->p_lock); 510 tp->t_proc_flag &= ~TP_CHKPT; 511 mutex_exit(&p->p_lock); 512 513 thread_lock(tp); 514 if (CPR_ISTOPPED(tp)) { 515 /* back on the runq */ 516 tp->t_schedflag |= TS_RESUME; 517 setrun_locked(tp); 518 } 519 thread_unlock(tp); 520 } 521 522 mutex_exit(&pidlock); 523 } 524 525 static void 526 sysctrl_signal_user(int sig) 527 { 528 struct proc *p; 529 530 mutex_enter(&pidlock); 531 532 for (p = practive; p != NULL; p = p->p_next) { 533 /* only user threads */ 534 if (p->p_exec == NULL || p->p_stat == SZOMB || 535 p == proc_init || p == ttoproc(curthread)) 536 continue; 537 538 mutex_enter(&p->p_lock); 539 sigtoproc(p, NULL, sig); 540 mutex_exit(&p->p_lock); 541 } 542 543 mutex_exit(&pidlock); 544 545 /* add a bit of delay */ 546 delay(hz); 547 } 548 549 void 550 sysctrl_resume(sysc_cfga_pkt_t *pkt) 551 { 552 #ifndef Bug_4154263 553 DEBUGP(errp("resume system...\n")); 554 #endif 555 switch (suspend_state) { 556 case SYSC_STATE_FULL: 557 /* 558 * release all the other cpus 559 */ 560 #ifndef Bug_4154263 561 DEBUGP(errp("release cpus...")); 562 #endif 563 /* 564 * Prevent false alarm in tod_validate() due to tod 565 * value change between suspend and resume 566 */ 567 mutex_enter(&tod_lock); 568 tod_status_set(TOD_DR_RESUME_DONE); 569 mutex_exit(&tod_lock); 570 571 sysctrl_release_cpus(); 572 DEBUGP(errp("cpus resumed...\n")); 573 574 /* 575 * If we suspended hw watchdog at suspend, 576 * re-enable it now. 577 */ 578 if (sysc_watchdog_suspended) { 579 mutex_enter(&tod_lock); 580 (void) tod_ops.tod_set_watchdog_timer( 581 watchdog_timeout_seconds); 582 mutex_exit(&tod_lock); 583 } 584 585 /* 586 * resume callout 587 */ 588 (void) callb_execute_class(CB_CL_CPR_RPC, CB_CODE_CPR_RESUME); 589 (void) callb_execute_class(CB_CL_CPR_CALLOUT, 590 CB_CODE_CPR_RESUME); 591 sysctrl_enable_intr(); 592 /* FALLTHROUGH */ 593 594 case SYSC_STATE_DRIVER: 595 /* 596 * resume drivers 597 */ 598 DEBUGP(errp("resume drivers...")); 599 sysctrl_resume_devices(ddi_root_node(), pkt); 600 DEBUGP(errp("done\n")); 601 602 /* 603 * resume the lock manager 604 */ 605 lm_cprresume(); 606 607 /* FALLTHROUGH */ 608 609 case SYSC_STATE_DAEMON: 610 /* 611 * resume kernel daemons 612 */ 613 if (!sysctrl_skip_kernel_threads) { 614 DEBUGP(errp("starting kernel daemons...")); 615 (void) callb_execute_class(CB_CL_CPR_DAEMON, 616 CB_CODE_CPR_RESUME); 617 callb_unlock_table(); 618 } 619 DEBUGP(errp("done\n")); 620 621 /* FALLTHROUGH */ 622 623 case SYSC_STATE_USER: 624 /* 625 * finally, resume user threads 626 */ 627 if (!sysctrl_skip_user_threads) { 628 DEBUGP(errp("starting user threads...")); 629 sysctrl_start_user_threads(); 630 DEBUGP(errp("done\n")); 631 } 632 /* FALLTHROUGH */ 633 634 case SYSC_STATE_BEGIN: 635 default: 636 /* 637 * let those who care know that we've just resumed 638 */ 639 DEBUGP(errp("sending SIGTHAW...")); 640 sysctrl_signal_user(SIGTHAW); 641 DEBUGP(errp("done\n")); 642 break; 643 } 644 645 /* Restore sysctrl detach/suspend to its original value */ 646 sysctrl_enable_detach_suspend = sysc_lastval; 647 648 DEBUGP(errp("system state restored\n")); 649 } 650 651 void 652 sysctrl_suspend_prepare(void) 653 { 654 /* 655 * We use a function, lm_cprsuspend(), in the suspend flow that 656 * is redirected to a module through the modstubs mechanism. 657 * If the module is currently not loaded, modstubs attempts 658 * the modload. The context this happens in below causes the 659 * module load to block forever, so this function must be called 660 * in the normal system call context ahead of time. 661 */ 662 (void) modload("misc", "klmmod"); 663 } 664 665 int 666 sysctrl_suspend(sysc_cfga_pkt_t *pkt) 667 { 668 int rc = DDI_SUCCESS; 669 670 /* enable sysctrl detach/suspend function */ 671 sysc_lastval = sysctrl_enable_detach_suspend; 672 sysctrl_enable_detach_suspend = 1; 673 674 /* 675 * first, stop all user threads 676 */ 677 DEBUGP(errp("\nstopping user threads...")); 678 suspend_state = SYSC_STATE_USER; 679 if (((rc = sysctrl_stop_user_threads(pkt)) != DDI_SUCCESS) && 680 sysctrl_check_user_stop_result) { 681 sysctrl_resume(pkt); 682 return (rc); 683 } 684 DEBUGP(errp("done\n")); 685 686 /* 687 * now stop daemon activities 688 */ 689 DEBUGP(errp("stopping kernel daemons...")); 690 suspend_state = SYSC_STATE_DAEMON; 691 if (rc = sysctrl_stop_kernel_threads(pkt)) { 692 sysctrl_resume(pkt); 693 return (rc); 694 } 695 DEBUGP(errp("done\n")); 696 697 /* 698 * This sync swap out all user pages 699 */ 700 vfs_sync(SYNC_ALL); 701 702 /* 703 * special treatment for lock manager 704 */ 705 lm_cprsuspend(); 706 707 /* 708 * sync the file system in case we never make it back 709 */ 710 sync(); 711 712 /* 713 * now suspend drivers 714 */ 715 DEBUGP(errp("suspending drivers...")); 716 suspend_state = SYSC_STATE_DRIVER; 717 if (rc = sysctrl_suspend_devices(ddi_root_node(), pkt)) { 718 sysctrl_resume(pkt); 719 return (rc); 720 } 721 DEBUGP(errp("done\n")); 722 723 /* 724 * handle the callout table 725 */ 726 sysctrl_stop_intr(); 727 728 (void) callb_execute_class(CB_CL_CPR_CALLOUT, CB_CODE_CPR_CHKPT); 729 730 /* 731 * if watchdog was activated, disable it 732 */ 733 if (watchdog_activated) { 734 mutex_enter(&tod_lock); 735 (void) tod_ops.tod_clear_watchdog_timer(); 736 mutex_exit(&tod_lock); 737 sysc_watchdog_suspended = 1; 738 } else { 739 sysc_watchdog_suspended = 0; 740 } 741 742 /* 743 * finally, grab all cpus 744 */ 745 DEBUGP(errp("freezing all cpus...\n")); 746 suspend_state = SYSC_STATE_FULL; 747 sysctrl_grab_cpus(); 748 #ifndef Bug_4154263 749 DEBUGP(errp("done\n")); 750 751 DEBUGP(errp("system is quiesced\n")); 752 #endif 753 754 return (rc); 755 } 756