1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This workaround inhibits prom_printf after the cpus are grabbed. 31 * This can be removed when 4154263 is corrected. 32 */ 33 #define Bug_4154263 34 35 /* 36 * A CPR derivative specifically for sunfire 37 */ 38 39 #include <sys/types.h> 40 #include <sys/systm.h> 41 #include <sys/machparam.h> 42 #include <sys/machsystm.h> 43 #include <sys/ddi.h> 44 #define SUNDDI_IMPL 45 #include <sys/sunddi.h> 46 #include <sys/time.h> 47 #include <sys/kmem.h> 48 #include <nfs/lm.h> 49 #include <sys/ddi_impldefs.h> 50 #include <sys/obpdefs.h> 51 #include <sys/cmn_err.h> 52 #include <sys/debug.h> 53 #include <sys/errno.h> 54 #include <sys/callb.h> 55 #include <sys/clock.h> 56 #include <sys/x_call.h> 57 #include <sys/cpuvar.h> 58 #include <sys/epm.h> 59 #include <sys/vfs.h> 60 #include <sys/fhc.h> 61 #include <sys/sysctrl.h> 62 #include <sys/promif.h> 63 #include <sys/conf.h> 64 #include <sys/modctl.h> 65 #include <sys/cyclic.h> 66 #include <sys/sunndi.h> 67 #include <sys/machsystm.h> 68 69 static enum sysctrl_suspend_state { 70 SYSC_STATE_BEGIN = 0, 71 SYSC_STATE_USER, 72 SYSC_STATE_DAEMON, 73 SYSC_STATE_DRIVER, 74 SYSC_STATE_FULL } suspend_state; 75 76 static int pstate_save; 77 static uint_t sysctrl_gate[NCPU]; 78 int sysctrl_quiesce_debug = FALSE; 79 static int sysctrl_skip_kernel_threads = TRUE; 80 81 /* 82 * sysctrl_skip_user_threads is used to control if user threads should 83 * be suspended. If sysctrl_skip_user_threads is true, the rest of the 84 * flags are not used; if it is false, sysctrl_check_user_stop_result 85 * will be used to control whether or not we need to check suspend 86 * result, and sysctrl_allow_blocked_threads will be used to control 87 * whether or not we allow suspend to continue if there are blocked 88 * threads. We allow all combinations of sysctrl_check_user_stop_result 89 * and sysctrl_allow_block_threads, even though it might not make much 90 * sense to not allow block threads when we don't even check stop 91 * result. 92 */ 93 static int sysctrl_skip_user_threads = 0; /* default to FALSE */ 94 static int sysctrl_check_user_stop_result = 1; /* default to TRUE */ 95 static int sysctrl_allow_blocked_threads = 1; /* default to TRUE */ 96 97 static int sysc_watchdog_suspended; 98 99 extern int sysctrl_enable_detach_suspend; 100 static int sysc_lastval; 101 102 #define DEBUGP(p) { if (sysctrl_quiesce_debug) p; } 103 #define errp prom_printf 104 105 #define SYSC_CPU_LOOP_MSEC 1000 106 107 static void 108 sysctrl_grab_cpus(void) 109 { 110 int i; 111 cpuset_t others; 112 extern cpuset_t cpu_ready_set; 113 extern void sysctrl_freeze(void); 114 uint64_t sysc_tick_limit; 115 uint64_t sysc_current_tick; 116 uint64_t sysc_tick_deadline; 117 118 extern u_longlong_t gettick(void); 119 120 for (i = 0; i < NCPU; i++) 121 sysctrl_gate[i] = 0; 122 123 /* tell other cpus to go quiet and wait for continue signal */ 124 others = cpu_ready_set; 125 CPUSET_DEL(others, CPU->cpu_id); 126 xt_some(others, (xcfunc_t *)sysctrl_freeze, (uint64_t)sysctrl_gate, 127 (uint64_t)(&sysctrl_gate[CPU->cpu_id])); 128 129 sysc_tick_limit = 130 ((uint64_t)sys_tick_freq * SYSC_CPU_LOOP_MSEC) / 1000; 131 132 /* wait for each cpu to check in */ 133 for (i = 0; i < NCPU; i++) { 134 if (!CPU_IN_SET(others, i)) 135 continue; 136 137 /* 138 * Get current tick value and calculate the deadline tick 139 */ 140 sysc_current_tick = gettick(); 141 sysc_tick_deadline = sysc_current_tick + sysc_tick_limit; 142 143 while (sysctrl_gate[i] == 0) { 144 /* If in panic, we just return */ 145 if (panicstr) 146 break; 147 148 /* Panic the system if cpu not responsed by deadline */ 149 sysc_current_tick = gettick(); 150 if (sysc_current_tick >= sysc_tick_deadline) { 151 cmn_err(CE_PANIC, "sysctrl: cpu %d not " 152 "responding to quiesce command", i); 153 } 154 } 155 } 156 157 /* now even our interrupts are disabled -- really quiet now */ 158 pstate_save = disable_vec_intr(); 159 } 160 161 static void 162 sysctrl_release_cpus(void) 163 { 164 /* let the other cpus go */ 165 sysctrl_gate[CPU->cpu_id] = 1; 166 167 /* restore our interrupts too */ 168 enable_vec_intr(pstate_save); 169 } 170 171 static void 172 sysctrl_stop_intr(void) 173 { 174 mutex_enter(&cpu_lock); 175 kpreempt_disable(); 176 cyclic_suspend(); 177 } 178 179 static void 180 sysctrl_enable_intr(void) 181 { 182 cyclic_resume(); 183 (void) spl0(); 184 kpreempt_enable(); 185 mutex_exit(&cpu_lock); 186 } 187 188 static int 189 sysctrl_is_real_device(dev_info_t *dip) 190 { 191 struct regspec *regbuf; 192 int length; 193 int rc; 194 195 if (ddi_get_driver(dip) == NULL) 196 return (FALSE); 197 198 if (DEVI(dip)->devi_pm_flags & (PMC_NEEDS_SR|PMC_PARENTAL_SR)) 199 return (TRUE); 200 if (DEVI(dip)->devi_pm_flags & PMC_NO_SR) 201 return (FALSE); 202 203 /* 204 * now the general case 205 */ 206 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg", 207 (caddr_t)®buf, &length); 208 ASSERT(rc != DDI_PROP_NO_MEMORY); 209 if (rc != DDI_PROP_SUCCESS) { 210 return (FALSE); 211 } else { 212 kmem_free(regbuf, length); 213 return (TRUE); 214 } 215 } 216 217 static dev_info_t *failed_driver; 218 static char device_path[MAXPATHLEN]; 219 220 static int 221 sysctrl_suspend_devices(dev_info_t *dip, sysc_cfga_pkt_t *pkt) 222 { 223 int circ; 224 225 ASSERT(dip == NULL || ddi_get_parent(dip) == NULL || 226 DEVI_BUSY_OWNED(ddi_get_parent(dip))); 227 228 failed_driver = NULL; 229 for (; dip != NULL; dip = ddi_get_next_sibling(dip)) { 230 /* 231 * Hold parent busy while walking child list 232 */ 233 ndi_devi_enter(dip, &circ); 234 if (sysctrl_suspend_devices(ddi_get_child(dip), pkt)) { 235 ndi_devi_exit(dip, circ); 236 return (ENXIO); 237 } 238 ndi_devi_exit(dip, circ); 239 240 if (!sysctrl_is_real_device(dip)) 241 continue; 242 243 /* 244 * Safe to call ddi_pathname() as parent is held busy 245 */ 246 (void) ddi_pathname(dip, device_path); 247 DEBUGP(errp(" suspending device %s\n", device_path)); 248 if (devi_detach(dip, DDI_SUSPEND) != DDI_SUCCESS) { 249 DEBUGP(errp(" unable to suspend device %s\n", 250 device_path)); 251 252 (void) strncpy(pkt->errbuf, device_path, 253 SYSC_OUTPUT_LEN); 254 SYSC_ERR_SET(pkt, SYSC_ERR_SUSPEND); 255 ndi_hold_devi(dip); 256 failed_driver = dip; 257 return (ENXIO); 258 } 259 } 260 261 return (DDI_SUCCESS); 262 } 263 264 static void 265 sysctrl_resume_devices(dev_info_t *start, sysc_cfga_pkt_t *pkt) 266 { 267 int circ; 268 dev_info_t *dip, *next, *last = NULL; 269 270 ASSERT(start == NULL || ddi_get_parent(start) == NULL || 271 DEVI_BUSY_OWNED(ddi_get_parent(start))); 272 273 /* attach in reverse device tree order */ 274 while (last != start) { 275 dip = start; 276 next = ddi_get_next_sibling(dip); 277 while (next != last && dip != failed_driver) { 278 dip = next; 279 next = ddi_get_next_sibling(dip); 280 } 281 if (dip == failed_driver) { 282 failed_driver = NULL; 283 ndi_rele_devi(dip); 284 } else if (sysctrl_is_real_device(dip) && 285 failed_driver == NULL) { 286 /* 287 * Parent dip is held busy, so ddi_pathname() can 288 * be safely called. 289 */ 290 (void) ddi_pathname(dip, device_path); 291 DEBUGP(errp(" resuming device %s\n", device_path)); 292 if (devi_attach(dip, DDI_RESUME) != DDI_SUCCESS) { 293 /* 294 * XXX - if in the future we decide not to 295 * panic the system, we need to set the error 296 * SYSC_ERR_RESUME here and also change the 297 * cfgadm platform library. 298 */ 299 cmn_err(CE_PANIC, "Unable to resume device %s", 300 device_path); 301 } 302 } 303 ndi_devi_enter(dip, &circ); 304 sysctrl_resume_devices(ddi_get_child(dip), pkt); 305 ndi_devi_exit(dip, circ); 306 307 last = dip; 308 } 309 } 310 311 /* 312 * True if thread is virtually stopped. Similar to CPR_VSTOPPED 313 * but from DR point of view. These user threads are waiting in 314 * the kernel. Once they complete in the kernel, they will process 315 * the stop signal and stop. 316 */ 317 #define SYSCTRL_VSTOPPED(t) \ 318 ((t)->t_state == TS_SLEEP && \ 319 (t)->t_wchan != NULL && \ 320 (t)->t_astflag && \ 321 ((t)->t_proc_flag & TP_CHKPT)) 322 323 static int 324 sysctrl_stop_user_threads(sysc_cfga_pkt_t *pkt) 325 { 326 int count; 327 char cache_psargs[PSARGSZ]; 328 kthread_id_t cache_tp; 329 uint_t cache_t_state; 330 int bailout; 331 pid_t pid; 332 333 extern void add_one_utstop(); 334 extern void utstop_timedwait(clock_t); 335 extern void utstop_init(void); 336 337 #define SYSCTRL_UTSTOP_RETRY 4 338 #define SYSCTRL_UTSTOP_WAIT hz 339 340 if (sysctrl_skip_user_threads) 341 return (DDI_SUCCESS); 342 343 utstop_init(); 344 345 /* we need to try a few times to get past fork, etc. */ 346 for (count = 0; count < SYSCTRL_UTSTOP_RETRY; count++) { 347 kthread_id_t tp; 348 349 /* walk the entire threadlist */ 350 mutex_enter(&pidlock); 351 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 352 proc_t *p = ttoproc(tp); 353 354 /* handle kernel threads separately */ 355 if (p->p_as == &kas || p->p_stat == SZOMB) 356 continue; 357 358 mutex_enter(&p->p_lock); 359 thread_lock(tp); 360 361 if (tp->t_state == TS_STOPPED) { 362 /* add another reason to stop this thread */ 363 tp->t_schedflag &= ~TS_RESUME; 364 } else { 365 tp->t_proc_flag |= TP_CHKPT; 366 367 thread_unlock(tp); 368 mutex_exit(&p->p_lock); 369 add_one_utstop(); 370 mutex_enter(&p->p_lock); 371 thread_lock(tp); 372 373 aston(tp); 374 375 if (tp->t_state == TS_SLEEP && 376 (tp->t_flag & T_WAKEABLE)) { 377 setrun_locked(tp); 378 } 379 380 } 381 382 /* grab thread if needed */ 383 if (tp->t_state == TS_ONPROC && tp->t_cpu != CPU) 384 poke_cpu(tp->t_cpu->cpu_id); 385 386 387 thread_unlock(tp); 388 mutex_exit(&p->p_lock); 389 } 390 mutex_exit(&pidlock); 391 392 393 /* let everything catch up */ 394 utstop_timedwait(count * count * SYSCTRL_UTSTOP_WAIT); 395 396 397 /* now, walk the threadlist again to see if we are done */ 398 mutex_enter(&pidlock); 399 for (tp = curthread->t_next, bailout = 0; 400 bailout == 0 && tp != curthread; tp = tp->t_next) { 401 proc_t *p = ttoproc(tp); 402 403 /* handle kernel threads separately */ 404 if (p->p_as == &kas || p->p_stat == SZOMB) 405 continue; 406 407 /* 408 * If this thread didn't stop, and we don't allow 409 * unstopped blocked threads, bail. 410 */ 411 /* did this thread stop? */ 412 thread_lock(tp); 413 if (!CPR_ISTOPPED(tp) && 414 !(sysctrl_allow_blocked_threads && 415 SYSCTRL_VSTOPPED(tp))) { 416 417 /* nope, cache the details for later */ 418 bcopy(p->p_user.u_psargs, cache_psargs, 419 sizeof (cache_psargs)); 420 cache_tp = tp; 421 cache_t_state = tp->t_state; 422 bailout = 1; 423 pid = p->p_pidp->pid_id; 424 } 425 thread_unlock(tp); 426 } 427 mutex_exit(&pidlock); 428 429 /* were all the threads stopped? */ 430 if (!bailout) 431 break; 432 } 433 434 /* were we unable to stop all threads after a few tries? */ 435 if (bailout) { 436 (void) sprintf(pkt->errbuf, "process: %s id: %d state: %x" 437 " thread descriptor: %p", 438 cache_psargs, (int)pid, cache_t_state, 439 (void *)cache_tp); 440 441 SYSC_ERR_SET(pkt, SYSC_ERR_UTHREAD); 442 443 return (ESRCH); 444 } 445 446 return (DDI_SUCCESS); 447 } 448 449 static int 450 sysctrl_stop_kernel_threads(sysc_cfga_pkt_t *pkt) 451 { 452 caddr_t name; 453 kthread_id_t tp; 454 455 if (sysctrl_skip_kernel_threads) { 456 return (DDI_SUCCESS); 457 } 458 459 /* 460 * Note: we unlock the table in resume. 461 * We only need to lock the callback table if we are actually 462 * suspending kernel threads. 463 */ 464 callb_lock_table(); 465 if ((name = callb_execute_class(CB_CL_CPR_DAEMON, 466 CB_CODE_CPR_CHKPT)) != (caddr_t)NULL) { 467 468 (void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN); 469 SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD); 470 return (EBUSY); 471 } 472 473 /* 474 * Verify that all threads are accounted for 475 */ 476 mutex_enter(&pidlock); 477 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 478 proc_t *p = ttoproc(tp); 479 480 if (p->p_as != &kas) 481 continue; 482 483 if (tp->t_flag & T_INTR_THREAD) 484 continue; 485 486 if (!callb_is_stopped(tp, &name)) { 487 mutex_exit(&pidlock); 488 (void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN); 489 SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD); 490 return (EBUSY); 491 } 492 } 493 494 mutex_exit(&pidlock); 495 return (DDI_SUCCESS); 496 } 497 498 static void 499 sysctrl_start_user_threads(void) 500 { 501 kthread_id_t tp; 502 503 mutex_enter(&pidlock); 504 505 /* walk all threads and release them */ 506 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 507 proc_t *p = ttoproc(tp); 508 509 /* skip kernel threads */ 510 if (ttoproc(tp)->p_as == &kas) 511 continue; 512 513 mutex_enter(&p->p_lock); 514 tp->t_proc_flag &= ~TP_CHKPT; 515 mutex_exit(&p->p_lock); 516 517 thread_lock(tp); 518 if (CPR_ISTOPPED(tp)) { 519 /* back on the runq */ 520 tp->t_schedflag |= TS_RESUME; 521 setrun_locked(tp); 522 } 523 thread_unlock(tp); 524 } 525 526 mutex_exit(&pidlock); 527 } 528 529 static void 530 sysctrl_signal_user(int sig) 531 { 532 struct proc *p; 533 534 mutex_enter(&pidlock); 535 536 for (p = practive; p != NULL; p = p->p_next) { 537 /* only user threads */ 538 if (p->p_exec == NULL || p->p_stat == SZOMB || 539 p == proc_init || p == ttoproc(curthread)) 540 continue; 541 542 mutex_enter(&p->p_lock); 543 sigtoproc(p, NULL, sig); 544 mutex_exit(&p->p_lock); 545 } 546 547 mutex_exit(&pidlock); 548 549 /* add a bit of delay */ 550 delay(hz); 551 } 552 553 void 554 sysctrl_resume(sysc_cfga_pkt_t *pkt) 555 { 556 #ifndef Bug_4154263 557 DEBUGP(errp("resume system...\n")); 558 #endif 559 switch (suspend_state) { 560 case SYSC_STATE_FULL: 561 /* 562 * release all the other cpus 563 */ 564 #ifndef Bug_4154263 565 DEBUGP(errp("release cpus...")); 566 #endif 567 /* 568 * Prevent false alarm in tod_validate() due to tod 569 * value change between suspend and resume 570 */ 571 mutex_enter(&tod_lock); 572 tod_fault_reset(); 573 mutex_exit(&tod_lock); 574 575 sysctrl_release_cpus(); 576 DEBUGP(errp("cpus resumed...\n")); 577 578 /* 579 * If we suspended hw watchdog at suspend, 580 * re-enable it now. 581 */ 582 if (sysc_watchdog_suspended) { 583 mutex_enter(&tod_lock); 584 tod_ops.tod_set_watchdog_timer( 585 watchdog_timeout_seconds); 586 mutex_exit(&tod_lock); 587 } 588 589 /* 590 * resume callout 591 */ 592 (void) callb_execute_class(CB_CL_CPR_RPC, CB_CODE_CPR_RESUME); 593 (void) callb_execute_class(CB_CL_CPR_CALLOUT, 594 CB_CODE_CPR_RESUME); 595 sysctrl_enable_intr(); 596 /* FALLTHROUGH */ 597 598 case SYSC_STATE_DRIVER: 599 /* 600 * resume drivers 601 */ 602 DEBUGP(errp("resume drivers...")); 603 sysctrl_resume_devices(ddi_root_node(), pkt); 604 DEBUGP(errp("done\n")); 605 606 /* 607 * resume the lock manager 608 */ 609 lm_cprresume(); 610 611 /* FALLTHROUGH */ 612 613 case SYSC_STATE_DAEMON: 614 /* 615 * resume kernel daemons 616 */ 617 if (!sysctrl_skip_kernel_threads) { 618 DEBUGP(errp("starting kernel daemons...")); 619 (void) callb_execute_class(CB_CL_CPR_DAEMON, 620 CB_CODE_CPR_RESUME); 621 callb_unlock_table(); 622 } 623 DEBUGP(errp("done\n")); 624 625 /* FALLTHROUGH */ 626 627 case SYSC_STATE_USER: 628 /* 629 * finally, resume user threads 630 */ 631 if (!sysctrl_skip_user_threads) { 632 DEBUGP(errp("starting user threads...")); 633 sysctrl_start_user_threads(); 634 DEBUGP(errp("done\n")); 635 } 636 /* FALLTHROUGH */ 637 638 case SYSC_STATE_BEGIN: 639 default: 640 /* 641 * let those who care know that we've just resumed 642 */ 643 DEBUGP(errp("sending SIGTHAW...")); 644 sysctrl_signal_user(SIGTHAW); 645 DEBUGP(errp("done\n")); 646 break; 647 } 648 649 /* Restore sysctrl detach/suspend to its original value */ 650 sysctrl_enable_detach_suspend = sysc_lastval; 651 652 DEBUGP(errp("system state restored\n")); 653 } 654 655 void 656 sysctrl_suspend_prepare(void) 657 { 658 /* 659 * We use a function, lm_cprsuspend(), in the suspend flow that 660 * is redirected to a module through the modstubs mechanism. 661 * If the module is currently not loaded, modstubs attempts 662 * the modload. The context this happens in below causes the 663 * module load to block forever, so this function must be called 664 * in the normal system call context ahead of time. 665 */ 666 (void) modload("misc", "klmmod"); 667 } 668 669 int 670 sysctrl_suspend(sysc_cfga_pkt_t *pkt) 671 { 672 int rc = DDI_SUCCESS; 673 674 /* enable sysctrl detach/suspend function */ 675 sysc_lastval = sysctrl_enable_detach_suspend; 676 sysctrl_enable_detach_suspend = 1; 677 678 /* 679 * first, stop all user threads 680 */ 681 DEBUGP(errp("\nstopping user threads...")); 682 suspend_state = SYSC_STATE_USER; 683 if (((rc = sysctrl_stop_user_threads(pkt)) != DDI_SUCCESS) && 684 sysctrl_check_user_stop_result) { 685 sysctrl_resume(pkt); 686 return (rc); 687 } 688 DEBUGP(errp("done\n")); 689 690 /* 691 * now stop daemon activities 692 */ 693 DEBUGP(errp("stopping kernel daemons...")); 694 suspend_state = SYSC_STATE_DAEMON; 695 if (rc = sysctrl_stop_kernel_threads(pkt)) { 696 sysctrl_resume(pkt); 697 return (rc); 698 } 699 DEBUGP(errp("done\n")); 700 701 /* 702 * This sync swap out all user pages 703 */ 704 vfs_sync(SYNC_ALL); 705 706 /* 707 * special treatment for lock manager 708 */ 709 lm_cprsuspend(); 710 711 /* 712 * sync the file system in case we never make it back 713 */ 714 sync(); 715 716 /* 717 * now suspend drivers 718 */ 719 DEBUGP(errp("suspending drivers...")); 720 suspend_state = SYSC_STATE_DRIVER; 721 if (rc = sysctrl_suspend_devices(ddi_root_node(), pkt)) { 722 sysctrl_resume(pkt); 723 return (rc); 724 } 725 DEBUGP(errp("done\n")); 726 727 /* 728 * handle the callout table 729 */ 730 sysctrl_stop_intr(); 731 732 (void) callb_execute_class(CB_CL_CPR_CALLOUT, CB_CODE_CPR_CHKPT); 733 734 /* 735 * if watchdog was activated, disable it 736 */ 737 if (watchdog_activated) { 738 mutex_enter(&tod_lock); 739 tod_ops.tod_clear_watchdog_timer(); 740 mutex_exit(&tod_lock); 741 sysc_watchdog_suspended = 1; 742 } else { 743 sysc_watchdog_suspended = 0; 744 } 745 746 /* 747 * finally, grab all cpus 748 */ 749 DEBUGP(errp("freezing all cpus...\n")); 750 suspend_state = SYSC_STATE_FULL; 751 sysctrl_grab_cpus(); 752 #ifndef Bug_4154263 753 DEBUGP(errp("done\n")); 754 755 DEBUGP(errp("system is quiesced\n")); 756 #endif 757 758 return (rc); 759 } 760