1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This workaround inhibits prom_printf after the cpus are grabbed. 31 * This can be removed when 4154263 is corrected. 32 */ 33 #define Bug_4154263 34 35 /* 36 * A CPR derivative specifically for sunfire 37 */ 38 39 #include <sys/types.h> 40 #include <sys/systm.h> 41 #include <sys/machparam.h> 42 #include <sys/machsystm.h> 43 #include <sys/ddi.h> 44 #define SUNDDI_IMPL 45 #include <sys/sunddi.h> 46 #include <sys/time.h> 47 #include <sys/kmem.h> 48 #include <nfs/lm.h> 49 #include <sys/ddi_impldefs.h> 50 #include <sys/obpdefs.h> 51 #include <sys/cmn_err.h> 52 #include <sys/debug.h> 53 #include <sys/errno.h> 54 #include <sys/callb.h> 55 #include <sys/clock.h> 56 #include <sys/x_call.h> 57 #include <sys/cpuvar.h> 58 #include <sys/epm.h> 59 #include <sys/vfs.h> 60 #include <sys/fhc.h> 61 #include <sys/sysctrl.h> 62 #include <sys/promif.h> 63 #include <sys/conf.h> 64 #include <sys/modctl.h> 65 #include <sys/cyclic.h> 66 #include <sys/sunndi.h> 67 #include <sys/machsystm.h> 68 69 static enum sysctrl_suspend_state { 70 SYSC_STATE_BEGIN = 0, 71 SYSC_STATE_USER, 72 SYSC_STATE_DAEMON, 73 SYSC_STATE_DRIVER, 74 SYSC_STATE_FULL } suspend_state; 75 76 static int pstate_save; 77 static uint_t sysctrl_gate[NCPU]; 78 int sysctrl_quiesce_debug = FALSE; 79 static int sysctrl_skip_kernel_threads = TRUE; 80 81 /* 82 * sysctrl_skip_user_threads is used to control if user threads should 83 * be suspended. If sysctrl_skip_user_threads is true, the rest of the 84 * flags are not used; if it is false, sysctrl_check_user_stop_result 85 * will be used to control whether or not we need to check suspend 86 * result, and sysctrl_allow_blocked_threads will be used to control 87 * whether or not we allow suspend to continue if there are blocked 88 * threads. We allow all combinations of sysctrl_check_user_stop_result 89 * and sysctrl_allow_block_threads, even though it might not make much 90 * sense to not allow block threads when we don't even check stop 91 * result. 92 */ 93 static int sysctrl_skip_user_threads = 0; /* default to FALSE */ 94 static int sysctrl_check_user_stop_result = 1; /* default to TRUE */ 95 static int sysctrl_allow_blocked_threads = 1; /* default to TRUE */ 96 97 static int sysc_watchdog_suspended; 98 99 extern int sysctrl_enable_detach_suspend; 100 static int sysc_lastval; 101 102 #define DEBUGP(p) { if (sysctrl_quiesce_debug) p; } 103 #define errp prom_printf 104 105 #define SYSC_CPU_LOOP_MSEC 1000 106 107 static void 108 sysctrl_grab_cpus(void) 109 { 110 int i; 111 cpuset_t others; 112 extern cpuset_t cpu_ready_set; 113 extern void sysctrl_freeze(void); 114 uint64_t sysc_tick_limit; 115 uint64_t sysc_current_tick; 116 uint64_t sysc_tick_deadline; 117 118 extern u_longlong_t gettick(void); 119 120 for (i = 0; i < NCPU; i++) 121 sysctrl_gate[i] = 0; 122 123 /* tell other cpus to go quiet and wait for continue signal */ 124 others = cpu_ready_set; 125 CPUSET_DEL(others, CPU->cpu_id); 126 xt_some(others, (xcfunc_t *)sysctrl_freeze, (uint64_t)sysctrl_gate, 127 (uint64_t)(&sysctrl_gate[CPU->cpu_id])); 128 129 sysc_tick_limit = 130 ((uint64_t)sys_tick_freq * SYSC_CPU_LOOP_MSEC) / 1000; 131 132 /* wait for each cpu to check in */ 133 for (i = 0; i < NCPU; i++) { 134 if (!CPU_IN_SET(others, i)) 135 continue; 136 137 /* 138 * Get current tick value and calculate the deadline tick 139 */ 140 sysc_current_tick = gettick(); 141 sysc_tick_deadline = sysc_current_tick + sysc_tick_limit; 142 143 while (sysctrl_gate[i] == 0) { 144 /* If in panic, we just return */ 145 if (panicstr) 146 break; 147 148 /* Panic the system if cpu not responsed by deadline */ 149 sysc_current_tick = gettick(); 150 if (sysc_current_tick >= sysc_tick_deadline) { 151 cmn_err(CE_PANIC, "sysctrl: cpu %d not " 152 "responding to quiesce command", i); 153 } 154 } 155 } 156 157 /* now even our interrupts are disabled -- really quiet now */ 158 pstate_save = disable_vec_intr(); 159 } 160 161 static void 162 sysctrl_release_cpus(void) 163 { 164 /* let the other cpus go */ 165 sysctrl_gate[CPU->cpu_id] = 1; 166 167 /* restore our interrupts too */ 168 enable_vec_intr(pstate_save); 169 } 170 171 static void 172 sysctrl_stop_intr(void) 173 { 174 mutex_enter(&cpu_lock); 175 kpreempt_disable(); 176 cyclic_suspend(); 177 } 178 179 static void 180 sysctrl_enable_intr(void) 181 { 182 cyclic_resume(); 183 (void) spl0(); 184 kpreempt_enable(); 185 mutex_exit(&cpu_lock); 186 } 187 188 static int 189 sysctrl_is_real_device(dev_info_t *dip) 190 { 191 struct regspec *regbuf; 192 int length; 193 int rc; 194 195 if (ddi_get_driver(dip) == NULL) 196 return (FALSE); 197 198 if (DEVI(dip)->devi_pm_flags & (PMC_NEEDS_SR|PMC_PARENTAL_SR)) 199 return (TRUE); 200 if (DEVI(dip)->devi_pm_flags & PMC_NO_SR) 201 return (FALSE); 202 203 /* 204 * now the general case 205 */ 206 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg", 207 (caddr_t)®buf, &length); 208 ASSERT(rc != DDI_PROP_NO_MEMORY); 209 if (rc != DDI_PROP_SUCCESS) { 210 return (FALSE); 211 } else { 212 kmem_free(regbuf, length); 213 return (TRUE); 214 } 215 } 216 217 static dev_info_t *failed_driver; 218 static char device_path[MAXPATHLEN]; 219 220 static int 221 sysctrl_suspend_devices(dev_info_t *dip, sysc_cfga_pkt_t *pkt) 222 { 223 int circ; 224 225 ASSERT(dip == NULL || ddi_get_parent(dip) == NULL || 226 DEVI_BUSY_OWNED(ddi_get_parent(dip))); 227 228 failed_driver = NULL; 229 for (; dip != NULL; dip = ddi_get_next_sibling(dip)) { 230 /* 231 * Hold parent busy while walking child list 232 */ 233 ndi_devi_enter(dip, &circ); 234 if (sysctrl_suspend_devices(ddi_get_child(dip), pkt)) { 235 ndi_devi_exit(dip, circ); 236 return (ENXIO); 237 } 238 ndi_devi_exit(dip, circ); 239 240 if (!sysctrl_is_real_device(dip)) 241 continue; 242 243 /* 244 * Safe to call ddi_pathname() as parent is held busy 245 */ 246 (void) ddi_pathname(dip, device_path); 247 DEBUGP(errp(" suspending device %s\n", device_path)); 248 if (devi_detach(dip, DDI_SUSPEND) != DDI_SUCCESS) { 249 DEBUGP(errp(" unable to suspend device %s\n", 250 device_path)); 251 252 (void) strncpy(pkt->errbuf, device_path, 253 SYSC_OUTPUT_LEN); 254 SYSC_ERR_SET(pkt, SYSC_ERR_SUSPEND); 255 ndi_hold_devi(dip); 256 failed_driver = dip; 257 return (ENXIO); 258 } 259 } 260 261 return (DDI_SUCCESS); 262 } 263 264 static void 265 sysctrl_resume_devices(dev_info_t *start, sysc_cfga_pkt_t *pkt) 266 { 267 int circ; 268 dev_info_t *dip, *next, *last = NULL; 269 270 ASSERT(start == NULL || ddi_get_parent(start) == NULL || 271 DEVI_BUSY_OWNED(ddi_get_parent(start))); 272 273 /* attach in reverse device tree order */ 274 while (last != start) { 275 dip = start; 276 next = ddi_get_next_sibling(dip); 277 while (next != last && dip != failed_driver) { 278 dip = next; 279 next = ddi_get_next_sibling(dip); 280 } 281 if (dip == failed_driver) { 282 failed_driver = NULL; 283 ndi_rele_devi(dip); 284 } else if (sysctrl_is_real_device(dip) && 285 failed_driver == NULL) { 286 /* 287 * Parent dip is held busy, so ddi_pathname() can 288 * be safely called. 289 */ 290 (void) ddi_pathname(dip, device_path); 291 DEBUGP(errp(" resuming device %s\n", device_path)); 292 if (devi_attach(dip, DDI_RESUME) != DDI_SUCCESS) { 293 /* 294 * XXX - if in the future we decide not to 295 * panic the system, we need to set the error 296 * SYSC_ERR_RESUME here and also change the 297 * cfgadm platform library. 298 */ 299 cmn_err(CE_PANIC, "Unable to resume device %s", 300 device_path); 301 } 302 } 303 ndi_devi_enter(dip, &circ); 304 sysctrl_resume_devices(ddi_get_child(dip), pkt); 305 ndi_devi_exit(dip, circ); 306 307 last = dip; 308 } 309 } 310 311 /* 312 * True if thread is virtually stopped. Similar to CPR_VSTOPPED 313 * but from DR point of view. These user threads are waiting in 314 * the kernel. Once they complete in the kernel, they will process 315 * the stop signal and stop. 316 */ 317 #define SYSCTRL_VSTOPPED(t) \ 318 ((t)->t_state == TS_SLEEP && \ 319 (t)->t_wchan != NULL && \ 320 (t)->t_astflag && \ 321 ((t)->t_proc_flag & TP_CHKPT)) 322 323 static int 324 sysctrl_stop_user_threads(sysc_cfga_pkt_t *pkt) 325 { 326 int count; 327 char cache_psargs[PSARGSZ]; 328 kthread_id_t cache_tp; 329 uint_t cache_t_state; 330 int bailout; 331 pid_t pid; 332 333 extern void add_one_utstop(); 334 extern void utstop_timedwait(clock_t); 335 extern void utstop_init(void); 336 337 #define SYSCTRL_UTSTOP_RETRY 4 338 #define SYSCTRL_UTSTOP_WAIT hz 339 340 if (sysctrl_skip_user_threads) 341 return (DDI_SUCCESS); 342 343 utstop_init(); 344 345 /* we need to try a few times to get past fork, etc. */ 346 for (count = 0; count < SYSCTRL_UTSTOP_RETRY; count++) { 347 kthread_id_t tp; 348 349 /* walk the entire threadlist */ 350 mutex_enter(&pidlock); 351 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 352 proc_t *p = ttoproc(tp); 353 354 /* handle kernel threads separately */ 355 if (p->p_as == &kas || p->p_stat == SZOMB) 356 continue; 357 358 mutex_enter(&p->p_lock); 359 thread_lock(tp); 360 361 if (tp->t_state == TS_STOPPED) { 362 /* add another reason to stop this thread */ 363 tp->t_schedflag &= ~TS_RESUME; 364 } else { 365 tp->t_proc_flag |= TP_CHKPT; 366 367 thread_unlock(tp); 368 mutex_exit(&p->p_lock); 369 add_one_utstop(); 370 mutex_enter(&p->p_lock); 371 thread_lock(tp); 372 373 aston(tp); 374 375 if (ISWAKEABLE(tp) || ISWAITING(tp)) { 376 setrun_locked(tp); 377 } 378 379 } 380 381 /* grab thread if needed */ 382 if (tp->t_state == TS_ONPROC && tp->t_cpu != CPU) 383 poke_cpu(tp->t_cpu->cpu_id); 384 385 386 thread_unlock(tp); 387 mutex_exit(&p->p_lock); 388 } 389 mutex_exit(&pidlock); 390 391 392 /* let everything catch up */ 393 utstop_timedwait(count * count * SYSCTRL_UTSTOP_WAIT); 394 395 396 /* now, walk the threadlist again to see if we are done */ 397 mutex_enter(&pidlock); 398 for (tp = curthread->t_next, bailout = 0; 399 bailout == 0 && tp != curthread; tp = tp->t_next) { 400 proc_t *p = ttoproc(tp); 401 402 /* handle kernel threads separately */ 403 if (p->p_as == &kas || p->p_stat == SZOMB) 404 continue; 405 406 /* 407 * If this thread didn't stop, and we don't allow 408 * unstopped blocked threads, bail. 409 */ 410 /* did this thread stop? */ 411 thread_lock(tp); 412 if (!CPR_ISTOPPED(tp) && 413 !(sysctrl_allow_blocked_threads && 414 SYSCTRL_VSTOPPED(tp))) { 415 416 /* nope, cache the details for later */ 417 bcopy(p->p_user.u_psargs, cache_psargs, 418 sizeof (cache_psargs)); 419 cache_tp = tp; 420 cache_t_state = tp->t_state; 421 bailout = 1; 422 pid = p->p_pidp->pid_id; 423 } 424 thread_unlock(tp); 425 } 426 mutex_exit(&pidlock); 427 428 /* were all the threads stopped? */ 429 if (!bailout) 430 break; 431 } 432 433 /* were we unable to stop all threads after a few tries? */ 434 if (bailout) { 435 (void) sprintf(pkt->errbuf, "process: %s id: %d state: %x" 436 " thread descriptor: %p", 437 cache_psargs, (int)pid, cache_t_state, 438 (void *)cache_tp); 439 440 SYSC_ERR_SET(pkt, SYSC_ERR_UTHREAD); 441 442 return (ESRCH); 443 } 444 445 return (DDI_SUCCESS); 446 } 447 448 static int 449 sysctrl_stop_kernel_threads(sysc_cfga_pkt_t *pkt) 450 { 451 caddr_t name; 452 kthread_id_t tp; 453 454 if (sysctrl_skip_kernel_threads) { 455 return (DDI_SUCCESS); 456 } 457 458 /* 459 * Note: we unlock the table in resume. 460 * We only need to lock the callback table if we are actually 461 * suspending kernel threads. 462 */ 463 callb_lock_table(); 464 if ((name = callb_execute_class(CB_CL_CPR_DAEMON, 465 CB_CODE_CPR_CHKPT)) != (caddr_t)NULL) { 466 467 (void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN); 468 SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD); 469 return (EBUSY); 470 } 471 472 /* 473 * Verify that all threads are accounted for 474 */ 475 mutex_enter(&pidlock); 476 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 477 proc_t *p = ttoproc(tp); 478 479 if (p->p_as != &kas) 480 continue; 481 482 if (tp->t_flag & T_INTR_THREAD) 483 continue; 484 485 if (!callb_is_stopped(tp, &name)) { 486 mutex_exit(&pidlock); 487 (void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN); 488 SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD); 489 return (EBUSY); 490 } 491 } 492 493 mutex_exit(&pidlock); 494 return (DDI_SUCCESS); 495 } 496 497 static void 498 sysctrl_start_user_threads(void) 499 { 500 kthread_id_t tp; 501 502 mutex_enter(&pidlock); 503 504 /* walk all threads and release them */ 505 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 506 proc_t *p = ttoproc(tp); 507 508 /* skip kernel threads */ 509 if (ttoproc(tp)->p_as == &kas) 510 continue; 511 512 mutex_enter(&p->p_lock); 513 tp->t_proc_flag &= ~TP_CHKPT; 514 mutex_exit(&p->p_lock); 515 516 thread_lock(tp); 517 if (CPR_ISTOPPED(tp)) { 518 /* back on the runq */ 519 tp->t_schedflag |= TS_RESUME; 520 setrun_locked(tp); 521 } 522 thread_unlock(tp); 523 } 524 525 mutex_exit(&pidlock); 526 } 527 528 static void 529 sysctrl_signal_user(int sig) 530 { 531 struct proc *p; 532 533 mutex_enter(&pidlock); 534 535 for (p = practive; p != NULL; p = p->p_next) { 536 /* only user threads */ 537 if (p->p_exec == NULL || p->p_stat == SZOMB || 538 p == proc_init || p == ttoproc(curthread)) 539 continue; 540 541 mutex_enter(&p->p_lock); 542 sigtoproc(p, NULL, sig); 543 mutex_exit(&p->p_lock); 544 } 545 546 mutex_exit(&pidlock); 547 548 /* add a bit of delay */ 549 delay(hz); 550 } 551 552 void 553 sysctrl_resume(sysc_cfga_pkt_t *pkt) 554 { 555 #ifndef Bug_4154263 556 DEBUGP(errp("resume system...\n")); 557 #endif 558 switch (suspend_state) { 559 case SYSC_STATE_FULL: 560 /* 561 * release all the other cpus 562 */ 563 #ifndef Bug_4154263 564 DEBUGP(errp("release cpus...")); 565 #endif 566 /* 567 * Prevent false alarm in tod_validate() due to tod 568 * value change between suspend and resume 569 */ 570 mutex_enter(&tod_lock); 571 tod_fault_reset(); 572 mutex_exit(&tod_lock); 573 574 sysctrl_release_cpus(); 575 DEBUGP(errp("cpus resumed...\n")); 576 577 /* 578 * If we suspended hw watchdog at suspend, 579 * re-enable it now. 580 */ 581 if (sysc_watchdog_suspended) { 582 mutex_enter(&tod_lock); 583 tod_ops.tod_set_watchdog_timer( 584 watchdog_timeout_seconds); 585 mutex_exit(&tod_lock); 586 } 587 588 /* 589 * resume callout 590 */ 591 (void) callb_execute_class(CB_CL_CPR_RPC, CB_CODE_CPR_RESUME); 592 (void) callb_execute_class(CB_CL_CPR_CALLOUT, 593 CB_CODE_CPR_RESUME); 594 sysctrl_enable_intr(); 595 /* FALLTHROUGH */ 596 597 case SYSC_STATE_DRIVER: 598 /* 599 * resume drivers 600 */ 601 DEBUGP(errp("resume drivers...")); 602 sysctrl_resume_devices(ddi_root_node(), pkt); 603 DEBUGP(errp("done\n")); 604 605 /* 606 * resume the lock manager 607 */ 608 lm_cprresume(); 609 610 /* FALLTHROUGH */ 611 612 case SYSC_STATE_DAEMON: 613 /* 614 * resume kernel daemons 615 */ 616 if (!sysctrl_skip_kernel_threads) { 617 DEBUGP(errp("starting kernel daemons...")); 618 (void) callb_execute_class(CB_CL_CPR_DAEMON, 619 CB_CODE_CPR_RESUME); 620 callb_unlock_table(); 621 } 622 DEBUGP(errp("done\n")); 623 624 /* FALLTHROUGH */ 625 626 case SYSC_STATE_USER: 627 /* 628 * finally, resume user threads 629 */ 630 if (!sysctrl_skip_user_threads) { 631 DEBUGP(errp("starting user threads...")); 632 sysctrl_start_user_threads(); 633 DEBUGP(errp("done\n")); 634 } 635 /* FALLTHROUGH */ 636 637 case SYSC_STATE_BEGIN: 638 default: 639 /* 640 * let those who care know that we've just resumed 641 */ 642 DEBUGP(errp("sending SIGTHAW...")); 643 sysctrl_signal_user(SIGTHAW); 644 DEBUGP(errp("done\n")); 645 break; 646 } 647 648 /* Restore sysctrl detach/suspend to its original value */ 649 sysctrl_enable_detach_suspend = sysc_lastval; 650 651 DEBUGP(errp("system state restored\n")); 652 } 653 654 void 655 sysctrl_suspend_prepare(void) 656 { 657 /* 658 * We use a function, lm_cprsuspend(), in the suspend flow that 659 * is redirected to a module through the modstubs mechanism. 660 * If the module is currently not loaded, modstubs attempts 661 * the modload. The context this happens in below causes the 662 * module load to block forever, so this function must be called 663 * in the normal system call context ahead of time. 664 */ 665 (void) modload("misc", "klmmod"); 666 } 667 668 int 669 sysctrl_suspend(sysc_cfga_pkt_t *pkt) 670 { 671 int rc = DDI_SUCCESS; 672 673 /* enable sysctrl detach/suspend function */ 674 sysc_lastval = sysctrl_enable_detach_suspend; 675 sysctrl_enable_detach_suspend = 1; 676 677 /* 678 * first, stop all user threads 679 */ 680 DEBUGP(errp("\nstopping user threads...")); 681 suspend_state = SYSC_STATE_USER; 682 if (((rc = sysctrl_stop_user_threads(pkt)) != DDI_SUCCESS) && 683 sysctrl_check_user_stop_result) { 684 sysctrl_resume(pkt); 685 return (rc); 686 } 687 DEBUGP(errp("done\n")); 688 689 /* 690 * now stop daemon activities 691 */ 692 DEBUGP(errp("stopping kernel daemons...")); 693 suspend_state = SYSC_STATE_DAEMON; 694 if (rc = sysctrl_stop_kernel_threads(pkt)) { 695 sysctrl_resume(pkt); 696 return (rc); 697 } 698 DEBUGP(errp("done\n")); 699 700 /* 701 * This sync swap out all user pages 702 */ 703 vfs_sync(SYNC_ALL); 704 705 /* 706 * special treatment for lock manager 707 */ 708 lm_cprsuspend(); 709 710 /* 711 * sync the file system in case we never make it back 712 */ 713 sync(); 714 715 /* 716 * now suspend drivers 717 */ 718 DEBUGP(errp("suspending drivers...")); 719 suspend_state = SYSC_STATE_DRIVER; 720 if (rc = sysctrl_suspend_devices(ddi_root_node(), pkt)) { 721 sysctrl_resume(pkt); 722 return (rc); 723 } 724 DEBUGP(errp("done\n")); 725 726 /* 727 * handle the callout table 728 */ 729 sysctrl_stop_intr(); 730 731 (void) callb_execute_class(CB_CL_CPR_CALLOUT, CB_CODE_CPR_CHKPT); 732 733 /* 734 * if watchdog was activated, disable it 735 */ 736 if (watchdog_activated) { 737 mutex_enter(&tod_lock); 738 tod_ops.tod_clear_watchdog_timer(); 739 mutex_exit(&tod_lock); 740 sysc_watchdog_suspended = 1; 741 } else { 742 sysc_watchdog_suspended = 0; 743 } 744 745 /* 746 * finally, grab all cpus 747 */ 748 DEBUGP(errp("freezing all cpus...\n")); 749 suspend_state = SYSC_STATE_FULL; 750 sysctrl_grab_cpus(); 751 #ifndef Bug_4154263 752 DEBUGP(errp("done\n")); 753 754 DEBUGP(errp("system is quiesced\n")); 755 #endif 756 757 return (rc); 758 } 759