1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This workaround inhibits prom_printf after the cpus are grabbed. 31 * This can be removed when 4154263 is corrected. 32 */ 33 #define Bug_4154263 34 35 /* 36 * A CPR derivative specifically for sunfire 37 */ 38 39 #include <sys/types.h> 40 #include <sys/systm.h> 41 #include <sys/machparam.h> 42 #include <sys/machsystm.h> 43 #include <sys/ddi.h> 44 #define SUNDDI_IMPL 45 #include <sys/sunddi.h> 46 #include <sys/time.h> 47 #include <sys/kmem.h> 48 #include <nfs/lm.h> 49 #include <sys/ddi_impldefs.h> 50 #include <sys/obpdefs.h> 51 #include <sys/cmn_err.h> 52 #include <sys/debug.h> 53 #include <sys/errno.h> 54 #include <sys/callb.h> 55 #include <sys/clock.h> 56 #include <sys/x_call.h> 57 #include <sys/cpuvar.h> 58 #include <sys/epm.h> 59 #include <sys/vfs.h> 60 #include <sys/fhc.h> 61 #include <sys/sysctrl.h> 62 #include <sys/promif.h> 63 #include <sys/conf.h> 64 #include <sys/modctl.h> 65 #include <sys/cyclic.h> 66 #include <sys/sunndi.h> 67 #include <sys/machsystm.h> 68 69 static enum sysctrl_suspend_state { 70 SYSC_STATE_BEGIN = 0, 71 SYSC_STATE_USER, 72 SYSC_STATE_DAEMON, 73 SYSC_STATE_DRIVER, 74 SYSC_STATE_FULL } suspend_state; 75 76 static int pstate_save; 77 static uint_t sysctrl_gate[NCPU]; 78 int sysctrl_quiesce_debug = FALSE; 79 static int sysctrl_skip_kernel_threads = TRUE; 80 81 /* 82 * sysctrl_skip_user_threads is used to control if user threads should 83 * be suspended. If sysctrl_skip_user_threads is true, the rest of the 84 * flags are not used; if it is false, sysctrl_check_user_stop_result 85 * will be used to control whether or not we need to check suspend 86 * result, and sysctrl_allow_blocked_threads will be used to control 87 * whether or not we allow suspend to continue if there are blocked 88 * threads. We allow all combinations of sysctrl_check_user_stop_result 89 * and sysctrl_allow_block_threads, even though it might not make much 90 * sense to not allow block threads when we don't even check stop 91 * result. 92 */ 93 static int sysctrl_skip_user_threads = 0; /* default to FALSE */ 94 static int sysctrl_check_user_stop_result = 1; /* default to TRUE */ 95 static int sysctrl_allow_blocked_threads = 1; /* default to TRUE */ 96 97 static int sysc_watchdog_suspended; 98 99 extern int sysctrl_enable_detach_suspend; 100 static int sysc_lastval; 101 102 #define DEBUGP(p) { if (sysctrl_quiesce_debug) p; } 103 #define errp prom_printf 104 105 #define SYSC_CPU_LOOP_MSEC 1000 106 107 static void 108 sysctrl_grab_cpus(void) 109 { 110 int i; 111 cpuset_t others; 112 extern cpuset_t cpu_ready_set; 113 extern void sysctrl_freeze(void); 114 uint64_t sysc_tick_limit; 115 uint64_t sysc_current_tick; 116 uint64_t sysc_tick_deadline; 117 118 extern u_longlong_t gettick(void); 119 120 for (i = 0; i < NCPU; i++) 121 sysctrl_gate[i] = 0; 122 123 /* tell other cpus to go quiet and wait for continue signal */ 124 others = cpu_ready_set; 125 CPUSET_DEL(others, CPU->cpu_id); 126 xt_some(others, (xcfunc_t *)sysctrl_freeze, (uint64_t)sysctrl_gate, 127 (uint64_t)(&sysctrl_gate[CPU->cpu_id])); 128 129 sysc_tick_limit = 130 ((uint64_t)sys_tick_freq * SYSC_CPU_LOOP_MSEC) / 1000; 131 132 /* wait for each cpu to check in */ 133 for (i = 0; i < NCPU; i++) { 134 if (!CPU_IN_SET(others, i)) 135 continue; 136 137 /* 138 * Get current tick value and calculate the deadline tick 139 */ 140 sysc_current_tick = gettick(); 141 sysc_tick_deadline = sysc_current_tick + sysc_tick_limit; 142 143 while (sysctrl_gate[i] == 0) { 144 /* If in panic, we just return */ 145 if (panicstr) 146 break; 147 148 /* Panic the system if cpu not responsed by deadline */ 149 sysc_current_tick = gettick(); 150 if (sysc_current_tick >= sysc_tick_deadline) { 151 cmn_err(CE_PANIC, "sysctrl: cpu %d not " 152 "responding to quiesce command", i); 153 } 154 } 155 } 156 157 /* now even our interrupts are disabled -- really quiet now */ 158 pstate_save = disable_vec_intr(); 159 } 160 161 static void 162 sysctrl_release_cpus(void) 163 { 164 /* let the other cpus go */ 165 sysctrl_gate[CPU->cpu_id] = 1; 166 167 /* restore our interrupts too */ 168 enable_vec_intr(pstate_save); 169 } 170 171 static void 172 sysctrl_stop_intr(void) 173 { 174 mutex_enter(&cpu_lock); 175 kpreempt_disable(); 176 cyclic_suspend(); 177 } 178 179 static void 180 sysctrl_enable_intr(void) 181 { 182 cyclic_resume(); 183 (void) spl0(); 184 kpreempt_enable(); 185 mutex_exit(&cpu_lock); 186 } 187 188 static int 189 sysctrl_is_real_device(dev_info_t *dip) 190 { 191 struct regspec *regbuf; 192 int length; 193 int rc; 194 195 if (ddi_get_driver(dip) == NULL) 196 return (FALSE); 197 198 if (DEVI(dip)->devi_pm_flags & (PMC_NEEDS_SR|PMC_PARENTAL_SR)) 199 return (TRUE); 200 if (DEVI(dip)->devi_pm_flags & PMC_NO_SR) 201 return (FALSE); 202 203 /* 204 * now the general case 205 */ 206 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg", 207 (caddr_t)®buf, &length); 208 ASSERT(rc != DDI_PROP_NO_MEMORY); 209 if (rc != DDI_PROP_SUCCESS) { 210 return (FALSE); 211 } else { 212 kmem_free(regbuf, length); 213 return (TRUE); 214 } 215 } 216 217 static dev_info_t *failed_driver; 218 static char device_path[MAXPATHLEN]; 219 220 static int 221 sysctrl_suspend_devices(dev_info_t *dip, sysc_cfga_pkt_t *pkt) 222 { 223 int circ; 224 225 ASSERT(dip == NULL || ddi_get_parent(dip) == NULL || 226 DEVI_BUSY_OWNED(ddi_get_parent(dip))); 227 228 failed_driver = NULL; 229 for (; dip != NULL; dip = ddi_get_next_sibling(dip)) { 230 /* 231 * Hold parent busy while walking child list 232 */ 233 ndi_devi_enter(dip, &circ); 234 if (sysctrl_suspend_devices(ddi_get_child(dip), pkt)) { 235 ndi_devi_exit(dip, circ); 236 return (ENXIO); 237 } 238 ndi_devi_exit(dip, circ); 239 240 if (!sysctrl_is_real_device(dip)) 241 continue; 242 243 /* 244 * Safe to call ddi_pathname() as parent is held busy 245 */ 246 (void) ddi_pathname(dip, device_path); 247 DEBUGP(errp(" suspending device %s\n", device_path)); 248 if (devi_detach(dip, DDI_SUSPEND) != DDI_SUCCESS) { 249 DEBUGP(errp(" unable to suspend device %s\n", 250 device_path)); 251 252 (void) strncpy(pkt->errbuf, device_path, 253 SYSC_OUTPUT_LEN); 254 SYSC_ERR_SET(pkt, SYSC_ERR_SUSPEND); 255 ndi_hold_devi(dip); 256 failed_driver = dip; 257 return (ENXIO); 258 } 259 } 260 261 return (DDI_SUCCESS); 262 } 263 264 static void 265 sysctrl_resume_devices(dev_info_t *start, sysc_cfga_pkt_t *pkt) 266 { 267 int circ; 268 dev_info_t *dip, *next, *last = NULL; 269 270 ASSERT(start == NULL || ddi_get_parent(start) == NULL || 271 DEVI_BUSY_OWNED(ddi_get_parent(start))); 272 273 /* attach in reverse device tree order */ 274 while (last != start) { 275 dip = start; 276 next = ddi_get_next_sibling(dip); 277 while (next != last && dip != failed_driver) { 278 dip = next; 279 next = ddi_get_next_sibling(dip); 280 } 281 if (dip == failed_driver) { 282 failed_driver = NULL; 283 ndi_rele_devi(dip); 284 } else if (sysctrl_is_real_device(dip) && 285 failed_driver == NULL) { 286 /* 287 * Parent dip is held busy, so ddi_pathname() can 288 * be safely called. 289 */ 290 (void) ddi_pathname(dip, device_path); 291 DEBUGP(errp(" resuming device %s\n", device_path)); 292 if (devi_attach(dip, DDI_RESUME) != DDI_SUCCESS) { 293 /* 294 * XXX - if in the future we decide not to 295 * panic the system, we need to set the error 296 * SYSC_ERR_RESUME here and also change the 297 * cfgadm platform library. 298 */ 299 cmn_err(CE_PANIC, "Unable to resume device %s", 300 device_path); 301 } 302 } 303 ndi_devi_enter(dip, &circ); 304 sysctrl_resume_devices(ddi_get_child(dip), pkt); 305 ndi_devi_exit(dip, circ); 306 307 last = dip; 308 } 309 } 310 311 /* 312 * True if thread is virtually stopped. Similar to CPR_VSTOPPED 313 * but from DR point of view. These user threads are waiting in 314 * the kernel. Once they complete in the kernel, they will process 315 * the stop signal and stop. 316 */ 317 #define SYSCTRL_VSTOPPED(t) \ 318 ((t)->t_state == TS_SLEEP && \ 319 (t)->t_wchan != NULL && \ 320 (t)->t_astflag && \ 321 ((t)->t_proc_flag & TP_CHKPT)) 322 323 static int 324 sysctrl_stop_user_threads(sysc_cfga_pkt_t *pkt) 325 { 326 int count; 327 char cache_psargs[PSARGSZ]; 328 kthread_id_t cache_tp; 329 uint_t cache_t_state; 330 int bailout; 331 pid_t pid; 332 333 extern void add_one_utstop(); 334 extern void utstop_timedwait(clock_t); 335 extern void utstop_init(void); 336 337 #define SYSCTRL_UTSTOP_RETRY 4 338 #define SYSCTRL_UTSTOP_WAIT hz 339 340 if (sysctrl_skip_user_threads) 341 return (DDI_SUCCESS); 342 343 utstop_init(); 344 345 /* we need to try a few times to get past fork, etc. */ 346 for (count = 0; count < SYSCTRL_UTSTOP_RETRY; count++) { 347 kthread_id_t tp; 348 349 /* walk the entire threadlist */ 350 mutex_enter(&pidlock); 351 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 352 proc_t *p = ttoproc(tp); 353 354 /* handle kernel threads separately */ 355 if (p->p_as == &kas || p->p_stat == SZOMB) 356 continue; 357 358 mutex_enter(&p->p_lock); 359 thread_lock(tp); 360 361 if (tp->t_state == TS_STOPPED) { 362 /* add another reason to stop this thread */ 363 tp->t_schedflag &= ~TS_RESUME; 364 } else { 365 tp->t_proc_flag |= TP_CHKPT; 366 367 thread_unlock(tp); 368 mutex_exit(&p->p_lock); 369 add_one_utstop(); 370 mutex_enter(&p->p_lock); 371 thread_lock(tp); 372 373 aston(tp); 374 375 if (tp->t_state == TS_SLEEP && 376 (tp->t_flag & T_WAKEABLE)) { 377 setrun_locked(tp); 378 } 379 380 } 381 382 /* grab thread if needed */ 383 if (tp->t_state == TS_ONPROC && tp->t_cpu != CPU) 384 poke_cpu(tp->t_cpu->cpu_id); 385 386 387 thread_unlock(tp); 388 mutex_exit(&p->p_lock); 389 } 390 mutex_exit(&pidlock); 391 392 393 /* let everything catch up */ 394 utstop_timedwait(count * count * SYSCTRL_UTSTOP_WAIT); 395 396 397 /* now, walk the threadlist again to see if we are done */ 398 mutex_enter(&pidlock); 399 for (tp = curthread->t_next, bailout = 0; 400 bailout == 0 && tp != curthread; tp = tp->t_next) { 401 proc_t *p = ttoproc(tp); 402 403 /* handle kernel threads separately */ 404 if (p->p_as == &kas || p->p_stat == SZOMB) 405 continue; 406 407 /* 408 * If this thread didn't stop, and we don't allow 409 * unstopped blocked threads, bail. 410 */ 411 /* did this thread stop? */ 412 thread_lock(tp); 413 if (!CPR_ISTOPPED(tp) && 414 !(sysctrl_allow_blocked_threads && 415 SYSCTRL_VSTOPPED(tp))) { 416 417 /* nope, cache the details for later */ 418 bcopy(p->p_user.u_psargs, cache_psargs, 419 sizeof (cache_psargs)); 420 cache_tp = tp; 421 cache_t_state = tp->t_state; 422 bailout = 1; 423 pid = p->p_pidp->pid_id; 424 } 425 thread_unlock(tp); 426 } 427 mutex_exit(&pidlock); 428 429 /* were all the threads stopped? */ 430 if (!bailout) 431 break; 432 } 433 434 /* were we unable to stop all threads after a few tries? */ 435 if (bailout) { 436 (void) sprintf(pkt->errbuf, "process: %s id: %d state: %x" 437 " thread descriptor: %p", 438 cache_psargs, (int)pid, cache_t_state, 439 (void *)cache_tp); 440 441 SYSC_ERR_SET(pkt, SYSC_ERR_UTHREAD); 442 443 return (ESRCH); 444 } 445 446 return (DDI_SUCCESS); 447 } 448 449 static int 450 sysctrl_stop_kernel_threads(sysc_cfga_pkt_t *pkt) 451 { 452 caddr_t name; 453 kthread_id_t tp; 454 455 if (sysctrl_skip_kernel_threads) { 456 return (DDI_SUCCESS); 457 } 458 459 /* 460 * Note: we unlock the table in resume. 461 * We only need to lock the callback table if we are actually 462 * suspending kernel threads. 463 */ 464 callb_lock_table(); 465 if ((name = callb_execute_class(CB_CL_CPR_DAEMON, 466 CB_CODE_CPR_CHKPT)) != (caddr_t)NULL) { 467 468 (void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN); 469 SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD); 470 return (EBUSY); 471 } 472 473 /* 474 * Verify that all threads are accounted for 475 */ 476 mutex_enter(&pidlock); 477 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 478 proc_t *p = ttoproc(tp); 479 480 if (p->p_as != &kas) 481 continue; 482 483 if (tp->t_flag & T_INTR_THREAD) 484 continue; 485 486 if (!callb_is_stopped(tp, &name)) { 487 mutex_exit(&pidlock); 488 (void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN); 489 SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD); 490 return (EBUSY); 491 } 492 } 493 494 mutex_exit(&pidlock); 495 return (DDI_SUCCESS); 496 } 497 498 static void 499 sysctrl_start_user_threads(void) 500 { 501 kthread_id_t tp; 502 503 mutex_enter(&pidlock); 504 505 /* walk all threads and release them */ 506 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 507 proc_t *p = ttoproc(tp); 508 509 /* skip kernel threads */ 510 if (ttoproc(tp)->p_as == &kas) 511 continue; 512 513 mutex_enter(&p->p_lock); 514 tp->t_proc_flag &= ~TP_CHKPT; 515 mutex_exit(&p->p_lock); 516 517 thread_lock(tp); 518 if (CPR_ISTOPPED(tp)) { 519 /* back on the runq */ 520 tp->t_schedflag |= TS_RESUME; 521 setrun_locked(tp); 522 } 523 thread_unlock(tp); 524 } 525 526 mutex_exit(&pidlock); 527 } 528 529 static void 530 sysctrl_signal_user(int sig) 531 { 532 struct proc *p; 533 534 mutex_enter(&pidlock); 535 536 for (p = practive; p != NULL; p = p->p_next) { 537 /* only user threads */ 538 if (p->p_exec == NULL || p->p_stat == SZOMB || 539 p == proc_init || p == ttoproc(curthread)) 540 continue; 541 542 mutex_enter(&p->p_lock); 543 sigtoproc(p, NULL, sig); 544 mutex_exit(&p->p_lock); 545 } 546 547 mutex_exit(&pidlock); 548 549 /* add a bit of delay */ 550 delay(hz); 551 } 552 553 void 554 sysctrl_resume(sysc_cfga_pkt_t *pkt) 555 { 556 #ifndef Bug_4154263 557 DEBUGP(errp("resume system...\n")); 558 #endif 559 switch (suspend_state) { 560 case SYSC_STATE_FULL: 561 /* 562 * release all the other cpus 563 */ 564 #ifndef Bug_4154263 565 DEBUGP(errp("release cpus...")); 566 #endif 567 sysctrl_release_cpus(); 568 DEBUGP(errp("cpus resumed...\n")); 569 570 /* 571 * If we suspended hw watchdog at suspend, 572 * re-enable it now. 573 */ 574 if (sysc_watchdog_suspended) { 575 mutex_enter(&tod_lock); 576 tod_ops.tod_set_watchdog_timer( 577 watchdog_timeout_seconds); 578 mutex_exit(&tod_lock); 579 } 580 581 /* 582 * resume callout 583 */ 584 (void) callb_execute_class(CB_CL_CPR_RPC, CB_CODE_CPR_RESUME); 585 (void) callb_execute_class(CB_CL_CPR_CALLOUT, 586 CB_CODE_CPR_RESUME); 587 sysctrl_enable_intr(); 588 /* FALLTHROUGH */ 589 590 case SYSC_STATE_DRIVER: 591 /* 592 * resume drivers 593 */ 594 DEBUGP(errp("resume drivers...")); 595 sysctrl_resume_devices(ddi_root_node(), pkt); 596 DEBUGP(errp("done\n")); 597 598 /* 599 * resume the lock manager 600 */ 601 lm_cprresume(); 602 603 /* FALLTHROUGH */ 604 605 case SYSC_STATE_DAEMON: 606 /* 607 * resume kernel daemons 608 */ 609 if (!sysctrl_skip_kernel_threads) { 610 DEBUGP(errp("starting kernel daemons...")); 611 (void) callb_execute_class(CB_CL_CPR_DAEMON, 612 CB_CODE_CPR_RESUME); 613 callb_unlock_table(); 614 } 615 DEBUGP(errp("done\n")); 616 617 /* FALLTHROUGH */ 618 619 case SYSC_STATE_USER: 620 /* 621 * finally, resume user threads 622 */ 623 if (!sysctrl_skip_user_threads) { 624 DEBUGP(errp("starting user threads...")); 625 sysctrl_start_user_threads(); 626 DEBUGP(errp("done\n")); 627 } 628 /* FALLTHROUGH */ 629 630 case SYSC_STATE_BEGIN: 631 default: 632 /* 633 * let those who care know that we've just resumed 634 */ 635 DEBUGP(errp("sending SIGTHAW...")); 636 sysctrl_signal_user(SIGTHAW); 637 DEBUGP(errp("done\n")); 638 break; 639 } 640 641 /* Restore sysctrl detach/suspend to its original value */ 642 sysctrl_enable_detach_suspend = sysc_lastval; 643 644 DEBUGP(errp("system state restored\n")); 645 } 646 647 void 648 sysctrl_suspend_prepare(void) 649 { 650 /* 651 * We use a function, lm_cprsuspend(), in the suspend flow that 652 * is redirected to a module through the modstubs mechanism. 653 * If the module is currently not loaded, modstubs attempts 654 * the modload. The context this happens in below causes the 655 * module load to block forever, so this function must be called 656 * in the normal system call context ahead of time. 657 */ 658 (void) modload("misc", "klmmod"); 659 } 660 661 int 662 sysctrl_suspend(sysc_cfga_pkt_t *pkt) 663 { 664 int rc = DDI_SUCCESS; 665 666 /* enable sysctrl detach/suspend function */ 667 sysc_lastval = sysctrl_enable_detach_suspend; 668 sysctrl_enable_detach_suspend = 1; 669 670 /* 671 * first, stop all user threads 672 */ 673 DEBUGP(errp("\nstopping user threads...")); 674 suspend_state = SYSC_STATE_USER; 675 if (((rc = sysctrl_stop_user_threads(pkt)) != DDI_SUCCESS) && 676 sysctrl_check_user_stop_result) { 677 sysctrl_resume(pkt); 678 return (rc); 679 } 680 DEBUGP(errp("done\n")); 681 682 /* 683 * now stop daemon activities 684 */ 685 DEBUGP(errp("stopping kernel daemons...")); 686 suspend_state = SYSC_STATE_DAEMON; 687 if (rc = sysctrl_stop_kernel_threads(pkt)) { 688 sysctrl_resume(pkt); 689 return (rc); 690 } 691 DEBUGP(errp("done\n")); 692 693 /* 694 * This sync swap out all user pages 695 */ 696 vfs_sync(SYNC_ALL); 697 698 /* 699 * special treatment for lock manager 700 */ 701 lm_cprsuspend(); 702 703 /* 704 * sync the file system in case we never make it back 705 */ 706 sync(); 707 708 /* 709 * now suspend drivers 710 */ 711 DEBUGP(errp("suspending drivers...")); 712 suspend_state = SYSC_STATE_DRIVER; 713 if (rc = sysctrl_suspend_devices(ddi_root_node(), pkt)) { 714 sysctrl_resume(pkt); 715 return (rc); 716 } 717 DEBUGP(errp("done\n")); 718 719 /* 720 * handle the callout table 721 */ 722 sysctrl_stop_intr(); 723 724 (void) callb_execute_class(CB_CL_CPR_CALLOUT, CB_CODE_CPR_CHKPT); 725 726 /* 727 * if watchdog was activated, disable it 728 */ 729 if (watchdog_activated) { 730 mutex_enter(&tod_lock); 731 tod_ops.tod_clear_watchdog_timer(); 732 mutex_exit(&tod_lock); 733 sysc_watchdog_suspended = 1; 734 } else { 735 sysc_watchdog_suspended = 0; 736 } 737 738 /* 739 * finally, grab all cpus 740 */ 741 DEBUGP(errp("freezing all cpus...\n")); 742 suspend_state = SYSC_STATE_FULL; 743 sysctrl_grab_cpus(); 744 #ifndef Bug_4154263 745 DEBUGP(errp("done\n")); 746 747 DEBUGP(errp("system is quiesced\n")); 748 #endif 749 750 return (rc); 751 } 752