1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This workaround inhibits prom_printf after the cpus are grabbed. 29 * This can be removed when 4154263 is corrected. 30 */ 31 #define Bug_4154263 32 33 /* 34 * A CPR derivative specifically for sunfire 35 */ 36 37 #include <sys/types.h> 38 #include <sys/systm.h> 39 #include <sys/machparam.h> 40 #include <sys/machsystm.h> 41 #include <sys/ddi.h> 42 #define SUNDDI_IMPL 43 #include <sys/sunddi.h> 44 #include <sys/time.h> 45 #include <sys/kmem.h> 46 #include <nfs/lm.h> 47 #include <sys/ddi_impldefs.h> 48 #include <sys/obpdefs.h> 49 #include <sys/cmn_err.h> 50 #include <sys/debug.h> 51 #include <sys/errno.h> 52 #include <sys/callb.h> 53 #include <sys/clock.h> 54 #include <sys/x_call.h> 55 #include <sys/cpuvar.h> 56 #include <sys/epm.h> 57 #include <sys/vfs.h> 58 #include <sys/fhc.h> 59 #include <sys/sysctrl.h> 60 #include <sys/promif.h> 61 #include <sys/conf.h> 62 #include <sys/modctl.h> 63 #include <sys/cyclic.h> 64 #include <sys/sunndi.h> 65 #include <sys/machsystm.h> 66 67 static enum sysctrl_suspend_state { 68 SYSC_STATE_BEGIN = 0, 69 SYSC_STATE_USER, 70 SYSC_STATE_DAEMON, 71 SYSC_STATE_DRIVER, 72 SYSC_STATE_FULL } suspend_state; 73 74 static int pstate_save; 75 static uint_t sysctrl_gate[NCPU]; 76 int sysctrl_quiesce_debug = FALSE; 77 static int sysctrl_skip_kernel_threads = TRUE; 78 79 /* 80 * sysctrl_skip_user_threads is used to control if user threads should 81 * be suspended. If sysctrl_skip_user_threads is true, the rest of the 82 * flags are not used; if it is false, sysctrl_check_user_stop_result 83 * will be used to control whether or not we need to check suspend 84 * result, and sysctrl_allow_blocked_threads will be used to control 85 * whether or not we allow suspend to continue if there are blocked 86 * threads. We allow all combinations of sysctrl_check_user_stop_result 87 * and sysctrl_allow_block_threads, even though it might not make much 88 * sense to not allow block threads when we don't even check stop 89 * result. 90 */ 91 static int sysctrl_skip_user_threads = 0; /* default to FALSE */ 92 static int sysctrl_check_user_stop_result = 1; /* default to TRUE */ 93 static int sysctrl_allow_blocked_threads = 1; /* default to TRUE */ 94 95 static int sysc_watchdog_suspended; 96 97 extern int sysctrl_enable_detach_suspend; 98 static int sysc_lastval; 99 100 #define DEBUGP(p) { if (sysctrl_quiesce_debug) p; } 101 #define errp prom_printf 102 103 #define SYSC_CPU_LOOP_MSEC 1000 104 105 static void 106 sysctrl_grab_cpus(void) 107 { 108 int i; 109 cpuset_t others; 110 extern cpuset_t cpu_ready_set; 111 extern void sysctrl_freeze(void); 112 uint64_t sysc_tick_limit; 113 uint64_t sysc_current_tick; 114 uint64_t sysc_tick_deadline; 115 116 extern u_longlong_t gettick(void); 117 118 for (i = 0; i < NCPU; i++) 119 sysctrl_gate[i] = 0; 120 121 /* tell other cpus to go quiet and wait for continue signal */ 122 others = cpu_ready_set; 123 CPUSET_DEL(others, CPU->cpu_id); 124 xt_some(others, (xcfunc_t *)sysctrl_freeze, (uint64_t)sysctrl_gate, 125 (uint64_t)(&sysctrl_gate[CPU->cpu_id])); 126 127 sysc_tick_limit = ((uint64_t)sys_tick_freq * SYSC_CPU_LOOP_MSEC) / 1000; 128 129 /* wait for each cpu to check in */ 130 for (i = 0; i < NCPU; i++) { 131 if (!CPU_IN_SET(others, i)) 132 continue; 133 134 /* 135 * Get current tick value and calculate the deadline tick 136 */ 137 sysc_current_tick = gettick(); 138 sysc_tick_deadline = sysc_current_tick + sysc_tick_limit; 139 140 while (sysctrl_gate[i] == 0) { 141 /* If in panic, we just return */ 142 if (panicstr) 143 break; 144 145 /* Panic the system if cpu not responsed by deadline */ 146 sysc_current_tick = gettick(); 147 if (sysc_current_tick >= sysc_tick_deadline) { 148 cmn_err(CE_PANIC, "sysctrl: cpu %d not " 149 "responding to quiesce command", i); 150 } 151 } 152 } 153 154 /* now even our interrupts are disabled -- really quiet now */ 155 pstate_save = disable_vec_intr(); 156 } 157 158 static void 159 sysctrl_release_cpus(void) 160 { 161 /* let the other cpus go */ 162 sysctrl_gate[CPU->cpu_id] = 1; 163 164 /* restore our interrupts too */ 165 enable_vec_intr(pstate_save); 166 } 167 168 static void 169 sysctrl_stop_intr(void) 170 { 171 mutex_enter(&cpu_lock); 172 kpreempt_disable(); 173 cyclic_suspend(); 174 } 175 176 static void 177 sysctrl_enable_intr(void) 178 { 179 cyclic_resume(); 180 (void) spl0(); 181 kpreempt_enable(); 182 mutex_exit(&cpu_lock); 183 } 184 185 static int 186 sysctrl_is_real_device(dev_info_t *dip) 187 { 188 struct regspec *regbuf; 189 int length; 190 int rc; 191 192 if (ddi_get_driver(dip) == NULL) 193 return (FALSE); 194 195 if (DEVI(dip)->devi_pm_flags & (PMC_NEEDS_SR|PMC_PARENTAL_SR)) 196 return (TRUE); 197 if (DEVI(dip)->devi_pm_flags & PMC_NO_SR) 198 return (FALSE); 199 200 /* 201 * now the general case 202 */ 203 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg", 204 (caddr_t)®buf, &length); 205 ASSERT(rc != DDI_PROP_NO_MEMORY); 206 if (rc != DDI_PROP_SUCCESS) { 207 return (FALSE); 208 } else { 209 kmem_free(regbuf, length); 210 return (TRUE); 211 } 212 } 213 214 static dev_info_t *failed_driver; 215 static char device_path[MAXPATHLEN]; 216 217 static int 218 sysctrl_suspend_devices(dev_info_t *dip, sysc_cfga_pkt_t *pkt) 219 { 220 int circ; 221 222 ASSERT(dip == NULL || ddi_get_parent(dip) == NULL || 223 DEVI_BUSY_OWNED(ddi_get_parent(dip))); 224 225 failed_driver = NULL; 226 for (; dip != NULL; dip = ddi_get_next_sibling(dip)) { 227 /* 228 * Hold parent busy while walking child list 229 */ 230 ndi_devi_enter(dip, &circ); 231 if (sysctrl_suspend_devices(ddi_get_child(dip), pkt)) { 232 ndi_devi_exit(dip, circ); 233 return (ENXIO); 234 } 235 ndi_devi_exit(dip, circ); 236 237 if (!sysctrl_is_real_device(dip)) 238 continue; 239 240 /* 241 * Safe to call ddi_pathname() as parent is held busy 242 */ 243 (void) ddi_pathname(dip, device_path); 244 DEBUGP(errp(" suspending device %s\n", device_path)); 245 if (devi_detach(dip, DDI_SUSPEND) != DDI_SUCCESS) { 246 DEBUGP(errp(" unable to suspend device %s\n", 247 device_path)); 248 249 (void) strncpy(pkt->errbuf, device_path, 250 SYSC_OUTPUT_LEN); 251 SYSC_ERR_SET(pkt, SYSC_ERR_SUSPEND); 252 ndi_hold_devi(dip); 253 failed_driver = dip; 254 return (ENXIO); 255 } 256 } 257 258 return (DDI_SUCCESS); 259 } 260 261 static void 262 sysctrl_resume_devices(dev_info_t *start, sysc_cfga_pkt_t *pkt) 263 { 264 int circ; 265 dev_info_t *dip, *next, *last = NULL; 266 267 ASSERT(start == NULL || ddi_get_parent(start) == NULL || 268 DEVI_BUSY_OWNED(ddi_get_parent(start))); 269 270 /* attach in reverse device tree order */ 271 while (last != start) { 272 dip = start; 273 next = ddi_get_next_sibling(dip); 274 while (next != last && dip != failed_driver) { 275 dip = next; 276 next = ddi_get_next_sibling(dip); 277 } 278 if (dip == failed_driver) { 279 failed_driver = NULL; 280 ndi_rele_devi(dip); 281 } else if (sysctrl_is_real_device(dip) && 282 failed_driver == NULL) { 283 /* 284 * Parent dip is held busy, so ddi_pathname() can 285 * be safely called. 286 */ 287 (void) ddi_pathname(dip, device_path); 288 DEBUGP(errp(" resuming device %s\n", device_path)); 289 if (devi_attach(dip, DDI_RESUME) != DDI_SUCCESS) { 290 /* 291 * XXX - if in the future we decide not to 292 * panic the system, we need to set the error 293 * SYSC_ERR_RESUME here and also change the 294 * cfgadm platform library. 295 */ 296 cmn_err(CE_PANIC, "Unable to resume device %s", 297 device_path); 298 } 299 } 300 ndi_devi_enter(dip, &circ); 301 sysctrl_resume_devices(ddi_get_child(dip), pkt); 302 ndi_devi_exit(dip, circ); 303 304 last = dip; 305 } 306 } 307 308 /* 309 * True if thread is virtually stopped. Similar to CPR_VSTOPPED 310 * but from DR point of view. These user threads are waiting in 311 * the kernel. Once they complete in the kernel, they will process 312 * the stop signal and stop. 313 */ 314 #define SYSCTRL_VSTOPPED(t) \ 315 ((t)->t_state == TS_SLEEP && \ 316 (t)->t_wchan != NULL && \ 317 (t)->t_astflag && \ 318 ((t)->t_proc_flag & TP_CHKPT)) 319 320 static int 321 sysctrl_stop_user_threads(sysc_cfga_pkt_t *pkt) 322 { 323 int count; 324 char cache_psargs[PSARGSZ]; 325 kthread_id_t cache_tp; 326 uint_t cache_t_state; 327 int bailout; 328 pid_t pid; 329 330 extern void add_one_utstop(); 331 extern void utstop_timedwait(clock_t); 332 extern void utstop_init(void); 333 334 #define SYSCTRL_UTSTOP_RETRY 4 335 #define SYSCTRL_UTSTOP_WAIT hz 336 337 if (sysctrl_skip_user_threads) 338 return (DDI_SUCCESS); 339 340 utstop_init(); 341 342 /* we need to try a few times to get past fork, etc. */ 343 for (count = 0; count < SYSCTRL_UTSTOP_RETRY; count++) { 344 kthread_id_t tp; 345 346 /* walk the entire threadlist */ 347 mutex_enter(&pidlock); 348 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 349 proc_t *p = ttoproc(tp); 350 351 /* handle kernel threads separately */ 352 if (p->p_as == &kas || p->p_stat == SZOMB) 353 continue; 354 355 mutex_enter(&p->p_lock); 356 thread_lock(tp); 357 358 if (tp->t_state == TS_STOPPED) { 359 /* add another reason to stop this thread */ 360 tp->t_schedflag &= ~TS_RESUME; 361 } else { 362 tp->t_proc_flag |= TP_CHKPT; 363 364 thread_unlock(tp); 365 mutex_exit(&p->p_lock); 366 add_one_utstop(); 367 mutex_enter(&p->p_lock); 368 thread_lock(tp); 369 370 aston(tp); 371 372 if (ISWAKEABLE(tp) || ISWAITING(tp)) { 373 setrun_locked(tp); 374 } 375 376 } 377 378 /* grab thread if needed */ 379 if (tp->t_state == TS_ONPROC && tp->t_cpu != CPU) 380 poke_cpu(tp->t_cpu->cpu_id); 381 382 383 thread_unlock(tp); 384 mutex_exit(&p->p_lock); 385 } 386 mutex_exit(&pidlock); 387 388 389 /* let everything catch up */ 390 utstop_timedwait(count * count * SYSCTRL_UTSTOP_WAIT); 391 392 393 /* now, walk the threadlist again to see if we are done */ 394 mutex_enter(&pidlock); 395 for (tp = curthread->t_next, bailout = 0; 396 bailout == 0 && tp != curthread; tp = tp->t_next) { 397 proc_t *p = ttoproc(tp); 398 399 /* handle kernel threads separately */ 400 if (p->p_as == &kas || p->p_stat == SZOMB) 401 continue; 402 403 /* 404 * If this thread didn't stop, and we don't allow 405 * unstopped blocked threads, bail. 406 */ 407 /* did this thread stop? */ 408 thread_lock(tp); 409 if (!CPR_ISTOPPED(tp) && 410 !(sysctrl_allow_blocked_threads && 411 SYSCTRL_VSTOPPED(tp))) { 412 413 /* nope, cache the details for later */ 414 bcopy(p->p_user.u_psargs, cache_psargs, 415 sizeof (cache_psargs)); 416 cache_tp = tp; 417 cache_t_state = tp->t_state; 418 bailout = 1; 419 pid = p->p_pidp->pid_id; 420 } 421 thread_unlock(tp); 422 } 423 mutex_exit(&pidlock); 424 425 /* were all the threads stopped? */ 426 if (!bailout) 427 break; 428 } 429 430 /* were we unable to stop all threads after a few tries? */ 431 if (bailout) { 432 (void) sprintf(pkt->errbuf, "process: %s id: %d state: %x" 433 " thread descriptor: %p", cache_psargs, (int)pid, 434 cache_t_state, (void *)cache_tp); 435 436 SYSC_ERR_SET(pkt, SYSC_ERR_UTHREAD); 437 438 return (ESRCH); 439 } 440 441 return (DDI_SUCCESS); 442 } 443 444 static int 445 sysctrl_stop_kernel_threads(sysc_cfga_pkt_t *pkt) 446 { 447 caddr_t name; 448 kthread_id_t tp; 449 450 if (sysctrl_skip_kernel_threads) { 451 return (DDI_SUCCESS); 452 } 453 454 /* 455 * Note: we unlock the table in resume. 456 * We only need to lock the callback table if we are actually 457 * suspending kernel threads. 458 */ 459 callb_lock_table(); 460 if ((name = callb_execute_class(CB_CL_CPR_DAEMON, 461 CB_CODE_CPR_CHKPT)) != (caddr_t)NULL) { 462 463 (void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN); 464 SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD); 465 return (EBUSY); 466 } 467 468 /* 469 * Verify that all threads are accounted for 470 */ 471 mutex_enter(&pidlock); 472 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 473 proc_t *p = ttoproc(tp); 474 475 if (p->p_as != &kas) 476 continue; 477 478 if (tp->t_flag & T_INTR_THREAD) 479 continue; 480 481 if (!callb_is_stopped(tp, &name)) { 482 mutex_exit(&pidlock); 483 (void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN); 484 SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD); 485 return (EBUSY); 486 } 487 } 488 489 mutex_exit(&pidlock); 490 return (DDI_SUCCESS); 491 } 492 493 static void 494 sysctrl_start_user_threads(void) 495 { 496 kthread_id_t tp; 497 498 mutex_enter(&pidlock); 499 500 /* walk all threads and release them */ 501 for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { 502 proc_t *p = ttoproc(tp); 503 504 /* skip kernel threads */ 505 if (ttoproc(tp)->p_as == &kas) 506 continue; 507 508 mutex_enter(&p->p_lock); 509 tp->t_proc_flag &= ~TP_CHKPT; 510 mutex_exit(&p->p_lock); 511 512 thread_lock(tp); 513 if (CPR_ISTOPPED(tp)) { 514 /* back on the runq */ 515 tp->t_schedflag |= TS_RESUME; 516 setrun_locked(tp); 517 } 518 thread_unlock(tp); 519 } 520 521 mutex_exit(&pidlock); 522 } 523 524 static void 525 sysctrl_signal_user(int sig) 526 { 527 struct proc *p; 528 529 mutex_enter(&pidlock); 530 531 for (p = practive; p != NULL; p = p->p_next) { 532 /* only user threads */ 533 if (p->p_exec == NULL || p->p_stat == SZOMB || 534 p == proc_init || p == ttoproc(curthread)) 535 continue; 536 537 mutex_enter(&p->p_lock); 538 sigtoproc(p, NULL, sig); 539 mutex_exit(&p->p_lock); 540 } 541 542 mutex_exit(&pidlock); 543 544 /* add a bit of delay */ 545 delay(hz); 546 } 547 548 void 549 sysctrl_resume(sysc_cfga_pkt_t *pkt) 550 { 551 #ifndef Bug_4154263 552 DEBUGP(errp("resume system...\n")); 553 #endif 554 switch (suspend_state) { 555 case SYSC_STATE_FULL: 556 /* 557 * release all the other cpus 558 */ 559 #ifndef Bug_4154263 560 DEBUGP(errp("release cpus...")); 561 #endif 562 /* 563 * Prevent false alarm in tod_validate() due to tod 564 * value change between suspend and resume 565 */ 566 mutex_enter(&tod_lock); 567 tod_status_set(TOD_DR_RESUME_DONE); 568 mutex_exit(&tod_lock); 569 570 sysctrl_release_cpus(); 571 DEBUGP(errp("cpus resumed...\n")); 572 573 /* 574 * If we suspended hw watchdog at suspend, 575 * re-enable it now. 576 */ 577 if (sysc_watchdog_suspended) { 578 mutex_enter(&tod_lock); 579 tod_ops.tod_set_watchdog_timer( 580 watchdog_timeout_seconds); 581 mutex_exit(&tod_lock); 582 } 583 584 /* 585 * resume callout 586 */ 587 (void) callb_execute_class(CB_CL_CPR_RPC, CB_CODE_CPR_RESUME); 588 (void) callb_execute_class(CB_CL_CPR_CALLOUT, 589 CB_CODE_CPR_RESUME); 590 sysctrl_enable_intr(); 591 /* FALLTHROUGH */ 592 593 case SYSC_STATE_DRIVER: 594 /* 595 * resume drivers 596 */ 597 DEBUGP(errp("resume drivers...")); 598 sysctrl_resume_devices(ddi_root_node(), pkt); 599 DEBUGP(errp("done\n")); 600 601 /* 602 * resume the lock manager 603 */ 604 lm_cprresume(); 605 606 /* FALLTHROUGH */ 607 608 case SYSC_STATE_DAEMON: 609 /* 610 * resume kernel daemons 611 */ 612 if (!sysctrl_skip_kernel_threads) { 613 DEBUGP(errp("starting kernel daemons...")); 614 (void) callb_execute_class(CB_CL_CPR_DAEMON, 615 CB_CODE_CPR_RESUME); 616 callb_unlock_table(); 617 } 618 DEBUGP(errp("done\n")); 619 620 /* FALLTHROUGH */ 621 622 case SYSC_STATE_USER: 623 /* 624 * finally, resume user threads 625 */ 626 if (!sysctrl_skip_user_threads) { 627 DEBUGP(errp("starting user threads...")); 628 sysctrl_start_user_threads(); 629 DEBUGP(errp("done\n")); 630 } 631 /* FALLTHROUGH */ 632 633 case SYSC_STATE_BEGIN: 634 default: 635 /* 636 * let those who care know that we've just resumed 637 */ 638 DEBUGP(errp("sending SIGTHAW...")); 639 sysctrl_signal_user(SIGTHAW); 640 DEBUGP(errp("done\n")); 641 break; 642 } 643 644 /* Restore sysctrl detach/suspend to its original value */ 645 sysctrl_enable_detach_suspend = sysc_lastval; 646 647 DEBUGP(errp("system state restored\n")); 648 } 649 650 void 651 sysctrl_suspend_prepare(void) 652 { 653 /* 654 * We use a function, lm_cprsuspend(), in the suspend flow that 655 * is redirected to a module through the modstubs mechanism. 656 * If the module is currently not loaded, modstubs attempts 657 * the modload. The context this happens in below causes the 658 * module load to block forever, so this function must be called 659 * in the normal system call context ahead of time. 660 */ 661 (void) modload("misc", "klmmod"); 662 } 663 664 int 665 sysctrl_suspend(sysc_cfga_pkt_t *pkt) 666 { 667 int rc = DDI_SUCCESS; 668 669 /* enable sysctrl detach/suspend function */ 670 sysc_lastval = sysctrl_enable_detach_suspend; 671 sysctrl_enable_detach_suspend = 1; 672 673 /* 674 * first, stop all user threads 675 */ 676 DEBUGP(errp("\nstopping user threads...")); 677 suspend_state = SYSC_STATE_USER; 678 if (((rc = sysctrl_stop_user_threads(pkt)) != DDI_SUCCESS) && 679 sysctrl_check_user_stop_result) { 680 sysctrl_resume(pkt); 681 return (rc); 682 } 683 DEBUGP(errp("done\n")); 684 685 /* 686 * now stop daemon activities 687 */ 688 DEBUGP(errp("stopping kernel daemons...")); 689 suspend_state = SYSC_STATE_DAEMON; 690 if (rc = sysctrl_stop_kernel_threads(pkt)) { 691 sysctrl_resume(pkt); 692 return (rc); 693 } 694 DEBUGP(errp("done\n")); 695 696 /* 697 * This sync swap out all user pages 698 */ 699 vfs_sync(SYNC_ALL); 700 701 /* 702 * special treatment for lock manager 703 */ 704 lm_cprsuspend(); 705 706 /* 707 * sync the file system in case we never make it back 708 */ 709 sync(); 710 711 /* 712 * now suspend drivers 713 */ 714 DEBUGP(errp("suspending drivers...")); 715 suspend_state = SYSC_STATE_DRIVER; 716 if (rc = sysctrl_suspend_devices(ddi_root_node(), pkt)) { 717 sysctrl_resume(pkt); 718 return (rc); 719 } 720 DEBUGP(errp("done\n")); 721 722 /* 723 * handle the callout table 724 */ 725 sysctrl_stop_intr(); 726 727 (void) callb_execute_class(CB_CL_CPR_CALLOUT, CB_CODE_CPR_CHKPT); 728 729 /* 730 * if watchdog was activated, disable it 731 */ 732 if (watchdog_activated) { 733 mutex_enter(&tod_lock); 734 tod_ops.tod_clear_watchdog_timer(); 735 mutex_exit(&tod_lock); 736 sysc_watchdog_suspended = 1; 737 } else { 738 sysc_watchdog_suspended = 0; 739 } 740 741 /* 742 * finally, grab all cpus 743 */ 744 DEBUGP(errp("freezing all cpus...\n")); 745 suspend_state = SYSC_STATE_FULL; 746 sysctrl_grab_cpus(); 747 #ifndef Bug_4154263 748 DEBUGP(errp("done\n")); 749 750 DEBUGP(errp("system is quiesced\n")); 751 #endif 752 753 return (rc); 754 } 755