/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" /* * This workaround inhibits prom_printf after the cpus are grabbed. * This can be removed when 4154263 is corrected. */ #define Bug_4154263 /* * A CPR derivative specifically for sunfire */ #include #include #include #include #include #define SUNDDI_IMPL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static enum sysctrl_suspend_state { SYSC_STATE_BEGIN = 0, SYSC_STATE_USER, SYSC_STATE_DAEMON, SYSC_STATE_DRIVER, SYSC_STATE_FULL } suspend_state; static int pstate_save; static uint_t sysctrl_gate[NCPU]; int sysctrl_quiesce_debug = FALSE; static int sysctrl_skip_kernel_threads = TRUE; /* * sysctrl_skip_user_threads is used to control if user threads should * be suspended. If sysctrl_skip_user_threads is true, the rest of the * flags are not used; if it is false, sysctrl_check_user_stop_result * will be used to control whether or not we need to check suspend * result, and sysctrl_allow_blocked_threads will be used to control * whether or not we allow suspend to continue if there are blocked * threads. We allow all combinations of sysctrl_check_user_stop_result * and sysctrl_allow_block_threads, even though it might not make much * sense to not allow block threads when we don't even check stop * result. */ static int sysctrl_skip_user_threads = 0; /* default to FALSE */ static int sysctrl_check_user_stop_result = 1; /* default to TRUE */ static int sysctrl_allow_blocked_threads = 1; /* default to TRUE */ static int sysc_watchdog_suspended; extern int sysctrl_enable_detach_suspend; static int sysc_lastval; #define DEBUGP(p) { if (sysctrl_quiesce_debug) p; } #define errp prom_printf #define SYSC_CPU_LOOP_MSEC 1000 static void sysctrl_grab_cpus(void) { int i; cpuset_t others; extern cpuset_t cpu_ready_set; extern void sysctrl_freeze(void); uint64_t sysc_tick_limit; uint64_t sysc_current_tick; uint64_t sysc_tick_deadline; extern u_longlong_t gettick(void); for (i = 0; i < NCPU; i++) sysctrl_gate[i] = 0; /* tell other cpus to go quiet and wait for continue signal */ others = cpu_ready_set; CPUSET_DEL(others, CPU->cpu_id); xt_some(others, (xcfunc_t *)sysctrl_freeze, (uint64_t)sysctrl_gate, (uint64_t)(&sysctrl_gate[CPU->cpu_id])); sysc_tick_limit = ((uint64_t)sys_tick_freq * SYSC_CPU_LOOP_MSEC) / 1000; /* wait for each cpu to check in */ for (i = 0; i < NCPU; i++) { if (!CPU_IN_SET(others, i)) continue; /* * Get current tick value and calculate the deadline tick */ sysc_current_tick = gettick(); sysc_tick_deadline = sysc_current_tick + sysc_tick_limit; while (sysctrl_gate[i] == 0) { /* If in panic, we just return */ if (panicstr) break; /* Panic the system if cpu not responsed by deadline */ sysc_current_tick = gettick(); if (sysc_current_tick >= sysc_tick_deadline) { cmn_err(CE_PANIC, "sysctrl: cpu %d not " "responding to quiesce command", i); } } } /* now even our interrupts are disabled -- really quiet now */ pstate_save = disable_vec_intr(); } static void sysctrl_release_cpus(void) { /* let the other cpus go */ sysctrl_gate[CPU->cpu_id] = 1; /* restore our interrupts too */ enable_vec_intr(pstate_save); } static void sysctrl_stop_intr(void) { mutex_enter(&cpu_lock); kpreempt_disable(); cyclic_suspend(); } static void sysctrl_enable_intr(void) { cyclic_resume(); (void) spl0(); kpreempt_enable(); mutex_exit(&cpu_lock); } static int sysctrl_is_real_device(dev_info_t *dip) { struct regspec *regbuf; int length; int rc; if (ddi_get_driver(dip) == NULL) return (FALSE); if (DEVI(dip)->devi_pm_flags & (PMC_NEEDS_SR|PMC_PARENTAL_SR)) return (TRUE); if (DEVI(dip)->devi_pm_flags & PMC_NO_SR) return (FALSE); /* * now the general case */ rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg", (caddr_t)®buf, &length); ASSERT(rc != DDI_PROP_NO_MEMORY); if (rc != DDI_PROP_SUCCESS) { return (FALSE); } else { kmem_free(regbuf, length); return (TRUE); } } static dev_info_t *failed_driver; static char device_path[MAXPATHLEN]; static int sysctrl_suspend_devices(dev_info_t *dip, sysc_cfga_pkt_t *pkt) { int circ; ASSERT(dip == NULL || ddi_get_parent(dip) == NULL || DEVI_BUSY_OWNED(ddi_get_parent(dip))); failed_driver = NULL; for (; dip != NULL; dip = ddi_get_next_sibling(dip)) { /* * Hold parent busy while walking child list */ ndi_devi_enter(dip, &circ); if (sysctrl_suspend_devices(ddi_get_child(dip), pkt)) { ndi_devi_exit(dip, circ); return (ENXIO); } ndi_devi_exit(dip, circ); if (!sysctrl_is_real_device(dip)) continue; /* * Safe to call ddi_pathname() as parent is held busy */ (void) ddi_pathname(dip, device_path); DEBUGP(errp(" suspending device %s\n", device_path)); if (devi_detach(dip, DDI_SUSPEND) != DDI_SUCCESS) { DEBUGP(errp(" unable to suspend device %s\n", device_path)); (void) strncpy(pkt->errbuf, device_path, SYSC_OUTPUT_LEN); SYSC_ERR_SET(pkt, SYSC_ERR_SUSPEND); ndi_hold_devi(dip); failed_driver = dip; return (ENXIO); } } return (DDI_SUCCESS); } static void sysctrl_resume_devices(dev_info_t *start, sysc_cfga_pkt_t *pkt) { int circ; dev_info_t *dip, *next, *last = NULL; ASSERT(start == NULL || ddi_get_parent(start) == NULL || DEVI_BUSY_OWNED(ddi_get_parent(start))); /* attach in reverse device tree order */ while (last != start) { dip = start; next = ddi_get_next_sibling(dip); while (next != last && dip != failed_driver) { dip = next; next = ddi_get_next_sibling(dip); } if (dip == failed_driver) { failed_driver = NULL; ndi_rele_devi(dip); } else if (sysctrl_is_real_device(dip) && failed_driver == NULL) { /* * Parent dip is held busy, so ddi_pathname() can * be safely called. */ (void) ddi_pathname(dip, device_path); DEBUGP(errp(" resuming device %s\n", device_path)); if (devi_attach(dip, DDI_RESUME) != DDI_SUCCESS) { /* * XXX - if in the future we decide not to * panic the system, we need to set the error * SYSC_ERR_RESUME here and also change the * cfgadm platform library. */ cmn_err(CE_PANIC, "Unable to resume device %s", device_path); } } ndi_devi_enter(dip, &circ); sysctrl_resume_devices(ddi_get_child(dip), pkt); ndi_devi_exit(dip, circ); last = dip; } } /* * True if thread is virtually stopped. Similar to CPR_VSTOPPED * but from DR point of view. These user threads are waiting in * the kernel. Once they complete in the kernel, they will process * the stop signal and stop. */ #define SYSCTRL_VSTOPPED(t) \ ((t)->t_state == TS_SLEEP && \ (t)->t_wchan != NULL && \ (t)->t_astflag && \ ((t)->t_proc_flag & TP_CHKPT)) static int sysctrl_stop_user_threads(sysc_cfga_pkt_t *pkt) { int count; char cache_psargs[PSARGSZ]; kthread_id_t cache_tp; uint_t cache_t_state; int bailout; pid_t pid; extern void add_one_utstop(); extern void utstop_timedwait(clock_t); extern void utstop_init(void); #define SYSCTRL_UTSTOP_RETRY 4 #define SYSCTRL_UTSTOP_WAIT hz if (sysctrl_skip_user_threads) return (DDI_SUCCESS); utstop_init(); /* we need to try a few times to get past fork, etc. */ for (count = 0; count < SYSCTRL_UTSTOP_RETRY; count++) { kthread_id_t tp; /* walk the entire threadlist */ mutex_enter(&pidlock); for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { proc_t *p = ttoproc(tp); /* handle kernel threads separately */ if (p->p_as == &kas || p->p_stat == SZOMB) continue; mutex_enter(&p->p_lock); thread_lock(tp); if (tp->t_state == TS_STOPPED) { /* add another reason to stop this thread */ tp->t_schedflag &= ~TS_RESUME; } else { tp->t_proc_flag |= TP_CHKPT; thread_unlock(tp); mutex_exit(&p->p_lock); add_one_utstop(); mutex_enter(&p->p_lock); thread_lock(tp); aston(tp); if (tp->t_state == TS_SLEEP && (tp->t_flag & T_WAKEABLE)) { setrun_locked(tp); } } /* grab thread if needed */ if (tp->t_state == TS_ONPROC && tp->t_cpu != CPU) poke_cpu(tp->t_cpu->cpu_id); thread_unlock(tp); mutex_exit(&p->p_lock); } mutex_exit(&pidlock); /* let everything catch up */ utstop_timedwait(count * count * SYSCTRL_UTSTOP_WAIT); /* now, walk the threadlist again to see if we are done */ mutex_enter(&pidlock); for (tp = curthread->t_next, bailout = 0; bailout == 0 && tp != curthread; tp = tp->t_next) { proc_t *p = ttoproc(tp); /* handle kernel threads separately */ if (p->p_as == &kas || p->p_stat == SZOMB) continue; /* * If this thread didn't stop, and we don't allow * unstopped blocked threads, bail. */ /* did this thread stop? */ thread_lock(tp); if (!CPR_ISTOPPED(tp) && !(sysctrl_allow_blocked_threads && SYSCTRL_VSTOPPED(tp))) { /* nope, cache the details for later */ bcopy(p->p_user.u_psargs, cache_psargs, sizeof (cache_psargs)); cache_tp = tp; cache_t_state = tp->t_state; bailout = 1; pid = p->p_pidp->pid_id; } thread_unlock(tp); } mutex_exit(&pidlock); /* were all the threads stopped? */ if (!bailout) break; } /* were we unable to stop all threads after a few tries? */ if (bailout) { (void) sprintf(pkt->errbuf, "process: %s id: %d state: %x" " thread descriptor: %p", cache_psargs, (int)pid, cache_t_state, (void *)cache_tp); SYSC_ERR_SET(pkt, SYSC_ERR_UTHREAD); return (ESRCH); } return (DDI_SUCCESS); } static int sysctrl_stop_kernel_threads(sysc_cfga_pkt_t *pkt) { caddr_t name; kthread_id_t tp; if (sysctrl_skip_kernel_threads) { return (DDI_SUCCESS); } /* * Note: we unlock the table in resume. * We only need to lock the callback table if we are actually * suspending kernel threads. */ callb_lock_table(); if ((name = callb_execute_class(CB_CL_CPR_DAEMON, CB_CODE_CPR_CHKPT)) != (caddr_t)NULL) { (void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN); SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD); return (EBUSY); } /* * Verify that all threads are accounted for */ mutex_enter(&pidlock); for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { proc_t *p = ttoproc(tp); if (p->p_as != &kas) continue; if (tp->t_flag & T_INTR_THREAD) continue; if (!callb_is_stopped(tp, &name)) { mutex_exit(&pidlock); (void) strncpy(pkt->errbuf, name, SYSC_OUTPUT_LEN); SYSC_ERR_SET(pkt, SYSC_ERR_KTHREAD); return (EBUSY); } } mutex_exit(&pidlock); return (DDI_SUCCESS); } static void sysctrl_start_user_threads(void) { kthread_id_t tp; mutex_enter(&pidlock); /* walk all threads and release them */ for (tp = curthread->t_next; tp != curthread; tp = tp->t_next) { proc_t *p = ttoproc(tp); /* skip kernel threads */ if (ttoproc(tp)->p_as == &kas) continue; mutex_enter(&p->p_lock); tp->t_proc_flag &= ~TP_CHKPT; mutex_exit(&p->p_lock); thread_lock(tp); if (CPR_ISTOPPED(tp)) { /* back on the runq */ tp->t_schedflag |= TS_RESUME; setrun_locked(tp); } thread_unlock(tp); } mutex_exit(&pidlock); } static void sysctrl_signal_user(int sig) { struct proc *p; mutex_enter(&pidlock); for (p = practive; p != NULL; p = p->p_next) { /* only user threads */ if (p->p_exec == NULL || p->p_stat == SZOMB || p == proc_init || p == ttoproc(curthread)) continue; mutex_enter(&p->p_lock); sigtoproc(p, NULL, sig); mutex_exit(&p->p_lock); } mutex_exit(&pidlock); /* add a bit of delay */ delay(hz); } void sysctrl_resume(sysc_cfga_pkt_t *pkt) { #ifndef Bug_4154263 DEBUGP(errp("resume system...\n")); #endif switch (suspend_state) { case SYSC_STATE_FULL: /* * release all the other cpus */ #ifndef Bug_4154263 DEBUGP(errp("release cpus...")); #endif /* * Prevent false alarm in tod_validate() due to tod * value change between suspend and resume */ mutex_enter(&tod_lock); tod_fault_reset(); mutex_exit(&tod_lock); sysctrl_release_cpus(); DEBUGP(errp("cpus resumed...\n")); /* * If we suspended hw watchdog at suspend, * re-enable it now. */ if (sysc_watchdog_suspended) { mutex_enter(&tod_lock); tod_ops.tod_set_watchdog_timer( watchdog_timeout_seconds); mutex_exit(&tod_lock); } /* * resume callout */ (void) callb_execute_class(CB_CL_CPR_RPC, CB_CODE_CPR_RESUME); (void) callb_execute_class(CB_CL_CPR_CALLOUT, CB_CODE_CPR_RESUME); sysctrl_enable_intr(); /* FALLTHROUGH */ case SYSC_STATE_DRIVER: /* * resume drivers */ DEBUGP(errp("resume drivers...")); sysctrl_resume_devices(ddi_root_node(), pkt); DEBUGP(errp("done\n")); /* * resume the lock manager */ lm_cprresume(); /* FALLTHROUGH */ case SYSC_STATE_DAEMON: /* * resume kernel daemons */ if (!sysctrl_skip_kernel_threads) { DEBUGP(errp("starting kernel daemons...")); (void) callb_execute_class(CB_CL_CPR_DAEMON, CB_CODE_CPR_RESUME); callb_unlock_table(); } DEBUGP(errp("done\n")); /* FALLTHROUGH */ case SYSC_STATE_USER: /* * finally, resume user threads */ if (!sysctrl_skip_user_threads) { DEBUGP(errp("starting user threads...")); sysctrl_start_user_threads(); DEBUGP(errp("done\n")); } /* FALLTHROUGH */ case SYSC_STATE_BEGIN: default: /* * let those who care know that we've just resumed */ DEBUGP(errp("sending SIGTHAW...")); sysctrl_signal_user(SIGTHAW); DEBUGP(errp("done\n")); break; } /* Restore sysctrl detach/suspend to its original value */ sysctrl_enable_detach_suspend = sysc_lastval; DEBUGP(errp("system state restored\n")); } void sysctrl_suspend_prepare(void) { /* * We use a function, lm_cprsuspend(), in the suspend flow that * is redirected to a module through the modstubs mechanism. * If the module is currently not loaded, modstubs attempts * the modload. The context this happens in below causes the * module load to block forever, so this function must be called * in the normal system call context ahead of time. */ (void) modload("misc", "klmmod"); } int sysctrl_suspend(sysc_cfga_pkt_t *pkt) { int rc = DDI_SUCCESS; /* enable sysctrl detach/suspend function */ sysc_lastval = sysctrl_enable_detach_suspend; sysctrl_enable_detach_suspend = 1; /* * first, stop all user threads */ DEBUGP(errp("\nstopping user threads...")); suspend_state = SYSC_STATE_USER; if (((rc = sysctrl_stop_user_threads(pkt)) != DDI_SUCCESS) && sysctrl_check_user_stop_result) { sysctrl_resume(pkt); return (rc); } DEBUGP(errp("done\n")); /* * now stop daemon activities */ DEBUGP(errp("stopping kernel daemons...")); suspend_state = SYSC_STATE_DAEMON; if (rc = sysctrl_stop_kernel_threads(pkt)) { sysctrl_resume(pkt); return (rc); } DEBUGP(errp("done\n")); /* * This sync swap out all user pages */ vfs_sync(SYNC_ALL); /* * special treatment for lock manager */ lm_cprsuspend(); /* * sync the file system in case we never make it back */ sync(); /* * now suspend drivers */ DEBUGP(errp("suspending drivers...")); suspend_state = SYSC_STATE_DRIVER; if (rc = sysctrl_suspend_devices(ddi_root_node(), pkt)) { sysctrl_resume(pkt); return (rc); } DEBUGP(errp("done\n")); /* * handle the callout table */ sysctrl_stop_intr(); (void) callb_execute_class(CB_CL_CPR_CALLOUT, CB_CODE_CPR_CHKPT); /* * if watchdog was activated, disable it */ if (watchdog_activated) { mutex_enter(&tod_lock); tod_ops.tod_clear_watchdog_timer(); mutex_exit(&tod_lock); sysc_watchdog_suspended = 1; } else { sysc_watchdog_suspended = 0; } /* * finally, grab all cpus */ DEBUGP(errp("freezing all cpus...\n")); suspend_state = SYSC_STATE_FULL; sysctrl_grab_cpus(); #ifndef Bug_4154263 DEBUGP(errp("done\n")); DEBUGP(errp("system is quiesced\n")); #endif return (rc); }