xref: /illumos-gate/usr/src/uts/common/cpr/cpr_uthread.c (revision 2aeafac3612e19716bf8164f89c3c9196342979c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/thread.h>
30 #include <sys/conf.h>
31 #include <sys/cpuvar.h>
32 #include <sys/cpr.h>
33 #include <sys/user.h>
34 #include <sys/cmn_err.h>
35 #include <sys/callb.h>
36 
37 extern void utstop_init(void);
38 extern void add_one_utstop(void);
39 extern void utstop_timedwait(long ticks);
40 
41 static void cpr_stop_user(int);
42 static int cpr_check_user_threads(void);
43 
44 /*
45  * CPR user thread related support routines
46  */
47 void
48 cpr_signal_user(int sig)
49 {
50 /*
51  * The signal SIGTHAW and SIGFREEZE cannot be sent to every thread yet
52  * since openwin is catching every signal and default action is to exit.
53  * We also need to implement the true SIGFREEZE and SIGTHAW to stop threads.
54  */
55 	struct proc *p;
56 
57 	mutex_enter(&pidlock);
58 
59 	for (p = practive; p; p = p->p_next) {
60 		/* only user threads */
61 		if (p->p_exec == NULL || p->p_stat == SZOMB ||
62 		    p == proc_init || p == ttoproc(curthread))
63 			continue;
64 
65 		mutex_enter(&p->p_lock);
66 		sigtoproc(p, NULL, sig);
67 		mutex_exit(&p->p_lock);
68 	}
69 	mutex_exit(&pidlock);
70 
71 	DELAY(MICROSEC);
72 }
73 
74 /* max wait time for user thread stop */
75 #define	CPR_UTSTOP_WAIT		hz
76 #define	CPR_UTSTOP_RETRY	4
77 static int count;
78 
79 int
80 cpr_stop_user_threads()
81 {
82 	utstop_init();
83 
84 	count = 0;
85 	do {
86 		if (++count > CPR_UTSTOP_RETRY)
87 			return (ESRCH);
88 		cpr_stop_user(count * count * CPR_UTSTOP_WAIT);
89 	} while (cpr_check_user_threads() &&
90 	    (count < CPR_UTSTOP_RETRY || CPR->c_fcn != AD_CPR_FORCE));
91 
92 	return (0);
93 }
94 
95 /*
96  * This routine tries to stop all user threads before we get rid of all
97  * its pages.It goes through allthreads list and set the TP_CHKPT flag
98  * for all user threads and make them runnable. If all of the threads
99  * can be stopped within the max wait time, CPR will proceed. Otherwise
100  * CPR is aborted after a few of similiar retries.
101  */
102 static void
103 cpr_stop_user(int wait)
104 {
105 	kthread_id_t tp;
106 	proc_t *p;
107 
108 	/* The whole loop below needs to be atomic */
109 	mutex_enter(&pidlock);
110 
111 	/* faster this way */
112 	tp = curthread->t_next;
113 	do {
114 		/* kernel threads will be handled later */
115 		p = ttoproc(tp);
116 		if (p->p_as == &kas || p->p_stat == SZOMB)
117 			continue;
118 
119 		/*
120 		 * If the thread is stopped (by CPR) already, do nothing;
121 		 * if running, mark TP_CHKPT;
122 		 * if sleeping normally, mark TP_CHKPT and setrun;
123 		 * if sleeping non-interruptable, mark TP_CHKPT only for now;
124 		 * if sleeping with t_wchan0 != 0 etc, virtually stopped,
125 		 * do nothing.
126 		 */
127 
128 		/* p_lock is needed for modifying t_proc_flag */
129 		mutex_enter(&p->p_lock);
130 		thread_lock(tp); /* needed to check CPR_ISTOPPED */
131 
132 		if (tp->t_state == TS_STOPPED) {
133 			/*
134 			 * if already stopped by other reasons, add this new
135 			 * reason to it.
136 			 */
137 			if (tp->t_schedflag & TS_RESUME)
138 				tp->t_schedflag &= ~TS_RESUME;
139 		} else {
140 
141 			tp->t_proc_flag |= TP_CHKPT;
142 
143 			thread_unlock(tp);
144 			mutex_exit(&p->p_lock);
145 			add_one_utstop();
146 			mutex_enter(&p->p_lock);
147 			thread_lock(tp);
148 
149 			aston(tp);
150 
151 			if (ISWAKEABLE(tp) || ISWAITING(tp)) {
152 				setrun_locked(tp);
153 			}
154 		}
155 		/*
156 		 * force the thread into the kernel if it is not already there.
157 		 */
158 		if (tp->t_state == TS_ONPROC && tp->t_cpu != CPU)
159 			poke_cpu(tp->t_cpu->cpu_id);
160 		thread_unlock(tp);
161 		mutex_exit(&p->p_lock);
162 
163 	} while ((tp = tp->t_next) != curthread);
164 	mutex_exit(&pidlock);
165 
166 	utstop_timedwait(wait);
167 }
168 
169 /*
170  * Checks and makes sure all user threads are stopped
171  */
172 static int
173 cpr_check_user_threads()
174 {
175 	kthread_id_t tp;
176 	int rc = 0;
177 
178 	mutex_enter(&pidlock);
179 	tp = curthread->t_next;
180 	do {
181 		if (ttoproc(tp)->p_as == &kas || ttoproc(tp)->p_stat == SZOMB)
182 			continue;
183 
184 		thread_lock(tp);
185 		/*
186 		 * make sure that we are off all the queues and in a stopped
187 		 * state.
188 		 */
189 		if (!CPR_ISTOPPED(tp)) {
190 			thread_unlock(tp);
191 			mutex_exit(&pidlock);
192 
193 			if (count == CPR_UTSTOP_RETRY) {
194 			CPR_DEBUG(CPR_DEBUG1, "Suspend failed: "
195 			    "cannot stop uthread\n");
196 			cpr_err(CE_WARN, "Suspend cannot stop "
197 			    "process %s (%p:%x).",
198 			    ttoproc(tp)->p_user.u_psargs, (void *)tp,
199 			    tp->t_state);
200 			cpr_err(CE_WARN, "Process may be waiting for"
201 			    " network request, please try again.");
202 			}
203 
204 			CPR_DEBUG(CPR_DEBUG2, "cant stop t=%p state=%x pfg=%x "
205 			    "sched=%x\n", (void *)tp, tp->t_state,
206 			    tp->t_proc_flag, tp->t_schedflag);
207 			CPR_DEBUG(CPR_DEBUG2, "proc %p state=%x pid=%d\n",
208 			    (void *)ttoproc(tp), ttoproc(tp)->p_stat,
209 			    ttoproc(tp)->p_pidp->pid_id);
210 			return (1);
211 		}
212 		thread_unlock(tp);
213 
214 	} while ((tp = tp->t_next) != curthread && rc == 0);
215 
216 	mutex_exit(&pidlock);
217 	return (0);
218 }
219 
220 
221 /*
222  * start all threads that were stopped for checkpoint.
223  */
224 void
225 cpr_start_user_threads()
226 {
227 	kthread_id_t tp;
228 	proc_t *p;
229 
230 	mutex_enter(&pidlock);
231 	tp = curthread->t_next;
232 	do {
233 		p = ttoproc(tp);
234 		/*
235 		 * kernel threads are callback'ed rather than setrun.
236 		 */
237 		if (ttoproc(tp)->p_as == &kas) continue;
238 		/*
239 		 * t_proc_flag should have been cleared. Just to make sure here
240 		 */
241 		mutex_enter(&p->p_lock);
242 		tp->t_proc_flag &= ~TP_CHKPT;
243 		mutex_exit(&p->p_lock);
244 
245 		thread_lock(tp);
246 		if (CPR_ISTOPPED(tp)) {
247 
248 			/*
249 			 * put it back on the runq
250 			 */
251 			tp->t_schedflag |= TS_RESUME;
252 			setrun_locked(tp);
253 		}
254 		thread_unlock(tp);
255 		/*
256 		 * DEBUG - Keep track of current and next thread pointer.
257 		 */
258 	} while ((tp = tp->t_next) != curthread);
259 
260 	mutex_exit(&pidlock);
261 }
262 
263 
264 /*
265  * re/start kernel threads
266  */
267 void
268 cpr_start_kernel_threads(void)
269 {
270 	CPR_DEBUG(CPR_DEBUG1, "starting kernel daemons...");
271 	(void) callb_execute_class(CB_CL_CPR_DAEMON, CB_CODE_CPR_RESUME);
272 	CPR_DEBUG(CPR_DEBUG1, "done\n");
273 
274 	/* see table lock below */
275 	callb_unlock_table();
276 }
277 
278 
279 /*
280  * Stop kernel threads by using the callback mechanism.  If any thread
281  * cannot be stopped, return failure.
282  */
283 int
284 cpr_stop_kernel_threads(void)
285 {
286 	caddr_t	name;
287 
288 	callb_lock_table();	/* Note: we unlock the table in resume. */
289 
290 	CPR_DEBUG(CPR_DEBUG1, "stopping kernel daemons...");
291 	if ((name = callb_execute_class(CB_CL_CPR_DAEMON,
292 	    CB_CODE_CPR_CHKPT)) != (caddr_t)NULL) {
293 		cpr_err(CE_WARN,
294 		    "Could not stop \"%s\" kernel thread.  "
295 		    "Please try again later.", name);
296 		return (EBUSY);
297 	}
298 
299 	CPR_DEBUG(CPR_DEBUG1, ("done\n"));
300 	return (0);
301 }
302 
303 /*
304  * Check to see that kernel threads are stopped.
305  * This should be called while CPU's are paused, and the caller is
306  * effectively running single user, or else we are virtually guaranteed
307  * to fail.  The routine should not ASSERT on the paused state or spl
308  * level, as there may be a use for this to verify that things are running
309  * again.
310  */
311 int
312 cpr_threads_are_stopped(void)
313 {
314 	caddr_t	name;
315 	kthread_id_t tp;
316 	proc_t *p;
317 
318 	/*
319 	 * We think we stopped all the kernel threads.  Just in case
320 	 * someone is not playing by the rules, take a spin through
321 	 * the threadlist and see if we can account for everybody.
322 	 */
323 	mutex_enter(&pidlock);
324 	tp = curthread->t_next;
325 	do {
326 		p = ttoproc(tp);
327 		if (p->p_as != &kas)
328 			continue;
329 
330 		if (tp->t_flag & T_INTR_THREAD)
331 			continue;
332 
333 		if (! callb_is_stopped(tp, &name)) {
334 			mutex_exit(&pidlock);
335 			cpr_err(CE_WARN,
336 			    "\"%s\" kernel thread not stopped.", name);
337 			return (EBUSY);
338 		}
339 	} while ((tp = tp->t_next) != curthread);
340 
341 	mutex_exit(&pidlock);
342 	return (0);
343 }
344