xref: /illumos-gate/usr/src/uts/common/cpr/cpr_uthread.c (revision 16b76d3cb933ff92018a2a75594449010192eacb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/thread.h>
28 #include <sys/conf.h>
29 #include <sys/cpuvar.h>
30 #include <sys/cpr.h>
31 #include <sys/user.h>
32 #include <sys/cmn_err.h>
33 #include <sys/callb.h>
34 
35 extern void utstop_init(void);
36 extern void add_one_utstop(void);
37 extern void utstop_timedwait(long ticks);
38 
39 static void cpr_stop_user(int);
40 static int cpr_check_user_threads(void);
41 
42 /*
43  * CPR user thread related support routines
44  */
45 void
46 cpr_signal_user(int sig)
47 {
48 /*
49  * The signal SIGTHAW and SIGFREEZE cannot be sent to every thread yet
50  * since openwin is catching every signal and default action is to exit.
51  * We also need to implement the true SIGFREEZE and SIGTHAW to stop threads.
52  */
53 	struct proc *p;
54 
55 	mutex_enter(&pidlock);
56 
57 	for (p = practive; p; p = p->p_next) {
58 		/* only user threads */
59 		if (p->p_exec == NULL || p->p_stat == SZOMB ||
60 		    p == proc_init || p == ttoproc(curthread))
61 			continue;
62 
63 		mutex_enter(&p->p_lock);
64 		sigtoproc(p, NULL, sig);
65 		mutex_exit(&p->p_lock);
66 	}
67 	mutex_exit(&pidlock);
68 
69 	DELAY(MICROSEC);
70 }
71 
72 /* max wait time for user thread stop */
73 #define	CPR_UTSTOP_WAIT		hz
74 #define	CPR_UTSTOP_RETRY	4
75 static int count;
76 
77 int
78 cpr_stop_user_threads()
79 {
80 	utstop_init();
81 
82 	count = 0;
83 	do {
84 		if (++count > CPR_UTSTOP_RETRY)
85 			return (ESRCH);
86 		cpr_stop_user(count * count * CPR_UTSTOP_WAIT);
87 	} while (cpr_check_user_threads() &&
88 	    (count < CPR_UTSTOP_RETRY || CPR->c_fcn != AD_CPR_FORCE));
89 
90 	return (0);
91 }
92 
93 /*
94  * This routine tries to stop all user threads before we get rid of all
95  * its pages.It goes through allthreads list and set the TP_CHKPT flag
96  * for all user threads and make them runnable. If all of the threads
97  * can be stopped within the max wait time, CPR will proceed. Otherwise
98  * CPR is aborted after a few of similiar retries.
99  */
100 static void
101 cpr_stop_user(int wait)
102 {
103 	kthread_id_t tp;
104 	proc_t *p;
105 
106 	/* The whole loop below needs to be atomic */
107 	mutex_enter(&pidlock);
108 
109 	/* faster this way */
110 	tp = curthread->t_next;
111 	do {
112 		/* kernel threads will be handled later */
113 		p = ttoproc(tp);
114 		if (p->p_as == &kas || p->p_stat == SZOMB)
115 			continue;
116 
117 		/*
118 		 * If the thread is stopped (by CPR) already, do nothing;
119 		 * if running, mark TP_CHKPT;
120 		 * if sleeping normally, mark TP_CHKPT and setrun;
121 		 * if sleeping non-interruptable, mark TP_CHKPT only for now;
122 		 * if sleeping with t_wchan0 != 0 etc, virtually stopped,
123 		 * do nothing.
124 		 */
125 
126 		/* p_lock is needed for modifying t_proc_flag */
127 		mutex_enter(&p->p_lock);
128 		thread_lock(tp); /* needed to check CPR_ISTOPPED */
129 
130 		if (tp->t_state == TS_STOPPED) {
131 			/*
132 			 * if already stopped by other reasons, add this new
133 			 * reason to it.
134 			 */
135 			if (tp->t_schedflag & TS_RESUME)
136 				tp->t_schedflag &= ~TS_RESUME;
137 		} else {
138 
139 			tp->t_proc_flag |= TP_CHKPT;
140 
141 			thread_unlock(tp);
142 			mutex_exit(&p->p_lock);
143 			add_one_utstop();
144 			mutex_enter(&p->p_lock);
145 			thread_lock(tp);
146 
147 			aston(tp);
148 
149 			if (ISWAKEABLE(tp) || ISWAITING(tp)) {
150 				setrun_locked(tp);
151 			}
152 		}
153 		/*
154 		 * force the thread into the kernel if it is not already there.
155 		 */
156 		if (tp->t_state == TS_ONPROC && tp->t_cpu != CPU)
157 			poke_cpu(tp->t_cpu->cpu_id);
158 		thread_unlock(tp);
159 		mutex_exit(&p->p_lock);
160 
161 	} while ((tp = tp->t_next) != curthread);
162 	mutex_exit(&pidlock);
163 
164 	utstop_timedwait(wait);
165 }
166 
167 /*
168  * Checks and makes sure all user threads are stopped
169  */
170 static int
171 cpr_check_user_threads()
172 {
173 	kthread_id_t tp;
174 	int rc = 0;
175 
176 	mutex_enter(&pidlock);
177 	tp = curthread->t_next;
178 	do {
179 		if (ttoproc(tp)->p_as == &kas || ttoproc(tp)->p_stat == SZOMB)
180 			continue;
181 
182 		thread_lock(tp);
183 		/*
184 		 * make sure that we are off all the queues and in a stopped
185 		 * state.
186 		 */
187 		if (!CPR_ISTOPPED(tp)) {
188 			thread_unlock(tp);
189 			mutex_exit(&pidlock);
190 
191 			if (count == CPR_UTSTOP_RETRY) {
192 			CPR_DEBUG(CPR_DEBUG1, "Suspend failed: "
193 			    "cannot stop uthread\n");
194 			cpr_err(CE_WARN, "Suspend cannot stop "
195 			    "process %s (%p:%x).",
196 			    ttoproc(tp)->p_user.u_psargs, (void *)tp,
197 			    tp->t_state);
198 			cpr_err(CE_WARN, "Process may be waiting for"
199 			    " network request, please try again.");
200 			}
201 
202 			CPR_DEBUG(CPR_DEBUG2, "cant stop t=%p state=%x pfg=%x "
203 			    "sched=%x\n", (void *)tp, tp->t_state,
204 			    tp->t_proc_flag, tp->t_schedflag);
205 			CPR_DEBUG(CPR_DEBUG2, "proc %p state=%x pid=%d\n",
206 			    (void *)ttoproc(tp), ttoproc(tp)->p_stat,
207 			    ttoproc(tp)->p_pidp->pid_id);
208 			return (1);
209 		}
210 		thread_unlock(tp);
211 
212 	} while ((tp = tp->t_next) != curthread && rc == 0);
213 
214 	mutex_exit(&pidlock);
215 	return (0);
216 }
217 
218 
219 /*
220  * start all threads that were stopped for checkpoint.
221  */
222 void
223 cpr_start_user_threads()
224 {
225 	kthread_id_t tp;
226 	proc_t *p;
227 
228 	mutex_enter(&pidlock);
229 	tp = curthread->t_next;
230 	do {
231 		p = ttoproc(tp);
232 		/*
233 		 * kernel threads are callback'ed rather than setrun.
234 		 */
235 		if (ttoproc(tp)->p_as == &kas) continue;
236 		/*
237 		 * t_proc_flag should have been cleared. Just to make sure here
238 		 */
239 		mutex_enter(&p->p_lock);
240 		tp->t_proc_flag &= ~TP_CHKPT;
241 		mutex_exit(&p->p_lock);
242 
243 		thread_lock(tp);
244 		if (CPR_ISTOPPED(tp)) {
245 
246 			/*
247 			 * put it back on the runq
248 			 */
249 			tp->t_schedflag |= TS_RESUME;
250 			setrun_locked(tp);
251 		}
252 		thread_unlock(tp);
253 		/*
254 		 * DEBUG - Keep track of current and next thread pointer.
255 		 */
256 	} while ((tp = tp->t_next) != curthread);
257 
258 	mutex_exit(&pidlock);
259 }
260 
261 
262 /*
263  * re/start kernel threads
264  */
265 void
266 cpr_start_kernel_threads(void)
267 {
268 	CPR_DEBUG(CPR_DEBUG1, "starting kernel daemons...");
269 	(void) callb_execute_class(CB_CL_CPR_DAEMON, CB_CODE_CPR_RESUME);
270 	CPR_DEBUG(CPR_DEBUG1, "done\n");
271 
272 	/* see table lock below */
273 	callb_unlock_table();
274 }
275 
276 
277 /*
278  * Stop kernel threads by using the callback mechanism.  If any thread
279  * cannot be stopped, return failure.
280  */
281 int
282 cpr_stop_kernel_threads(void)
283 {
284 	caddr_t	name;
285 
286 	callb_lock_table();	/* Note: we unlock the table in resume. */
287 
288 	CPR_DEBUG(CPR_DEBUG1, "stopping kernel daemons...");
289 	if ((name = callb_execute_class(CB_CL_CPR_DAEMON,
290 	    CB_CODE_CPR_CHKPT)) != (caddr_t)NULL) {
291 		cpr_err(CE_WARN,
292 		    "Could not stop \"%s\" kernel thread.  "
293 		    "Please try again later.", name);
294 		return (EBUSY);
295 	}
296 
297 	CPR_DEBUG(CPR_DEBUG1, ("done\n"));
298 	return (0);
299 }
300 
301 /*
302  * Check to see that kernel threads are stopped.
303  * This should be called while CPU's are paused, and the caller is
304  * effectively running single user, or else we are virtually guaranteed
305  * to fail.  The routine should not ASSERT on the paused state or spl
306  * level, as there may be a use for this to verify that things are running
307  * again.
308  */
309 int
310 cpr_threads_are_stopped(void)
311 {
312 	caddr_t	name;
313 	kthread_id_t tp;
314 	proc_t *p;
315 
316 	/*
317 	 * We think we stopped all the kernel threads.  Just in case
318 	 * someone is not playing by the rules, take a spin through
319 	 * the threadlist and see if we can account for everybody.
320 	 */
321 	mutex_enter(&pidlock);
322 	tp = curthread->t_next;
323 	do {
324 		p = ttoproc(tp);
325 		if (p->p_as != &kas)
326 			continue;
327 
328 		if (tp->t_flag & T_INTR_THREAD)
329 			continue;
330 
331 		if (! callb_is_stopped(tp, &name)) {
332 			mutex_exit(&pidlock);
333 			cpr_err(CE_WARN,
334 			    "\"%s\" kernel thread not stopped.", name);
335 			return (EBUSY);
336 		}
337 	} while ((tp = tp->t_next) != curthread);
338 
339 	mutex_exit(&pidlock);
340 	return (0);
341 }
342