xref: /illumos-gate/usr/src/cmd/mdb/common/kmdb/kaif_start.c (revision 35a5a3587fd94b666239c157d3722745250ccbd7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * The main CPU-control loops, used to control masters and slaves.
30  */
31 
32 #include <sys/types.h>
33 
34 #include <kmdb/kaif.h>
35 #include <kmdb/kaif_start.h>
36 #include <kmdb/kmdb_asmutil.h>
37 #include <kmdb/kmdb_dpi_impl.h>
38 #include <kmdb/kmdb_kdi.h>
39 
40 #define	KAIF_SLAVE_CMD_SPIN	0
41 #define	KAIF_SLAVE_CMD_SWITCH	1
42 #define	KAIF_SLAVE_CMD_RESUME	2
43 #define	KAIF_SLAVE_CMD_FLUSH	3
44 #define	KAIF_SLAVE_CMD_REBOOT	4
45 #if defined(__sparc)
46 #define	KAIF_SLAVE_CMD_ACK	5
47 #endif
48 
49 
50 /*
51  * Used to synchronize attempts to set kaif_master_cpuid.  kaif_master_cpuid may
52  * be read without kaif_master_lock, and may be written by the current master
53  * CPU.
54  */
55 int kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET;
56 static uintptr_t kaif_master_lock = 0;
57 
58 /*
59  * Used to ensure that all CPUs leave the debugger together. kaif_loop_lock must
60  * be held to write kaif_looping, but need not be held to read it.
61  */
62 static volatile uint_t kaif_looping;
63 static uintptr_t kaif_loop_lock;
64 
65 static volatile int kaif_slave_cmd;
66 static volatile int kaif_slave_tgt;	/* target cpuid for CMD_SWITCH */
67 
68 static void
69 kaif_lock_enter(uintptr_t *lock)
70 {
71 	while (cas(lock, 0, 1) != 0)
72 		continue;
73 	membar_producer();
74 }
75 
76 static void
77 kaif_lock_exit(uintptr_t *lock)
78 {
79 	*lock = 0;
80 	membar_producer();
81 }
82 
83 static void
84 kaif_start_slaves(int cmd)
85 {
86 	kaif_slave_cmd = cmd;
87 	kmdb_kdi_start_slaves();
88 }
89 
90 static int
91 kaif_master_loop(kaif_cpusave_t *cpusave)
92 {
93 	int notflushed, i;
94 
95 #if defined(__sparc)
96 	kaif_prom_rearm();
97 #endif
98 	kaif_trap_set_debugger();
99 
100 	/*
101 	 * If we re-entered due to a ::switch, we need to tell the slave CPUs
102 	 * to sleep again.
103 	 */
104 	kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 0);
105 
106 master_loop:
107 	switch (kmdb_dpi_reenter()) {
108 	case KMDB_DPI_CMD_SWITCH_CPU:
109 		/*
110 		 * We assume that the target CPU is a valid slave.  There's no
111 		 * easy way to complain here, so we'll assume that the caller
112 		 * has done the proper checking.
113 		 */
114 		if (kmdb_dpi_switch_target == cpusave->krs_cpu_id)
115 			break;
116 
117 		kaif_slave_tgt = kaif_master_cpuid = kmdb_dpi_switch_target;
118 		cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
119 		membar_producer();
120 
121 		/*
122 		 * Switch back to the saved trap table before we switch CPUs --
123 		 * we need to make sure that only one CPU is on the debugger's
124 		 * table at a time.
125 		 */
126 		kaif_trap_set_saved(cpusave);
127 
128 		kaif_start_slaves(KAIF_SLAVE_CMD_SWITCH);
129 
130 		/* The new master is now awake */
131 		return (KAIF_CPU_CMD_SWITCH);
132 
133 	case KMDB_DPI_CMD_RESUME_ALL:
134 	case KMDB_DPI_CMD_RESUME_UNLOAD:
135 		/*
136 		 * Resume everyone, clean up for next entry.
137 		 */
138 		kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET;
139 		membar_producer();
140 		kaif_start_slaves(KAIF_SLAVE_CMD_RESUME);
141 
142 		if (kmdb_dpi_work_required())
143 			kmdb_dpi_wrintr_fire();
144 
145 		kaif_trap_set_saved(cpusave);
146 
147 		return (KAIF_CPU_CMD_RESUME);
148 
149 	case KMDB_DPI_CMD_RESUME_MASTER:
150 		/*
151 		 * Single-CPU resume, which is performed on the debugger's
152 		 * trap table (so no need to switch back).
153 		 */
154 		return (KAIF_CPU_CMD_RESUME_MASTER);
155 
156 	case KMDB_DPI_CMD_FLUSH_CACHES:
157 		kaif_start_slaves(KAIF_SLAVE_CMD_FLUSH);
158 
159 		/*
160 		 * Wait for the other cpus to finish flushing their caches.
161 		 */
162 		do {
163 			notflushed = 0;
164 			for (i = 0; i < kaif_ncpusave; i++) {
165 				kaif_cpusave_t *save = &kaif_cpusave[i];
166 
167 				if (save->krs_cpu_state ==
168 				    KAIF_CPU_STATE_SLAVE &&
169 				    !save->krs_cpu_flushed) {
170 					notflushed++;
171 					break;
172 				}
173 			}
174 		} while (notflushed > 0);
175 
176 		kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
177 		break;
178 
179 #if defined(__i386) || defined(__amd64)
180 	case KMDB_DPI_CMD_REBOOT:
181 		/*
182 		 * Reboot must be initiated by CPU 0.  I could ask why, but I'm
183 		 * afraid that I don't want to know the answer.
184 		 */
185 		if (cpusave->krs_cpu_id == 0)
186 			kmdb_kdi_reboot();
187 
188 		kaif_start_slaves(KAIF_SLAVE_CMD_REBOOT);
189 
190 		/*
191 		 * Spin forever, waiting for CPU 0 (apparently a slave) to
192 		 * reboot the system.
193 		 */
194 		for (;;)
195 			continue;
196 
197 		/*NOTREACHED*/
198 		break;
199 #endif
200 	}
201 
202 	goto master_loop;
203 }
204 
205 static int
206 kaif_slave_loop(kaif_cpusave_t *cpusave)
207 {
208 	int slavecmd, rv;
209 
210 #if defined(__sparc)
211 	/*
212 	 * If the user elects to drop to OBP from the debugger, some OBP
213 	 * implementations will cross-call the slaves.  We have to turn
214 	 * IE back on so we can receive the cross-calls.  If we don't,
215 	 * some OBP implementations will wait forever.
216 	 */
217 	interrupts_on();
218 #endif
219 
220 	/* Wait for duty to call */
221 	for (;;) {
222 		slavecmd = kaif_slave_cmd;
223 
224 		if (slavecmd == KAIF_SLAVE_CMD_SWITCH &&
225 		    kaif_slave_tgt == cpusave->krs_cpu_id) {
226 			kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
227 			cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
228 			rv = KAIF_CPU_CMD_SWITCH;
229 			break;
230 
231 		} else if (slavecmd == KAIF_SLAVE_CMD_FLUSH) {
232 			kmdb_kdi_flush_caches();
233 			cpusave->krs_cpu_flushed = 1;
234 			continue;
235 
236 #if defined(__i386) || defined(__amd64)
237 		} else if (slavecmd == KAIF_SLAVE_CMD_REBOOT &&
238 		    cpusave->krs_cpu_id == 0) {
239 			rv = 0;
240 			kmdb_kdi_reboot();
241 			break;
242 #endif
243 
244 		} else if (slavecmd == KAIF_SLAVE_CMD_RESUME) {
245 			rv = KAIF_CPU_CMD_RESUME;
246 			break;
247 #if defined(__sparc)
248 		} else if (slavecmd == KAIF_SLAVE_CMD_ACK) {
249 			cpusave->krs_cpu_acked = 1;
250 		} else if (cpusave->krs_cpu_acked &&
251 			slavecmd == KAIF_SLAVE_CMD_SPIN) {
252 			cpusave->krs_cpu_acked = 0;
253 #endif
254 		}
255 
256 		kmdb_kdi_slave_wait();
257 	}
258 
259 #if defined(__sparc)
260 	interrupts_off();
261 #endif
262 
263 	return (rv);
264 }
265 
266 static void
267 kaif_select_master(kaif_cpusave_t *cpusave)
268 {
269 	kaif_lock_enter(&kaif_master_lock);
270 
271 	if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) {
272 		/* This is the master. */
273 		kaif_master_cpuid = cpusave->krs_cpu_id;
274 		cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
275 		kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
276 
277 		membar_producer();
278 
279 		kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 1);
280 	} else {
281 		/* The master was already chosen - go be a slave */
282 		cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
283 		membar_producer();
284 	}
285 
286 	kaif_lock_exit(&kaif_master_lock);
287 }
288 
289 int
290 kaif_main_loop(kaif_cpusave_t *cpusave)
291 {
292 	int cmd;
293 
294 	if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) {
295 		if (!kmdb_dpi_resume_requested &&
296 		    kmdb_kdi_get_unload_request()) {
297 			/*
298 			 * Special case: Unload requested before first debugger
299 			 * entry.  Don't stop the world, as there's nothing to
300 			 * clean up that can't be handled by the running kernel.
301 			 */
302 			cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
303 			return (KAIF_CPU_CMD_RESUME);
304 		}
305 
306 		kaif_select_master(cpusave);
307 
308 #ifdef __sparc
309 		if (kaif_master_cpuid == cpusave->krs_cpu_id) {
310 			/*
311 			 * Everyone has arrived, so we can disarm the post-PROM
312 			 * entry point.
313 			 */
314 			*kaif_promexitarmp = 0;
315 			membar_producer();
316 		}
317 #endif
318 	} else if (kaif_master_cpuid == cpusave->krs_cpu_id) {
319 		cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
320 	} else {
321 		cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
322 	}
323 
324 	cpusave->krs_cpu_flushed = 0;
325 
326 	kaif_lock_enter(&kaif_loop_lock);
327 	kaif_looping++;
328 	kaif_lock_exit(&kaif_loop_lock);
329 
330 	/*
331 	 * We know who the master and slaves are, so now they can go off
332 	 * to their respective loops.
333 	 */
334 	do {
335 		if (kaif_master_cpuid == cpusave->krs_cpu_id)
336 			cmd = kaif_master_loop(cpusave);
337 		else
338 			cmd = kaif_slave_loop(cpusave);
339 	} while (cmd == KAIF_CPU_CMD_SWITCH);
340 
341 	kaif_lock_enter(&kaif_loop_lock);
342 	kaif_looping--;
343 	kaif_lock_exit(&kaif_loop_lock);
344 
345 	cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
346 
347 	if (cmd == KAIF_CPU_CMD_RESUME) {
348 		/*
349 		 * By this point, the master has directed the slaves to resume,
350 		 * and everyone is making their way to this point.  We're going
351 		 * to block here until all CPUs leave the master and slave
352 		 * loops.  When all have arrived, we'll turn them all loose.
353 		 * This barrier is required for two reasons:
354 		 *
355 		 * 1. There exists a race condition whereby a CPU could reenter
356 		 *    the debugger while another CPU is still in the slave loop
357 		 *    from this debugger entry.  This usually happens when the
358 		 *    current master releases the slaves, and makes it back to
359 		 *    the world before the slaves notice the release.  The
360 		 *    former master then triggers a debugger entry, and attempts
361 		 *    to stop the slaves for this entry before they've even
362 		 *    resumed from the last one.  When the slaves arrive here,
363 		 *    they'll have re-disabled interrupts, and will thus ignore
364 		 *    cross-calls until they finish resuming.
365 		 *
366 		 * 2. At the time of this writing, there exists a SPARC bug that
367 		 *    causes an apparently unsolicited interrupt vector trap
368 		 *    from OBP to one of the slaves.  This wouldn't normally be
369 		 *    a problem but for the fact that the cross-called CPU
370 		 *    encounters some sort of failure while in OBP.  OBP
371 		 *    recovers by executing the debugger-hook word, which sends
372 		 *    the slave back into the debugger, triggering a debugger
373 		 *    fault.  This problem seems to only happen during resume,
374 		 *    the result being that all CPUs save for the cross-called
375 		 *    one make it back into the world, while the cross-called
376 		 *    one is stuck at the debugger fault prompt.  Leave the
377 		 *    world in that state too long, and you'll get a mondo
378 		 *    timeout panic.  If we hold everyone here, we can give the
379 		 *    the user a chance to trigger a panic for further analysis.
380 		 *    To trigger the bug, "pool_unlock:b :c" and "while : ; do
381 		 *    psrset -p ; done".
382 		 *
383 		 * When the second item is fixed, the barrier can move into
384 		 * kaif_select_master(), immediately prior to the setting of
385 		 * kaif_master_cpuid.
386 		 */
387 		while (kaif_looping != 0)
388 			continue;
389 	}
390 
391 	return (cmd);
392 }
393 
394 
395 #if defined(__sparc)
396 
397 static int slave_loop_barrier_failures = 0;	/* for debug */
398 
399 /*
400  * There exist a race condition observed by some
401  * platforms where the kmdb master cpu exits to OBP via
402  * prom_enter_mon (e.g. "$q" command) and then later re-enter
403  * kmdb (typing "go") while the slaves are still proceeding
404  * from the OBP idle-loop back to the kmdb slave loop. The
405  * problem arises when the master cpu now back in kmdb proceed
406  * to re-enter OBP (e.g. doing a prom_read() from the kmdb main
407  * loop) while the slaves are still trying to get out of (the
408  * previous trip in) OBP into the safety of the kmdb slave loop.
409  * This routine forces the slaves to explicitly acknowledge
410  * that they are back in the slave loop. The master cpu can
411  * call this routine to ensure that all slave cpus are back
412  * in the slave loop before proceeding.
413  */
414 void
415 kaif_slave_loop_barrier(void)
416 {
417 	extern void kdi_usecwait(clock_t);
418 	int i;
419 	int not_acked;
420 	int timeout_count = 0;
421 
422 	kaif_start_slaves(KAIF_SLAVE_CMD_ACK);
423 
424 	/*
425 	 * Wait for slave cpus to explicitly acknowledge
426 	 * that they are spinning in the slave loop.
427 	 */
428 	do {
429 		not_acked = 0;
430 		for (i = 0; i < kaif_ncpusave; i++) {
431 			kaif_cpusave_t *save = &kaif_cpusave[i];
432 
433 			if (save->krs_cpu_state ==
434 			    KAIF_CPU_STATE_SLAVE &&
435 			    !save->krs_cpu_acked) {
436 				not_acked++;
437 				break;
438 			}
439 		}
440 
441 		if (not_acked == 0)
442 			break;
443 
444 		/*
445 		 * Play it safe and do a timeout delay.
446 		 * We will do at most kaif_ncpusave delays before
447 		 * bailing out of this barrier.
448 		 */
449 		kdi_usecwait(200);
450 
451 	} while (++timeout_count < kaif_ncpusave);
452 
453 	if (not_acked > 0)
454 		/*
455 		 * we cannot establish a barrier with all
456 		 * the slave cpus coming back from OBP
457 		 * Record this fact for future debugging
458 		 */
459 		slave_loop_barrier_failures++;
460 
461 	kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
462 }
463 #endif
464