xref: /illumos-gate/usr/src/cmd/mdb/common/kmdb/kaif_start.c (revision 74e12c43fe52f2c30f36e65a4d0fb0e8dfd7068a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2018 Joyent, Inc.
25  */
26 
27 /*
28  * The main CPU-control loops, used to control masters and slaves.
29  */
30 
31 #include <sys/types.h>
32 
33 #include <kmdb/kaif.h>
34 #include <kmdb/kaif_start.h>
35 #include <kmdb/kmdb_asmutil.h>
36 #include <kmdb/kmdb_dpi_impl.h>
37 #include <kmdb/kmdb_kdi.h>
38 
39 #define	KAIF_SLAVE_CMD_SPIN	0
40 #define	KAIF_SLAVE_CMD_SWITCH	1
41 #define	KAIF_SLAVE_CMD_RESUME	2
42 #define	KAIF_SLAVE_CMD_FLUSH	3
43 #define	KAIF_SLAVE_CMD_REBOOT	4
44 #if defined(__sparc)
45 #define	KAIF_SLAVE_CMD_ACK	5
46 #endif
47 
48 
49 /*
50  * Used to synchronize attempts to set kaif_master_cpuid.  kaif_master_cpuid may
51  * be read without kaif_master_lock, and may be written by the current master
52  * CPU.
53  */
54 int kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET;
55 static uintptr_t kaif_master_lock = 0;
56 
57 /*
58  * Used to ensure that all CPUs leave the debugger together. kaif_loop_lock must
59  * be held to write kaif_looping, but need not be held to read it.
60  */
61 static volatile uint_t kaif_looping;
62 static uintptr_t kaif_loop_lock;
63 
64 static volatile int kaif_slave_cmd;
65 static volatile int kaif_slave_tgt;	/* target cpuid for CMD_SWITCH */
66 
67 static void
68 kaif_lock_enter(uintptr_t *lock)
69 {
70 	while (cas(lock, 0, 1) != 0)
71 		continue;
72 	membar_producer();
73 }
74 
75 static void
76 kaif_lock_exit(uintptr_t *lock)
77 {
78 	*lock = 0;
79 	membar_producer();
80 }
81 
82 static void
83 kaif_start_slaves(int cmd)
84 {
85 	kaif_slave_cmd = cmd;
86 	kmdb_kdi_start_slaves();
87 }
88 
89 static int
90 kaif_master_loop(kaif_cpusave_t *cpusave)
91 {
92 	int notflushed, i;
93 
94 #if defined(__sparc)
95 	kaif_prom_rearm();
96 #endif
97 	kaif_trap_set_debugger();
98 
99 	/*
100 	 * If we re-entered due to a ::switch, we need to tell the slave CPUs
101 	 * to sleep again.
102 	 */
103 	kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 0);
104 
105 master_loop:
106 	switch (kmdb_dpi_reenter()) {
107 	case KMDB_DPI_CMD_SWITCH_CPU:
108 		/*
109 		 * We assume that the target CPU is a valid slave.  There's no
110 		 * easy way to complain here, so we'll assume that the caller
111 		 * has done the proper checking.
112 		 */
113 		if (kmdb_dpi_switch_target == cpusave->krs_cpu_id)
114 			break;
115 
116 		kaif_slave_tgt = kaif_master_cpuid = kmdb_dpi_switch_target;
117 		cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
118 		membar_producer();
119 
120 		/*
121 		 * Switch back to the saved trap table before we switch CPUs --
122 		 * we need to make sure that only one CPU is on the debugger's
123 		 * table at a time.
124 		 */
125 		kaif_trap_set_saved(cpusave);
126 
127 		kaif_start_slaves(KAIF_SLAVE_CMD_SWITCH);
128 
129 		/* The new master is now awake */
130 		return (KAIF_CPU_CMD_SWITCH);
131 
132 	case KMDB_DPI_CMD_RESUME_ALL:
133 	case KMDB_DPI_CMD_RESUME_UNLOAD:
134 		/*
135 		 * Resume everyone, clean up for next entry.
136 		 */
137 		kaif_master_cpuid = KAIF_MASTER_CPUID_UNSET;
138 		membar_producer();
139 		kaif_start_slaves(KAIF_SLAVE_CMD_RESUME);
140 
141 		if (kmdb_dpi_work_required())
142 			kmdb_dpi_wrintr_fire();
143 
144 		kaif_trap_set_saved(cpusave);
145 
146 		return (KAIF_CPU_CMD_RESUME);
147 
148 	case KMDB_DPI_CMD_RESUME_MASTER:
149 		/*
150 		 * Single-CPU resume, which is performed on the debugger's
151 		 * trap table (so no need to switch back).
152 		 */
153 		return (KAIF_CPU_CMD_RESUME_MASTER);
154 
155 	case KMDB_DPI_CMD_FLUSH_CACHES:
156 		kaif_start_slaves(KAIF_SLAVE_CMD_FLUSH);
157 
158 		/*
159 		 * Wait for the other cpus to finish flushing their caches.
160 		 */
161 		do {
162 			notflushed = 0;
163 			for (i = 0; i < kaif_ncpusave; i++) {
164 				kaif_cpusave_t *save = &kaif_cpusave[i];
165 
166 				if (save->krs_cpu_state ==
167 				    KAIF_CPU_STATE_SLAVE &&
168 				    !save->krs_cpu_flushed) {
169 					notflushed++;
170 					break;
171 				}
172 			}
173 		} while (notflushed > 0);
174 
175 		kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
176 		break;
177 
178 #if defined(__i386) || defined(__amd64)
179 	case KMDB_DPI_CMD_REBOOT:
180 		/*
181 		 * Reboot must be initiated by CPU 0.  I could ask why, but I'm
182 		 * afraid that I don't want to know the answer.
183 		 */
184 		if (cpusave->krs_cpu_id == 0)
185 			kmdb_kdi_reboot();
186 
187 		kaif_start_slaves(KAIF_SLAVE_CMD_REBOOT);
188 
189 		/*
190 		 * Spin forever, waiting for CPU 0 (apparently a slave) to
191 		 * reboot the system.
192 		 */
193 		for (;;)
194 			continue;
195 
196 		/*NOTREACHED*/
197 		break;
198 #endif
199 	}
200 
201 	goto master_loop;
202 }
203 
204 static int
205 kaif_slave_loop(kaif_cpusave_t *cpusave)
206 {
207 	int slavecmd, rv;
208 
209 #if defined(__sparc)
210 	/*
211 	 * If the user elects to drop to OBP from the debugger, some OBP
212 	 * implementations will cross-call the slaves.  We have to turn
213 	 * IE back on so we can receive the cross-calls.  If we don't,
214 	 * some OBP implementations will wait forever.
215 	 */
216 	interrupts_on();
217 #endif
218 
219 	/* Wait for duty to call */
220 	for (;;) {
221 		slavecmd = kaif_slave_cmd;
222 
223 		if (slavecmd == KAIF_SLAVE_CMD_SWITCH &&
224 		    kaif_slave_tgt == cpusave->krs_cpu_id) {
225 			kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
226 			cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
227 			rv = KAIF_CPU_CMD_SWITCH;
228 			break;
229 
230 		} else if (slavecmd == KAIF_SLAVE_CMD_FLUSH) {
231 			kmdb_kdi_flush_caches();
232 			cpusave->krs_cpu_flushed = 1;
233 			continue;
234 
235 #if defined(__i386) || defined(__amd64)
236 		} else if (slavecmd == KAIF_SLAVE_CMD_REBOOT &&
237 		    cpusave->krs_cpu_id == 0) {
238 			rv = 0;
239 			kmdb_kdi_reboot();
240 			break;
241 #endif
242 
243 		} else if (slavecmd == KAIF_SLAVE_CMD_RESUME) {
244 			rv = KAIF_CPU_CMD_RESUME;
245 			break;
246 #if defined(__sparc)
247 		} else if (slavecmd == KAIF_SLAVE_CMD_ACK) {
248 			cpusave->krs_cpu_acked = 1;
249 		} else if (cpusave->krs_cpu_acked &&
250 		    slavecmd == KAIF_SLAVE_CMD_SPIN) {
251 			cpusave->krs_cpu_acked = 0;
252 #endif
253 		}
254 
255 		kmdb_kdi_slave_wait();
256 	}
257 
258 #if defined(__sparc)
259 	interrupts_off();
260 #endif
261 
262 	return (rv);
263 }
264 
265 static void
266 kaif_select_master(kaif_cpusave_t *cpusave)
267 {
268 	kaif_lock_enter(&kaif_master_lock);
269 
270 	if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) {
271 		/* This is the master. */
272 		kaif_master_cpuid = cpusave->krs_cpu_id;
273 		cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
274 		kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
275 
276 		membar_producer();
277 
278 		kmdb_kdi_stop_slaves(cpusave->krs_cpu_id, 1);
279 	} else {
280 		/* The master was already chosen - go be a slave */
281 		cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
282 		membar_producer();
283 	}
284 
285 	kaif_lock_exit(&kaif_master_lock);
286 }
287 
288 int
289 kaif_main_loop(kaif_cpusave_t *cpusave)
290 {
291 	int cmd;
292 
293 	if (kaif_master_cpuid == KAIF_MASTER_CPUID_UNSET) {
294 
295 		/*
296 		 * Special case: Unload requested before first debugger entry.
297 		 * Don't stop the world, as there's nothing to clean up that
298 		 * can't be handled by the running kernel.
299 		 */
300 		if (!kmdb_dpi_resume_requested &&
301 		    kmdb_kdi_get_unload_request()) {
302 			cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
303 			return (KAIF_CPU_CMD_RESUME);
304 		}
305 
306 		/*
307 		 * We're a slave with no master, so just resume.  This can
308 		 * happen if, prior to this, two CPUs both raced through
309 		 * kdi_cmnint() - for example, a breakpoint on a frequently
310 		 * called function.  The loser will be redirected to the slave
311 		 * loop; note that the event itself is lost at this point.
312 		 *
313 		 * The winner will then cross-call that slave, but it won't
314 		 * actually be received until the slave returns to the kernel
315 		 * and enables interrupts.  We'll then come back in via
316 		 * kdi_slave_entry() and hit this path.
317 		 */
318 		if (cpusave->krs_cpu_state == KAIF_CPU_STATE_SLAVE) {
319 			cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
320 			return (KAIF_CPU_CMD_RESUME);
321 		}
322 
323 		kaif_select_master(cpusave);
324 
325 #ifdef __sparc
326 		if (kaif_master_cpuid == cpusave->krs_cpu_id) {
327 			/*
328 			 * Everyone has arrived, so we can disarm the post-PROM
329 			 * entry point.
330 			 */
331 			*kaif_promexitarmp = 0;
332 			membar_producer();
333 		}
334 #endif
335 	} else if (kaif_master_cpuid == cpusave->krs_cpu_id) {
336 		cpusave->krs_cpu_state = KAIF_CPU_STATE_MASTER;
337 	} else {
338 		cpusave->krs_cpu_state = KAIF_CPU_STATE_SLAVE;
339 	}
340 
341 	cpusave->krs_cpu_flushed = 0;
342 
343 	kaif_lock_enter(&kaif_loop_lock);
344 	kaif_looping++;
345 	kaif_lock_exit(&kaif_loop_lock);
346 
347 	/*
348 	 * We know who the master and slaves are, so now they can go off
349 	 * to their respective loops.
350 	 */
351 	do {
352 		if (kaif_master_cpuid == cpusave->krs_cpu_id)
353 			cmd = kaif_master_loop(cpusave);
354 		else
355 			cmd = kaif_slave_loop(cpusave);
356 	} while (cmd == KAIF_CPU_CMD_SWITCH);
357 
358 	kaif_lock_enter(&kaif_loop_lock);
359 	kaif_looping--;
360 	kaif_lock_exit(&kaif_loop_lock);
361 
362 	cpusave->krs_cpu_state = KAIF_CPU_STATE_NONE;
363 
364 	if (cmd == KAIF_CPU_CMD_RESUME) {
365 		/*
366 		 * By this point, the master has directed the slaves to resume,
367 		 * and everyone is making their way to this point.  We're going
368 		 * to block here until all CPUs leave the master and slave
369 		 * loops.  When all have arrived, we'll turn them all loose.
370 		 * This barrier is required for two reasons:
371 		 *
372 		 * 1. There exists a race condition whereby a CPU could reenter
373 		 *    the debugger while another CPU is still in the slave loop
374 		 *    from this debugger entry.  This usually happens when the
375 		 *    current master releases the slaves, and makes it back to
376 		 *    the world before the slaves notice the release.  The
377 		 *    former master then triggers a debugger entry, and attempts
378 		 *    to stop the slaves for this entry before they've even
379 		 *    resumed from the last one.  When the slaves arrive here,
380 		 *    they'll have re-disabled interrupts, and will thus ignore
381 		 *    cross-calls until they finish resuming.
382 		 *
383 		 * 2. At the time of this writing, there exists a SPARC bug that
384 		 *    causes an apparently unsolicited interrupt vector trap
385 		 *    from OBP to one of the slaves.  This wouldn't normally be
386 		 *    a problem but for the fact that the cross-called CPU
387 		 *    encounters some sort of failure while in OBP.  OBP
388 		 *    recovers by executing the debugger-hook word, which sends
389 		 *    the slave back into the debugger, triggering a debugger
390 		 *    fault.  This problem seems to only happen during resume,
391 		 *    the result being that all CPUs save for the cross-called
392 		 *    one make it back into the world, while the cross-called
393 		 *    one is stuck at the debugger fault prompt.  Leave the
394 		 *    world in that state too long, and you'll get a mondo
395 		 *    timeout panic.  If we hold everyone here, we can give the
396 		 *    the user a chance to trigger a panic for further analysis.
397 		 *    To trigger the bug, "pool_unlock:b :c" and "while : ; do
398 		 *    psrset -p ; done".
399 		 *
400 		 * When the second item is fixed, the barrier can move into
401 		 * kaif_select_master(), immediately prior to the setting of
402 		 * kaif_master_cpuid.
403 		 */
404 		while (kaif_looping != 0)
405 			continue;
406 	}
407 
408 	return (cmd);
409 }
410 
411 
412 #if defined(__sparc)
413 
414 static int slave_loop_barrier_failures = 0;	/* for debug */
415 
416 /*
417  * There exist a race condition observed by some
418  * platforms where the kmdb master cpu exits to OBP via
419  * prom_enter_mon (e.g. "$q" command) and then later re-enter
420  * kmdb (typing "go") while the slaves are still proceeding
421  * from the OBP idle-loop back to the kmdb slave loop. The
422  * problem arises when the master cpu now back in kmdb proceed
423  * to re-enter OBP (e.g. doing a prom_read() from the kmdb main
424  * loop) while the slaves are still trying to get out of (the
425  * previous trip in) OBP into the safety of the kmdb slave loop.
426  * This routine forces the slaves to explicitly acknowledge
427  * that they are back in the slave loop. The master cpu can
428  * call this routine to ensure that all slave cpus are back
429  * in the slave loop before proceeding.
430  */
431 void
432 kaif_slave_loop_barrier(void)
433 {
434 	extern void kdi_usecwait(clock_t);
435 	int i;
436 	int not_acked;
437 	int timeout_count = 0;
438 
439 	kaif_start_slaves(KAIF_SLAVE_CMD_ACK);
440 
441 	/*
442 	 * Wait for slave cpus to explicitly acknowledge
443 	 * that they are spinning in the slave loop.
444 	 */
445 	do {
446 		not_acked = 0;
447 		for (i = 0; i < kaif_ncpusave; i++) {
448 			kaif_cpusave_t *save = &kaif_cpusave[i];
449 
450 			if (save->krs_cpu_state ==
451 			    KAIF_CPU_STATE_SLAVE &&
452 			    !save->krs_cpu_acked) {
453 				not_acked++;
454 				break;
455 			}
456 		}
457 
458 		if (not_acked == 0)
459 			break;
460 
461 		/*
462 		 * Play it safe and do a timeout delay.
463 		 * We will do at most kaif_ncpusave delays before
464 		 * bailing out of this barrier.
465 		 */
466 		kdi_usecwait(200);
467 
468 	} while (++timeout_count < kaif_ncpusave);
469 
470 	if (not_acked > 0)
471 		/*
472 		 * we cannot establish a barrier with all
473 		 * the slave cpus coming back from OBP
474 		 * Record this fact for future debugging
475 		 */
476 		slave_loop_barrier_failures++;
477 
478 	kaif_slave_cmd = KAIF_SLAVE_CMD_SPIN;
479 }
480 #endif
481