xref: /illumos-gate/usr/src/uts/i86pc/os/x_call.c (revision 48f21d36693650e32c51fc8474dca1acc9b7376c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/t_lock.h>
29 #include <sys/thread.h>
30 #include <sys/cpuvar.h>
31 #include <sys/x_call.h>
32 #include <sys/xc_levels.h>
33 #include <sys/cpu.h>
34 #include <sys/psw.h>
35 #include <sys/sunddi.h>
36 #include <sys/debug.h>
37 #include <sys/systm.h>
38 #include <sys/archsystm.h>
39 #include <sys/machsystm.h>
40 #include <sys/mutex_impl.h>
41 #include <sys/stack.h>
42 #include <sys/promif.h>
43 #include <sys/x86_archext.h>
44 
45 /*
46  * Implementation for cross-processor calls via interprocessor interrupts
47  *
48  * This implementation uses a message passing architecture to allow multiple
49  * concurrent cross calls to be in flight at any given time. We use the cmpxchg
50  * instruction, aka casptr(), to implement simple efficient work queues for
51  * message passing between CPUs with almost no need for regular locking.
52  * See xc_extract() and xc_insert() below.
53  *
54  * The general idea is that initiating a cross call means putting a message
55  * on a target(s) CPU's work queue. Any synchronization is handled by passing
56  * the message back and forth between initiator and target(s).
57  *
58  * Every CPU has xc_work_cnt, which indicates it has messages to process.
59  * This value is incremented as message traffic is initiated and decremented
60  * with every message that finishes all processing.
61  *
62  * The code needs no mfence or other membar_*() calls. The uses of
63  * casptr(), cas32() and atomic_dec_32() for the message passing are
64  * implemented with LOCK prefix instructions which are equivalent to mfence.
65  *
66  * One interesting aspect of this implmentation is that it allows 2 or more
67  * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
68  * The cross call processing by the CPUs will happen in any order with only
69  * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
70  * from cross calls before all slaves have invoked the function.
71  *
72  * The reason for this asynchronous approach is to allow for fast global
73  * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
74  * on a different Virtual Address at the same time. The old code required
75  * N squared IPIs. With this method, depending on timing, it could happen
76  * with just N IPIs.
77  */
78 
79 /*
80  * The default is to not enable collecting counts of IPI information, since
81  * the updating of shared cachelines could cause excess bus traffic.
82  */
83 uint_t xc_collect_enable = 0;
84 uint64_t xc_total_cnt = 0;	/* total #IPIs sent for cross calls */
85 uint64_t xc_multi_cnt = 0;	/* # times we piggy backed on another IPI */
86 
87 /*
88  * Values for message states. Here are the normal transitions. A transition
89  * of "->" happens in the slave cpu and "=>" happens in the master cpu as
90  * the messages are passed back and forth.
91  *
92  * FREE => ASYNC ->                       DONE => FREE
93  * FREE => CALL ->                        DONE => FREE
94  * FREE => SYNC -> WAITING => RELEASED -> DONE => FREE
95  *
96  * The interesing one above is ASYNC. You might ask, why not go directly
97  * to FREE, instead of DONE. If it did that, it might be possible to exhaust
98  * the master's xc_free list if a master can generate ASYNC messages faster
99  * then the slave can process them. That could be handled with more complicated
100  * handling. However since nothing important uses ASYNC, I've not bothered.
101  */
102 #define	XC_MSG_FREE	(0)	/* msg in xc_free queue */
103 #define	XC_MSG_ASYNC	(1)	/* msg in slave xc_msgbox */
104 #define	XC_MSG_CALL	(2)	/* msg in slave xc_msgbox */
105 #define	XC_MSG_SYNC	(3)	/* msg in slave xc_msgbox */
106 #define	XC_MSG_WAITING	(4)	/* msg in master xc_msgbox or xc_waiters */
107 #define	XC_MSG_RELEASED	(5)	/* msg in slave xc_msgbox */
108 #define	XC_MSG_DONE	(6)	/* msg in master xc_msgbox */
109 
110 /*
111  * We allow for one high priority message at a time to happen in the system.
112  * This is used for panic, kmdb, etc., so no locking is done.
113  */
114 static cpuset_t xc_priority_set;
115 static xc_data_t xc_priority_data;
116 
117 /*
118  * Decrement a CPU's work count
119  */
120 static void
121 xc_decrement(struct machcpu *mcpu)
122 {
123 	atomic_dec_32(&mcpu->xc_work_cnt);
124 }
125 
126 /*
127  * Increment a CPU's work count and return the old value
128  */
129 static int
130 xc_increment(struct machcpu *mcpu)
131 {
132 	int old;
133 	do {
134 		old = mcpu->xc_work_cnt;
135 	} while (cas32((uint32_t *)&mcpu->xc_work_cnt, old, old + 1) != old);
136 	return (old);
137 }
138 
139 /*
140  * Put a message into a queue. The insertion is atomic no matter
141  * how many different inserts/extracts to the same queue happen.
142  */
143 static void
144 xc_insert(void *queue, xc_msg_t *msg)
145 {
146 	xc_msg_t *old_head;
147 	do {
148 		old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
149 		msg->xc_next = old_head;
150 	} while (casptr(queue, old_head, msg) != old_head);
151 }
152 
153 /*
154  * Extract a message from a queue. The extraction is atomic only
155  * when just one thread does extractions from the queue.
156  * If the queue is empty, NULL is returned.
157  */
158 static xc_msg_t *
159 xc_extract(xc_msg_t **queue)
160 {
161 	xc_msg_t *old_head;
162 
163 	do {
164 		old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
165 		if (old_head == NULL)
166 			return (old_head);
167 	} while (casptr(queue, old_head, old_head->xc_next) != old_head);
168 	old_head->xc_next = NULL;
169 	return (old_head);
170 }
171 
172 
173 /*
174  * Initialize the machcpu fields used for cross calls
175  */
176 static uint_t xc_initialized = 0;
177 void
178 xc_init_cpu(struct cpu *cpup)
179 {
180 	xc_msg_t *msg;
181 	int c;
182 
183 	/*
184 	 * add a new msg to each existing CPU's free list, as well as one for
185 	 * my list for each of them
186 	 */
187 	for (c = 0; c < ncpus; ++c) {
188 		if (cpu[c] == NULL)
189 			continue;
190 		msg = kmem_zalloc(sizeof (*msg), KM_SLEEP);
191 		msg->xc_command = XC_MSG_FREE;
192 		xc_insert(&cpu[c]->cpu_m.xc_free, msg);
193 
194 		msg = kmem_zalloc(sizeof (*msg), KM_SLEEP);
195 		msg->xc_command = XC_MSG_FREE;
196 		xc_insert(&cpup->cpu_m.xc_free, msg);
197 	}
198 
199 	/*
200 	 * Add one for self messages
201 	 */
202 	msg = kmem_zalloc(sizeof (*msg), KM_SLEEP);
203 	msg->xc_command = XC_MSG_FREE;
204 	xc_insert(&cpup->cpu_m.xc_free, msg);
205 
206 	if (!xc_initialized)
207 		xc_initialized = 1;
208 }
209 
210 /*
211  * X-call message processing routine. Note that this is used by both
212  * senders and recipients of messages.
213  *
214  * We're protected against changing CPUs by either being in a high-priority
215  * interrupt, having preemption disabled or by having a raised SPL.
216  */
217 /*ARGSUSED*/
218 uint_t
219 xc_serv(caddr_t arg1, caddr_t arg2)
220 {
221 	struct machcpu *mcpup = &(CPU->cpu_m);
222 	xc_msg_t *msg;
223 	xc_data_t *data;
224 	xc_msg_t *xc_waiters = NULL;
225 	uint32_t num_waiting = 0;
226 	xc_func_t func;
227 	xc_arg_t a1;
228 	xc_arg_t a2;
229 	xc_arg_t a3;
230 	uint_t rc = DDI_INTR_UNCLAIMED;
231 
232 	while (mcpup->xc_work_cnt != 0) {
233 		rc = DDI_INTR_CLAIMED;
234 
235 		/*
236 		 * We may have to wait for a message to arrive.
237 		 */
238 		for (;;) {
239 			/*
240 			 * alway check for and handle a priority message
241 			 */
242 			if (BT_TEST(CPUSET2BV(xc_priority_set), CPU->cpu_id)) {
243 				func = xc_priority_data.xc_func;
244 				a1 = xc_priority_data.xc_a1;
245 				a2 = xc_priority_data.xc_a2;
246 				a3 = xc_priority_data.xc_a3;
247 				BT_CLEAR(CPUSET2BV(xc_priority_set),
248 				    CPU->cpu_id);
249 				xc_decrement(mcpup);
250 				func(a1, a2, a3);
251 				if (mcpup->xc_work_cnt == 0)
252 					return (rc);
253 			}
254 
255 			/*
256 			 * extract and handle regular message
257 			 */
258 			msg = xc_extract(&mcpup->xc_msgbox);
259 			if (msg != NULL)
260 				break;
261 
262 			/*
263 			 * wait for a message to arrive
264 			 */
265 			if (x86_feature & X86_MWAIT) {
266 				i86_monitor(
267 				    (volatile uint32_t *)&mcpup->xc_msgbox,
268 				    0, 0);
269 				if (mcpup->xc_msgbox == NULL)
270 					i86_mwait(0, 0);
271 			} else {
272 				SMT_PAUSE();
273 			}
274 		}
275 
276 
277 		/*
278 		 * process the message
279 		 */
280 		switch (msg->xc_command) {
281 
282 		/*
283 		 * ASYNC gives back the message immediately, then we do the
284 		 * function and return with no more waiting.
285 		 */
286 		case XC_MSG_ASYNC:
287 			data = &cpu[msg->xc_master]->cpu_m.xc_data;
288 			func = data->xc_func;
289 			a1 = data->xc_a1;
290 			a2 = data->xc_a2;
291 			a3 = data->xc_a3;
292 			msg->xc_command = XC_MSG_DONE;
293 			xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg);
294 			if (func != NULL)
295 				(void) (*func)(a1, a2, a3);
296 			xc_decrement(mcpup);
297 			break;
298 
299 		/*
300 		 * SYNC messages do the call, then send it back to the master
301 		 * in WAITING mode
302 		 */
303 		case XC_MSG_SYNC:
304 			data = &cpu[msg->xc_master]->cpu_m.xc_data;
305 			if (data->xc_func != NULL)
306 				(void) (*data->xc_func)(data->xc_a1,
307 				    data->xc_a2, data->xc_a3);
308 			msg->xc_command = XC_MSG_WAITING;
309 			xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg);
310 			break;
311 
312 		/*
313 		 * WAITING messsages are collected by the master until all
314 		 * have arrived. Once all arrive, we release them back to
315 		 * the slaves
316 		 */
317 		case XC_MSG_WAITING:
318 			xc_insert(&xc_waiters, msg);
319 			if (++num_waiting < mcpup->xc_wait_cnt)
320 				break;
321 			while ((msg = xc_extract(&xc_waiters)) != NULL) {
322 				msg->xc_command = XC_MSG_RELEASED;
323 				xc_insert(&cpu[msg->xc_slave]->cpu_m.xc_msgbox,
324 				    msg);
325 				--num_waiting;
326 			}
327 			if (num_waiting != 0)
328 				panic("wrong number waiting");
329 			mcpup->xc_wait_cnt = 0;
330 			break;
331 
332 		/*
333 		 * CALL messages do the function and then, like RELEASE,
334 		 * send the message is back to master as DONE.
335 		 */
336 		case XC_MSG_CALL:
337 			data = &cpu[msg->xc_master]->cpu_m.xc_data;
338 			if (data->xc_func != NULL)
339 				(void) (*data->xc_func)(data->xc_a1,
340 				    data->xc_a2, data->xc_a3);
341 			/*FALLTHROUGH*/
342 		case XC_MSG_RELEASED:
343 			msg->xc_command = XC_MSG_DONE;
344 			xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg);
345 			xc_decrement(mcpup);
346 			break;
347 
348 		/*
349 		 * DONE means a slave has completely finished up.
350 		 * Once we collect all the DONE messages, we'll exit
351 		 * processing too.
352 		 */
353 		case XC_MSG_DONE:
354 			msg->xc_command = XC_MSG_FREE;
355 			xc_insert(&mcpup->xc_free, msg);
356 			xc_decrement(mcpup);
357 			break;
358 
359 		case XC_MSG_FREE:
360 			panic("free message in msgbox");
361 			break;
362 
363 		default:
364 			panic("bad message in msgbox");
365 			break;
366 		}
367 	}
368 	return (rc);
369 }
370 
371 /*
372  * Initiate cross call processing.
373  */
374 static void
375 xc_common(
376 	xc_func_t func,
377 	xc_arg_t arg1,
378 	xc_arg_t arg2,
379 	xc_arg_t arg3,
380 	ulong_t *set,
381 	uint_t command)
382 {
383 	int c;
384 	struct cpu *cpup;
385 	xc_msg_t *msg;
386 	xc_data_t *data;
387 	int cnt;
388 	int save_spl;
389 
390 	if (!xc_initialized) {
391 		if (BT_TEST(set, CPU->cpu_id) && (CPU->cpu_flags & CPU_READY) &&
392 		    func != NULL)
393 			(void) (*func)(arg1, arg2, arg3);
394 		return;
395 	}
396 
397 	save_spl = splr(ipltospl(XC_HI_PIL));
398 
399 	/*
400 	 * fill in cross call data
401 	 */
402 	data = &CPU->cpu_m.xc_data;
403 	data->xc_func = func;
404 	data->xc_a1 = arg1;
405 	data->xc_a2 = arg2;
406 	data->xc_a3 = arg3;
407 
408 	/*
409 	 * Post messages to all CPUs involved that are CPU_READY
410 	 */
411 	CPU->cpu_m.xc_wait_cnt = 0;
412 	for (c = 0; c < ncpus; ++c) {
413 		if (!BT_TEST(set, c))
414 			continue;
415 		cpup = cpu[c];
416 		if (cpup == NULL || !(cpup->cpu_flags & CPU_READY))
417 			continue;
418 
419 		/*
420 		 * Fill out a new message.
421 		 */
422 		msg = xc_extract(&CPU->cpu_m.xc_free);
423 		if (msg == NULL)
424 			panic("Ran out of free xc_msg_t's");
425 		msg->xc_command = command;
426 		msg->xc_master = CPU->cpu_id;
427 		msg->xc_slave = c;
428 
429 		/*
430 		 * Increment my work count for all messages that I'll
431 		 * transition from DONE to FREE.
432 		 * Also remember how many XC_MSG_WAITINGs to look for
433 		 */
434 		(void) xc_increment(&CPU->cpu_m);
435 		if (command == XC_MSG_SYNC)
436 			++CPU->cpu_m.xc_wait_cnt;
437 
438 		/*
439 		 * Increment the target CPU work count then insert the message
440 		 * in the target msgbox. If I post the first bit of work
441 		 * for the target to do, send an IPI to the target CPU.
442 		 */
443 		cnt = xc_increment(&cpup->cpu_m);
444 		xc_insert(&cpup->cpu_m.xc_msgbox, msg);
445 		if (cpup != CPU) {
446 			if (cnt == 0) {
447 				CPU_STATS_ADDQ(CPU, sys, xcalls, 1);
448 				send_dirint(c, XC_HI_PIL);
449 				if (xc_collect_enable)
450 					++xc_total_cnt;
451 			} else if (xc_collect_enable) {
452 				++xc_multi_cnt;
453 			}
454 		}
455 	}
456 
457 	/*
458 	 * Now drop into the message handler until all work is done
459 	 */
460 	(void) xc_serv(NULL, NULL);
461 	splx(save_spl);
462 }
463 
464 /*
465  * Push out a priority cross call.
466  */
467 static void
468 xc_priority_common(
469 	xc_func_t func,
470 	xc_arg_t arg1,
471 	xc_arg_t arg2,
472 	xc_arg_t arg3,
473 	ulong_t *set)
474 {
475 	int i;
476 	int c;
477 	struct cpu *cpup;
478 
479 	/*
480 	 * Wait briefly for a previous xc_priority to have finished, but
481 	 * continue no matter what.
482 	 */
483 	for (i = 0; i < 40000; ++i) {
484 		if (CPUSET_ISNULL(xc_priority_set))
485 			break;
486 		SMT_PAUSE();
487 	}
488 
489 	/*
490 	 * fill in cross call data
491 	 */
492 	xc_priority_data.xc_func = func;
493 	xc_priority_data.xc_a1 = arg1;
494 	xc_priority_data.xc_a2 = arg2;
495 	xc_priority_data.xc_a3 = arg3;
496 	xc_priority_set = *(cpuset_t *)set;
497 
498 	/*
499 	 * Post messages to all CPUs involved that are CPU_READY
500 	 * We'll always IPI, plus bang on the xc_msgbox for i86_mwait()
501 	 */
502 	for (c = 0; c < ncpus; ++c) {
503 		if (!BT_TEST(set, c))
504 			continue;
505 		cpup = cpu[c];
506 		if (cpup == NULL || !(cpup->cpu_flags & CPU_READY) ||
507 		    cpup == CPU)
508 			continue;
509 		(void) xc_increment(&cpup->cpu_m);
510 		send_dirint(c, XC_HI_PIL);
511 		for (i = 0; i < 10; ++i) {
512 			(void) casptr(&cpup->cpu_m.xc_msgbox,
513 			    cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox);
514 		}
515 	}
516 }
517 
518 /*
519  * Do cross call to all other CPUs with absolutely no waiting or handshaking.
520  * This should only be used for extraordinary operations, like panic(), which
521  * need to work, in some fashion, in a not completely functional system.
522  * All other uses that want minimal waiting should use xc_call_nowait().
523  */
524 void
525 xc_priority(
526 	xc_arg_t arg1,
527 	xc_arg_t arg2,
528 	xc_arg_t arg3,
529 	ulong_t *set,
530 	xc_func_t func)
531 {
532 	extern int IGNORE_KERNEL_PREEMPTION;
533 	int save_spl = splr(ipltospl(XC_HI_PIL));
534 	int save_kernel_preemption = IGNORE_KERNEL_PREEMPTION;
535 
536 	IGNORE_KERNEL_PREEMPTION = 1;
537 	xc_priority_common((xc_func_t)func, arg1, arg2, arg3, set);
538 	IGNORE_KERNEL_PREEMPTION = save_kernel_preemption;
539 	splx(save_spl);
540 }
541 
542 /*
543  * Wrapper for kmdb to capture other CPUs, causing them to enter the debugger.
544  */
545 void
546 kdi_xc_others(int this_cpu, void (*func)(void))
547 {
548 	extern int IGNORE_KERNEL_PREEMPTION;
549 	int save_kernel_preemption;
550 	cpuset_t set;
551 
552 	if (!xc_initialized)
553 		return;
554 
555 	save_kernel_preemption = IGNORE_KERNEL_PREEMPTION;
556 	IGNORE_KERNEL_PREEMPTION = 1;
557 	CPUSET_ALL_BUT(set, this_cpu);
558 	xc_priority_common((xc_func_t)func, 0, 0, 0, CPUSET2BV(set));
559 	IGNORE_KERNEL_PREEMPTION = save_kernel_preemption;
560 }
561 
562 
563 
564 /*
565  * Invoke function on specified processors. Remotes may continue after
566  * service with no waiting. xc_call_nowait() may return immediately too.
567  */
568 void
569 xc_call_nowait(
570 	xc_arg_t arg1,
571 	xc_arg_t arg2,
572 	xc_arg_t arg3,
573 	ulong_t *set,
574 	xc_func_t func)
575 {
576 	xc_common(func, arg1, arg2, arg3, set, XC_MSG_ASYNC);
577 }
578 
579 /*
580  * Invoke function on specified processors. Remotes may continue after
581  * service with no waiting. xc_call() returns only after remotes have finished.
582  */
583 void
584 xc_call(
585 	xc_arg_t arg1,
586 	xc_arg_t arg2,
587 	xc_arg_t arg3,
588 	ulong_t *set,
589 	xc_func_t func)
590 {
591 	xc_common(func, arg1, arg2, arg3, set, XC_MSG_CALL);
592 }
593 
594 /*
595  * Invoke function on specified processors. Remotes wait until all have
596  * finished. xc_sync() also waits until all remotes have finished.
597  */
598 void
599 xc_sync(
600 	xc_arg_t arg1,
601 	xc_arg_t arg2,
602 	xc_arg_t arg3,
603 	ulong_t *set,
604 	xc_func_t func)
605 {
606 	xc_common(func, arg1, arg2, arg3, set, XC_MSG_SYNC);
607 }
608