xref: /linux/kernel/smp.c (revision 8b8eed05a1c650c27e78bc47d07f7d6c9ba779e8)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Generic helpers for smp ipi calls
4  *
5  * (C) Jens Axboe <jens.axboe@oracle.com> 2008
6  */
7 
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 
10 #include <linux/irq_work.h>
11 #include <linux/rcupdate.h>
12 #include <linux/rculist.h>
13 #include <linux/kernel.h>
14 #include <linux/export.h>
15 #include <linux/percpu.h>
16 #include <linux/init.h>
17 #include <linux/interrupt.h>
18 #include <linux/gfp.h>
19 #include <linux/smp.h>
20 #include <linux/cpu.h>
21 #include <linux/sched.h>
22 #include <linux/sched/idle.h>
23 #include <linux/hypervisor.h>
24 #include <linux/sched/clock.h>
25 #include <linux/nmi.h>
26 #include <linux/sched/debug.h>
27 #include <linux/jump_label.h>
28 
29 #include <trace/events/ipi.h>
30 #define CREATE_TRACE_POINTS
31 #include <trace/events/csd.h>
32 #undef CREATE_TRACE_POINTS
33 
34 #include "smpboot.h"
35 #include "sched/smp.h"
36 
37 #define CSD_TYPE(_csd)	((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)
38 
39 struct call_function_data {
40 	call_single_data_t	__percpu *csd;
41 	cpumask_var_t		cpumask;
42 	cpumask_var_t		cpumask_ipi;
43 };
44 
45 static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
46 
47 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
48 
49 static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1);
50 
51 static void __flush_smp_call_function_queue(bool warn_cpu_offline);
52 
53 int smpcfd_prepare_cpu(unsigned int cpu)
54 {
55 	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
56 
57 	if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
58 				     cpu_to_node(cpu)))
59 		return -ENOMEM;
60 	if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
61 				     cpu_to_node(cpu))) {
62 		free_cpumask_var(cfd->cpumask);
63 		return -ENOMEM;
64 	}
65 	cfd->csd = alloc_percpu(call_single_data_t);
66 	if (!cfd->csd) {
67 		free_cpumask_var(cfd->cpumask);
68 		free_cpumask_var(cfd->cpumask_ipi);
69 		return -ENOMEM;
70 	}
71 
72 	return 0;
73 }
74 
75 int smpcfd_dead_cpu(unsigned int cpu)
76 {
77 	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
78 
79 	free_cpumask_var(cfd->cpumask);
80 	free_cpumask_var(cfd->cpumask_ipi);
81 	free_percpu(cfd->csd);
82 	return 0;
83 }
84 
85 int smpcfd_dying_cpu(unsigned int cpu)
86 {
87 	/*
88 	 * The IPIs for the smp-call-function callbacks queued by other
89 	 * CPUs might arrive late, either due to hardware latencies or
90 	 * because this CPU disabled interrupts (inside stop-machine)
91 	 * before the IPIs were sent. So flush out any pending callbacks
92 	 * explicitly (without waiting for the IPIs to arrive), to
93 	 * ensure that the outgoing CPU doesn't go offline with work
94 	 * still pending.
95 	 */
96 	__flush_smp_call_function_queue(false);
97 	irq_work_run();
98 	return 0;
99 }
100 
101 void __init call_function_init(void)
102 {
103 	int i;
104 
105 	for_each_possible_cpu(i)
106 		init_llist_head(&per_cpu(call_single_queue, i));
107 
108 	smpcfd_prepare_cpu(smp_processor_id());
109 }
110 
111 static __always_inline void
112 send_call_function_single_ipi(int cpu)
113 {
114 	if (call_function_single_prep_ipi(cpu)) {
115 		trace_ipi_send_cpu(cpu, _RET_IP_,
116 				   generic_smp_call_function_single_interrupt);
117 		arch_send_call_function_single_ipi(cpu);
118 	}
119 }
120 
121 static __always_inline void
122 send_call_function_ipi_mask(struct cpumask *mask)
123 {
124 	trace_ipi_send_cpumask(mask, _RET_IP_,
125 			       generic_smp_call_function_single_interrupt);
126 	arch_send_call_function_ipi_mask(mask);
127 }
128 
129 static __always_inline void
130 csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
131 {
132 	trace_csd_function_entry(func, csd);
133 	func(info);
134 	trace_csd_function_exit(func, csd);
135 }
136 
137 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
138 
139 static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);
140 
141 /*
142  * Parse the csdlock_debug= kernel boot parameter.
143  *
144  * If you need to restore the old "ext" value that once provided
145  * additional debugging information, reapply the following commits:
146  *
147  * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging")
148  * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging")
149  */
150 static int __init csdlock_debug(char *str)
151 {
152 	int ret;
153 	unsigned int val = 0;
154 
155 	ret = get_option(&str, &val);
156 	if (ret) {
157 		if (val)
158 			static_branch_enable(&csdlock_debug_enabled);
159 		else
160 			static_branch_disable(&csdlock_debug_enabled);
161 	}
162 
163 	return 1;
164 }
165 __setup("csdlock_debug=", csdlock_debug);
166 
167 static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
168 static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
169 static DEFINE_PER_CPU(void *, cur_csd_info);
170 
171 static ulong csd_lock_timeout = 5000;  /* CSD lock timeout in milliseconds. */
172 module_param(csd_lock_timeout, ulong, 0444);
173 static int panic_on_ipistall;  /* CSD panic timeout in milliseconds, 300000 for five minutes. */
174 module_param(panic_on_ipistall, int, 0444);
175 
176 static atomic_t csd_bug_count = ATOMIC_INIT(0);
177 
178 /* Record current CSD work for current CPU, NULL to erase. */
179 static void __csd_lock_record(call_single_data_t *csd)
180 {
181 	if (!csd) {
182 		smp_mb(); /* NULL cur_csd after unlock. */
183 		__this_cpu_write(cur_csd, NULL);
184 		return;
185 	}
186 	__this_cpu_write(cur_csd_func, csd->func);
187 	__this_cpu_write(cur_csd_info, csd->info);
188 	smp_wmb(); /* func and info before csd. */
189 	__this_cpu_write(cur_csd, csd);
190 	smp_mb(); /* Update cur_csd before function call. */
191 		  /* Or before unlock, as the case may be. */
192 }
193 
194 static __always_inline void csd_lock_record(call_single_data_t *csd)
195 {
196 	if (static_branch_unlikely(&csdlock_debug_enabled))
197 		__csd_lock_record(csd);
198 }
199 
200 static int csd_lock_wait_getcpu(call_single_data_t *csd)
201 {
202 	unsigned int csd_type;
203 
204 	csd_type = CSD_TYPE(csd);
205 	if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
206 		return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */
207 	return -1;
208 }
209 
210 /*
211  * Complain if too much time spent waiting.  Note that only
212  * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
213  * so waiting on other types gets much less information.
214  */
215 static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
216 {
217 	int cpu = -1;
218 	int cpux;
219 	bool firsttime;
220 	u64 ts2, ts_delta;
221 	call_single_data_t *cpu_cur_csd;
222 	unsigned int flags = READ_ONCE(csd->node.u_flags);
223 	unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC;
224 
225 	if (!(flags & CSD_FLAG_LOCK)) {
226 		if (!unlikely(*bug_id))
227 			return true;
228 		cpu = csd_lock_wait_getcpu(csd);
229 		pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
230 			 *bug_id, raw_smp_processor_id(), cpu);
231 		return true;
232 	}
233 
234 	ts2 = sched_clock();
235 	/* How long since we last checked for a stuck CSD lock.*/
236 	ts_delta = ts2 - *ts1;
237 	if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
238 		return false;
239 
240 	firsttime = !*bug_id;
241 	if (firsttime)
242 		*bug_id = atomic_inc_return(&csd_bug_count);
243 	cpu = csd_lock_wait_getcpu(csd);
244 	if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
245 		cpux = 0;
246 	else
247 		cpux = cpu;
248 	cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
249 	/* How long since this CSD lock was stuck. */
250 	ts_delta = ts2 - ts0;
251 	pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
252 		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
253 		 cpu, csd->func, csd->info);
254 	/*
255 	 * If the CSD lock is still stuck after 5 minutes, it is unlikely
256 	 * to become unstuck. Use a signed comparison to avoid triggering
257 	 * on underflows when the TSC is out of sync between sockets.
258 	 */
259 	BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
260 	if (cpu_cur_csd && csd != cpu_cur_csd) {
261 		pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
262 			 *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
263 			 READ_ONCE(per_cpu(cur_csd_info, cpux)));
264 	} else {
265 		pr_alert("\tcsd: CSD lock (#%d) %s.\n",
266 			 *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
267 	}
268 	if (cpu >= 0) {
269 		if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0))
270 			dump_cpu_task(cpu);
271 		if (!cpu_cur_csd) {
272 			pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
273 			arch_send_call_function_single_ipi(cpu);
274 		}
275 	}
276 	if (firsttime)
277 		dump_stack();
278 	*ts1 = ts2;
279 
280 	return false;
281 }
282 
283 /*
284  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
285  *
286  * For non-synchronous ipi calls the csd can still be in use by the
287  * previous function call. For multi-cpu calls its even more interesting
288  * as we'll have to ensure no other cpu is observing our csd.
289  */
290 static void __csd_lock_wait(call_single_data_t *csd)
291 {
292 	int bug_id = 0;
293 	u64 ts0, ts1;
294 
295 	ts1 = ts0 = sched_clock();
296 	for (;;) {
297 		if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
298 			break;
299 		cpu_relax();
300 	}
301 	smp_acquire__after_ctrl_dep();
302 }
303 
304 static __always_inline void csd_lock_wait(call_single_data_t *csd)
305 {
306 	if (static_branch_unlikely(&csdlock_debug_enabled)) {
307 		__csd_lock_wait(csd);
308 		return;
309 	}
310 
311 	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
312 }
313 #else
314 static void csd_lock_record(call_single_data_t *csd)
315 {
316 }
317 
318 static __always_inline void csd_lock_wait(call_single_data_t *csd)
319 {
320 	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
321 }
322 #endif
323 
324 static __always_inline void csd_lock(call_single_data_t *csd)
325 {
326 	csd_lock_wait(csd);
327 	csd->node.u_flags |= CSD_FLAG_LOCK;
328 
329 	/*
330 	 * prevent CPU from reordering the above assignment
331 	 * to ->flags with any subsequent assignments to other
332 	 * fields of the specified call_single_data_t structure:
333 	 */
334 	smp_wmb();
335 }
336 
337 static __always_inline void csd_unlock(call_single_data_t *csd)
338 {
339 	WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
340 
341 	/*
342 	 * ensure we're all done before releasing data:
343 	 */
344 	smp_store_release(&csd->node.u_flags, 0);
345 }
346 
347 static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
348 
349 void __smp_call_single_queue(int cpu, struct llist_node *node)
350 {
351 	/*
352 	 * We have to check the type of the CSD before queueing it, because
353 	 * once queued it can have its flags cleared by
354 	 *   flush_smp_call_function_queue()
355 	 * even if we haven't sent the smp_call IPI yet (e.g. the stopper
356 	 * executes migration_cpu_stop() on the remote CPU).
357 	 */
358 	if (trace_csd_queue_cpu_enabled()) {
359 		call_single_data_t *csd;
360 		smp_call_func_t func;
361 
362 		csd = container_of(node, call_single_data_t, node.llist);
363 		func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
364 			sched_ttwu_pending : csd->func;
365 
366 		trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
367 	}
368 
369 	/*
370 	 * The list addition should be visible to the target CPU when it pops
371 	 * the head of the list to pull the entry off it in the IPI handler
372 	 * because of normal cache coherency rules implied by the underlying
373 	 * llist ops.
374 	 *
375 	 * If IPIs can go out of order to the cache coherency protocol
376 	 * in an architecture, sufficient synchronisation should be added
377 	 * to arch code to make it appear to obey cache coherency WRT
378 	 * locking and barrier primitives. Generic code isn't really
379 	 * equipped to do the right thing...
380 	 */
381 	if (llist_add(node, &per_cpu(call_single_queue, cpu)))
382 		send_call_function_single_ipi(cpu);
383 }
384 
385 /*
386  * Insert a previously allocated call_single_data_t element
387  * for execution on the given CPU. data must already have
388  * ->func, ->info, and ->flags set.
389  */
390 static int generic_exec_single(int cpu, call_single_data_t *csd)
391 {
392 	if (cpu == smp_processor_id()) {
393 		smp_call_func_t func = csd->func;
394 		void *info = csd->info;
395 		unsigned long flags;
396 
397 		/*
398 		 * We can unlock early even for the synchronous on-stack case,
399 		 * since we're doing this from the same CPU..
400 		 */
401 		csd_lock_record(csd);
402 		csd_unlock(csd);
403 		local_irq_save(flags);
404 		csd_do_func(func, info, NULL);
405 		csd_lock_record(NULL);
406 		local_irq_restore(flags);
407 		return 0;
408 	}
409 
410 	if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
411 		csd_unlock(csd);
412 		return -ENXIO;
413 	}
414 
415 	__smp_call_single_queue(cpu, &csd->node.llist);
416 
417 	return 0;
418 }
419 
420 /**
421  * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
422  *
423  * Invoked by arch to handle an IPI for call function single.
424  * Must be called with interrupts disabled.
425  */
426 void generic_smp_call_function_single_interrupt(void)
427 {
428 	__flush_smp_call_function_queue(true);
429 }
430 
431 /**
432  * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
433  *
434  * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
435  *		      offline CPU. Skip this check if set to 'false'.
436  *
437  * Flush any pending smp-call-function callbacks queued on this CPU. This is
438  * invoked by the generic IPI handler, as well as by a CPU about to go offline,
439  * to ensure that all pending IPI callbacks are run before it goes completely
440  * offline.
441  *
442  * Loop through the call_single_queue and run all the queued callbacks.
443  * Must be called with interrupts disabled.
444  */
445 static void __flush_smp_call_function_queue(bool warn_cpu_offline)
446 {
447 	call_single_data_t *csd, *csd_next;
448 	struct llist_node *entry, *prev;
449 	struct llist_head *head;
450 	static bool warned;
451 	atomic_t *tbt;
452 
453 	lockdep_assert_irqs_disabled();
454 
455 	/* Allow waiters to send backtrace NMI from here onwards */
456 	tbt = this_cpu_ptr(&trigger_backtrace);
457 	atomic_set_release(tbt, 1);
458 
459 	head = this_cpu_ptr(&call_single_queue);
460 	entry = llist_del_all(head);
461 	entry = llist_reverse_order(entry);
462 
463 	/* There shouldn't be any pending callbacks on an offline CPU. */
464 	if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
465 		     !warned && entry != NULL)) {
466 		warned = true;
467 		WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
468 
469 		/*
470 		 * We don't have to use the _safe() variant here
471 		 * because we are not invoking the IPI handlers yet.
472 		 */
473 		llist_for_each_entry(csd, entry, node.llist) {
474 			switch (CSD_TYPE(csd)) {
475 			case CSD_TYPE_ASYNC:
476 			case CSD_TYPE_SYNC:
477 			case CSD_TYPE_IRQ_WORK:
478 				pr_warn("IPI callback %pS sent to offline CPU\n",
479 					csd->func);
480 				break;
481 
482 			case CSD_TYPE_TTWU:
483 				pr_warn("IPI task-wakeup sent to offline CPU\n");
484 				break;
485 
486 			default:
487 				pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
488 					CSD_TYPE(csd));
489 				break;
490 			}
491 		}
492 	}
493 
494 	/*
495 	 * First; run all SYNC callbacks, people are waiting for us.
496 	 */
497 	prev = NULL;
498 	llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
499 		/* Do we wait until *after* callback? */
500 		if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
501 			smp_call_func_t func = csd->func;
502 			void *info = csd->info;
503 
504 			if (prev) {
505 				prev->next = &csd_next->node.llist;
506 			} else {
507 				entry = &csd_next->node.llist;
508 			}
509 
510 			csd_lock_record(csd);
511 			csd_do_func(func, info, csd);
512 			csd_unlock(csd);
513 			csd_lock_record(NULL);
514 		} else {
515 			prev = &csd->node.llist;
516 		}
517 	}
518 
519 	if (!entry)
520 		return;
521 
522 	/*
523 	 * Second; run all !SYNC callbacks.
524 	 */
525 	prev = NULL;
526 	llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
527 		int type = CSD_TYPE(csd);
528 
529 		if (type != CSD_TYPE_TTWU) {
530 			if (prev) {
531 				prev->next = &csd_next->node.llist;
532 			} else {
533 				entry = &csd_next->node.llist;
534 			}
535 
536 			if (type == CSD_TYPE_ASYNC) {
537 				smp_call_func_t func = csd->func;
538 				void *info = csd->info;
539 
540 				csd_lock_record(csd);
541 				csd_unlock(csd);
542 				csd_do_func(func, info, csd);
543 				csd_lock_record(NULL);
544 			} else if (type == CSD_TYPE_IRQ_WORK) {
545 				irq_work_single(csd);
546 			}
547 
548 		} else {
549 			prev = &csd->node.llist;
550 		}
551 	}
552 
553 	/*
554 	 * Third; only CSD_TYPE_TTWU is left, issue those.
555 	 */
556 	if (entry) {
557 		csd = llist_entry(entry, typeof(*csd), node.llist);
558 		csd_do_func(sched_ttwu_pending, entry, csd);
559 	}
560 }
561 
562 
563 /**
564  * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
565  *				   from task context (idle, migration thread)
566  *
567  * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
568  * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
569  * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
570  * handle queued SMP function calls before scheduling.
571  *
572  * The migration thread has to ensure that an eventually pending wakeup has
573  * been handled before it migrates a task.
574  */
575 void flush_smp_call_function_queue(void)
576 {
577 	unsigned int was_pending;
578 	unsigned long flags;
579 
580 	if (llist_empty(this_cpu_ptr(&call_single_queue)))
581 		return;
582 
583 	local_irq_save(flags);
584 	/* Get the already pending soft interrupts for RT enabled kernels */
585 	was_pending = local_softirq_pending();
586 	__flush_smp_call_function_queue(true);
587 	if (local_softirq_pending())
588 		do_softirq_post_smp_call_flush(was_pending);
589 
590 	local_irq_restore(flags);
591 }
592 
593 /*
594  * smp_call_function_single - Run a function on a specific CPU
595  * @func: The function to run. This must be fast and non-blocking.
596  * @info: An arbitrary pointer to pass to the function.
597  * @wait: If true, wait until function has completed on other CPUs.
598  *
599  * Returns 0 on success, else a negative status code.
600  */
601 int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
602 			     int wait)
603 {
604 	call_single_data_t *csd;
605 	call_single_data_t csd_stack = {
606 		.node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
607 	};
608 	int this_cpu;
609 	int err;
610 
611 	/*
612 	 * prevent preemption and reschedule on another processor,
613 	 * as well as CPU removal
614 	 */
615 	this_cpu = get_cpu();
616 
617 	/*
618 	 * Can deadlock when called with interrupts disabled.
619 	 * We allow cpu's that are not yet online though, as no one else can
620 	 * send smp call function interrupt to this cpu and as such deadlocks
621 	 * can't happen.
622 	 */
623 	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
624 		     && !oops_in_progress);
625 
626 	/*
627 	 * When @wait we can deadlock when we interrupt between llist_add() and
628 	 * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
629 	 * csd_lock() on because the interrupt context uses the same csd
630 	 * storage.
631 	 */
632 	WARN_ON_ONCE(!in_task());
633 
634 	csd = &csd_stack;
635 	if (!wait) {
636 		csd = this_cpu_ptr(&csd_data);
637 		csd_lock(csd);
638 	}
639 
640 	csd->func = func;
641 	csd->info = info;
642 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
643 	csd->node.src = smp_processor_id();
644 	csd->node.dst = cpu;
645 #endif
646 
647 	err = generic_exec_single(cpu, csd);
648 
649 	if (wait)
650 		csd_lock_wait(csd);
651 
652 	put_cpu();
653 
654 	return err;
655 }
656 EXPORT_SYMBOL(smp_call_function_single);
657 
658 /**
659  * smp_call_function_single_async() - Run an asynchronous function on a
660  * 			         specific CPU.
661  * @cpu: The CPU to run on.
662  * @csd: Pre-allocated and setup data structure
663  *
664  * Like smp_call_function_single(), but the call is asynchonous and
665  * can thus be done from contexts with disabled interrupts.
666  *
667  * The caller passes his own pre-allocated data structure
668  * (ie: embedded in an object) and is responsible for synchronizing it
669  * such that the IPIs performed on the @csd are strictly serialized.
670  *
671  * If the function is called with one csd which has not yet been
672  * processed by previous call to smp_call_function_single_async(), the
673  * function will return immediately with -EBUSY showing that the csd
674  * object is still in progress.
675  *
676  * NOTE: Be careful, there is unfortunately no current debugging facility to
677  * validate the correctness of this serialization.
678  *
679  * Return: %0 on success or negative errno value on error
680  */
681 int smp_call_function_single_async(int cpu, call_single_data_t *csd)
682 {
683 	int err = 0;
684 
685 	preempt_disable();
686 
687 	if (csd->node.u_flags & CSD_FLAG_LOCK) {
688 		err = -EBUSY;
689 		goto out;
690 	}
691 
692 	csd->node.u_flags = CSD_FLAG_LOCK;
693 	smp_wmb();
694 
695 	err = generic_exec_single(cpu, csd);
696 
697 out:
698 	preempt_enable();
699 
700 	return err;
701 }
702 EXPORT_SYMBOL_GPL(smp_call_function_single_async);
703 
704 /*
705  * smp_call_function_any - Run a function on any of the given cpus
706  * @mask: The mask of cpus it can run on.
707  * @func: The function to run. This must be fast and non-blocking.
708  * @info: An arbitrary pointer to pass to the function.
709  * @wait: If true, wait until function has completed.
710  *
711  * Returns 0 on success, else a negative status code (if no cpus were online).
712  *
713  * Selection preference:
714  *	1) current cpu if in @mask
715  *	2) any cpu of current node if in @mask
716  *	3) any other online cpu in @mask
717  */
718 int smp_call_function_any(const struct cpumask *mask,
719 			  smp_call_func_t func, void *info, int wait)
720 {
721 	unsigned int cpu;
722 	const struct cpumask *nodemask;
723 	int ret;
724 
725 	/* Try for same CPU (cheapest) */
726 	cpu = get_cpu();
727 	if (cpumask_test_cpu(cpu, mask))
728 		goto call;
729 
730 	/* Try for same node. */
731 	nodemask = cpumask_of_node(cpu_to_node(cpu));
732 	for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
733 	     cpu = cpumask_next_and(cpu, nodemask, mask)) {
734 		if (cpu_online(cpu))
735 			goto call;
736 	}
737 
738 	/* Any online will do: smp_call_function_single handles nr_cpu_ids. */
739 	cpu = cpumask_any_and(mask, cpu_online_mask);
740 call:
741 	ret = smp_call_function_single(cpu, func, info, wait);
742 	put_cpu();
743 	return ret;
744 }
745 EXPORT_SYMBOL_GPL(smp_call_function_any);
746 
747 /*
748  * Flags to be used as scf_flags argument of smp_call_function_many_cond().
749  *
750  * %SCF_WAIT:		Wait until function execution is completed
751  * %SCF_RUN_LOCAL:	Run also locally if local cpu is set in cpumask
752  */
753 #define SCF_WAIT	(1U << 0)
754 #define SCF_RUN_LOCAL	(1U << 1)
755 
756 static void smp_call_function_many_cond(const struct cpumask *mask,
757 					smp_call_func_t func, void *info,
758 					unsigned int scf_flags,
759 					smp_cond_func_t cond_func)
760 {
761 	int cpu, last_cpu, this_cpu = smp_processor_id();
762 	struct call_function_data *cfd;
763 	bool wait = scf_flags & SCF_WAIT;
764 	int nr_cpus = 0;
765 	bool run_remote = false;
766 	bool run_local = false;
767 
768 	lockdep_assert_preemption_disabled();
769 
770 	/*
771 	 * Can deadlock when called with interrupts disabled.
772 	 * We allow cpu's that are not yet online though, as no one else can
773 	 * send smp call function interrupt to this cpu and as such deadlocks
774 	 * can't happen.
775 	 */
776 	if (cpu_online(this_cpu) && !oops_in_progress &&
777 	    !early_boot_irqs_disabled)
778 		lockdep_assert_irqs_enabled();
779 
780 	/*
781 	 * When @wait we can deadlock when we interrupt between llist_add() and
782 	 * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
783 	 * csd_lock() on because the interrupt context uses the same csd
784 	 * storage.
785 	 */
786 	WARN_ON_ONCE(!in_task());
787 
788 	/* Check if we need local execution. */
789 	if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask))
790 		run_local = true;
791 
792 	/* Check if we need remote execution, i.e., any CPU excluding this one. */
793 	cpu = cpumask_first_and(mask, cpu_online_mask);
794 	if (cpu == this_cpu)
795 		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
796 	if (cpu < nr_cpu_ids)
797 		run_remote = true;
798 
799 	if (run_remote) {
800 		cfd = this_cpu_ptr(&cfd_data);
801 		cpumask_and(cfd->cpumask, mask, cpu_online_mask);
802 		__cpumask_clear_cpu(this_cpu, cfd->cpumask);
803 
804 		cpumask_clear(cfd->cpumask_ipi);
805 		for_each_cpu(cpu, cfd->cpumask) {
806 			call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
807 
808 			if (cond_func && !cond_func(cpu, info)) {
809 				__cpumask_clear_cpu(cpu, cfd->cpumask);
810 				continue;
811 			}
812 
813 			csd_lock(csd);
814 			if (wait)
815 				csd->node.u_flags |= CSD_TYPE_SYNC;
816 			csd->func = func;
817 			csd->info = info;
818 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
819 			csd->node.src = smp_processor_id();
820 			csd->node.dst = cpu;
821 #endif
822 			trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
823 
824 			if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
825 				__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
826 				nr_cpus++;
827 				last_cpu = cpu;
828 			}
829 		}
830 
831 		/*
832 		 * Choose the most efficient way to send an IPI. Note that the
833 		 * number of CPUs might be zero due to concurrent changes to the
834 		 * provided mask.
835 		 */
836 		if (nr_cpus == 1)
837 			send_call_function_single_ipi(last_cpu);
838 		else if (likely(nr_cpus > 1))
839 			send_call_function_ipi_mask(cfd->cpumask_ipi);
840 	}
841 
842 	if (run_local && (!cond_func || cond_func(this_cpu, info))) {
843 		unsigned long flags;
844 
845 		local_irq_save(flags);
846 		csd_do_func(func, info, NULL);
847 		local_irq_restore(flags);
848 	}
849 
850 	if (run_remote && wait) {
851 		for_each_cpu(cpu, cfd->cpumask) {
852 			call_single_data_t *csd;
853 
854 			csd = per_cpu_ptr(cfd->csd, cpu);
855 			csd_lock_wait(csd);
856 		}
857 	}
858 }
859 
860 /**
861  * smp_call_function_many(): Run a function on a set of CPUs.
862  * @mask: The set of cpus to run on (only runs on online subset).
863  * @func: The function to run. This must be fast and non-blocking.
864  * @info: An arbitrary pointer to pass to the function.
865  * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
866  *        (atomically) until function has completed on other CPUs. If
867  *        %SCF_RUN_LOCAL is set, the function will also be run locally
868  *        if the local CPU is set in the @cpumask.
869  *
870  * If @wait is true, then returns once @func has returned.
871  *
872  * You must not call this function with disabled interrupts or from a
873  * hardware interrupt handler or from a bottom half handler. Preemption
874  * must be disabled when calling this function.
875  */
876 void smp_call_function_many(const struct cpumask *mask,
877 			    smp_call_func_t func, void *info, bool wait)
878 {
879 	smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL);
880 }
881 EXPORT_SYMBOL(smp_call_function_many);
882 
883 /**
884  * smp_call_function(): Run a function on all other CPUs.
885  * @func: The function to run. This must be fast and non-blocking.
886  * @info: An arbitrary pointer to pass to the function.
887  * @wait: If true, wait (atomically) until function has completed
888  *        on other CPUs.
889  *
890  * Returns 0.
891  *
892  * If @wait is true, then returns once @func has returned; otherwise
893  * it returns just before the target cpu calls @func.
894  *
895  * You must not call this function with disabled interrupts or from a
896  * hardware interrupt handler or from a bottom half handler.
897  */
898 void smp_call_function(smp_call_func_t func, void *info, int wait)
899 {
900 	preempt_disable();
901 	smp_call_function_many(cpu_online_mask, func, info, wait);
902 	preempt_enable();
903 }
904 EXPORT_SYMBOL(smp_call_function);
905 
906 /* Setup configured maximum number of CPUs to activate */
907 unsigned int setup_max_cpus = NR_CPUS;
908 EXPORT_SYMBOL(setup_max_cpus);
909 
910 
911 /*
912  * Setup routine for controlling SMP activation
913  *
914  * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
915  * activation entirely (the MPS table probe still happens, though).
916  *
917  * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
918  * greater than 0, limits the maximum number of CPUs activated in
919  * SMP mode to <NUM>.
920  */
921 
922 void __weak __init arch_disable_smp_support(void) { }
923 
924 static int __init nosmp(char *str)
925 {
926 	setup_max_cpus = 0;
927 	arch_disable_smp_support();
928 
929 	return 0;
930 }
931 
932 early_param("nosmp", nosmp);
933 
934 /* this is hard limit */
935 static int __init nrcpus(char *str)
936 {
937 	int nr_cpus;
938 
939 	if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
940 		set_nr_cpu_ids(nr_cpus);
941 
942 	return 0;
943 }
944 
945 early_param("nr_cpus", nrcpus);
946 
947 static int __init maxcpus(char *str)
948 {
949 	get_option(&str, &setup_max_cpus);
950 	if (setup_max_cpus == 0)
951 		arch_disable_smp_support();
952 
953 	return 0;
954 }
955 
956 early_param("maxcpus", maxcpus);
957 
958 #if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS)
959 /* Setup number of possible processor ids */
960 unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
961 EXPORT_SYMBOL(nr_cpu_ids);
962 #endif
963 
964 /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
965 void __init setup_nr_cpu_ids(void)
966 {
967 	set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
968 }
969 
970 /* Called by boot processor to activate the rest. */
971 void __init smp_init(void)
972 {
973 	int num_nodes, num_cpus;
974 
975 	idle_threads_init();
976 	cpuhp_threads_init();
977 
978 	pr_info("Bringing up secondary CPUs ...\n");
979 
980 	bringup_nonboot_cpus(setup_max_cpus);
981 
982 	num_nodes = num_online_nodes();
983 	num_cpus  = num_online_cpus();
984 	pr_info("Brought up %d node%s, %d CPU%s\n",
985 		num_nodes, (num_nodes > 1 ? "s" : ""),
986 		num_cpus,  (num_cpus  > 1 ? "s" : ""));
987 
988 	/* Any cleanup work */
989 	smp_cpus_done(setup_max_cpus);
990 }
991 
992 /*
993  * on_each_cpu_cond(): Call a function on each processor for which
994  * the supplied function cond_func returns true, optionally waiting
995  * for all the required CPUs to finish. This may include the local
996  * processor.
997  * @cond_func:	A callback function that is passed a cpu id and
998  *		the info parameter. The function is called
999  *		with preemption disabled. The function should
1000  *		return a blooean value indicating whether to IPI
1001  *		the specified CPU.
1002  * @func:	The function to run on all applicable CPUs.
1003  *		This must be fast and non-blocking.
1004  * @info:	An arbitrary pointer to pass to both functions.
1005  * @wait:	If true, wait (atomically) until function has
1006  *		completed on other CPUs.
1007  *
1008  * Preemption is disabled to protect against CPUs going offline but not online.
1009  * CPUs going online during the call will not be seen or sent an IPI.
1010  *
1011  * You must not call this function with disabled interrupts or
1012  * from a hardware interrupt handler or from a bottom half handler.
1013  */
1014 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
1015 			   void *info, bool wait, const struct cpumask *mask)
1016 {
1017 	unsigned int scf_flags = SCF_RUN_LOCAL;
1018 
1019 	if (wait)
1020 		scf_flags |= SCF_WAIT;
1021 
1022 	preempt_disable();
1023 	smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
1024 	preempt_enable();
1025 }
1026 EXPORT_SYMBOL(on_each_cpu_cond_mask);
1027 
1028 static void do_nothing(void *unused)
1029 {
1030 }
1031 
1032 /**
1033  * kick_all_cpus_sync - Force all cpus out of idle
1034  *
1035  * Used to synchronize the update of pm_idle function pointer. It's
1036  * called after the pointer is updated and returns after the dummy
1037  * callback function has been executed on all cpus. The execution of
1038  * the function can only happen on the remote cpus after they have
1039  * left the idle function which had been called via pm_idle function
1040  * pointer. So it's guaranteed that nothing uses the previous pointer
1041  * anymore.
1042  */
1043 void kick_all_cpus_sync(void)
1044 {
1045 	/* Make sure the change is visible before we kick the cpus */
1046 	smp_mb();
1047 	smp_call_function(do_nothing, NULL, 1);
1048 }
1049 EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
1050 
1051 /**
1052  * wake_up_all_idle_cpus - break all cpus out of idle
1053  * wake_up_all_idle_cpus try to break all cpus which is in idle state even
1054  * including idle polling cpus, for non-idle cpus, we will do nothing
1055  * for them.
1056  */
1057 void wake_up_all_idle_cpus(void)
1058 {
1059 	int cpu;
1060 
1061 	for_each_possible_cpu(cpu) {
1062 		preempt_disable();
1063 		if (cpu != smp_processor_id() && cpu_online(cpu))
1064 			wake_up_if_idle(cpu);
1065 		preempt_enable();
1066 	}
1067 }
1068 EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
1069 
1070 /**
1071  * struct smp_call_on_cpu_struct - Call a function on a specific CPU
1072  * @work: &work_struct
1073  * @done: &completion to signal
1074  * @func: function to call
1075  * @data: function's data argument
1076  * @ret: return value from @func
1077  * @cpu: target CPU (%-1 for any CPU)
1078  *
1079  * Used to call a function on a specific cpu and wait for it to return.
1080  * Optionally make sure the call is done on a specified physical cpu via vcpu
1081  * pinning in order to support virtualized environments.
1082  */
1083 struct smp_call_on_cpu_struct {
1084 	struct work_struct	work;
1085 	struct completion	done;
1086 	int			(*func)(void *);
1087 	void			*data;
1088 	int			ret;
1089 	int			cpu;
1090 };
1091 
1092 static void smp_call_on_cpu_callback(struct work_struct *work)
1093 {
1094 	struct smp_call_on_cpu_struct *sscs;
1095 
1096 	sscs = container_of(work, struct smp_call_on_cpu_struct, work);
1097 	if (sscs->cpu >= 0)
1098 		hypervisor_pin_vcpu(sscs->cpu);
1099 	sscs->ret = sscs->func(sscs->data);
1100 	if (sscs->cpu >= 0)
1101 		hypervisor_pin_vcpu(-1);
1102 
1103 	complete(&sscs->done);
1104 }
1105 
1106 int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
1107 {
1108 	struct smp_call_on_cpu_struct sscs = {
1109 		.done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
1110 		.func = func,
1111 		.data = par,
1112 		.cpu  = phys ? cpu : -1,
1113 	};
1114 
1115 	INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);
1116 
1117 	if (cpu >= nr_cpu_ids || !cpu_online(cpu))
1118 		return -ENXIO;
1119 
1120 	queue_work_on(cpu, system_wq, &sscs.work);
1121 	wait_for_completion(&sscs.done);
1122 
1123 	return sscs.ret;
1124 }
1125 EXPORT_SYMBOL_GPL(smp_call_on_cpu);
1126