xref: /linux/kernel/stop_machine.c (revision 0b26351b910fb8fe6a056f8a1bbccabe50c0e19f)
11142d810STejun Heo /*
21142d810STejun Heo  * kernel/stop_machine.c
31142d810STejun Heo  *
41142d810STejun Heo  * Copyright (C) 2008, 2005	IBM Corporation.
51142d810STejun Heo  * Copyright (C) 2008, 2005	Rusty Russell rusty@rustcorp.com.au
61142d810STejun Heo  * Copyright (C) 2010		SUSE Linux Products GmbH
71142d810STejun Heo  * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
81142d810STejun Heo  *
91142d810STejun Heo  * This file is released under the GPLv2 and any later version.
10e5582ca2SRusty Russell  */
111142d810STejun Heo #include <linux/completion.h>
121da177e4SLinus Torvalds #include <linux/cpu.h>
131142d810STejun Heo #include <linux/init.h>
14ee527cd3SPrarit Bhargava #include <linux/kthread.h>
159984de1aSPaul Gortmaker #include <linux/export.h>
161142d810STejun Heo #include <linux/percpu.h>
17ee527cd3SPrarit Bhargava #include <linux/sched.h>
18ee527cd3SPrarit Bhargava #include <linux/stop_machine.h>
19a12bb444SBenjamin Herrenschmidt #include <linux/interrupt.h>
201142d810STejun Heo #include <linux/kallsyms.h>
2114e568e7SThomas Gleixner #include <linux/smpboot.h>
2260063497SArun Sharma #include <linux/atomic.h>
23ce4f06dcSOleg Nesterov #include <linux/nmi.h>
24*0b26351bSPeter Zijlstra #include <linux/sched/wake_q.h>
251142d810STejun Heo 
261142d810STejun Heo /*
271142d810STejun Heo  * Structure to determine completion condition and record errors.  May
281142d810STejun Heo  * be shared by works on different cpus.
291142d810STejun Heo  */
301142d810STejun Heo struct cpu_stop_done {
311142d810STejun Heo 	atomic_t		nr_todo;	/* nr left to execute */
321142d810STejun Heo 	int			ret;		/* collected return value */
331142d810STejun Heo 	struct completion	completion;	/* fired if nr_todo reaches 0 */
341142d810STejun Heo };
351142d810STejun Heo 
361142d810STejun Heo /* the actual stopper, one per every possible cpu, enabled on online cpus */
371142d810STejun Heo struct cpu_stopper {
3802cb7aa9SOleg Nesterov 	struct task_struct	*thread;
3902cb7aa9SOleg Nesterov 
401142d810STejun Heo 	spinlock_t		lock;
41878ae127SRichard Kennedy 	bool			enabled;	/* is this stopper enabled? */
421142d810STejun Heo 	struct list_head	works;		/* list of pending works */
4302cb7aa9SOleg Nesterov 
4402cb7aa9SOleg Nesterov 	struct cpu_stop_work	stop_work;	/* for stop_cpus */
451142d810STejun Heo };
461142d810STejun Heo 
471142d810STejun Heo static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
48f445027eSJeremy Fitzhardinge static bool stop_machine_initialized = false;
491142d810STejun Heo 
50e6253970SOleg Nesterov /* static data for stop_cpus */
51e6253970SOleg Nesterov static DEFINE_MUTEX(stop_cpus_mutex);
52e6253970SOleg Nesterov static bool stop_cpus_in_progress;
537053ea1aSRik van Riel 
541142d810STejun Heo static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
551142d810STejun Heo {
561142d810STejun Heo 	memset(done, 0, sizeof(*done));
571142d810STejun Heo 	atomic_set(&done->nr_todo, nr_todo);
581142d810STejun Heo 	init_completion(&done->completion);
591142d810STejun Heo }
601142d810STejun Heo 
611142d810STejun Heo /* signal completion unless @done is NULL */
626fa3b826SOleg Nesterov static void cpu_stop_signal_done(struct cpu_stop_done *done)
631142d810STejun Heo {
641142d810STejun Heo 	if (atomic_dec_and_test(&done->nr_todo))
651142d810STejun Heo 		complete(&done->completion);
661142d810STejun Heo }
671142d810STejun Heo 
685caa1c08SOleg Nesterov static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
69*0b26351bSPeter Zijlstra 					struct cpu_stop_work *work,
70*0b26351bSPeter Zijlstra 					struct wake_q_head *wakeq)
715caa1c08SOleg Nesterov {
725caa1c08SOleg Nesterov 	list_add_tail(&work->list, &stopper->works);
73*0b26351bSPeter Zijlstra 	wake_q_add(wakeq, stopper->thread);
745caa1c08SOleg Nesterov }
755caa1c08SOleg Nesterov 
761142d810STejun Heo /* queue @work to @stopper.  if offline, @work is completed immediately */
771b034bd9SOleg Nesterov static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
781142d810STejun Heo {
79860a0ffaSThomas Gleixner 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
80*0b26351bSPeter Zijlstra 	DEFINE_WAKE_Q(wakeq);
811142d810STejun Heo 	unsigned long flags;
821b034bd9SOleg Nesterov 	bool enabled;
831142d810STejun Heo 
841142d810STejun Heo 	spin_lock_irqsave(&stopper->lock, flags);
851b034bd9SOleg Nesterov 	enabled = stopper->enabled;
861b034bd9SOleg Nesterov 	if (enabled)
87*0b26351bSPeter Zijlstra 		__cpu_stop_queue_work(stopper, work, &wakeq);
88dd2e3121SOleg Nesterov 	else if (work->done)
896fa3b826SOleg Nesterov 		cpu_stop_signal_done(work->done);
901142d810STejun Heo 	spin_unlock_irqrestore(&stopper->lock, flags);
911b034bd9SOleg Nesterov 
92*0b26351bSPeter Zijlstra 	wake_up_q(&wakeq);
93*0b26351bSPeter Zijlstra 
941b034bd9SOleg Nesterov 	return enabled;
951142d810STejun Heo }
961142d810STejun Heo 
971142d810STejun Heo /**
981142d810STejun Heo  * stop_one_cpu - stop a cpu
991142d810STejun Heo  * @cpu: cpu to stop
1001142d810STejun Heo  * @fn: function to execute
1011142d810STejun Heo  * @arg: argument to @fn
1021142d810STejun Heo  *
1031142d810STejun Heo  * Execute @fn(@arg) on @cpu.  @fn is run in a process context with
1041142d810STejun Heo  * the highest priority preempting any task on the cpu and
1051142d810STejun Heo  * monopolizing it.  This function returns after the execution is
1061142d810STejun Heo  * complete.
1071142d810STejun Heo  *
1081142d810STejun Heo  * This function doesn't guarantee @cpu stays online till @fn
1091142d810STejun Heo  * completes.  If @cpu goes down in the middle, execution may happen
1101142d810STejun Heo  * partially or fully on different cpus.  @fn should either be ready
1111142d810STejun Heo  * for that or the caller should ensure that @cpu stays online until
1121142d810STejun Heo  * this function completes.
1131142d810STejun Heo  *
1141142d810STejun Heo  * CONTEXT:
1151142d810STejun Heo  * Might sleep.
1161142d810STejun Heo  *
1171142d810STejun Heo  * RETURNS:
1181142d810STejun Heo  * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
1191142d810STejun Heo  * otherwise, the return value of @fn.
1201142d810STejun Heo  */
1211142d810STejun Heo int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
1221142d810STejun Heo {
1231142d810STejun Heo 	struct cpu_stop_done done;
1241142d810STejun Heo 	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
1251142d810STejun Heo 
1261142d810STejun Heo 	cpu_stop_init_done(&done, 1);
127958c5f84SOleg Nesterov 	if (!cpu_stop_queue_work(cpu, &work))
128958c5f84SOleg Nesterov 		return -ENOENT;
129bf89a304SCheng Chao 	/*
130bf89a304SCheng Chao 	 * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup
131bf89a304SCheng Chao 	 * cycle by doing a preemption:
132bf89a304SCheng Chao 	 */
133bf89a304SCheng Chao 	cond_resched();
1341142d810STejun Heo 	wait_for_completion(&done.completion);
135958c5f84SOleg Nesterov 	return done.ret;
1361142d810STejun Heo }
1371142d810STejun Heo 
1381be0bd77SPeter Zijlstra /* This controls the threads on each CPU. */
1391be0bd77SPeter Zijlstra enum multi_stop_state {
1401be0bd77SPeter Zijlstra 	/* Dummy starting state for thread. */
1411be0bd77SPeter Zijlstra 	MULTI_STOP_NONE,
1421be0bd77SPeter Zijlstra 	/* Awaiting everyone to be scheduled. */
1431be0bd77SPeter Zijlstra 	MULTI_STOP_PREPARE,
1441be0bd77SPeter Zijlstra 	/* Disable interrupts. */
1451be0bd77SPeter Zijlstra 	MULTI_STOP_DISABLE_IRQ,
1461be0bd77SPeter Zijlstra 	/* Run the function */
1471be0bd77SPeter Zijlstra 	MULTI_STOP_RUN,
1481be0bd77SPeter Zijlstra 	/* Exit */
1491be0bd77SPeter Zijlstra 	MULTI_STOP_EXIT,
1501be0bd77SPeter Zijlstra };
1511be0bd77SPeter Zijlstra 
1521be0bd77SPeter Zijlstra struct multi_stop_data {
1539a301f22SOleg Nesterov 	cpu_stop_fn_t		fn;
1541be0bd77SPeter Zijlstra 	void			*data;
1551be0bd77SPeter Zijlstra 	/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
1561be0bd77SPeter Zijlstra 	unsigned int		num_threads;
1571be0bd77SPeter Zijlstra 	const struct cpumask	*active_cpus;
1581be0bd77SPeter Zijlstra 
1591be0bd77SPeter Zijlstra 	enum multi_stop_state	state;
1601be0bd77SPeter Zijlstra 	atomic_t		thread_ack;
1611be0bd77SPeter Zijlstra };
1621be0bd77SPeter Zijlstra 
1631be0bd77SPeter Zijlstra static void set_state(struct multi_stop_data *msdata,
1641be0bd77SPeter Zijlstra 		      enum multi_stop_state newstate)
1651be0bd77SPeter Zijlstra {
1661be0bd77SPeter Zijlstra 	/* Reset ack counter. */
1671be0bd77SPeter Zijlstra 	atomic_set(&msdata->thread_ack, msdata->num_threads);
1681be0bd77SPeter Zijlstra 	smp_wmb();
1691be0bd77SPeter Zijlstra 	msdata->state = newstate;
1701be0bd77SPeter Zijlstra }
1711be0bd77SPeter Zijlstra 
1721be0bd77SPeter Zijlstra /* Last one to ack a state moves to the next state. */
1731be0bd77SPeter Zijlstra static void ack_state(struct multi_stop_data *msdata)
1741be0bd77SPeter Zijlstra {
1751be0bd77SPeter Zijlstra 	if (atomic_dec_and_test(&msdata->thread_ack))
1761be0bd77SPeter Zijlstra 		set_state(msdata, msdata->state + 1);
1771be0bd77SPeter Zijlstra }
1781be0bd77SPeter Zijlstra 
1791be0bd77SPeter Zijlstra /* This is the cpu_stop function which stops the CPU. */
1801be0bd77SPeter Zijlstra static int multi_cpu_stop(void *data)
1811be0bd77SPeter Zijlstra {
1821be0bd77SPeter Zijlstra 	struct multi_stop_data *msdata = data;
1831be0bd77SPeter Zijlstra 	enum multi_stop_state curstate = MULTI_STOP_NONE;
1841be0bd77SPeter Zijlstra 	int cpu = smp_processor_id(), err = 0;
1851be0bd77SPeter Zijlstra 	unsigned long flags;
1861be0bd77SPeter Zijlstra 	bool is_active;
1871be0bd77SPeter Zijlstra 
1881be0bd77SPeter Zijlstra 	/*
1891be0bd77SPeter Zijlstra 	 * When called from stop_machine_from_inactive_cpu(), irq might
1901be0bd77SPeter Zijlstra 	 * already be disabled.  Save the state and restore it on exit.
1911be0bd77SPeter Zijlstra 	 */
1921be0bd77SPeter Zijlstra 	local_save_flags(flags);
1931be0bd77SPeter Zijlstra 
1941be0bd77SPeter Zijlstra 	if (!msdata->active_cpus)
1951be0bd77SPeter Zijlstra 		is_active = cpu == cpumask_first(cpu_online_mask);
1961be0bd77SPeter Zijlstra 	else
1971be0bd77SPeter Zijlstra 		is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
1981be0bd77SPeter Zijlstra 
1991be0bd77SPeter Zijlstra 	/* Simple state machine */
2001be0bd77SPeter Zijlstra 	do {
2011be0bd77SPeter Zijlstra 		/* Chill out and ensure we re-read multi_stop_state. */
202bf0d31c0SChristian Borntraeger 		cpu_relax_yield();
2031be0bd77SPeter Zijlstra 		if (msdata->state != curstate) {
2041be0bd77SPeter Zijlstra 			curstate = msdata->state;
2051be0bd77SPeter Zijlstra 			switch (curstate) {
2061be0bd77SPeter Zijlstra 			case MULTI_STOP_DISABLE_IRQ:
2071be0bd77SPeter Zijlstra 				local_irq_disable();
2081be0bd77SPeter Zijlstra 				hard_irq_disable();
2091be0bd77SPeter Zijlstra 				break;
2101be0bd77SPeter Zijlstra 			case MULTI_STOP_RUN:
2111be0bd77SPeter Zijlstra 				if (is_active)
2121be0bd77SPeter Zijlstra 					err = msdata->fn(msdata->data);
2131be0bd77SPeter Zijlstra 				break;
2141be0bd77SPeter Zijlstra 			default:
2151be0bd77SPeter Zijlstra 				break;
2161be0bd77SPeter Zijlstra 			}
2171be0bd77SPeter Zijlstra 			ack_state(msdata);
218ce4f06dcSOleg Nesterov 		} else if (curstate > MULTI_STOP_PREPARE) {
219ce4f06dcSOleg Nesterov 			/*
220ce4f06dcSOleg Nesterov 			 * At this stage all other CPUs we depend on must spin
221ce4f06dcSOleg Nesterov 			 * in the same loop. Any reason for hard-lockup should
222ce4f06dcSOleg Nesterov 			 * be detected and reported on their side.
223ce4f06dcSOleg Nesterov 			 */
224ce4f06dcSOleg Nesterov 			touch_nmi_watchdog();
2251be0bd77SPeter Zijlstra 		}
2261be0bd77SPeter Zijlstra 	} while (curstate != MULTI_STOP_EXIT);
2271be0bd77SPeter Zijlstra 
2281be0bd77SPeter Zijlstra 	local_irq_restore(flags);
2291be0bd77SPeter Zijlstra 	return err;
2301be0bd77SPeter Zijlstra }
2311be0bd77SPeter Zijlstra 
2325caa1c08SOleg Nesterov static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
2335caa1c08SOleg Nesterov 				    int cpu2, struct cpu_stop_work *work2)
2345caa1c08SOleg Nesterov {
235d8bc8535SOleg Nesterov 	struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
236d8bc8535SOleg Nesterov 	struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
237*0b26351bSPeter Zijlstra 	DEFINE_WAKE_Q(wakeq);
238d8bc8535SOleg Nesterov 	int err;
239e6253970SOleg Nesterov retry:
240d8bc8535SOleg Nesterov 	spin_lock_irq(&stopper1->lock);
241d8bc8535SOleg Nesterov 	spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
242d8bc8535SOleg Nesterov 
243d8bc8535SOleg Nesterov 	err = -ENOENT;
244d8bc8535SOleg Nesterov 	if (!stopper1->enabled || !stopper2->enabled)
245d8bc8535SOleg Nesterov 		goto unlock;
246e6253970SOleg Nesterov 	/*
247e6253970SOleg Nesterov 	 * Ensure that if we race with __stop_cpus() the stoppers won't get
248e6253970SOleg Nesterov 	 * queued up in reverse order leading to system deadlock.
249e6253970SOleg Nesterov 	 *
250e6253970SOleg Nesterov 	 * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has
251e6253970SOleg Nesterov 	 * queued a work on cpu1 but not on cpu2, we hold both locks.
252e6253970SOleg Nesterov 	 *
253e6253970SOleg Nesterov 	 * It can be falsely true but it is safe to spin until it is cleared,
254e6253970SOleg Nesterov 	 * queue_stop_cpus_work() does everything under preempt_disable().
255e6253970SOleg Nesterov 	 */
256e6253970SOleg Nesterov 	err = -EDEADLK;
257e6253970SOleg Nesterov 	if (unlikely(stop_cpus_in_progress))
258e6253970SOleg Nesterov 			goto unlock;
259d8bc8535SOleg Nesterov 
260d8bc8535SOleg Nesterov 	err = 0;
261*0b26351bSPeter Zijlstra 	__cpu_stop_queue_work(stopper1, work1, &wakeq);
262*0b26351bSPeter Zijlstra 	__cpu_stop_queue_work(stopper2, work2, &wakeq);
263d8bc8535SOleg Nesterov unlock:
264d8bc8535SOleg Nesterov 	spin_unlock(&stopper2->lock);
265d8bc8535SOleg Nesterov 	spin_unlock_irq(&stopper1->lock);
2665caa1c08SOleg Nesterov 
267e6253970SOleg Nesterov 	if (unlikely(err == -EDEADLK)) {
268e6253970SOleg Nesterov 		while (stop_cpus_in_progress)
269e6253970SOleg Nesterov 			cpu_relax();
270e6253970SOleg Nesterov 		goto retry;
271e6253970SOleg Nesterov 	}
272*0b26351bSPeter Zijlstra 
273*0b26351bSPeter Zijlstra 	wake_up_q(&wakeq);
274*0b26351bSPeter Zijlstra 
275d8bc8535SOleg Nesterov 	return err;
2765caa1c08SOleg Nesterov }
2771be0bd77SPeter Zijlstra /**
2781be0bd77SPeter Zijlstra  * stop_two_cpus - stops two cpus
2791be0bd77SPeter Zijlstra  * @cpu1: the cpu to stop
2801be0bd77SPeter Zijlstra  * @cpu2: the other cpu to stop
2811be0bd77SPeter Zijlstra  * @fn: function to execute
2821be0bd77SPeter Zijlstra  * @arg: argument to @fn
2831be0bd77SPeter Zijlstra  *
2841be0bd77SPeter Zijlstra  * Stops both the current and specified CPU and runs @fn on one of them.
2851be0bd77SPeter Zijlstra  *
2861be0bd77SPeter Zijlstra  * returns when both are completed.
2871be0bd77SPeter Zijlstra  */
2881be0bd77SPeter Zijlstra int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
2891be0bd77SPeter Zijlstra {
2901be0bd77SPeter Zijlstra 	struct cpu_stop_done done;
2911be0bd77SPeter Zijlstra 	struct cpu_stop_work work1, work2;
2926acce3efSPeter Zijlstra 	struct multi_stop_data msdata;
2936acce3efSPeter Zijlstra 
2946acce3efSPeter Zijlstra 	msdata = (struct multi_stop_data){
2951be0bd77SPeter Zijlstra 		.fn = fn,
2961be0bd77SPeter Zijlstra 		.data = arg,
2971be0bd77SPeter Zijlstra 		.num_threads = 2,
2981be0bd77SPeter Zijlstra 		.active_cpus = cpumask_of(cpu1),
2991be0bd77SPeter Zijlstra 	};
3001be0bd77SPeter Zijlstra 
3011be0bd77SPeter Zijlstra 	work1 = work2 = (struct cpu_stop_work){
3021be0bd77SPeter Zijlstra 		.fn = multi_cpu_stop,
3031be0bd77SPeter Zijlstra 		.arg = &msdata,
3041be0bd77SPeter Zijlstra 		.done = &done
3051be0bd77SPeter Zijlstra 	};
3061be0bd77SPeter Zijlstra 
3071be0bd77SPeter Zijlstra 	cpu_stop_init_done(&done, 2);
3081be0bd77SPeter Zijlstra 	set_state(&msdata, MULTI_STOP_PREPARE);
3091be0bd77SPeter Zijlstra 
3105caa1c08SOleg Nesterov 	if (cpu1 > cpu2)
3115caa1c08SOleg Nesterov 		swap(cpu1, cpu2);
3126a190051SOleg Nesterov 	if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2))
3135caa1c08SOleg Nesterov 		return -ENOENT;
3141be0bd77SPeter Zijlstra 
3151be0bd77SPeter Zijlstra 	wait_for_completion(&done.completion);
3166a190051SOleg Nesterov 	return done.ret;
3171be0bd77SPeter Zijlstra }
3181be0bd77SPeter Zijlstra 
3191142d810STejun Heo /**
3201142d810STejun Heo  * stop_one_cpu_nowait - stop a cpu but don't wait for completion
3211142d810STejun Heo  * @cpu: cpu to stop
3221142d810STejun Heo  * @fn: function to execute
3231142d810STejun Heo  * @arg: argument to @fn
324cf250040SFabian Frederick  * @work_buf: pointer to cpu_stop_work structure
3251142d810STejun Heo  *
3261142d810STejun Heo  * Similar to stop_one_cpu() but doesn't wait for completion.  The
3271142d810STejun Heo  * caller is responsible for ensuring @work_buf is currently unused
3281142d810STejun Heo  * and will remain untouched until stopper starts executing @fn.
3291142d810STejun Heo  *
3301142d810STejun Heo  * CONTEXT:
3311142d810STejun Heo  * Don't care.
3321b034bd9SOleg Nesterov  *
3331b034bd9SOleg Nesterov  * RETURNS:
3341b034bd9SOleg Nesterov  * true if cpu_stop_work was queued successfully and @fn will be called,
3351b034bd9SOleg Nesterov  * false otherwise.
3361142d810STejun Heo  */
3371b034bd9SOleg Nesterov bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
3381142d810STejun Heo 			struct cpu_stop_work *work_buf)
3391142d810STejun Heo {
3401142d810STejun Heo 	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
3411b034bd9SOleg Nesterov 	return cpu_stop_queue_work(cpu, work_buf);
3421142d810STejun Heo }
3431142d810STejun Heo 
3444aff1ca6SOleg Nesterov static bool queue_stop_cpus_work(const struct cpumask *cpumask,
345fd7355baSTejun Heo 				 cpu_stop_fn_t fn, void *arg,
346fd7355baSTejun Heo 				 struct cpu_stop_done *done)
3471142d810STejun Heo {
3481142d810STejun Heo 	struct cpu_stop_work *work;
3491142d810STejun Heo 	unsigned int cpu;
3504aff1ca6SOleg Nesterov 	bool queued = false;
3511142d810STejun Heo 
3521142d810STejun Heo 	/*
3531142d810STejun Heo 	 * Disable preemption while queueing to avoid getting
3541142d810STejun Heo 	 * preempted by a stopper which might wait for other stoppers
3551142d810STejun Heo 	 * to enter @fn which can lead to deadlock.
3561142d810STejun Heo 	 */
357e6253970SOleg Nesterov 	preempt_disable();
358e6253970SOleg Nesterov 	stop_cpus_in_progress = true;
359b377c2a0SOleg Nesterov 	for_each_cpu(cpu, cpumask) {
360b377c2a0SOleg Nesterov 		work = &per_cpu(cpu_stopper.stop_work, cpu);
361b377c2a0SOleg Nesterov 		work->fn = fn;
362b377c2a0SOleg Nesterov 		work->arg = arg;
363b377c2a0SOleg Nesterov 		work->done = done;
3644aff1ca6SOleg Nesterov 		if (cpu_stop_queue_work(cpu, work))
3654aff1ca6SOleg Nesterov 			queued = true;
366b377c2a0SOleg Nesterov 	}
367e6253970SOleg Nesterov 	stop_cpus_in_progress = false;
368e6253970SOleg Nesterov 	preempt_enable();
3694aff1ca6SOleg Nesterov 
3704aff1ca6SOleg Nesterov 	return queued;
371fd7355baSTejun Heo }
3721142d810STejun Heo 
373fd7355baSTejun Heo static int __stop_cpus(const struct cpumask *cpumask,
374fd7355baSTejun Heo 		       cpu_stop_fn_t fn, void *arg)
375fd7355baSTejun Heo {
376fd7355baSTejun Heo 	struct cpu_stop_done done;
377fd7355baSTejun Heo 
378fd7355baSTejun Heo 	cpu_stop_init_done(&done, cpumask_weight(cpumask));
3794aff1ca6SOleg Nesterov 	if (!queue_stop_cpus_work(cpumask, fn, arg, &done))
3804aff1ca6SOleg Nesterov 		return -ENOENT;
3811142d810STejun Heo 	wait_for_completion(&done.completion);
3824aff1ca6SOleg Nesterov 	return done.ret;
3831142d810STejun Heo }
3841142d810STejun Heo 
3851142d810STejun Heo /**
3861142d810STejun Heo  * stop_cpus - stop multiple cpus
3871142d810STejun Heo  * @cpumask: cpus to stop
3881142d810STejun Heo  * @fn: function to execute
3891142d810STejun Heo  * @arg: argument to @fn
3901142d810STejun Heo  *
3911142d810STejun Heo  * Execute @fn(@arg) on online cpus in @cpumask.  On each target cpu,
3921142d810STejun Heo  * @fn is run in a process context with the highest priority
3931142d810STejun Heo  * preempting any task on the cpu and monopolizing it.  This function
3941142d810STejun Heo  * returns after all executions are complete.
3951142d810STejun Heo  *
3961142d810STejun Heo  * This function doesn't guarantee the cpus in @cpumask stay online
3971142d810STejun Heo  * till @fn completes.  If some cpus go down in the middle, execution
3981142d810STejun Heo  * on the cpu may happen partially or fully on different cpus.  @fn
3991142d810STejun Heo  * should either be ready for that or the caller should ensure that
4001142d810STejun Heo  * the cpus stay online until this function completes.
4011142d810STejun Heo  *
4021142d810STejun Heo  * All stop_cpus() calls are serialized making it safe for @fn to wait
4031142d810STejun Heo  * for all cpus to start executing it.
4041142d810STejun Heo  *
4051142d810STejun Heo  * CONTEXT:
4061142d810STejun Heo  * Might sleep.
4071142d810STejun Heo  *
4081142d810STejun Heo  * RETURNS:
4091142d810STejun Heo  * -ENOENT if @fn(@arg) was not executed at all because all cpus in
4101142d810STejun Heo  * @cpumask were offline; otherwise, 0 if all executions of @fn
4111142d810STejun Heo  * returned 0, any non zero return value if any returned non zero.
4121142d810STejun Heo  */
4131142d810STejun Heo int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
4141142d810STejun Heo {
4151142d810STejun Heo 	int ret;
4161142d810STejun Heo 
4171142d810STejun Heo 	/* static works are used, process one request at a time */
4181142d810STejun Heo 	mutex_lock(&stop_cpus_mutex);
4191142d810STejun Heo 	ret = __stop_cpus(cpumask, fn, arg);
4201142d810STejun Heo 	mutex_unlock(&stop_cpus_mutex);
4211142d810STejun Heo 	return ret;
4221142d810STejun Heo }
4231142d810STejun Heo 
4241142d810STejun Heo /**
4251142d810STejun Heo  * try_stop_cpus - try to stop multiple cpus
4261142d810STejun Heo  * @cpumask: cpus to stop
4271142d810STejun Heo  * @fn: function to execute
4281142d810STejun Heo  * @arg: argument to @fn
4291142d810STejun Heo  *
4301142d810STejun Heo  * Identical to stop_cpus() except that it fails with -EAGAIN if
4311142d810STejun Heo  * someone else is already using the facility.
4321142d810STejun Heo  *
4331142d810STejun Heo  * CONTEXT:
4341142d810STejun Heo  * Might sleep.
4351142d810STejun Heo  *
4361142d810STejun Heo  * RETURNS:
4371142d810STejun Heo  * -EAGAIN if someone else is already stopping cpus, -ENOENT if
4381142d810STejun Heo  * @fn(@arg) was not executed at all because all cpus in @cpumask were
4391142d810STejun Heo  * offline; otherwise, 0 if all executions of @fn returned 0, any non
4401142d810STejun Heo  * zero return value if any returned non zero.
4411142d810STejun Heo  */
4421142d810STejun Heo int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
4431142d810STejun Heo {
4441142d810STejun Heo 	int ret;
4451142d810STejun Heo 
4461142d810STejun Heo 	/* static works are used, process one request at a time */
4471142d810STejun Heo 	if (!mutex_trylock(&stop_cpus_mutex))
4481142d810STejun Heo 		return -EAGAIN;
4491142d810STejun Heo 	ret = __stop_cpus(cpumask, fn, arg);
4501142d810STejun Heo 	mutex_unlock(&stop_cpus_mutex);
4511142d810STejun Heo 	return ret;
4521142d810STejun Heo }
4531142d810STejun Heo 
45414e568e7SThomas Gleixner static int cpu_stop_should_run(unsigned int cpu)
4551142d810STejun Heo {
45614e568e7SThomas Gleixner 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
45714e568e7SThomas Gleixner 	unsigned long flags;
45814e568e7SThomas Gleixner 	int run;
45914e568e7SThomas Gleixner 
46014e568e7SThomas Gleixner 	spin_lock_irqsave(&stopper->lock, flags);
46114e568e7SThomas Gleixner 	run = !list_empty(&stopper->works);
46214e568e7SThomas Gleixner 	spin_unlock_irqrestore(&stopper->lock, flags);
46314e568e7SThomas Gleixner 	return run;
46414e568e7SThomas Gleixner }
46514e568e7SThomas Gleixner 
46614e568e7SThomas Gleixner static void cpu_stopper_thread(unsigned int cpu)
46714e568e7SThomas Gleixner {
46814e568e7SThomas Gleixner 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
4691142d810STejun Heo 	struct cpu_stop_work *work;
4701142d810STejun Heo 
4711142d810STejun Heo repeat:
4721142d810STejun Heo 	work = NULL;
4731142d810STejun Heo 	spin_lock_irq(&stopper->lock);
4741142d810STejun Heo 	if (!list_empty(&stopper->works)) {
4751142d810STejun Heo 		work = list_first_entry(&stopper->works,
4761142d810STejun Heo 					struct cpu_stop_work, list);
4771142d810STejun Heo 		list_del_init(&work->list);
4781142d810STejun Heo 	}
4791142d810STejun Heo 	spin_unlock_irq(&stopper->lock);
4801142d810STejun Heo 
4811142d810STejun Heo 	if (work) {
4821142d810STejun Heo 		cpu_stop_fn_t fn = work->fn;
4831142d810STejun Heo 		void *arg = work->arg;
4841142d810STejun Heo 		struct cpu_stop_done *done = work->done;
485accaf6eaSOleg Nesterov 		int ret;
4861142d810STejun Heo 
487accaf6eaSOleg Nesterov 		/* cpu stop callbacks must not sleep, make in_atomic() == T */
488accaf6eaSOleg Nesterov 		preempt_count_inc();
4891142d810STejun Heo 		ret = fn(arg);
490dd2e3121SOleg Nesterov 		if (done) {
491dd2e3121SOleg Nesterov 			if (ret)
4921142d810STejun Heo 				done->ret = ret;
4936fa3b826SOleg Nesterov 			cpu_stop_signal_done(done);
494dd2e3121SOleg Nesterov 		}
495accaf6eaSOleg Nesterov 		preempt_count_dec();
4961142d810STejun Heo 		WARN_ONCE(preempt_count(),
497accaf6eaSOleg Nesterov 			  "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg);
4981142d810STejun Heo 		goto repeat;
4991142d810STejun Heo 	}
50014e568e7SThomas Gleixner }
5011142d810STejun Heo 
502233e7f26SOleg Nesterov void stop_machine_park(int cpu)
503233e7f26SOleg Nesterov {
504233e7f26SOleg Nesterov 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
505233e7f26SOleg Nesterov 	/*
506233e7f26SOleg Nesterov 	 * Lockless. cpu_stopper_thread() will take stopper->lock and flush
507233e7f26SOleg Nesterov 	 * the pending works before it parks, until then it is fine to queue
508233e7f26SOleg Nesterov 	 * the new works.
509233e7f26SOleg Nesterov 	 */
510233e7f26SOleg Nesterov 	stopper->enabled = false;
511233e7f26SOleg Nesterov 	kthread_park(stopper->thread);
512233e7f26SOleg Nesterov }
513233e7f26SOleg Nesterov 
51434f971f6SPeter Zijlstra extern void sched_set_stop_task(int cpu, struct task_struct *stop);
51534f971f6SPeter Zijlstra 
51614e568e7SThomas Gleixner static void cpu_stop_create(unsigned int cpu)
5171142d810STejun Heo {
51802cb7aa9SOleg Nesterov 	sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
51914e568e7SThomas Gleixner }
52014e568e7SThomas Gleixner 
52114e568e7SThomas Gleixner static void cpu_stop_park(unsigned int cpu)
52214e568e7SThomas Gleixner {
5231142d810STejun Heo 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
5249c6f7e43SIngo Molnar 
525233e7f26SOleg Nesterov 	WARN_ON(!list_empty(&stopper->works));
52614e568e7SThomas Gleixner }
52714e568e7SThomas Gleixner 
528c00166d8SOleg Nesterov void stop_machine_unpark(int cpu)
529c00166d8SOleg Nesterov {
530c00166d8SOleg Nesterov 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
531c00166d8SOleg Nesterov 
532f0cf16cbSOleg Nesterov 	stopper->enabled = true;
533c00166d8SOleg Nesterov 	kthread_unpark(stopper->thread);
534c00166d8SOleg Nesterov }
535c00166d8SOleg Nesterov 
53614e568e7SThomas Gleixner static struct smp_hotplug_thread cpu_stop_threads = {
53702cb7aa9SOleg Nesterov 	.store			= &cpu_stopper.thread,
53814e568e7SThomas Gleixner 	.thread_should_run	= cpu_stop_should_run,
53914e568e7SThomas Gleixner 	.thread_fn		= cpu_stopper_thread,
54014e568e7SThomas Gleixner 	.thread_comm		= "migration/%u",
54114e568e7SThomas Gleixner 	.create			= cpu_stop_create,
54214e568e7SThomas Gleixner 	.park			= cpu_stop_park,
54314e568e7SThomas Gleixner 	.selfparking		= true,
5441142d810STejun Heo };
5451142d810STejun Heo 
5461142d810STejun Heo static int __init cpu_stop_init(void)
5471142d810STejun Heo {
5481142d810STejun Heo 	unsigned int cpu;
5491142d810STejun Heo 
5501142d810STejun Heo 	for_each_possible_cpu(cpu) {
5511142d810STejun Heo 		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
5521142d810STejun Heo 
5531142d810STejun Heo 		spin_lock_init(&stopper->lock);
5541142d810STejun Heo 		INIT_LIST_HEAD(&stopper->works);
5551142d810STejun Heo 	}
5561142d810STejun Heo 
55714e568e7SThomas Gleixner 	BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
558c00166d8SOleg Nesterov 	stop_machine_unpark(raw_smp_processor_id());
559f445027eSJeremy Fitzhardinge 	stop_machine_initialized = true;
5601142d810STejun Heo 	return 0;
5611142d810STejun Heo }
5621142d810STejun Heo early_initcall(cpu_stop_init);
5631da177e4SLinus Torvalds 
564fe5595c0SSebastian Andrzej Siewior int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
565fe5595c0SSebastian Andrzej Siewior 			    const struct cpumask *cpus)
5661da177e4SLinus Torvalds {
5671be0bd77SPeter Zijlstra 	struct multi_stop_data msdata = {
5681be0bd77SPeter Zijlstra 		.fn = fn,
5691be0bd77SPeter Zijlstra 		.data = data,
5703fc1f1e2STejun Heo 		.num_threads = num_online_cpus(),
5711be0bd77SPeter Zijlstra 		.active_cpus = cpus,
5721be0bd77SPeter Zijlstra 	};
5731da177e4SLinus Torvalds 
574fe5595c0SSebastian Andrzej Siewior 	lockdep_assert_cpus_held();
575fe5595c0SSebastian Andrzej Siewior 
576f445027eSJeremy Fitzhardinge 	if (!stop_machine_initialized) {
577f445027eSJeremy Fitzhardinge 		/*
578f445027eSJeremy Fitzhardinge 		 * Handle the case where stop_machine() is called
579f445027eSJeremy Fitzhardinge 		 * early in boot before stop_machine() has been
580f445027eSJeremy Fitzhardinge 		 * initialized.
581f445027eSJeremy Fitzhardinge 		 */
582f445027eSJeremy Fitzhardinge 		unsigned long flags;
583f445027eSJeremy Fitzhardinge 		int ret;
584f445027eSJeremy Fitzhardinge 
5851be0bd77SPeter Zijlstra 		WARN_ON_ONCE(msdata.num_threads != 1);
586f445027eSJeremy Fitzhardinge 
587f445027eSJeremy Fitzhardinge 		local_irq_save(flags);
588f445027eSJeremy Fitzhardinge 		hard_irq_disable();
589f445027eSJeremy Fitzhardinge 		ret = (*fn)(data);
590f445027eSJeremy Fitzhardinge 		local_irq_restore(flags);
591f445027eSJeremy Fitzhardinge 
592f445027eSJeremy Fitzhardinge 		return ret;
593f445027eSJeremy Fitzhardinge 	}
594f445027eSJeremy Fitzhardinge 
5953fc1f1e2STejun Heo 	/* Set the initial state and stop all online cpus. */
5961be0bd77SPeter Zijlstra 	set_state(&msdata, MULTI_STOP_PREPARE);
5971be0bd77SPeter Zijlstra 	return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
5981da177e4SLinus Torvalds }
5991da177e4SLinus Torvalds 
6009a301f22SOleg Nesterov int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
6011da177e4SLinus Torvalds {
6021da177e4SLinus Torvalds 	int ret;
6031da177e4SLinus Torvalds 
6041da177e4SLinus Torvalds 	/* No CPUs can come up or down during this. */
605fe5595c0SSebastian Andrzej Siewior 	cpus_read_lock();
606fe5595c0SSebastian Andrzej Siewior 	ret = stop_machine_cpuslocked(fn, data, cpus);
607fe5595c0SSebastian Andrzej Siewior 	cpus_read_unlock();
6081da177e4SLinus Torvalds 	return ret;
6091da177e4SLinus Torvalds }
610eeec4fadSRusty Russell EXPORT_SYMBOL_GPL(stop_machine);
611bbf1bb3eSTejun Heo 
612f740e6cdSTejun Heo /**
613f740e6cdSTejun Heo  * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
614f740e6cdSTejun Heo  * @fn: the function to run
615f740e6cdSTejun Heo  * @data: the data ptr for the @fn()
616f740e6cdSTejun Heo  * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
617f740e6cdSTejun Heo  *
618f740e6cdSTejun Heo  * This is identical to stop_machine() but can be called from a CPU which
619f740e6cdSTejun Heo  * is not active.  The local CPU is in the process of hotplug (so no other
620f740e6cdSTejun Heo  * CPU hotplug can start) and not marked active and doesn't have enough
621f740e6cdSTejun Heo  * context to sleep.
622f740e6cdSTejun Heo  *
623f740e6cdSTejun Heo  * This function provides stop_machine() functionality for such state by
624f740e6cdSTejun Heo  * using busy-wait for synchronization and executing @fn directly for local
625f740e6cdSTejun Heo  * CPU.
626f740e6cdSTejun Heo  *
627f740e6cdSTejun Heo  * CONTEXT:
628f740e6cdSTejun Heo  * Local CPU is inactive.  Temporarily stops all active CPUs.
629f740e6cdSTejun Heo  *
630f740e6cdSTejun Heo  * RETURNS:
631f740e6cdSTejun Heo  * 0 if all executions of @fn returned 0, any non zero return value if any
632f740e6cdSTejun Heo  * returned non zero.
633f740e6cdSTejun Heo  */
6349a301f22SOleg Nesterov int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
635f740e6cdSTejun Heo 				  const struct cpumask *cpus)
636f740e6cdSTejun Heo {
6371be0bd77SPeter Zijlstra 	struct multi_stop_data msdata = { .fn = fn, .data = data,
638f740e6cdSTejun Heo 					    .active_cpus = cpus };
639f740e6cdSTejun Heo 	struct cpu_stop_done done;
640f740e6cdSTejun Heo 	int ret;
641f740e6cdSTejun Heo 
642f740e6cdSTejun Heo 	/* Local CPU must be inactive and CPU hotplug in progress. */
643f740e6cdSTejun Heo 	BUG_ON(cpu_active(raw_smp_processor_id()));
6441be0bd77SPeter Zijlstra 	msdata.num_threads = num_active_cpus() + 1;	/* +1 for local */
645f740e6cdSTejun Heo 
646f740e6cdSTejun Heo 	/* No proper task established and can't sleep - busy wait for lock. */
647f740e6cdSTejun Heo 	while (!mutex_trylock(&stop_cpus_mutex))
648f740e6cdSTejun Heo 		cpu_relax();
649f740e6cdSTejun Heo 
650f740e6cdSTejun Heo 	/* Schedule work on other CPUs and execute directly for local CPU */
6511be0bd77SPeter Zijlstra 	set_state(&msdata, MULTI_STOP_PREPARE);
652f740e6cdSTejun Heo 	cpu_stop_init_done(&done, num_active_cpus());
6531be0bd77SPeter Zijlstra 	queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
654f740e6cdSTejun Heo 			     &done);
6551be0bd77SPeter Zijlstra 	ret = multi_cpu_stop(&msdata);
656f740e6cdSTejun Heo 
657f740e6cdSTejun Heo 	/* Busy wait for completion. */
658f740e6cdSTejun Heo 	while (!completion_done(&done.completion))
659f740e6cdSTejun Heo 		cpu_relax();
660f740e6cdSTejun Heo 
661f740e6cdSTejun Heo 	mutex_unlock(&stop_cpus_mutex);
662f740e6cdSTejun Heo 	return ret ?: done.ret;
663f740e6cdSTejun Heo }
664