xref: /linux/kernel/sched/membarrier.c (revision 79790b6818e96c58fe2bffee1b418c16e64e7b80)
1c942fddfSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
222e4ebb9SMathieu Desnoyers /*
322e4ebb9SMathieu Desnoyers  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
422e4ebb9SMathieu Desnoyers  *
522e4ebb9SMathieu Desnoyers  * membarrier system call
622e4ebb9SMathieu Desnoyers  */
722e4ebb9SMathieu Desnoyers 
822e4ebb9SMathieu Desnoyers /*
925595eb6SMathieu Desnoyers  * For documentation purposes, here are some membarrier ordering
1025595eb6SMathieu Desnoyers  * scenarios to keep in mind:
1125595eb6SMathieu Desnoyers  *
1225595eb6SMathieu Desnoyers  * A) Userspace thread execution after IPI vs membarrier's memory
1325595eb6SMathieu Desnoyers  *    barrier before sending the IPI
1425595eb6SMathieu Desnoyers  *
1525595eb6SMathieu Desnoyers  * Userspace variables:
1625595eb6SMathieu Desnoyers  *
1725595eb6SMathieu Desnoyers  * int x = 0, y = 0;
1825595eb6SMathieu Desnoyers  *
1925595eb6SMathieu Desnoyers  * The memory barrier at the start of membarrier() on CPU0 is necessary in
2025595eb6SMathieu Desnoyers  * order to enforce the guarantee that any writes occurring on CPU0 before
2125595eb6SMathieu Desnoyers  * the membarrier() is executed will be visible to any code executing on
2225595eb6SMathieu Desnoyers  * CPU1 after the IPI-induced memory barrier:
2325595eb6SMathieu Desnoyers  *
2425595eb6SMathieu Desnoyers  *         CPU0                              CPU1
2525595eb6SMathieu Desnoyers  *
2625595eb6SMathieu Desnoyers  *         x = 1
2725595eb6SMathieu Desnoyers  *         membarrier():
2825595eb6SMathieu Desnoyers  *           a: smp_mb()
2925595eb6SMathieu Desnoyers  *           b: send IPI                       IPI-induced mb
3025595eb6SMathieu Desnoyers  *           c: smp_mb()
3125595eb6SMathieu Desnoyers  *         r2 = y
3225595eb6SMathieu Desnoyers  *                                           y = 1
3325595eb6SMathieu Desnoyers  *                                           barrier()
3425595eb6SMathieu Desnoyers  *                                           r1 = x
3525595eb6SMathieu Desnoyers  *
3625595eb6SMathieu Desnoyers  *                     BUG_ON(r1 == 0 && r2 == 0)
3725595eb6SMathieu Desnoyers  *
3825595eb6SMathieu Desnoyers  * The write to y and load from x by CPU1 are unordered by the hardware,
3925595eb6SMathieu Desnoyers  * so it's possible to have "r1 = x" reordered before "y = 1" at any
4025595eb6SMathieu Desnoyers  * point after (b).  If the memory barrier at (a) is omitted, then "x = 1"
4125595eb6SMathieu Desnoyers  * can be reordered after (a) (although not after (c)), so we get r1 == 0
4225595eb6SMathieu Desnoyers  * and r2 == 0.  This violates the guarantee that membarrier() is
4325595eb6SMathieu Desnoyers  * supposed by provide.
4425595eb6SMathieu Desnoyers  *
4525595eb6SMathieu Desnoyers  * The timing of the memory barrier at (a) has to ensure that it executes
4625595eb6SMathieu Desnoyers  * before the IPI-induced memory barrier on CPU1.
4725595eb6SMathieu Desnoyers  *
4825595eb6SMathieu Desnoyers  * B) Userspace thread execution before IPI vs membarrier's memory
4925595eb6SMathieu Desnoyers  *    barrier after completing the IPI
5025595eb6SMathieu Desnoyers  *
5125595eb6SMathieu Desnoyers  * Userspace variables:
5225595eb6SMathieu Desnoyers  *
5325595eb6SMathieu Desnoyers  * int x = 0, y = 0;
5425595eb6SMathieu Desnoyers  *
5525595eb6SMathieu Desnoyers  * The memory barrier at the end of membarrier() on CPU0 is necessary in
5625595eb6SMathieu Desnoyers  * order to enforce the guarantee that any writes occurring on CPU1 before
5725595eb6SMathieu Desnoyers  * the membarrier() is executed will be visible to any code executing on
5825595eb6SMathieu Desnoyers  * CPU0 after the membarrier():
5925595eb6SMathieu Desnoyers  *
6025595eb6SMathieu Desnoyers  *         CPU0                              CPU1
6125595eb6SMathieu Desnoyers  *
6225595eb6SMathieu Desnoyers  *                                           x = 1
6325595eb6SMathieu Desnoyers  *                                           barrier()
6425595eb6SMathieu Desnoyers  *                                           y = 1
6525595eb6SMathieu Desnoyers  *         r2 = y
6625595eb6SMathieu Desnoyers  *         membarrier():
6725595eb6SMathieu Desnoyers  *           a: smp_mb()
6825595eb6SMathieu Desnoyers  *           b: send IPI                       IPI-induced mb
6925595eb6SMathieu Desnoyers  *           c: smp_mb()
7025595eb6SMathieu Desnoyers  *         r1 = x
7125595eb6SMathieu Desnoyers  *         BUG_ON(r1 == 0 && r2 == 1)
7225595eb6SMathieu Desnoyers  *
7325595eb6SMathieu Desnoyers  * The writes to x and y are unordered by the hardware, so it's possible to
7425595eb6SMathieu Desnoyers  * have "r2 = 1" even though the write to x doesn't execute until (b).  If
7525595eb6SMathieu Desnoyers  * the memory barrier at (c) is omitted then "r1 = x" can be reordered
7625595eb6SMathieu Desnoyers  * before (b) (although not before (a)), so we get "r1 = 0".  This violates
7725595eb6SMathieu Desnoyers  * the guarantee that membarrier() is supposed to provide.
7825595eb6SMathieu Desnoyers  *
7925595eb6SMathieu Desnoyers  * The timing of the memory barrier at (c) has to ensure that it executes
8025595eb6SMathieu Desnoyers  * after the IPI-induced memory barrier on CPU1.
8125595eb6SMathieu Desnoyers  *
8225595eb6SMathieu Desnoyers  * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier
8325595eb6SMathieu Desnoyers  *
8425595eb6SMathieu Desnoyers  *           CPU0                            CPU1
8525595eb6SMathieu Desnoyers  *
8625595eb6SMathieu Desnoyers  *           membarrier():
8725595eb6SMathieu Desnoyers  *           a: smp_mb()
8825595eb6SMathieu Desnoyers  *                                           d: switch to kthread (includes mb)
8925595eb6SMathieu Desnoyers  *           b: read rq->curr->mm == NULL
9025595eb6SMathieu Desnoyers  *                                           e: switch to user (includes mb)
9125595eb6SMathieu Desnoyers  *           c: smp_mb()
9225595eb6SMathieu Desnoyers  *
9325595eb6SMathieu Desnoyers  * Using the scenario from (A), we can show that (a) needs to be paired
9425595eb6SMathieu Desnoyers  * with (e). Using the scenario from (B), we can show that (c) needs to
9525595eb6SMathieu Desnoyers  * be paired with (d).
9625595eb6SMathieu Desnoyers  *
9725595eb6SMathieu Desnoyers  * D) exit_mm vs membarrier
9825595eb6SMathieu Desnoyers  *
9925595eb6SMathieu Desnoyers  * Two thread groups are created, A and B.  Thread group B is created by
10025595eb6SMathieu Desnoyers  * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD.
10125595eb6SMathieu Desnoyers  * Let's assume we have a single thread within each thread group (Thread A
10225595eb6SMathieu Desnoyers  * and Thread B).  Thread A runs on CPU0, Thread B runs on CPU1.
10325595eb6SMathieu Desnoyers  *
10425595eb6SMathieu Desnoyers  *           CPU0                            CPU1
10525595eb6SMathieu Desnoyers  *
10625595eb6SMathieu Desnoyers  *           membarrier():
10725595eb6SMathieu Desnoyers  *             a: smp_mb()
10825595eb6SMathieu Desnoyers  *                                           exit_mm():
10925595eb6SMathieu Desnoyers  *                                             d: smp_mb()
11025595eb6SMathieu Desnoyers  *                                             e: current->mm = NULL
11125595eb6SMathieu Desnoyers  *             b: read rq->curr->mm == NULL
11225595eb6SMathieu Desnoyers  *             c: smp_mb()
11325595eb6SMathieu Desnoyers  *
11425595eb6SMathieu Desnoyers  * Using scenario (B), we can show that (c) needs to be paired with (d).
11525595eb6SMathieu Desnoyers  *
11625595eb6SMathieu Desnoyers  * E) kthread_{use,unuse}_mm vs membarrier
11725595eb6SMathieu Desnoyers  *
11825595eb6SMathieu Desnoyers  *           CPU0                            CPU1
11925595eb6SMathieu Desnoyers  *
12025595eb6SMathieu Desnoyers  *           membarrier():
12125595eb6SMathieu Desnoyers  *           a: smp_mb()
12225595eb6SMathieu Desnoyers  *                                           kthread_unuse_mm()
12325595eb6SMathieu Desnoyers  *                                             d: smp_mb()
12425595eb6SMathieu Desnoyers  *                                             e: current->mm = NULL
12525595eb6SMathieu Desnoyers  *           b: read rq->curr->mm == NULL
12625595eb6SMathieu Desnoyers  *                                           kthread_use_mm()
12725595eb6SMathieu Desnoyers  *                                             f: current->mm = mm
12825595eb6SMathieu Desnoyers  *                                             g: smp_mb()
12925595eb6SMathieu Desnoyers  *           c: smp_mb()
13025595eb6SMathieu Desnoyers  *
13125595eb6SMathieu Desnoyers  * Using the scenario from (A), we can show that (a) needs to be paired
13225595eb6SMathieu Desnoyers  * with (g). Using the scenario from (B), we can show that (c) needs to
13325595eb6SMathieu Desnoyers  * be paired with (d).
13425595eb6SMathieu Desnoyers  */
13525595eb6SMathieu Desnoyers 
13625595eb6SMathieu Desnoyers /*
13722e4ebb9SMathieu Desnoyers  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
13822e4ebb9SMathieu Desnoyers  * except MEMBARRIER_CMD_QUERY.
13922e4ebb9SMathieu Desnoyers  */
14070216e18SMathieu Desnoyers #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
14170216e18SMathieu Desnoyers #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK			\
14270216e18SMathieu Desnoyers 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE			\
14370216e18SMathieu Desnoyers 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
14470216e18SMathieu Desnoyers #else
14570216e18SMathieu Desnoyers #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0
14670216e18SMathieu Desnoyers #endif
14770216e18SMathieu Desnoyers 
1482a36ab71SPeter Oskolkov #ifdef CONFIG_RSEQ
14980923261SMathieu Desnoyers #define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK		\
1502a36ab71SPeter Oskolkov 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ			\
15180923261SMathieu Desnoyers 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
1522a36ab71SPeter Oskolkov #else
15380923261SMathieu Desnoyers #define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK	0
1542a36ab71SPeter Oskolkov #endif
1552a36ab71SPeter Oskolkov 
15622e4ebb9SMathieu Desnoyers #define MEMBARRIER_CMD_BITMASK						\
157c5f58bd5SMathieu Desnoyers 	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED	\
158c5f58bd5SMathieu Desnoyers 	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED			\
159c5f58bd5SMathieu Desnoyers 	| MEMBARRIER_CMD_PRIVATE_EXPEDITED				\
16070216e18SMathieu Desnoyers 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\
16180923261SMathieu Desnoyers 	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK		\
162544a4f2eSMichal Clapinski 	| MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK			\
163544a4f2eSMichal Clapinski 	| MEMBARRIER_CMD_GET_REGISTRATIONS)
16422e4ebb9SMathieu Desnoyers 
165944d5fe5SLinus Torvalds static DEFINE_MUTEX(membarrier_ipi_mutex);
166944d5fe5SLinus Torvalds #define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex)
167944d5fe5SLinus Torvalds 
ipi_mb(void * info)16822e4ebb9SMathieu Desnoyers static void ipi_mb(void *info)
16922e4ebb9SMathieu Desnoyers {
17022e4ebb9SMathieu Desnoyers 	smp_mb();	/* IPIs should be serializing but paranoid. */
17122e4ebb9SMathieu Desnoyers }
17222e4ebb9SMathieu Desnoyers 
ipi_sync_core(void * info)173758c9373SAndy Lutomirski static void ipi_sync_core(void *info)
174758c9373SAndy Lutomirski {
175758c9373SAndy Lutomirski 	/*
176758c9373SAndy Lutomirski 	 * The smp_mb() in membarrier after all the IPIs is supposed to
177758c9373SAndy Lutomirski 	 * ensure that memory on remote CPUs that occur before the IPI
178758c9373SAndy Lutomirski 	 * become visible to membarrier()'s caller -- see scenario B in
179758c9373SAndy Lutomirski 	 * the big comment at the top of this file.
180758c9373SAndy Lutomirski 	 *
181758c9373SAndy Lutomirski 	 * A sync_core() would provide this guarantee, but
182758c9373SAndy Lutomirski 	 * sync_core_before_usermode() might end up being deferred until
183758c9373SAndy Lutomirski 	 * after membarrier()'s smp_mb().
184758c9373SAndy Lutomirski 	 */
185758c9373SAndy Lutomirski 	smp_mb();	/* IPIs should be serializing but paranoid. */
186758c9373SAndy Lutomirski 
187758c9373SAndy Lutomirski 	sync_core_before_usermode();
188758c9373SAndy Lutomirski }
189758c9373SAndy Lutomirski 
ipi_rseq(void * info)1902a36ab71SPeter Oskolkov static void ipi_rseq(void *info)
1912a36ab71SPeter Oskolkov {
1922ecedd75SAndy Lutomirski 	/*
1932ecedd75SAndy Lutomirski 	 * Ensure that all stores done by the calling thread are visible
1942ecedd75SAndy Lutomirski 	 * to the current task before the current task resumes.  We could
1952ecedd75SAndy Lutomirski 	 * probably optimize this away on most architectures, but by the
1962ecedd75SAndy Lutomirski 	 * time we've already sent an IPI, the cost of the extra smp_mb()
1972ecedd75SAndy Lutomirski 	 * is negligible.
1982ecedd75SAndy Lutomirski 	 */
1992ecedd75SAndy Lutomirski 	smp_mb();
2002a36ab71SPeter Oskolkov 	rseq_preempt(current);
2012a36ab71SPeter Oskolkov }
2022a36ab71SPeter Oskolkov 
ipi_sync_rq_state(void * info)203227a4aadSMathieu Desnoyers static void ipi_sync_rq_state(void *info)
204227a4aadSMathieu Desnoyers {
205227a4aadSMathieu Desnoyers 	struct mm_struct *mm = (struct mm_struct *) info;
206227a4aadSMathieu Desnoyers 
207227a4aadSMathieu Desnoyers 	if (current->mm != mm)
208227a4aadSMathieu Desnoyers 		return;
209227a4aadSMathieu Desnoyers 	this_cpu_write(runqueues.membarrier_state,
210227a4aadSMathieu Desnoyers 		       atomic_read(&mm->membarrier_state));
211227a4aadSMathieu Desnoyers 	/*
212227a4aadSMathieu Desnoyers 	 * Issue a memory barrier after setting
213227a4aadSMathieu Desnoyers 	 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
214227a4aadSMathieu Desnoyers 	 * guarantee that no memory access following registration is reordered
215227a4aadSMathieu Desnoyers 	 * before registration.
216227a4aadSMathieu Desnoyers 	 */
217227a4aadSMathieu Desnoyers 	smp_mb();
218227a4aadSMathieu Desnoyers }
219227a4aadSMathieu Desnoyers 
membarrier_exec_mmap(struct mm_struct * mm)220227a4aadSMathieu Desnoyers void membarrier_exec_mmap(struct mm_struct *mm)
221227a4aadSMathieu Desnoyers {
222227a4aadSMathieu Desnoyers 	/*
223227a4aadSMathieu Desnoyers 	 * Issue a memory barrier before clearing membarrier_state to
224227a4aadSMathieu Desnoyers 	 * guarantee that no memory access prior to exec is reordered after
225227a4aadSMathieu Desnoyers 	 * clearing this state.
226227a4aadSMathieu Desnoyers 	 */
227227a4aadSMathieu Desnoyers 	smp_mb();
228227a4aadSMathieu Desnoyers 	atomic_set(&mm->membarrier_state, 0);
229227a4aadSMathieu Desnoyers 	/*
230227a4aadSMathieu Desnoyers 	 * Keep the runqueue membarrier_state in sync with this mm
231227a4aadSMathieu Desnoyers 	 * membarrier_state.
232227a4aadSMathieu Desnoyers 	 */
233227a4aadSMathieu Desnoyers 	this_cpu_write(runqueues.membarrier_state, 0);
234227a4aadSMathieu Desnoyers }
235227a4aadSMathieu Desnoyers 
membarrier_update_current_mm(struct mm_struct * next_mm)2365bc78502SMathieu Desnoyers void membarrier_update_current_mm(struct mm_struct *next_mm)
2375bc78502SMathieu Desnoyers {
2385bc78502SMathieu Desnoyers 	struct rq *rq = this_rq();
2395bc78502SMathieu Desnoyers 	int membarrier_state = 0;
2405bc78502SMathieu Desnoyers 
2415bc78502SMathieu Desnoyers 	if (next_mm)
2425bc78502SMathieu Desnoyers 		membarrier_state = atomic_read(&next_mm->membarrier_state);
2435bc78502SMathieu Desnoyers 	if (READ_ONCE(rq->membarrier_state) == membarrier_state)
2445bc78502SMathieu Desnoyers 		return;
2455bc78502SMathieu Desnoyers 	WRITE_ONCE(rq->membarrier_state, membarrier_state);
2465bc78502SMathieu Desnoyers }
2475bc78502SMathieu Desnoyers 
membarrier_global_expedited(void)248c5f58bd5SMathieu Desnoyers static int membarrier_global_expedited(void)
249c5f58bd5SMathieu Desnoyers {
250c5f58bd5SMathieu Desnoyers 	int cpu;
251c5f58bd5SMathieu Desnoyers 	cpumask_var_t tmpmask;
252c5f58bd5SMathieu Desnoyers 
253c5f58bd5SMathieu Desnoyers 	if (num_online_cpus() == 1)
254c5f58bd5SMathieu Desnoyers 		return 0;
255c5f58bd5SMathieu Desnoyers 
256c5f58bd5SMathieu Desnoyers 	/*
257a14d11a0SAndrea Parri 	 * Matches memory barriers after rq->curr modification in
258c5f58bd5SMathieu Desnoyers 	 * scheduler.
259c5f58bd5SMathieu Desnoyers 	 */
260c5f58bd5SMathieu Desnoyers 	smp_mb();	/* system call entry is not a mb. */
261c5f58bd5SMathieu Desnoyers 
262c172e0a3SMathieu Desnoyers 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
263c172e0a3SMathieu Desnoyers 		return -ENOMEM;
264c5f58bd5SMathieu Desnoyers 
265944d5fe5SLinus Torvalds 	SERIALIZE_IPI();
266c5f58bd5SMathieu Desnoyers 	cpus_read_lock();
267227a4aadSMathieu Desnoyers 	rcu_read_lock();
268c5f58bd5SMathieu Desnoyers 	for_each_online_cpu(cpu) {
269c5f58bd5SMathieu Desnoyers 		struct task_struct *p;
270c5f58bd5SMathieu Desnoyers 
271c5f58bd5SMathieu Desnoyers 		/*
272c5f58bd5SMathieu Desnoyers 		 * Skipping the current CPU is OK even through we can be
273c5f58bd5SMathieu Desnoyers 		 * migrated at any point. The current CPU, at the point
274c5f58bd5SMathieu Desnoyers 		 * where we read raw_smp_processor_id(), is ensured to
275c5f58bd5SMathieu Desnoyers 		 * be in program order with respect to the caller
276c5f58bd5SMathieu Desnoyers 		 * thread. Therefore, we can skip this CPU from the
277c5f58bd5SMathieu Desnoyers 		 * iteration.
278c5f58bd5SMathieu Desnoyers 		 */
279c5f58bd5SMathieu Desnoyers 		if (cpu == raw_smp_processor_id())
280c5f58bd5SMathieu Desnoyers 			continue;
28197fb7a0aSIngo Molnar 
282227a4aadSMathieu Desnoyers 		if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
283227a4aadSMathieu Desnoyers 		    MEMBARRIER_STATE_GLOBAL_EXPEDITED))
284227a4aadSMathieu Desnoyers 			continue;
285227a4aadSMathieu Desnoyers 
286227a4aadSMathieu Desnoyers 		/*
287618758edSMathieu Desnoyers 		 * Skip the CPU if it runs a kernel thread which is not using
288618758edSMathieu Desnoyers 		 * a task mm.
289227a4aadSMathieu Desnoyers 		 */
290154abafcSEric W. Biederman 		p = rcu_dereference(cpu_rq(cpu)->curr);
291618758edSMathieu Desnoyers 		if (!p->mm)
292227a4aadSMathieu Desnoyers 			continue;
293227a4aadSMathieu Desnoyers 
294c5f58bd5SMathieu Desnoyers 		__cpumask_set_cpu(cpu, tmpmask);
295c5f58bd5SMathieu Desnoyers 	}
296c5f58bd5SMathieu Desnoyers 	rcu_read_unlock();
297c172e0a3SMathieu Desnoyers 
298c5f58bd5SMathieu Desnoyers 	preempt_disable();
299c5f58bd5SMathieu Desnoyers 	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
300c5f58bd5SMathieu Desnoyers 	preempt_enable();
301c172e0a3SMathieu Desnoyers 
302c5f58bd5SMathieu Desnoyers 	free_cpumask_var(tmpmask);
303c5f58bd5SMathieu Desnoyers 	cpus_read_unlock();
304c5f58bd5SMathieu Desnoyers 
305c5f58bd5SMathieu Desnoyers 	/*
306c5f58bd5SMathieu Desnoyers 	 * Memory barrier on the caller thread _after_ we finished
307a14d11a0SAndrea Parri 	 * waiting for the last IPI. Matches memory barriers before
308c5f58bd5SMathieu Desnoyers 	 * rq->curr modification in scheduler.
309c5f58bd5SMathieu Desnoyers 	 */
310c5f58bd5SMathieu Desnoyers 	smp_mb();	/* exit from system call is not a mb */
311c5f58bd5SMathieu Desnoyers 	return 0;
312c5f58bd5SMathieu Desnoyers }
313c5f58bd5SMathieu Desnoyers 
membarrier_private_expedited(int flags,int cpu_id)3142a36ab71SPeter Oskolkov static int membarrier_private_expedited(int flags, int cpu_id)
31522e4ebb9SMathieu Desnoyers {
31622e4ebb9SMathieu Desnoyers 	cpumask_var_t tmpmask;
317c6d68c1cSMathieu Desnoyers 	struct mm_struct *mm = current->mm;
3182a36ab71SPeter Oskolkov 	smp_call_func_t ipi_func = ipi_mb;
31922e4ebb9SMathieu Desnoyers 
3202a36ab71SPeter Oskolkov 	if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
32170216e18SMathieu Desnoyers 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
32270216e18SMathieu Desnoyers 			return -EINVAL;
323c6d68c1cSMathieu Desnoyers 		if (!(atomic_read(&mm->membarrier_state) &
32470216e18SMathieu Desnoyers 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
325a961e409SMathieu Desnoyers 			return -EPERM;
326758c9373SAndy Lutomirski 		ipi_func = ipi_sync_core;
3274ff4c745SAndrea Parri 		prepare_sync_core_cmd(mm);
3282a36ab71SPeter Oskolkov 	} else if (flags == MEMBARRIER_FLAG_RSEQ) {
3292a36ab71SPeter Oskolkov 		if (!IS_ENABLED(CONFIG_RSEQ))
3302a36ab71SPeter Oskolkov 			return -EINVAL;
3312a36ab71SPeter Oskolkov 		if (!(atomic_read(&mm->membarrier_state) &
3322a36ab71SPeter Oskolkov 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
3332a36ab71SPeter Oskolkov 			return -EPERM;
3342a36ab71SPeter Oskolkov 		ipi_func = ipi_rseq;
33570216e18SMathieu Desnoyers 	} else {
3362a36ab71SPeter Oskolkov 		WARN_ON_ONCE(flags);
337c6d68c1cSMathieu Desnoyers 		if (!(atomic_read(&mm->membarrier_state) &
33870216e18SMathieu Desnoyers 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
33970216e18SMathieu Desnoyers 			return -EPERM;
34070216e18SMathieu Desnoyers 	}
341a961e409SMathieu Desnoyers 
342e45cdc71SAndy Lutomirski 	if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
343e45cdc71SAndy Lutomirski 	    (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
344a961e409SMathieu Desnoyers 		return 0;
34522e4ebb9SMathieu Desnoyers 
34622e4ebb9SMathieu Desnoyers 	/*
347a14d11a0SAndrea Parri 	 * Matches memory barriers after rq->curr modification in
34822e4ebb9SMathieu Desnoyers 	 * scheduler.
349*cd9b2901SAndrea Parri 	 *
350*cd9b2901SAndrea Parri 	 * On RISC-V, this barrier pairing is also needed for the
351*cd9b2901SAndrea Parri 	 * SYNC_CORE command when switching between processes, cf.
352*cd9b2901SAndrea Parri 	 * the inline comments in membarrier_arch_switch_mm().
35322e4ebb9SMathieu Desnoyers 	 */
35422e4ebb9SMathieu Desnoyers 	smp_mb();	/* system call entry is not a mb. */
35522e4ebb9SMathieu Desnoyers 
3562a36ab71SPeter Oskolkov 	if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
357c172e0a3SMathieu Desnoyers 		return -ENOMEM;
35822e4ebb9SMathieu Desnoyers 
359944d5fe5SLinus Torvalds 	SERIALIZE_IPI();
36022e4ebb9SMathieu Desnoyers 	cpus_read_lock();
3612a36ab71SPeter Oskolkov 
3622a36ab71SPeter Oskolkov 	if (cpu_id >= 0) {
3632a36ab71SPeter Oskolkov 		struct task_struct *p;
3642a36ab71SPeter Oskolkov 
3652a36ab71SPeter Oskolkov 		if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
3662a36ab71SPeter Oskolkov 			goto out;
3672a36ab71SPeter Oskolkov 		rcu_read_lock();
3682a36ab71SPeter Oskolkov 		p = rcu_dereference(cpu_rq(cpu_id)->curr);
3692a36ab71SPeter Oskolkov 		if (!p || p->mm != mm) {
3702a36ab71SPeter Oskolkov 			rcu_read_unlock();
3712a36ab71SPeter Oskolkov 			goto out;
3722a36ab71SPeter Oskolkov 		}
3732a36ab71SPeter Oskolkov 		rcu_read_unlock();
3742a36ab71SPeter Oskolkov 	} else {
3752a36ab71SPeter Oskolkov 		int cpu;
3762a36ab71SPeter Oskolkov 
377227a4aadSMathieu Desnoyers 		rcu_read_lock();
37822e4ebb9SMathieu Desnoyers 		for_each_online_cpu(cpu) {
37922e4ebb9SMathieu Desnoyers 			struct task_struct *p;
38022e4ebb9SMathieu Desnoyers 
381154abafcSEric W. Biederman 			p = rcu_dereference(cpu_rq(cpu)->curr);
382c172e0a3SMathieu Desnoyers 			if (p && p->mm == mm)
38322e4ebb9SMathieu Desnoyers 				__cpumask_set_cpu(cpu, tmpmask);
38422e4ebb9SMathieu Desnoyers 		}
385227a4aadSMathieu Desnoyers 		rcu_read_unlock();
3862a36ab71SPeter Oskolkov 	}
387c172e0a3SMathieu Desnoyers 
388e45cdc71SAndy Lutomirski 	if (cpu_id >= 0) {
389e45cdc71SAndy Lutomirski 		/*
390e45cdc71SAndy Lutomirski 		 * smp_call_function_single() will call ipi_func() if cpu_id
391e45cdc71SAndy Lutomirski 		 * is the calling CPU.
392e45cdc71SAndy Lutomirski 		 */
3932a36ab71SPeter Oskolkov 		smp_call_function_single(cpu_id, ipi_func, NULL, 1);
394e45cdc71SAndy Lutomirski 	} else {
395e45cdc71SAndy Lutomirski 		/*
396e45cdc71SAndy Lutomirski 		 * For regular membarrier, we can save a few cycles by
397e45cdc71SAndy Lutomirski 		 * skipping the current cpu -- we're about to do smp_mb()
398e45cdc71SAndy Lutomirski 		 * below, and if we migrate to a different cpu, this cpu
399e45cdc71SAndy Lutomirski 		 * and the new cpu will execute a full barrier in the
400e45cdc71SAndy Lutomirski 		 * scheduler.
401e45cdc71SAndy Lutomirski 		 *
402e45cdc71SAndy Lutomirski 		 * For SYNC_CORE, we do need a barrier on the current cpu --
403e45cdc71SAndy Lutomirski 		 * otherwise, if we are migrated and replaced by a different
404e45cdc71SAndy Lutomirski 		 * task in the same mm just before, during, or after
405e45cdc71SAndy Lutomirski 		 * membarrier, we will end up with some thread in the mm
406e45cdc71SAndy Lutomirski 		 * running without a core sync.
407e45cdc71SAndy Lutomirski 		 *
408e45cdc71SAndy Lutomirski 		 * For RSEQ, don't rseq_preempt() the caller.  User code
409e45cdc71SAndy Lutomirski 		 * is not supposed to issue syscalls at all from inside an
410e45cdc71SAndy Lutomirski 		 * rseq critical section.
411e45cdc71SAndy Lutomirski 		 */
412e45cdc71SAndy Lutomirski 		if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
413e45cdc71SAndy Lutomirski 			preempt_disable();
414e45cdc71SAndy Lutomirski 			smp_call_function_many(tmpmask, ipi_func, NULL, true);
41554167607SMathieu Desnoyers 			preempt_enable();
416e45cdc71SAndy Lutomirski 		} else {
417e45cdc71SAndy Lutomirski 			on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
418e45cdc71SAndy Lutomirski 		}
419e45cdc71SAndy Lutomirski 	}
420c172e0a3SMathieu Desnoyers 
4212a36ab71SPeter Oskolkov out:
4222a36ab71SPeter Oskolkov 	if (cpu_id < 0)
42322e4ebb9SMathieu Desnoyers 		free_cpumask_var(tmpmask);
42422e4ebb9SMathieu Desnoyers 	cpus_read_unlock();
42522e4ebb9SMathieu Desnoyers 
42622e4ebb9SMathieu Desnoyers 	/*
42722e4ebb9SMathieu Desnoyers 	 * Memory barrier on the caller thread _after_ we finished
428a14d11a0SAndrea Parri 	 * waiting for the last IPI. Matches memory barriers before
42922e4ebb9SMathieu Desnoyers 	 * rq->curr modification in scheduler.
43022e4ebb9SMathieu Desnoyers 	 */
43122e4ebb9SMathieu Desnoyers 	smp_mb();	/* exit from system call is not a mb */
43297fb7a0aSIngo Molnar 
433a961e409SMathieu Desnoyers 	return 0;
434a961e409SMathieu Desnoyers }
435a961e409SMathieu Desnoyers 
sync_runqueues_membarrier_state(struct mm_struct * mm)436227a4aadSMathieu Desnoyers static int sync_runqueues_membarrier_state(struct mm_struct *mm)
437227a4aadSMathieu Desnoyers {
438227a4aadSMathieu Desnoyers 	int membarrier_state = atomic_read(&mm->membarrier_state);
439227a4aadSMathieu Desnoyers 	cpumask_var_t tmpmask;
440227a4aadSMathieu Desnoyers 	int cpu;
441227a4aadSMathieu Desnoyers 
442227a4aadSMathieu Desnoyers 	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
443227a4aadSMathieu Desnoyers 		this_cpu_write(runqueues.membarrier_state, membarrier_state);
444227a4aadSMathieu Desnoyers 
445227a4aadSMathieu Desnoyers 		/*
446227a4aadSMathieu Desnoyers 		 * For single mm user, we can simply issue a memory barrier
447227a4aadSMathieu Desnoyers 		 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
448227a4aadSMathieu Desnoyers 		 * mm and in the current runqueue to guarantee that no memory
449227a4aadSMathieu Desnoyers 		 * access following registration is reordered before
450227a4aadSMathieu Desnoyers 		 * registration.
451227a4aadSMathieu Desnoyers 		 */
452227a4aadSMathieu Desnoyers 		smp_mb();
453227a4aadSMathieu Desnoyers 		return 0;
454227a4aadSMathieu Desnoyers 	}
455227a4aadSMathieu Desnoyers 
456227a4aadSMathieu Desnoyers 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
457227a4aadSMathieu Desnoyers 		return -ENOMEM;
458227a4aadSMathieu Desnoyers 
459227a4aadSMathieu Desnoyers 	/*
460227a4aadSMathieu Desnoyers 	 * For mm with multiple users, we need to ensure all future
461227a4aadSMathieu Desnoyers 	 * scheduler executions will observe @mm's new membarrier
462227a4aadSMathieu Desnoyers 	 * state.
463227a4aadSMathieu Desnoyers 	 */
464227a4aadSMathieu Desnoyers 	synchronize_rcu();
465227a4aadSMathieu Desnoyers 
466227a4aadSMathieu Desnoyers 	/*
467227a4aadSMathieu Desnoyers 	 * For each cpu runqueue, if the task's mm match @mm, ensure that all
468c034f48eSRandy Dunlap 	 * @mm's membarrier state set bits are also set in the runqueue's
469227a4aadSMathieu Desnoyers 	 * membarrier state. This ensures that a runqueue scheduling
470227a4aadSMathieu Desnoyers 	 * between threads which are users of @mm has its membarrier state
471227a4aadSMathieu Desnoyers 	 * updated.
472227a4aadSMathieu Desnoyers 	 */
473944d5fe5SLinus Torvalds 	SERIALIZE_IPI();
474227a4aadSMathieu Desnoyers 	cpus_read_lock();
475227a4aadSMathieu Desnoyers 	rcu_read_lock();
476227a4aadSMathieu Desnoyers 	for_each_online_cpu(cpu) {
477227a4aadSMathieu Desnoyers 		struct rq *rq = cpu_rq(cpu);
478227a4aadSMathieu Desnoyers 		struct task_struct *p;
479227a4aadSMathieu Desnoyers 
480c172e0a3SMathieu Desnoyers 		p = rcu_dereference(rq->curr);
481227a4aadSMathieu Desnoyers 		if (p && p->mm == mm)
482227a4aadSMathieu Desnoyers 			__cpumask_set_cpu(cpu, tmpmask);
483227a4aadSMathieu Desnoyers 	}
484227a4aadSMathieu Desnoyers 	rcu_read_unlock();
485227a4aadSMathieu Desnoyers 
486ce29ddc4SMathieu Desnoyers 	on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
487227a4aadSMathieu Desnoyers 
488227a4aadSMathieu Desnoyers 	free_cpumask_var(tmpmask);
489227a4aadSMathieu Desnoyers 	cpus_read_unlock();
490227a4aadSMathieu Desnoyers 
491227a4aadSMathieu Desnoyers 	return 0;
492227a4aadSMathieu Desnoyers }
493227a4aadSMathieu Desnoyers 
membarrier_register_global_expedited(void)494c5f58bd5SMathieu Desnoyers static int membarrier_register_global_expedited(void)
495c5f58bd5SMathieu Desnoyers {
496c5f58bd5SMathieu Desnoyers 	struct task_struct *p = current;
497c5f58bd5SMathieu Desnoyers 	struct mm_struct *mm = p->mm;
498227a4aadSMathieu Desnoyers 	int ret;
499c5f58bd5SMathieu Desnoyers 
500c5f58bd5SMathieu Desnoyers 	if (atomic_read(&mm->membarrier_state) &
501c5f58bd5SMathieu Desnoyers 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
502c5f58bd5SMathieu Desnoyers 		return 0;
503c5f58bd5SMathieu Desnoyers 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
504227a4aadSMathieu Desnoyers 	ret = sync_runqueues_membarrier_state(mm);
505227a4aadSMathieu Desnoyers 	if (ret)
506227a4aadSMathieu Desnoyers 		return ret;
507c5f58bd5SMathieu Desnoyers 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
508c5f58bd5SMathieu Desnoyers 		  &mm->membarrier_state);
50997fb7a0aSIngo Molnar 
510c5f58bd5SMathieu Desnoyers 	return 0;
511c5f58bd5SMathieu Desnoyers }
512c5f58bd5SMathieu Desnoyers 
membarrier_register_private_expedited(int flags)51370216e18SMathieu Desnoyers static int membarrier_register_private_expedited(int flags)
514a961e409SMathieu Desnoyers {
515a961e409SMathieu Desnoyers 	struct task_struct *p = current;
516a961e409SMathieu Desnoyers 	struct mm_struct *mm = p->mm;
517227a4aadSMathieu Desnoyers 	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
518227a4aadSMathieu Desnoyers 	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
519227a4aadSMathieu Desnoyers 	    ret;
52070216e18SMathieu Desnoyers 
5212a36ab71SPeter Oskolkov 	if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
52270216e18SMathieu Desnoyers 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
52370216e18SMathieu Desnoyers 			return -EINVAL;
524227a4aadSMathieu Desnoyers 		ready_state =
525227a4aadSMathieu Desnoyers 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
5262a36ab71SPeter Oskolkov 	} else if (flags == MEMBARRIER_FLAG_RSEQ) {
5272a36ab71SPeter Oskolkov 		if (!IS_ENABLED(CONFIG_RSEQ))
5282a36ab71SPeter Oskolkov 			return -EINVAL;
5292a36ab71SPeter Oskolkov 		ready_state =
5302a36ab71SPeter Oskolkov 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
5312a36ab71SPeter Oskolkov 	} else {
5322a36ab71SPeter Oskolkov 		WARN_ON_ONCE(flags);
53370216e18SMathieu Desnoyers 	}
534a961e409SMathieu Desnoyers 
535a961e409SMathieu Desnoyers 	/*
536a961e409SMathieu Desnoyers 	 * We need to consider threads belonging to different thread
537a961e409SMathieu Desnoyers 	 * groups, which use the same mm. (CLONE_VM but not
538a961e409SMathieu Desnoyers 	 * CLONE_THREAD).
539a961e409SMathieu Desnoyers 	 */
540227a4aadSMathieu Desnoyers 	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
541c5f58bd5SMathieu Desnoyers 		return 0;
54270216e18SMathieu Desnoyers 	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
543227a4aadSMathieu Desnoyers 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
5442a36ab71SPeter Oskolkov 	if (flags & MEMBARRIER_FLAG_RSEQ)
5452a36ab71SPeter Oskolkov 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
546227a4aadSMathieu Desnoyers 	atomic_or(set_state, &mm->membarrier_state);
547227a4aadSMathieu Desnoyers 	ret = sync_runqueues_membarrier_state(mm);
548227a4aadSMathieu Desnoyers 	if (ret)
549227a4aadSMathieu Desnoyers 		return ret;
550227a4aadSMathieu Desnoyers 	atomic_or(ready_state, &mm->membarrier_state);
55197fb7a0aSIngo Molnar 
552c5f58bd5SMathieu Desnoyers 	return 0;
55322e4ebb9SMathieu Desnoyers }
55422e4ebb9SMathieu Desnoyers 
membarrier_get_registrations(void)555544a4f2eSMichal Clapinski static int membarrier_get_registrations(void)
556544a4f2eSMichal Clapinski {
557544a4f2eSMichal Clapinski 	struct task_struct *p = current;
558544a4f2eSMichal Clapinski 	struct mm_struct *mm = p->mm;
559544a4f2eSMichal Clapinski 	int registrations_mask = 0, membarrier_state, i;
560544a4f2eSMichal Clapinski 	static const int states[] = {
561544a4f2eSMichal Clapinski 		MEMBARRIER_STATE_GLOBAL_EXPEDITED |
562544a4f2eSMichal Clapinski 			MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
563544a4f2eSMichal Clapinski 		MEMBARRIER_STATE_PRIVATE_EXPEDITED |
564544a4f2eSMichal Clapinski 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
565544a4f2eSMichal Clapinski 		MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE |
566544a4f2eSMichal Clapinski 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY,
567544a4f2eSMichal Clapinski 		MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ |
568544a4f2eSMichal Clapinski 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY
569544a4f2eSMichal Clapinski 	};
570544a4f2eSMichal Clapinski 	static const int registration_cmds[] = {
571544a4f2eSMichal Clapinski 		MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED,
572544a4f2eSMichal Clapinski 		MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED,
573544a4f2eSMichal Clapinski 		MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE,
574544a4f2eSMichal Clapinski 		MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
575544a4f2eSMichal Clapinski 	};
576544a4f2eSMichal Clapinski 	BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds));
577544a4f2eSMichal Clapinski 
578544a4f2eSMichal Clapinski 	membarrier_state = atomic_read(&mm->membarrier_state);
579544a4f2eSMichal Clapinski 	for (i = 0; i < ARRAY_SIZE(states); ++i) {
580544a4f2eSMichal Clapinski 		if (membarrier_state & states[i]) {
581544a4f2eSMichal Clapinski 			registrations_mask |= registration_cmds[i];
582544a4f2eSMichal Clapinski 			membarrier_state &= ~states[i];
583544a4f2eSMichal Clapinski 		}
584544a4f2eSMichal Clapinski 	}
585544a4f2eSMichal Clapinski 	WARN_ON_ONCE(membarrier_state != 0);
586544a4f2eSMichal Clapinski 	return registrations_mask;
587544a4f2eSMichal Clapinski }
588544a4f2eSMichal Clapinski 
58922e4ebb9SMathieu Desnoyers /**
59022e4ebb9SMathieu Desnoyers  * sys_membarrier - issue memory barriers on a set of threads
59122e4ebb9SMathieu Desnoyers  * @cmd:    Takes command values defined in enum membarrier_cmd.
5922a36ab71SPeter Oskolkov  * @flags:  Currently needs to be 0 for all commands other than
5932a36ab71SPeter Oskolkov  *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
5942a36ab71SPeter Oskolkov  *          case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
5952a36ab71SPeter Oskolkov  *          contains the CPU on which to interrupt (= restart)
5962a36ab71SPeter Oskolkov  *          the RSEQ critical section.
5972a36ab71SPeter Oskolkov  * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
5982a36ab71SPeter Oskolkov  *          RSEQ CS should be interrupted (@cmd must be
5992a36ab71SPeter Oskolkov  *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
60022e4ebb9SMathieu Desnoyers  *
60122e4ebb9SMathieu Desnoyers  * If this system call is not implemented, -ENOSYS is returned. If the
60222e4ebb9SMathieu Desnoyers  * command specified does not exist, not available on the running
60322e4ebb9SMathieu Desnoyers  * kernel, or if the command argument is invalid, this system call
60422e4ebb9SMathieu Desnoyers  * returns -EINVAL. For a given command, with flags argument set to 0,
605227a4aadSMathieu Desnoyers  * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
606227a4aadSMathieu Desnoyers  * always return the same value until reboot. In addition, it can return
607227a4aadSMathieu Desnoyers  * -ENOMEM if there is not enough memory available to perform the system
608227a4aadSMathieu Desnoyers  * call.
60922e4ebb9SMathieu Desnoyers  *
61022e4ebb9SMathieu Desnoyers  * All memory accesses performed in program order from each targeted thread
61122e4ebb9SMathieu Desnoyers  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
61222e4ebb9SMathieu Desnoyers  * the semantic "barrier()" to represent a compiler barrier forcing memory
61322e4ebb9SMathieu Desnoyers  * accesses to be performed in program order across the barrier, and
61422e4ebb9SMathieu Desnoyers  * smp_mb() to represent explicit memory barriers forcing full memory
61522e4ebb9SMathieu Desnoyers  * ordering across the barrier, we have the following ordering table for
61622e4ebb9SMathieu Desnoyers  * each pair of barrier(), sys_membarrier() and smp_mb():
61722e4ebb9SMathieu Desnoyers  *
61822e4ebb9SMathieu Desnoyers  * The pair ordering is detailed as (O: ordered, X: not ordered):
61922e4ebb9SMathieu Desnoyers  *
62022e4ebb9SMathieu Desnoyers  *                        barrier()   smp_mb() sys_membarrier()
62122e4ebb9SMathieu Desnoyers  *        barrier()          X           X            O
62222e4ebb9SMathieu Desnoyers  *        smp_mb()           X           O            O
62322e4ebb9SMathieu Desnoyers  *        sys_membarrier()   O           O            O
62422e4ebb9SMathieu Desnoyers  */
SYSCALL_DEFINE3(membarrier,int,cmd,unsigned int,flags,int,cpu_id)6252a36ab71SPeter Oskolkov SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
62622e4ebb9SMathieu Desnoyers {
6272a36ab71SPeter Oskolkov 	switch (cmd) {
6282a36ab71SPeter Oskolkov 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
6292a36ab71SPeter Oskolkov 		if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
6302a36ab71SPeter Oskolkov 			return -EINVAL;
6312a36ab71SPeter Oskolkov 		break;
6322a36ab71SPeter Oskolkov 	default:
63322e4ebb9SMathieu Desnoyers 		if (unlikely(flags))
63422e4ebb9SMathieu Desnoyers 			return -EINVAL;
6352a36ab71SPeter Oskolkov 	}
6362a36ab71SPeter Oskolkov 
6372a36ab71SPeter Oskolkov 	if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
6382a36ab71SPeter Oskolkov 		cpu_id = -1;
6392a36ab71SPeter Oskolkov 
64022e4ebb9SMathieu Desnoyers 	switch (cmd) {
64122e4ebb9SMathieu Desnoyers 	case MEMBARRIER_CMD_QUERY:
64222e4ebb9SMathieu Desnoyers 	{
64322e4ebb9SMathieu Desnoyers 		int cmd_mask = MEMBARRIER_CMD_BITMASK;
64422e4ebb9SMathieu Desnoyers 
64522e4ebb9SMathieu Desnoyers 		if (tick_nohz_full_enabled())
646c5f58bd5SMathieu Desnoyers 			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
64722e4ebb9SMathieu Desnoyers 		return cmd_mask;
64822e4ebb9SMathieu Desnoyers 	}
649c5f58bd5SMathieu Desnoyers 	case MEMBARRIER_CMD_GLOBAL:
650c5f58bd5SMathieu Desnoyers 		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
65122e4ebb9SMathieu Desnoyers 		if (tick_nohz_full_enabled())
65222e4ebb9SMathieu Desnoyers 			return -EINVAL;
65322e4ebb9SMathieu Desnoyers 		if (num_online_cpus() > 1)
65478d125d3SPaul E. McKenney 			synchronize_rcu();
65522e4ebb9SMathieu Desnoyers 		return 0;
656c5f58bd5SMathieu Desnoyers 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
657c5f58bd5SMathieu Desnoyers 		return membarrier_global_expedited();
658c5f58bd5SMathieu Desnoyers 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
659c5f58bd5SMathieu Desnoyers 		return membarrier_register_global_expedited();
66022e4ebb9SMathieu Desnoyers 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
6612a36ab71SPeter Oskolkov 		return membarrier_private_expedited(0, cpu_id);
662a961e409SMathieu Desnoyers 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
66370216e18SMathieu Desnoyers 		return membarrier_register_private_expedited(0);
66470216e18SMathieu Desnoyers 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
6652a36ab71SPeter Oskolkov 		return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
66670216e18SMathieu Desnoyers 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
66770216e18SMathieu Desnoyers 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
6682a36ab71SPeter Oskolkov 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
6692a36ab71SPeter Oskolkov 		return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
6702a36ab71SPeter Oskolkov 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
6712a36ab71SPeter Oskolkov 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
672544a4f2eSMichal Clapinski 	case MEMBARRIER_CMD_GET_REGISTRATIONS:
673544a4f2eSMichal Clapinski 		return membarrier_get_registrations();
67422e4ebb9SMathieu Desnoyers 	default:
67522e4ebb9SMathieu Desnoyers 		return -EINVAL;
67622e4ebb9SMathieu Desnoyers 	}
67722e4ebb9SMathieu Desnoyers }
678