xref: /linux/kernel/sched/membarrier.c (revision 6b3f7af57881f6d6250c6dcc4d910fe8e855a607)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4  *
5  * membarrier system call
6  */
7 #include <uapi/linux/membarrier.h>
8 #include "sched.h"
9 
10 /*
11  * For documentation purposes, here are some membarrier ordering
12  * scenarios to keep in mind:
13  *
14  * A) Userspace thread execution after IPI vs membarrier's memory
15  *    barrier before sending the IPI
16  *
17  * Userspace variables:
18  *
19  * int x = 0, y = 0;
20  *
21  * The memory barrier at the start of membarrier() on CPU0 is necessary in
22  * order to enforce the guarantee that any writes occurring on CPU0 before
23  * the membarrier() is executed will be visible to any code executing on
24  * CPU1 after the IPI-induced memory barrier:
25  *
26  *         CPU0                              CPU1
27  *
28  *         x = 1
29  *         membarrier():
30  *           a: smp_mb()
31  *           b: send IPI                       IPI-induced mb
32  *           c: smp_mb()
33  *         r2 = y
34  *                                           y = 1
35  *                                           barrier()
36  *                                           r1 = x
37  *
38  *                     BUG_ON(r1 == 0 && r2 == 0)
39  *
40  * The write to y and load from x by CPU1 are unordered by the hardware,
41  * so it's possible to have "r1 = x" reordered before "y = 1" at any
42  * point after (b).  If the memory barrier at (a) is omitted, then "x = 1"
43  * can be reordered after (a) (although not after (c)), so we get r1 == 0
44  * and r2 == 0.  This violates the guarantee that membarrier() is
45  * supposed by provide.
46  *
47  * The timing of the memory barrier at (a) has to ensure that it executes
48  * before the IPI-induced memory barrier on CPU1.
49  *
50  * B) Userspace thread execution before IPI vs membarrier's memory
51  *    barrier after completing the IPI
52  *
53  * Userspace variables:
54  *
55  * int x = 0, y = 0;
56  *
57  * The memory barrier at the end of membarrier() on CPU0 is necessary in
58  * order to enforce the guarantee that any writes occurring on CPU1 before
59  * the membarrier() is executed will be visible to any code executing on
60  * CPU0 after the membarrier():
61  *
62  *         CPU0                              CPU1
63  *
64  *                                           x = 1
65  *                                           barrier()
66  *                                           y = 1
67  *         r2 = y
68  *         membarrier():
69  *           a: smp_mb()
70  *           b: send IPI                       IPI-induced mb
71  *           c: smp_mb()
72  *         r1 = x
73  *         BUG_ON(r1 == 0 && r2 == 1)
74  *
75  * The writes to x and y are unordered by the hardware, so it's possible to
76  * have "r2 = 1" even though the write to x doesn't execute until (b).  If
77  * the memory barrier at (c) is omitted then "r1 = x" can be reordered
78  * before (b) (although not before (a)), so we get "r1 = 0".  This violates
79  * the guarantee that membarrier() is supposed to provide.
80  *
81  * The timing of the memory barrier at (c) has to ensure that it executes
82  * after the IPI-induced memory barrier on CPU1.
83  *
84  * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier
85  *
86  *           CPU0                            CPU1
87  *
88  *           membarrier():
89  *           a: smp_mb()
90  *                                           d: switch to kthread (includes mb)
91  *           b: read rq->curr->mm == NULL
92  *                                           e: switch to user (includes mb)
93  *           c: smp_mb()
94  *
95  * Using the scenario from (A), we can show that (a) needs to be paired
96  * with (e). Using the scenario from (B), we can show that (c) needs to
97  * be paired with (d).
98  *
99  * D) exit_mm vs membarrier
100  *
101  * Two thread groups are created, A and B.  Thread group B is created by
102  * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD.
103  * Let's assume we have a single thread within each thread group (Thread A
104  * and Thread B).  Thread A runs on CPU0, Thread B runs on CPU1.
105  *
106  *           CPU0                            CPU1
107  *
108  *           membarrier():
109  *             a: smp_mb()
110  *                                           exit_mm():
111  *                                             d: smp_mb()
112  *                                             e: current->mm = NULL
113  *             b: read rq->curr->mm == NULL
114  *             c: smp_mb()
115  *
116  * Using scenario (B), we can show that (c) needs to be paired with (d).
117  *
118  * E) kthread_{use,unuse}_mm vs membarrier
119  *
120  *           CPU0                            CPU1
121  *
122  *           membarrier():
123  *           a: smp_mb()
124  *                                           kthread_unuse_mm()
125  *                                             d: smp_mb()
126  *                                             e: current->mm = NULL
127  *           b: read rq->curr->mm == NULL
128  *                                           kthread_use_mm()
129  *                                             f: current->mm = mm
130  *                                             g: smp_mb()
131  *           c: smp_mb()
132  *
133  * Using the scenario from (A), we can show that (a) needs to be paired
134  * with (g). Using the scenario from (B), we can show that (c) needs to
135  * be paired with (d).
136  */
137 
138 /*
139  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
140  * except MEMBARRIER_CMD_QUERY.
141  */
142 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
143 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK			\
144 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE			\
145 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
146 #else
147 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0
148 #endif
149 
150 #ifdef CONFIG_RSEQ
151 #define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK		\
152 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ			\
153 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
154 #else
155 #define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK	0
156 #endif
157 
158 #define MEMBARRIER_CMD_BITMASK						\
159 	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED	\
160 	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED			\
161 	| MEMBARRIER_CMD_PRIVATE_EXPEDITED				\
162 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\
163 	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK		\
164 	| MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK			\
165 	| MEMBARRIER_CMD_GET_REGISTRATIONS)
166 
167 /*
168  * Scoped guard for memory barriers on entry and exit.
169  * Matches memory barriers before & after rq->curr modification in scheduler.
170  */
171 DEFINE_LOCK_GUARD_0(mb, smp_mb(), smp_mb())
172 static DEFINE_MUTEX(membarrier_ipi_mutex);
173 static DEFINE_PER_CPU(struct mutex, membarrier_cpu_mutexes);
174 
175 #define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex)
176 #define SERIALIZE_IPI_CPU(cpu_id) guard(mutex)(&per_cpu(membarrier_cpu_mutexes, cpu_id))
177 
178 static int __init membarrier_init(void)
179 {
180 	int i;
181 
182 	for_each_possible_cpu(i)
183 		mutex_init(&per_cpu(membarrier_cpu_mutexes, i));
184 	return 0;
185 }
186 core_initcall(membarrier_init);
187 
188 static void ipi_mb(void *info)
189 {
190 	smp_mb();	/* IPIs should be serializing but paranoid. */
191 }
192 
193 static void ipi_sync_core(void *info)
194 {
195 	/*
196 	 * The smp_mb() in membarrier after all the IPIs is supposed to
197 	 * ensure that memory on remote CPUs that occur before the IPI
198 	 * become visible to membarrier()'s caller -- see scenario B in
199 	 * the big comment at the top of this file.
200 	 *
201 	 * A sync_core() would provide this guarantee, but
202 	 * sync_core_before_usermode() might end up being deferred until
203 	 * after membarrier()'s smp_mb().
204 	 */
205 	smp_mb();	/* IPIs should be serializing but paranoid. */
206 
207 	sync_core_before_usermode();
208 }
209 
210 static void ipi_rseq(void *info)
211 {
212 	/*
213 	 * Ensure that all stores done by the calling thread are visible
214 	 * to the current task before the current task resumes.  We could
215 	 * probably optimize this away on most architectures, but by the
216 	 * time we've already sent an IPI, the cost of the extra smp_mb()
217 	 * is negligible.
218 	 */
219 	smp_mb();
220 	/*
221 	 * Legacy mode requires that IDs are written and the critical section is
222 	 * evaluated. V2 optimized mode handles the critical section and IDs are
223 	 * only updated if they change as a consequence of preemption after
224 	 * return from this IPI.
225 	 */
226 	if (rseq_v2(current))
227 		rseq_sched_switch_event(current);
228 	else
229 		rseq_force_update();
230 }
231 
232 static void ipi_sync_rq_state(void *info)
233 {
234 	struct mm_struct *mm = (struct mm_struct *) info;
235 
236 	if (current->mm != mm)
237 		return;
238 	this_cpu_write(runqueues.membarrier_state,
239 		       atomic_read(&mm->membarrier_state));
240 	/*
241 	 * Issue a memory barrier after setting
242 	 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
243 	 * guarantee that no memory access following registration is reordered
244 	 * before registration.
245 	 */
246 	smp_mb();
247 }
248 
249 void membarrier_exec_mmap(struct mm_struct *mm)
250 {
251 	/*
252 	 * Issue a memory barrier before clearing membarrier_state to
253 	 * guarantee that no memory access prior to exec is reordered after
254 	 * clearing this state.
255 	 */
256 	smp_mb();
257 	atomic_set(&mm->membarrier_state, 0);
258 	/*
259 	 * Keep the runqueue membarrier_state in sync with this mm
260 	 * membarrier_state.
261 	 */
262 	this_cpu_write(runqueues.membarrier_state, 0);
263 }
264 
265 void membarrier_update_current_mm(struct mm_struct *next_mm)
266 {
267 	struct rq *rq = this_rq();
268 	int membarrier_state = 0;
269 
270 	if (next_mm)
271 		membarrier_state = atomic_read(&next_mm->membarrier_state);
272 	if (READ_ONCE(rq->membarrier_state) == membarrier_state)
273 		return;
274 	WRITE_ONCE(rq->membarrier_state, membarrier_state);
275 }
276 
277 static int membarrier_global_expedited(void)
278 {
279 	cpumask_var_t __free(free_cpumask_var) tmpmask = CPUMASK_VAR_NULL;
280 	int cpu;
281 
282 	if (num_online_cpus() == 1)
283 		return 0;
284 
285 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
286 		return -ENOMEM;
287 
288 	guard(mb)();
289 	SERIALIZE_IPI();
290 	guard(cpus_read_lock)();
291 
292 	rcu_read_lock();
293 	for_each_online_cpu(cpu) {
294 		struct task_struct *p;
295 
296 		/*
297 		 * Skipping the current CPU is OK even through we can be
298 		 * migrated at any point. The current CPU, at the point
299 		 * where we read raw_smp_processor_id(), is ensured to
300 		 * be in program order with respect to the caller
301 		 * thread. Therefore, we can skip this CPU from the
302 		 * iteration.
303 		 */
304 		if (cpu == raw_smp_processor_id())
305 			continue;
306 
307 		if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
308 		    MEMBARRIER_STATE_GLOBAL_EXPEDITED))
309 			continue;
310 
311 		/*
312 		 * Skip the CPU if it runs a kernel thread which is not using
313 		 * a task mm.
314 		 */
315 		p = rcu_dereference(cpu_rq(cpu)->curr);
316 		if (!p->mm)
317 			continue;
318 
319 		__cpumask_set_cpu(cpu, tmpmask);
320 	}
321 	rcu_read_unlock();
322 
323 	preempt_disable();
324 	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
325 	preempt_enable();
326 
327 	return 0;
328 }
329 
330 static int membarrier_private_expedited(int flags, int cpu_id)
331 {
332 	struct mm_struct *mm = current->mm;
333 	smp_call_func_t ipi_func = ipi_mb;
334 
335 	if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
336 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
337 			return -EINVAL;
338 		if (!(atomic_read(&mm->membarrier_state) &
339 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
340 			return -EPERM;
341 		ipi_func = ipi_sync_core;
342 		prepare_sync_core_cmd(mm);
343 	} else if (flags == MEMBARRIER_FLAG_RSEQ) {
344 		if (!IS_ENABLED(CONFIG_RSEQ))
345 			return -EINVAL;
346 		if (!(atomic_read(&mm->membarrier_state) &
347 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
348 			return -EPERM;
349 		ipi_func = ipi_rseq;
350 	} else {
351 		WARN_ON_ONCE(flags);
352 		if (!(atomic_read(&mm->membarrier_state) &
353 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
354 			return -EPERM;
355 	}
356 
357 	if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
358 	    (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
359 		return 0;
360 
361 	/*
362 	 * Matches memory barriers after rq->curr modification in
363 	 * scheduler.
364 	 *
365 	 * On RISC-V, this barrier pairing is also needed for the
366 	 * SYNC_CORE command when switching between processes, cf.
367 	 * the inline comments in membarrier_arch_switch_mm().
368 	 *
369 	 * Memory barrier on the caller thread _after_ we finished
370 	 * waiting for the last IPI. Matches memory barriers before
371 	 * rq->curr modification in scheduler.
372 	 */
373 	guard(mb)();
374 	if (cpu_id >= 0) {
375 		if (cpu_id >= nr_cpu_ids || !cpu_possible(cpu_id))
376 			return 0;
377 
378 		SERIALIZE_IPI_CPU(cpu_id);
379 		guard(cpus_read_lock)();
380 		struct task_struct *p;
381 
382 		if (!cpu_online(cpu_id))
383 			return 0;
384 
385 		rcu_read_lock();
386 		p = rcu_dereference(cpu_rq(cpu_id)->curr);
387 		if (!p || p->mm != mm) {
388 			rcu_read_unlock();
389 			return 0;
390 		}
391 		rcu_read_unlock();
392 		/*
393 		 * smp_call_function_single() will call ipi_func() if cpu_id
394 		 * is the calling CPU.
395 		 */
396 		smp_call_function_single(cpu_id, ipi_func, NULL, 1);
397 	} else {
398 		cpumask_var_t __free(free_cpumask_var) tmpmask = CPUMASK_VAR_NULL;
399 		int cpu;
400 
401 		if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
402 			return -ENOMEM;
403 
404 		SERIALIZE_IPI();
405 		guard(cpus_read_lock)();
406 
407 		rcu_read_lock();
408 		for_each_online_cpu(cpu) {
409 			struct task_struct *p;
410 
411 			p = rcu_dereference(cpu_rq(cpu)->curr);
412 			if (p && p->mm == mm)
413 				__cpumask_set_cpu(cpu, tmpmask);
414 		}
415 		rcu_read_unlock();
416 		/*
417 		 * For regular membarrier, we can save a few cycles by
418 		 * skipping the current cpu -- we're about to do smp_mb()
419 		 * below, and if we migrate to a different cpu, this cpu
420 		 * and the new cpu will execute a full barrier in the
421 		 * scheduler.
422 		 *
423 		 * For SYNC_CORE, we do need a barrier on the current cpu --
424 		 * otherwise, if we are migrated and replaced by a different
425 		 * task in the same mm just before, during, or after
426 		 * membarrier, we will end up with some thread in the mm
427 		 * running without a core sync.
428 		 *
429 		 * For RSEQ, don't invoke rseq_sched_switch_event() on the
430 		 * caller.  User code is not supposed to issue syscalls at
431 		 * all from inside an rseq critical section.
432 		 */
433 		if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
434 			preempt_disable();
435 			smp_call_function_many(tmpmask, ipi_func, NULL, true);
436 			preempt_enable();
437 		} else {
438 			on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
439 		}
440 	}
441 
442 	return 0;
443 }
444 
445 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
446 {
447 	int membarrier_state = atomic_read(&mm->membarrier_state);
448 	cpumask_var_t tmpmask;
449 	int cpu;
450 
451 	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
452 		this_cpu_write(runqueues.membarrier_state, membarrier_state);
453 
454 		/*
455 		 * For single mm user, we can simply issue a memory barrier
456 		 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
457 		 * mm and in the current runqueue to guarantee that no memory
458 		 * access following registration is reordered before
459 		 * registration.
460 		 */
461 		smp_mb();
462 		return 0;
463 	}
464 
465 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
466 		return -ENOMEM;
467 
468 	/*
469 	 * For mm with multiple users, we need to ensure all future
470 	 * scheduler executions will observe @mm's new membarrier
471 	 * state.
472 	 */
473 	synchronize_rcu();
474 
475 	/*
476 	 * For each cpu runqueue, if the task's mm match @mm, ensure that all
477 	 * @mm's membarrier state set bits are also set in the runqueue's
478 	 * membarrier state. This ensures that a runqueue scheduling
479 	 * between threads which are users of @mm has its membarrier state
480 	 * updated.
481 	 */
482 	SERIALIZE_IPI();
483 	cpus_read_lock();
484 	rcu_read_lock();
485 	for_each_online_cpu(cpu) {
486 		struct rq *rq = cpu_rq(cpu);
487 		struct task_struct *p;
488 
489 		p = rcu_dereference(rq->curr);
490 		if (p && p->mm == mm)
491 			__cpumask_set_cpu(cpu, tmpmask);
492 	}
493 	rcu_read_unlock();
494 
495 	on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
496 
497 	free_cpumask_var(tmpmask);
498 	cpus_read_unlock();
499 
500 	return 0;
501 }
502 
503 static int membarrier_register_global_expedited(void)
504 {
505 	struct task_struct *p = current;
506 	struct mm_struct *mm = p->mm;
507 	int ret;
508 
509 	if (atomic_read(&mm->membarrier_state) &
510 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
511 		return 0;
512 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
513 	ret = sync_runqueues_membarrier_state(mm);
514 	if (ret)
515 		return ret;
516 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
517 		  &mm->membarrier_state);
518 
519 	return 0;
520 }
521 
522 static int membarrier_register_private_expedited(int flags)
523 {
524 	struct task_struct *p = current;
525 	struct mm_struct *mm = p->mm;
526 	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
527 	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
528 	    ret;
529 
530 	if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
531 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
532 			return -EINVAL;
533 		ready_state =
534 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
535 	} else if (flags == MEMBARRIER_FLAG_RSEQ) {
536 		if (!IS_ENABLED(CONFIG_RSEQ))
537 			return -EINVAL;
538 		ready_state =
539 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
540 	} else {
541 		WARN_ON_ONCE(flags);
542 	}
543 
544 	/*
545 	 * We need to consider threads belonging to different thread
546 	 * groups, which use the same mm. (CLONE_VM but not
547 	 * CLONE_THREAD).
548 	 */
549 	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
550 		return 0;
551 	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
552 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
553 	if (flags & MEMBARRIER_FLAG_RSEQ)
554 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
555 	atomic_or(set_state, &mm->membarrier_state);
556 	ret = sync_runqueues_membarrier_state(mm);
557 	if (ret)
558 		return ret;
559 	atomic_or(ready_state, &mm->membarrier_state);
560 
561 	return 0;
562 }
563 
564 static int membarrier_get_registrations(void)
565 {
566 	struct task_struct *p = current;
567 	struct mm_struct *mm = p->mm;
568 	int registrations_mask = 0, membarrier_state, i;
569 	static const int states[] = {
570 		MEMBARRIER_STATE_GLOBAL_EXPEDITED |
571 			MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
572 		MEMBARRIER_STATE_PRIVATE_EXPEDITED |
573 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
574 		MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE |
575 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY,
576 		MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ |
577 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY
578 	};
579 	static const int registration_cmds[] = {
580 		MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED,
581 		MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED,
582 		MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE,
583 		MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
584 	};
585 	BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds));
586 
587 	membarrier_state = atomic_read(&mm->membarrier_state);
588 	for (i = 0; i < ARRAY_SIZE(states); ++i) {
589 		if (membarrier_state & states[i]) {
590 			registrations_mask |= registration_cmds[i];
591 			membarrier_state &= ~states[i];
592 		}
593 	}
594 	WARN_ON_ONCE(membarrier_state != 0);
595 	return registrations_mask;
596 }
597 
598 /**
599  * sys_membarrier - issue memory barriers on a set of threads
600  * @cmd:    Takes command values defined in enum membarrier_cmd.
601  * @flags:  Currently needs to be 0 for all commands other than
602  *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
603  *          case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
604  *          contains the CPU on which to interrupt (= restart)
605  *          the RSEQ critical section.
606  * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
607  *          RSEQ CS should be interrupted (@cmd must be
608  *          MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
609  *
610  * If this system call is not implemented, -ENOSYS is returned. If the
611  * command specified does not exist, not available on the running
612  * kernel, or if the command argument is invalid, this system call
613  * returns -EINVAL. For a given command, with flags argument set to 0,
614  * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
615  * always return the same value until reboot. In addition, it can return
616  * -ENOMEM if there is not enough memory available to perform the system
617  * call.
618  *
619  * All memory accesses performed in program order from each targeted thread
620  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
621  * the semantic "barrier()" to represent a compiler barrier forcing memory
622  * accesses to be performed in program order across the barrier, and
623  * smp_mb() to represent explicit memory barriers forcing full memory
624  * ordering across the barrier, we have the following ordering table for
625  * each pair of barrier(), sys_membarrier() and smp_mb():
626  *
627  * The pair ordering is detailed as (O: ordered, X: not ordered):
628  *
629  *                        barrier()   smp_mb() sys_membarrier()
630  *        barrier()          X           X            O
631  *        smp_mb()           X           O            O
632  *        sys_membarrier()   O           O            O
633  */
634 SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
635 {
636 	switch (cmd) {
637 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
638 		if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
639 			return -EINVAL;
640 		break;
641 	default:
642 		if (unlikely(flags))
643 			return -EINVAL;
644 	}
645 
646 	if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
647 		cpu_id = -1;
648 
649 	switch (cmd) {
650 	case MEMBARRIER_CMD_QUERY:
651 	{
652 		int cmd_mask = MEMBARRIER_CMD_BITMASK;
653 
654 		if (tick_nohz_full_enabled())
655 			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
656 		return cmd_mask;
657 	}
658 	case MEMBARRIER_CMD_GLOBAL:
659 		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
660 		if (tick_nohz_full_enabled())
661 			return -EINVAL;
662 		if (num_online_cpus() > 1)
663 			synchronize_rcu();
664 		return 0;
665 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
666 		return membarrier_global_expedited();
667 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
668 		return membarrier_register_global_expedited();
669 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
670 		return membarrier_private_expedited(0, cpu_id);
671 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
672 		return membarrier_register_private_expedited(0);
673 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
674 		return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
675 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
676 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
677 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
678 		return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
679 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
680 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
681 	case MEMBARRIER_CMD_GET_REGISTRATIONS:
682 		return membarrier_get_registrations();
683 	default:
684 		return -EINVAL;
685 	}
686 }
687