xref: /linux/kernel/sched/membarrier.c (revision 9d106c6dd81bb26ad7fc3ee89cb1d62557c8e2c9)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4  *
5  * membarrier system call
6  */
7 #include "sched.h"
8 
9 /*
10  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11  * except MEMBARRIER_CMD_QUERY.
12  */
13 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
14 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK			\
15 	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE			\
16 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
17 #else
18 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0
19 #endif
20 
21 #define MEMBARRIER_CMD_BITMASK						\
22 	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED	\
23 	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED			\
24 	| MEMBARRIER_CMD_PRIVATE_EXPEDITED				\
25 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\
26 	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
27 
28 static void ipi_mb(void *info)
29 {
30 	smp_mb();	/* IPIs should be serializing but paranoid. */
31 }
32 
33 static void ipi_sync_rq_state(void *info)
34 {
35 	struct mm_struct *mm = (struct mm_struct *) info;
36 
37 	if (current->mm != mm)
38 		return;
39 	this_cpu_write(runqueues.membarrier_state,
40 		       atomic_read(&mm->membarrier_state));
41 	/*
42 	 * Issue a memory barrier after setting
43 	 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
44 	 * guarantee that no memory access following registration is reordered
45 	 * before registration.
46 	 */
47 	smp_mb();
48 }
49 
50 void membarrier_exec_mmap(struct mm_struct *mm)
51 {
52 	/*
53 	 * Issue a memory barrier before clearing membarrier_state to
54 	 * guarantee that no memory access prior to exec is reordered after
55 	 * clearing this state.
56 	 */
57 	smp_mb();
58 	atomic_set(&mm->membarrier_state, 0);
59 	/*
60 	 * Keep the runqueue membarrier_state in sync with this mm
61 	 * membarrier_state.
62 	 */
63 	this_cpu_write(runqueues.membarrier_state, 0);
64 }
65 
66 static int membarrier_global_expedited(void)
67 {
68 	int cpu;
69 	cpumask_var_t tmpmask;
70 
71 	if (num_online_cpus() == 1)
72 		return 0;
73 
74 	/*
75 	 * Matches memory barriers around rq->curr modification in
76 	 * scheduler.
77 	 */
78 	smp_mb();	/* system call entry is not a mb. */
79 
80 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
81 		return -ENOMEM;
82 
83 	cpus_read_lock();
84 	rcu_read_lock();
85 	for_each_online_cpu(cpu) {
86 		struct task_struct *p;
87 
88 		/*
89 		 * Skipping the current CPU is OK even through we can be
90 		 * migrated at any point. The current CPU, at the point
91 		 * where we read raw_smp_processor_id(), is ensured to
92 		 * be in program order with respect to the caller
93 		 * thread. Therefore, we can skip this CPU from the
94 		 * iteration.
95 		 */
96 		if (cpu == raw_smp_processor_id())
97 			continue;
98 
99 		if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
100 		    MEMBARRIER_STATE_GLOBAL_EXPEDITED))
101 			continue;
102 
103 		/*
104 		 * Skip the CPU if it runs a kernel thread. The scheduler
105 		 * leaves the prior task mm in place as an optimization when
106 		 * scheduling a kthread.
107 		 */
108 		p = rcu_dereference(cpu_rq(cpu)->curr);
109 		if (p->flags & PF_KTHREAD)
110 			continue;
111 
112 		__cpumask_set_cpu(cpu, tmpmask);
113 	}
114 	rcu_read_unlock();
115 
116 	preempt_disable();
117 	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
118 	preempt_enable();
119 
120 	free_cpumask_var(tmpmask);
121 	cpus_read_unlock();
122 
123 	/*
124 	 * Memory barrier on the caller thread _after_ we finished
125 	 * waiting for the last IPI. Matches memory barriers around
126 	 * rq->curr modification in scheduler.
127 	 */
128 	smp_mb();	/* exit from system call is not a mb */
129 	return 0;
130 }
131 
132 static int membarrier_private_expedited(int flags)
133 {
134 	int cpu;
135 	cpumask_var_t tmpmask;
136 	struct mm_struct *mm = current->mm;
137 
138 	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
139 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
140 			return -EINVAL;
141 		if (!(atomic_read(&mm->membarrier_state) &
142 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
143 			return -EPERM;
144 	} else {
145 		if (!(atomic_read(&mm->membarrier_state) &
146 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
147 			return -EPERM;
148 	}
149 
150 	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
151 		return 0;
152 
153 	/*
154 	 * Matches memory barriers around rq->curr modification in
155 	 * scheduler.
156 	 */
157 	smp_mb();	/* system call entry is not a mb. */
158 
159 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
160 		return -ENOMEM;
161 
162 	cpus_read_lock();
163 	rcu_read_lock();
164 	for_each_online_cpu(cpu) {
165 		struct task_struct *p;
166 
167 		/*
168 		 * Skipping the current CPU is OK even through we can be
169 		 * migrated at any point. The current CPU, at the point
170 		 * where we read raw_smp_processor_id(), is ensured to
171 		 * be in program order with respect to the caller
172 		 * thread. Therefore, we can skip this CPU from the
173 		 * iteration.
174 		 */
175 		if (cpu == raw_smp_processor_id())
176 			continue;
177 		p = rcu_dereference(cpu_rq(cpu)->curr);
178 		if (p && p->mm == mm)
179 			__cpumask_set_cpu(cpu, tmpmask);
180 	}
181 	rcu_read_unlock();
182 
183 	preempt_disable();
184 	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
185 	preempt_enable();
186 
187 	free_cpumask_var(tmpmask);
188 	cpus_read_unlock();
189 
190 	/*
191 	 * Memory barrier on the caller thread _after_ we finished
192 	 * waiting for the last IPI. Matches memory barriers around
193 	 * rq->curr modification in scheduler.
194 	 */
195 	smp_mb();	/* exit from system call is not a mb */
196 
197 	return 0;
198 }
199 
200 static int sync_runqueues_membarrier_state(struct mm_struct *mm)
201 {
202 	int membarrier_state = atomic_read(&mm->membarrier_state);
203 	cpumask_var_t tmpmask;
204 	int cpu;
205 
206 	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
207 		this_cpu_write(runqueues.membarrier_state, membarrier_state);
208 
209 		/*
210 		 * For single mm user, we can simply issue a memory barrier
211 		 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
212 		 * mm and in the current runqueue to guarantee that no memory
213 		 * access following registration is reordered before
214 		 * registration.
215 		 */
216 		smp_mb();
217 		return 0;
218 	}
219 
220 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
221 		return -ENOMEM;
222 
223 	/*
224 	 * For mm with multiple users, we need to ensure all future
225 	 * scheduler executions will observe @mm's new membarrier
226 	 * state.
227 	 */
228 	synchronize_rcu();
229 
230 	/*
231 	 * For each cpu runqueue, if the task's mm match @mm, ensure that all
232 	 * @mm's membarrier state set bits are also set in in the runqueue's
233 	 * membarrier state. This ensures that a runqueue scheduling
234 	 * between threads which are users of @mm has its membarrier state
235 	 * updated.
236 	 */
237 	cpus_read_lock();
238 	rcu_read_lock();
239 	for_each_online_cpu(cpu) {
240 		struct rq *rq = cpu_rq(cpu);
241 		struct task_struct *p;
242 
243 		p = rcu_dereference(rq->curr);
244 		if (p && p->mm == mm)
245 			__cpumask_set_cpu(cpu, tmpmask);
246 	}
247 	rcu_read_unlock();
248 
249 	preempt_disable();
250 	smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
251 	preempt_enable();
252 
253 	free_cpumask_var(tmpmask);
254 	cpus_read_unlock();
255 
256 	return 0;
257 }
258 
259 static int membarrier_register_global_expedited(void)
260 {
261 	struct task_struct *p = current;
262 	struct mm_struct *mm = p->mm;
263 	int ret;
264 
265 	if (atomic_read(&mm->membarrier_state) &
266 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
267 		return 0;
268 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
269 	ret = sync_runqueues_membarrier_state(mm);
270 	if (ret)
271 		return ret;
272 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
273 		  &mm->membarrier_state);
274 
275 	return 0;
276 }
277 
278 static int membarrier_register_private_expedited(int flags)
279 {
280 	struct task_struct *p = current;
281 	struct mm_struct *mm = p->mm;
282 	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
283 	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
284 	    ret;
285 
286 	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
287 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
288 			return -EINVAL;
289 		ready_state =
290 			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
291 	}
292 
293 	/*
294 	 * We need to consider threads belonging to different thread
295 	 * groups, which use the same mm. (CLONE_VM but not
296 	 * CLONE_THREAD).
297 	 */
298 	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
299 		return 0;
300 	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
301 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
302 	atomic_or(set_state, &mm->membarrier_state);
303 	ret = sync_runqueues_membarrier_state(mm);
304 	if (ret)
305 		return ret;
306 	atomic_or(ready_state, &mm->membarrier_state);
307 
308 	return 0;
309 }
310 
311 /**
312  * sys_membarrier - issue memory barriers on a set of threads
313  * @cmd:   Takes command values defined in enum membarrier_cmd.
314  * @flags: Currently needs to be 0. For future extensions.
315  *
316  * If this system call is not implemented, -ENOSYS is returned. If the
317  * command specified does not exist, not available on the running
318  * kernel, or if the command argument is invalid, this system call
319  * returns -EINVAL. For a given command, with flags argument set to 0,
320  * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
321  * always return the same value until reboot. In addition, it can return
322  * -ENOMEM if there is not enough memory available to perform the system
323  * call.
324  *
325  * All memory accesses performed in program order from each targeted thread
326  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
327  * the semantic "barrier()" to represent a compiler barrier forcing memory
328  * accesses to be performed in program order across the barrier, and
329  * smp_mb() to represent explicit memory barriers forcing full memory
330  * ordering across the barrier, we have the following ordering table for
331  * each pair of barrier(), sys_membarrier() and smp_mb():
332  *
333  * The pair ordering is detailed as (O: ordered, X: not ordered):
334  *
335  *                        barrier()   smp_mb() sys_membarrier()
336  *        barrier()          X           X            O
337  *        smp_mb()           X           O            O
338  *        sys_membarrier()   O           O            O
339  */
340 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
341 {
342 	if (unlikely(flags))
343 		return -EINVAL;
344 	switch (cmd) {
345 	case MEMBARRIER_CMD_QUERY:
346 	{
347 		int cmd_mask = MEMBARRIER_CMD_BITMASK;
348 
349 		if (tick_nohz_full_enabled())
350 			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
351 		return cmd_mask;
352 	}
353 	case MEMBARRIER_CMD_GLOBAL:
354 		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
355 		if (tick_nohz_full_enabled())
356 			return -EINVAL;
357 		if (num_online_cpus() > 1)
358 			synchronize_rcu();
359 		return 0;
360 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
361 		return membarrier_global_expedited();
362 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
363 		return membarrier_register_global_expedited();
364 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
365 		return membarrier_private_expedited(0);
366 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
367 		return membarrier_register_private_expedited(0);
368 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
369 		return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
370 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
371 		return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
372 	default:
373 		return -EINVAL;
374 	}
375 }
376