xref: /linux/kernel/sched/membarrier.c (revision 92485487ba834f6665ec13dfb2c69e80cd0c7c37)
1 /*
2  * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3  *
4  * membarrier system call
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  */
16 
17 #include <linux/syscalls.h>
18 #include <linux/membarrier.h>
19 #include <linux/tick.h>
20 #include <linux/cpumask.h>
21 #include <linux/atomic.h>
22 
23 #include "sched.h"	/* for cpu_rq(). */
24 
25 /*
26  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
27  * except MEMBARRIER_CMD_QUERY.
28  */
29 #define MEMBARRIER_CMD_BITMASK	\
30 	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
31 	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
32 	| MEMBARRIER_CMD_PRIVATE_EXPEDITED	\
33 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
34 
35 static void ipi_mb(void *info)
36 {
37 	smp_mb();	/* IPIs should be serializing but paranoid. */
38 }
39 
40 static int membarrier_global_expedited(void)
41 {
42 	int cpu;
43 	bool fallback = false;
44 	cpumask_var_t tmpmask;
45 
46 	if (num_online_cpus() == 1)
47 		return 0;
48 
49 	/*
50 	 * Matches memory barriers around rq->curr modification in
51 	 * scheduler.
52 	 */
53 	smp_mb();	/* system call entry is not a mb. */
54 
55 	/*
56 	 * Expedited membarrier commands guarantee that they won't
57 	 * block, hence the GFP_NOWAIT allocation flag and fallback
58 	 * implementation.
59 	 */
60 	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
61 		/* Fallback for OOM. */
62 		fallback = true;
63 	}
64 
65 	cpus_read_lock();
66 	for_each_online_cpu(cpu) {
67 		struct task_struct *p;
68 
69 		/*
70 		 * Skipping the current CPU is OK even through we can be
71 		 * migrated at any point. The current CPU, at the point
72 		 * where we read raw_smp_processor_id(), is ensured to
73 		 * be in program order with respect to the caller
74 		 * thread. Therefore, we can skip this CPU from the
75 		 * iteration.
76 		 */
77 		if (cpu == raw_smp_processor_id())
78 			continue;
79 		rcu_read_lock();
80 		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
81 		if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
82 				   MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
83 			if (!fallback)
84 				__cpumask_set_cpu(cpu, tmpmask);
85 			else
86 				smp_call_function_single(cpu, ipi_mb, NULL, 1);
87 		}
88 		rcu_read_unlock();
89 	}
90 	if (!fallback) {
91 		preempt_disable();
92 		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
93 		preempt_enable();
94 		free_cpumask_var(tmpmask);
95 	}
96 	cpus_read_unlock();
97 
98 	/*
99 	 * Memory barrier on the caller thread _after_ we finished
100 	 * waiting for the last IPI. Matches memory barriers around
101 	 * rq->curr modification in scheduler.
102 	 */
103 	smp_mb();	/* exit from system call is not a mb */
104 	return 0;
105 }
106 
107 static int membarrier_private_expedited(void)
108 {
109 	int cpu;
110 	bool fallback = false;
111 	cpumask_var_t tmpmask;
112 
113 	if (!(atomic_read(&current->mm->membarrier_state)
114 			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
115 		return -EPERM;
116 
117 	if (num_online_cpus() == 1)
118 		return 0;
119 
120 	/*
121 	 * Matches memory barriers around rq->curr modification in
122 	 * scheduler.
123 	 */
124 	smp_mb();	/* system call entry is not a mb. */
125 
126 	/*
127 	 * Expedited membarrier commands guarantee that they won't
128 	 * block, hence the GFP_NOWAIT allocation flag and fallback
129 	 * implementation.
130 	 */
131 	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
132 		/* Fallback for OOM. */
133 		fallback = true;
134 	}
135 
136 	cpus_read_lock();
137 	for_each_online_cpu(cpu) {
138 		struct task_struct *p;
139 
140 		/*
141 		 * Skipping the current CPU is OK even through we can be
142 		 * migrated at any point. The current CPU, at the point
143 		 * where we read raw_smp_processor_id(), is ensured to
144 		 * be in program order with respect to the caller
145 		 * thread. Therefore, we can skip this CPU from the
146 		 * iteration.
147 		 */
148 		if (cpu == raw_smp_processor_id())
149 			continue;
150 		rcu_read_lock();
151 		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
152 		if (p && p->mm == current->mm) {
153 			if (!fallback)
154 				__cpumask_set_cpu(cpu, tmpmask);
155 			else
156 				smp_call_function_single(cpu, ipi_mb, NULL, 1);
157 		}
158 		rcu_read_unlock();
159 	}
160 	if (!fallback) {
161 		preempt_disable();
162 		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
163 		preempt_enable();
164 		free_cpumask_var(tmpmask);
165 	}
166 	cpus_read_unlock();
167 
168 	/*
169 	 * Memory barrier on the caller thread _after_ we finished
170 	 * waiting for the last IPI. Matches memory barriers around
171 	 * rq->curr modification in scheduler.
172 	 */
173 	smp_mb();	/* exit from system call is not a mb */
174 	return 0;
175 }
176 
177 static int membarrier_register_global_expedited(void)
178 {
179 	struct task_struct *p = current;
180 	struct mm_struct *mm = p->mm;
181 
182 	if (atomic_read(&mm->membarrier_state) &
183 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
184 		return 0;
185 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
186 	if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
187 		/*
188 		 * For single mm user, single threaded process, we can
189 		 * simply issue a memory barrier after setting
190 		 * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
191 		 * no memory access following registration is reordered
192 		 * before registration.
193 		 */
194 		smp_mb();
195 	} else {
196 		/*
197 		 * For multi-mm user threads, we need to ensure all
198 		 * future scheduler executions will observe the new
199 		 * thread flag state for this mm.
200 		 */
201 		synchronize_sched();
202 	}
203 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
204 		  &mm->membarrier_state);
205 	return 0;
206 }
207 
208 static int membarrier_register_private_expedited(void)
209 {
210 	struct task_struct *p = current;
211 	struct mm_struct *mm = p->mm;
212 
213 	/*
214 	 * We need to consider threads belonging to different thread
215 	 * groups, which use the same mm. (CLONE_VM but not
216 	 * CLONE_THREAD).
217 	 */
218 	if (atomic_read(&mm->membarrier_state)
219 			& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
220 		return 0;
221 	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
222 	if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
223 		/*
224 		 * Ensure all future scheduler executions will observe the
225 		 * new thread flag state for this process.
226 		 */
227 		synchronize_sched();
228 	}
229 	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
230 			&mm->membarrier_state);
231 	return 0;
232 }
233 
234 /**
235  * sys_membarrier - issue memory barriers on a set of threads
236  * @cmd:   Takes command values defined in enum membarrier_cmd.
237  * @flags: Currently needs to be 0. For future extensions.
238  *
239  * If this system call is not implemented, -ENOSYS is returned. If the
240  * command specified does not exist, not available on the running
241  * kernel, or if the command argument is invalid, this system call
242  * returns -EINVAL. For a given command, with flags argument set to 0,
243  * this system call is guaranteed to always return the same value until
244  * reboot.
245  *
246  * All memory accesses performed in program order from each targeted thread
247  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
248  * the semantic "barrier()" to represent a compiler barrier forcing memory
249  * accesses to be performed in program order across the barrier, and
250  * smp_mb() to represent explicit memory barriers forcing full memory
251  * ordering across the barrier, we have the following ordering table for
252  * each pair of barrier(), sys_membarrier() and smp_mb():
253  *
254  * The pair ordering is detailed as (O: ordered, X: not ordered):
255  *
256  *                        barrier()   smp_mb() sys_membarrier()
257  *        barrier()          X           X            O
258  *        smp_mb()           X           O            O
259  *        sys_membarrier()   O           O            O
260  */
261 SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
262 {
263 	if (unlikely(flags))
264 		return -EINVAL;
265 	switch (cmd) {
266 	case MEMBARRIER_CMD_QUERY:
267 	{
268 		int cmd_mask = MEMBARRIER_CMD_BITMASK;
269 
270 		if (tick_nohz_full_enabled())
271 			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
272 		return cmd_mask;
273 	}
274 	case MEMBARRIER_CMD_GLOBAL:
275 		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
276 		if (tick_nohz_full_enabled())
277 			return -EINVAL;
278 		if (num_online_cpus() > 1)
279 			synchronize_sched();
280 		return 0;
281 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
282 		return membarrier_global_expedited();
283 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
284 		return membarrier_register_global_expedited();
285 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
286 		return membarrier_private_expedited();
287 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
288 		return membarrier_register_private_expedited();
289 	default:
290 		return -EINVAL;
291 	}
292 }
293