xref: /freebsd/sys/kern/kern_membarrier.c (revision 4a69fc16a583face922319c476f3e739d9ce9140)
1*4a69fc16SKonstantin Belousov /*-
2*4a69fc16SKonstantin Belousov  * Copyright (c) 2021 The FreeBSD Foundation
3*4a69fc16SKonstantin Belousov  *
4*4a69fc16SKonstantin Belousov  * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
5*4a69fc16SKonstantin Belousov  * under sponsorship from the FreeBSD Foundation.
6*4a69fc16SKonstantin Belousov  *
7*4a69fc16SKonstantin Belousov  * Redistribution and use in source and binary forms, with or without
8*4a69fc16SKonstantin Belousov  * modification, are permitted provided that the following conditions
9*4a69fc16SKonstantin Belousov  * are met:
10*4a69fc16SKonstantin Belousov  * 1. Redistributions of source code must retain the above copyright
11*4a69fc16SKonstantin Belousov  *    notice, this list of conditions and the following disclaimer.
12*4a69fc16SKonstantin Belousov  * 2. Redistributions in binary form must reproduce the above copyright
13*4a69fc16SKonstantin Belousov  *    notice, this list of conditions and the following disclaimer in the
14*4a69fc16SKonstantin Belousov  *    documentation and/or other materials provided with the distribution.
15*4a69fc16SKonstantin Belousov  *
16*4a69fc16SKonstantin Belousov  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17*4a69fc16SKonstantin Belousov  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18*4a69fc16SKonstantin Belousov  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19*4a69fc16SKonstantin Belousov  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20*4a69fc16SKonstantin Belousov  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21*4a69fc16SKonstantin Belousov  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22*4a69fc16SKonstantin Belousov  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23*4a69fc16SKonstantin Belousov  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24*4a69fc16SKonstantin Belousov  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25*4a69fc16SKonstantin Belousov  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26*4a69fc16SKonstantin Belousov  * SUCH DAMAGE.
27*4a69fc16SKonstantin Belousov  */
28*4a69fc16SKonstantin Belousov 
29*4a69fc16SKonstantin Belousov #include <sys/cdefs.h>
30*4a69fc16SKonstantin Belousov #include <sys/param.h>
31*4a69fc16SKonstantin Belousov #include <sys/systm.h>
32*4a69fc16SKonstantin Belousov #include <sys/cpuset.h>
33*4a69fc16SKonstantin Belousov #include <sys/lock.h>
34*4a69fc16SKonstantin Belousov #include <sys/membarrier.h>
35*4a69fc16SKonstantin Belousov #include <sys/mutex.h>
36*4a69fc16SKonstantin Belousov #include <sys/proc.h>
37*4a69fc16SKonstantin Belousov #include <sys/sched.h>
38*4a69fc16SKonstantin Belousov #include <sys/smp.h>
39*4a69fc16SKonstantin Belousov #include <sys/syscallsubr.h>
40*4a69fc16SKonstantin Belousov #include <sys/sysproto.h>
41*4a69fc16SKonstantin Belousov 
42*4a69fc16SKonstantin Belousov #include <vm/vm_param.h>
43*4a69fc16SKonstantin Belousov #include <vm/vm.h>
44*4a69fc16SKonstantin Belousov #include <vm/pmap.h>
45*4a69fc16SKonstantin Belousov #include <vm/vm_map.h>
46*4a69fc16SKonstantin Belousov 
47*4a69fc16SKonstantin Belousov #define MEMBARRIER_SUPPORTED_CMDS	(			\
48*4a69fc16SKonstantin Belousov     MEMBARRIER_CMD_GLOBAL |					\
49*4a69fc16SKonstantin Belousov     MEMBARRIER_CMD_GLOBAL_EXPEDITED |				\
50*4a69fc16SKonstantin Belousov     MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED |			\
51*4a69fc16SKonstantin Belousov     MEMBARRIER_CMD_PRIVATE_EXPEDITED |				\
52*4a69fc16SKonstantin Belousov     MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED |			\
53*4a69fc16SKonstantin Belousov     MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE |		\
54*4a69fc16SKonstantin Belousov     MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
55*4a69fc16SKonstantin Belousov 
56*4a69fc16SKonstantin Belousov static void
57*4a69fc16SKonstantin Belousov membarrier_action_seqcst(void *arg __unused)
58*4a69fc16SKonstantin Belousov {
59*4a69fc16SKonstantin Belousov 	atomic_thread_fence_seq_cst();
60*4a69fc16SKonstantin Belousov }
61*4a69fc16SKonstantin Belousov 
62*4a69fc16SKonstantin Belousov static void
63*4a69fc16SKonstantin Belousov membarrier_action_seqcst_sync_core(void *arg __unused)
64*4a69fc16SKonstantin Belousov {
65*4a69fc16SKonstantin Belousov 	atomic_thread_fence_seq_cst();
66*4a69fc16SKonstantin Belousov 	cpu_sync_core();
67*4a69fc16SKonstantin Belousov }
68*4a69fc16SKonstantin Belousov 
69*4a69fc16SKonstantin Belousov static void
70*4a69fc16SKonstantin Belousov do_membarrier_ipi(cpuset_t *csp, void (*func)(void *))
71*4a69fc16SKonstantin Belousov {
72*4a69fc16SKonstantin Belousov 	atomic_thread_fence_seq_cst();
73*4a69fc16SKonstantin Belousov 	smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func,
74*4a69fc16SKonstantin Belousov 	    smp_no_rendezvous_barrier, NULL);
75*4a69fc16SKonstantin Belousov 	atomic_thread_fence_seq_cst();
76*4a69fc16SKonstantin Belousov }
77*4a69fc16SKonstantin Belousov 
78*4a69fc16SKonstantin Belousov static void
79*4a69fc16SKonstantin Belousov check_cpu_switched(int c, cpuset_t *csp, uint64_t *swt, bool init)
80*4a69fc16SKonstantin Belousov {
81*4a69fc16SKonstantin Belousov 	struct pcpu *pc;
82*4a69fc16SKonstantin Belousov 	uint64_t sw;
83*4a69fc16SKonstantin Belousov 
84*4a69fc16SKonstantin Belousov 	if (CPU_ISSET(c, csp))
85*4a69fc16SKonstantin Belousov 		return;
86*4a69fc16SKonstantin Belousov 
87*4a69fc16SKonstantin Belousov 	pc = cpuid_to_pcpu[c];
88*4a69fc16SKonstantin Belousov 	if (pc->pc_curthread == pc->pc_idlethread) {
89*4a69fc16SKonstantin Belousov 		CPU_SET(c, csp);
90*4a69fc16SKonstantin Belousov 		return;
91*4a69fc16SKonstantin Belousov 	}
92*4a69fc16SKonstantin Belousov 
93*4a69fc16SKonstantin Belousov 	/*
94*4a69fc16SKonstantin Belousov 	 * Sync with context switch to ensure that override of
95*4a69fc16SKonstantin Belousov 	 * pc_curthread with non-idle thread pointer is visible before
96*4a69fc16SKonstantin Belousov 	 * reading of pc_switchtime.
97*4a69fc16SKonstantin Belousov 	 */
98*4a69fc16SKonstantin Belousov 	atomic_thread_fence_acq();
99*4a69fc16SKonstantin Belousov 
100*4a69fc16SKonstantin Belousov 	sw = pc->pc_switchtime;
101*4a69fc16SKonstantin Belousov 	if (init)
102*4a69fc16SKonstantin Belousov 		swt[c] = sw;
103*4a69fc16SKonstantin Belousov 	else if (sw != swt[c])
104*4a69fc16SKonstantin Belousov 		CPU_SET(c, csp);
105*4a69fc16SKonstantin Belousov }
106*4a69fc16SKonstantin Belousov 
107*4a69fc16SKonstantin Belousov /*
108*4a69fc16SKonstantin Belousov  *
109*4a69fc16SKonstantin Belousov  * XXXKIB: We execute the requested action (seq_cst and possibly
110*4a69fc16SKonstantin Belousov  * sync_core) on current CPU as well.  There is no guarantee that
111*4a69fc16SKonstantin Belousov  * current thread executes anything with the full fence semantics
112*4a69fc16SKonstantin Belousov  * during syscall execution.  Similarly, cpu_core_sync() semantics
113*4a69fc16SKonstantin Belousov  * might be not provided by the syscall return.  E.g. on amd64 we
114*4a69fc16SKonstantin Belousov  * typically return without IRET.
115*4a69fc16SKonstantin Belousov  */
116*4a69fc16SKonstantin Belousov int
117*4a69fc16SKonstantin Belousov kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id)
118*4a69fc16SKonstantin Belousov {
119*4a69fc16SKonstantin Belousov 	struct proc *p, *p1;
120*4a69fc16SKonstantin Belousov 	struct thread *td1;
121*4a69fc16SKonstantin Belousov 	cpuset_t cs;
122*4a69fc16SKonstantin Belousov 	uint64_t *swt;
123*4a69fc16SKonstantin Belousov 	int c, error;
124*4a69fc16SKonstantin Belousov 	bool first;
125*4a69fc16SKonstantin Belousov 
126*4a69fc16SKonstantin Belousov 	if (flags != 0 || (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0)
127*4a69fc16SKonstantin Belousov 		return (EINVAL);
128*4a69fc16SKonstantin Belousov 
129*4a69fc16SKonstantin Belousov 	if (cmd == MEMBARRIER_CMD_QUERY) {
130*4a69fc16SKonstantin Belousov 		td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS;
131*4a69fc16SKonstantin Belousov 		return (0);
132*4a69fc16SKonstantin Belousov 	}
133*4a69fc16SKonstantin Belousov 
134*4a69fc16SKonstantin Belousov 	p = td->td_proc;
135*4a69fc16SKonstantin Belousov 	error = 0;
136*4a69fc16SKonstantin Belousov 
137*4a69fc16SKonstantin Belousov 	switch (cmd) {
138*4a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_GLOBAL:
139*4a69fc16SKonstantin Belousov 		swt = malloc((mp_maxid + 1) * sizeof(*swt), M_TEMP, M_WAITOK);
140*4a69fc16SKonstantin Belousov 		CPU_ZERO(&cs);
141*4a69fc16SKonstantin Belousov 		sched_pin();
142*4a69fc16SKonstantin Belousov 		CPU_SET(PCPU_GET(cpuid), &cs);
143*4a69fc16SKonstantin Belousov 		for (first = true; error == 0; first = false) {
144*4a69fc16SKonstantin Belousov 			CPU_FOREACH(c)
145*4a69fc16SKonstantin Belousov 				check_cpu_switched(c, &cs, swt, first);
146*4a69fc16SKonstantin Belousov 			if (CPU_CMP(&cs, &all_cpus) == 0)
147*4a69fc16SKonstantin Belousov 				break;
148*4a69fc16SKonstantin Belousov 			error = pause_sig("mmbr", 1);
149*4a69fc16SKonstantin Belousov 			if (error == EWOULDBLOCK)
150*4a69fc16SKonstantin Belousov 				error = 0;
151*4a69fc16SKonstantin Belousov 		}
152*4a69fc16SKonstantin Belousov 		sched_unpin();
153*4a69fc16SKonstantin Belousov 		free(swt, M_TEMP);
154*4a69fc16SKonstantin Belousov 		atomic_thread_fence_seq_cst();
155*4a69fc16SKonstantin Belousov 		break;
156*4a69fc16SKonstantin Belousov 
157*4a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
158*4a69fc16SKonstantin Belousov 		if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
159*4a69fc16SKonstantin Belousov 			error = EPERM;
160*4a69fc16SKonstantin Belousov 		} else {
161*4a69fc16SKonstantin Belousov 			CPU_ZERO(&cs);
162*4a69fc16SKonstantin Belousov 			CPU_FOREACH(c) {
163*4a69fc16SKonstantin Belousov 				td1 = cpuid_to_pcpu[c]->pc_curthread;
164*4a69fc16SKonstantin Belousov 				p1 = td1->td_proc;
165*4a69fc16SKonstantin Belousov 				if (p1 != NULL &&
166*4a69fc16SKonstantin Belousov 				    (p1->p_flag2 & P2_MEMBAR_GLOBE) != 0)
167*4a69fc16SKonstantin Belousov 					CPU_SET(c, &cs);
168*4a69fc16SKonstantin Belousov 			}
169*4a69fc16SKonstantin Belousov 			do_membarrier_ipi(&cs, membarrier_action_seqcst);
170*4a69fc16SKonstantin Belousov 		}
171*4a69fc16SKonstantin Belousov 		break;
172*4a69fc16SKonstantin Belousov 
173*4a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
174*4a69fc16SKonstantin Belousov 		if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
175*4a69fc16SKonstantin Belousov 			PROC_LOCK(p);
176*4a69fc16SKonstantin Belousov 			p->p_flag2 |= P2_MEMBAR_GLOBE;
177*4a69fc16SKonstantin Belousov 			PROC_UNLOCK(p);
178*4a69fc16SKonstantin Belousov 		}
179*4a69fc16SKonstantin Belousov 		break;
180*4a69fc16SKonstantin Belousov 
181*4a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
182*4a69fc16SKonstantin Belousov 		if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
183*4a69fc16SKonstantin Belousov 			error = EPERM;
184*4a69fc16SKonstantin Belousov 		} else {
185*4a69fc16SKonstantin Belousov 			pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
186*4a69fc16SKonstantin Belousov 			do_membarrier_ipi(&cs, membarrier_action_seqcst);
187*4a69fc16SKonstantin Belousov 		}
188*4a69fc16SKonstantin Belousov 		break;
189*4a69fc16SKonstantin Belousov 
190*4a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
191*4a69fc16SKonstantin Belousov 		if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
192*4a69fc16SKonstantin Belousov 			PROC_LOCK(p);
193*4a69fc16SKonstantin Belousov 			p->p_flag2 |= P2_MEMBAR_PRIVE;
194*4a69fc16SKonstantin Belousov 			PROC_UNLOCK(p);
195*4a69fc16SKonstantin Belousov 		}
196*4a69fc16SKonstantin Belousov 		break;
197*4a69fc16SKonstantin Belousov 
198*4a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
199*4a69fc16SKonstantin Belousov 		if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
200*4a69fc16SKonstantin Belousov 			error = EPERM;
201*4a69fc16SKonstantin Belousov 		} else {
202*4a69fc16SKonstantin Belousov 			/*
203*4a69fc16SKonstantin Belousov 			 * Calculating the IPI multicast mask from
204*4a69fc16SKonstantin Belousov 			 * pmap active mask means that we do not call
205*4a69fc16SKonstantin Belousov 			 * cpu_sync_core() on CPUs that were missed
206*4a69fc16SKonstantin Belousov 			 * from pmap active mask but could be switched
207*4a69fc16SKonstantin Belousov 			 * from or to meantime.  This is fine at least
208*4a69fc16SKonstantin Belousov 			 * on amd64 because threads always use slow
209*4a69fc16SKonstantin Belousov 			 * (IRETQ) path to return from syscall after
210*4a69fc16SKonstantin Belousov 			 * context switch.
211*4a69fc16SKonstantin Belousov 			 */
212*4a69fc16SKonstantin Belousov 			pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
213*4a69fc16SKonstantin Belousov 
214*4a69fc16SKonstantin Belousov 			do_membarrier_ipi(&cs,
215*4a69fc16SKonstantin Belousov 			    membarrier_action_seqcst_sync_core);
216*4a69fc16SKonstantin Belousov 		}
217*4a69fc16SKonstantin Belousov 		break;
218*4a69fc16SKonstantin Belousov 
219*4a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
220*4a69fc16SKonstantin Belousov 		if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
221*4a69fc16SKonstantin Belousov 			PROC_LOCK(p);
222*4a69fc16SKonstantin Belousov 			p->p_flag2 |= P2_MEMBAR_PRIVE_SYNCORE;
223*4a69fc16SKonstantin Belousov 			PROC_UNLOCK(p);
224*4a69fc16SKonstantin Belousov 		}
225*4a69fc16SKonstantin Belousov 		break;
226*4a69fc16SKonstantin Belousov 
227*4a69fc16SKonstantin Belousov 	default:
228*4a69fc16SKonstantin Belousov 		error = EINVAL;
229*4a69fc16SKonstantin Belousov 		break;
230*4a69fc16SKonstantin Belousov 	}
231*4a69fc16SKonstantin Belousov 
232*4a69fc16SKonstantin Belousov 	return (error);
233*4a69fc16SKonstantin Belousov }
234*4a69fc16SKonstantin Belousov 
235*4a69fc16SKonstantin Belousov int
236*4a69fc16SKonstantin Belousov sys_membarrier(struct thread *td, struct membarrier_args *uap)
237*4a69fc16SKonstantin Belousov {
238*4a69fc16SKonstantin Belousov 	return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id));
239*4a69fc16SKonstantin Belousov }
240