xref: /freebsd/sys/kern/kern_membarrier.c (revision 6bb132ba1e4e28cf1e713c8dcd77f2e8af7bde88)
14a69fc16SKonstantin Belousov /*-
24a69fc16SKonstantin Belousov  * Copyright (c) 2021 The FreeBSD Foundation
34a69fc16SKonstantin Belousov  *
44a69fc16SKonstantin Belousov  * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
54a69fc16SKonstantin Belousov  * under sponsorship from the FreeBSD Foundation.
64a69fc16SKonstantin Belousov  *
74a69fc16SKonstantin Belousov  * Redistribution and use in source and binary forms, with or without
84a69fc16SKonstantin Belousov  * modification, are permitted provided that the following conditions
94a69fc16SKonstantin Belousov  * are met:
104a69fc16SKonstantin Belousov  * 1. Redistributions of source code must retain the above copyright
114a69fc16SKonstantin Belousov  *    notice, this list of conditions and the following disclaimer.
124a69fc16SKonstantin Belousov  * 2. Redistributions in binary form must reproduce the above copyright
134a69fc16SKonstantin Belousov  *    notice, this list of conditions and the following disclaimer in the
144a69fc16SKonstantin Belousov  *    documentation and/or other materials provided with the distribution.
154a69fc16SKonstantin Belousov  *
164a69fc16SKonstantin Belousov  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
174a69fc16SKonstantin Belousov  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
184a69fc16SKonstantin Belousov  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
194a69fc16SKonstantin Belousov  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
204a69fc16SKonstantin Belousov  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
214a69fc16SKonstantin Belousov  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
224a69fc16SKonstantin Belousov  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
234a69fc16SKonstantin Belousov  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
244a69fc16SKonstantin Belousov  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
254a69fc16SKonstantin Belousov  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
264a69fc16SKonstantin Belousov  * SUCH DAMAGE.
274a69fc16SKonstantin Belousov  */
284a69fc16SKonstantin Belousov 
294a69fc16SKonstantin Belousov #include <sys/param.h>
304a69fc16SKonstantin Belousov #include <sys/systm.h>
314a69fc16SKonstantin Belousov #include <sys/cpuset.h>
324a69fc16SKonstantin Belousov #include <sys/lock.h>
33*6bb132baSBrooks Davis #include <sys/malloc.h>
344a69fc16SKonstantin Belousov #include <sys/membarrier.h>
354a69fc16SKonstantin Belousov #include <sys/mutex.h>
364a69fc16SKonstantin Belousov #include <sys/proc.h>
374a69fc16SKonstantin Belousov #include <sys/sched.h>
384a69fc16SKonstantin Belousov #include <sys/smp.h>
394a69fc16SKonstantin Belousov #include <sys/syscallsubr.h>
404a69fc16SKonstantin Belousov #include <sys/sysproto.h>
414a69fc16SKonstantin Belousov 
424a69fc16SKonstantin Belousov #include <vm/vm_param.h>
434a69fc16SKonstantin Belousov #include <vm/vm.h>
444a69fc16SKonstantin Belousov #include <vm/pmap.h>
454a69fc16SKonstantin Belousov #include <vm/vm_map.h>
464a69fc16SKonstantin Belousov 
474a69fc16SKonstantin Belousov #define MEMBARRIER_SUPPORTED_CMDS	(			\
484a69fc16SKonstantin Belousov     MEMBARRIER_CMD_GLOBAL |					\
494a69fc16SKonstantin Belousov     MEMBARRIER_CMD_GLOBAL_EXPEDITED |				\
504a69fc16SKonstantin Belousov     MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED |			\
514a69fc16SKonstantin Belousov     MEMBARRIER_CMD_PRIVATE_EXPEDITED |				\
524a69fc16SKonstantin Belousov     MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED |			\
534a69fc16SKonstantin Belousov     MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE |		\
544a69fc16SKonstantin Belousov     MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
554a69fc16SKonstantin Belousov 
564a69fc16SKonstantin Belousov static void
574a69fc16SKonstantin Belousov membarrier_action_seqcst(void *arg __unused)
584a69fc16SKonstantin Belousov {
594a69fc16SKonstantin Belousov 	atomic_thread_fence_seq_cst();
604a69fc16SKonstantin Belousov }
614a69fc16SKonstantin Belousov 
624a69fc16SKonstantin Belousov static void
634a69fc16SKonstantin Belousov membarrier_action_seqcst_sync_core(void *arg __unused)
644a69fc16SKonstantin Belousov {
654a69fc16SKonstantin Belousov 	atomic_thread_fence_seq_cst();
664a69fc16SKonstantin Belousov 	cpu_sync_core();
674a69fc16SKonstantin Belousov }
684a69fc16SKonstantin Belousov 
694a69fc16SKonstantin Belousov static void
704a69fc16SKonstantin Belousov do_membarrier_ipi(cpuset_t *csp, void (*func)(void *))
714a69fc16SKonstantin Belousov {
724a69fc16SKonstantin Belousov 	atomic_thread_fence_seq_cst();
734a69fc16SKonstantin Belousov 	smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func,
744a69fc16SKonstantin Belousov 	    smp_no_rendezvous_barrier, NULL);
754a69fc16SKonstantin Belousov 	atomic_thread_fence_seq_cst();
764a69fc16SKonstantin Belousov }
774a69fc16SKonstantin Belousov 
784a69fc16SKonstantin Belousov static void
794a69fc16SKonstantin Belousov check_cpu_switched(int c, cpuset_t *csp, uint64_t *swt, bool init)
804a69fc16SKonstantin Belousov {
814a69fc16SKonstantin Belousov 	struct pcpu *pc;
824a69fc16SKonstantin Belousov 	uint64_t sw;
834a69fc16SKonstantin Belousov 
844a69fc16SKonstantin Belousov 	if (CPU_ISSET(c, csp))
854a69fc16SKonstantin Belousov 		return;
864a69fc16SKonstantin Belousov 
874a69fc16SKonstantin Belousov 	pc = cpuid_to_pcpu[c];
884a69fc16SKonstantin Belousov 	if (pc->pc_curthread == pc->pc_idlethread) {
894a69fc16SKonstantin Belousov 		CPU_SET(c, csp);
904a69fc16SKonstantin Belousov 		return;
914a69fc16SKonstantin Belousov 	}
924a69fc16SKonstantin Belousov 
934a69fc16SKonstantin Belousov 	/*
944a69fc16SKonstantin Belousov 	 * Sync with context switch to ensure that override of
954a69fc16SKonstantin Belousov 	 * pc_curthread with non-idle thread pointer is visible before
964a69fc16SKonstantin Belousov 	 * reading of pc_switchtime.
974a69fc16SKonstantin Belousov 	 */
984a69fc16SKonstantin Belousov 	atomic_thread_fence_acq();
994a69fc16SKonstantin Belousov 
1004a69fc16SKonstantin Belousov 	sw = pc->pc_switchtime;
1014a69fc16SKonstantin Belousov 	if (init)
1024a69fc16SKonstantin Belousov 		swt[c] = sw;
1034a69fc16SKonstantin Belousov 	else if (sw != swt[c])
1044a69fc16SKonstantin Belousov 		CPU_SET(c, csp);
1054a69fc16SKonstantin Belousov }
1064a69fc16SKonstantin Belousov 
1074a69fc16SKonstantin Belousov /*
1084a69fc16SKonstantin Belousov  *
1094a69fc16SKonstantin Belousov  * XXXKIB: We execute the requested action (seq_cst and possibly
1104a69fc16SKonstantin Belousov  * sync_core) on current CPU as well.  There is no guarantee that
1114a69fc16SKonstantin Belousov  * current thread executes anything with the full fence semantics
1124a69fc16SKonstantin Belousov  * during syscall execution.  Similarly, cpu_core_sync() semantics
1134a69fc16SKonstantin Belousov  * might be not provided by the syscall return.  E.g. on amd64 we
1144a69fc16SKonstantin Belousov  * typically return without IRET.
1154a69fc16SKonstantin Belousov  */
1164a69fc16SKonstantin Belousov int
1174a69fc16SKonstantin Belousov kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id)
1184a69fc16SKonstantin Belousov {
1194a69fc16SKonstantin Belousov 	struct proc *p, *p1;
1204a69fc16SKonstantin Belousov 	struct thread *td1;
1214a69fc16SKonstantin Belousov 	cpuset_t cs;
1224a69fc16SKonstantin Belousov 	uint64_t *swt;
1234a69fc16SKonstantin Belousov 	int c, error;
1244a69fc16SKonstantin Belousov 	bool first;
1254a69fc16SKonstantin Belousov 
1264a69fc16SKonstantin Belousov 	if (flags != 0 || (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0)
1274a69fc16SKonstantin Belousov 		return (EINVAL);
1284a69fc16SKonstantin Belousov 
1294a69fc16SKonstantin Belousov 	if (cmd == MEMBARRIER_CMD_QUERY) {
1304a69fc16SKonstantin Belousov 		td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS;
1314a69fc16SKonstantin Belousov 		return (0);
1324a69fc16SKonstantin Belousov 	}
1334a69fc16SKonstantin Belousov 
1344a69fc16SKonstantin Belousov 	p = td->td_proc;
1354a69fc16SKonstantin Belousov 	error = 0;
1364a69fc16SKonstantin Belousov 
1374a69fc16SKonstantin Belousov 	switch (cmd) {
1384a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_GLOBAL:
1394a69fc16SKonstantin Belousov 		swt = malloc((mp_maxid + 1) * sizeof(*swt), M_TEMP, M_WAITOK);
1404a69fc16SKonstantin Belousov 		CPU_ZERO(&cs);
1414a69fc16SKonstantin Belousov 		sched_pin();
1424a69fc16SKonstantin Belousov 		CPU_SET(PCPU_GET(cpuid), &cs);
1434a69fc16SKonstantin Belousov 		for (first = true; error == 0; first = false) {
1444a69fc16SKonstantin Belousov 			CPU_FOREACH(c)
1454a69fc16SKonstantin Belousov 				check_cpu_switched(c, &cs, swt, first);
1464a69fc16SKonstantin Belousov 			if (CPU_CMP(&cs, &all_cpus) == 0)
1474a69fc16SKonstantin Belousov 				break;
1484a69fc16SKonstantin Belousov 			error = pause_sig("mmbr", 1);
1494a69fc16SKonstantin Belousov 			if (error == EWOULDBLOCK)
1504a69fc16SKonstantin Belousov 				error = 0;
1514a69fc16SKonstantin Belousov 		}
1524a69fc16SKonstantin Belousov 		sched_unpin();
1534a69fc16SKonstantin Belousov 		free(swt, M_TEMP);
1544a69fc16SKonstantin Belousov 		atomic_thread_fence_seq_cst();
1554a69fc16SKonstantin Belousov 		break;
1564a69fc16SKonstantin Belousov 
1574a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
1584a69fc16SKonstantin Belousov 		if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
1594a69fc16SKonstantin Belousov 			error = EPERM;
1604a69fc16SKonstantin Belousov 		} else {
1614a69fc16SKonstantin Belousov 			CPU_ZERO(&cs);
1624a69fc16SKonstantin Belousov 			CPU_FOREACH(c) {
1634a69fc16SKonstantin Belousov 				td1 = cpuid_to_pcpu[c]->pc_curthread;
1644a69fc16SKonstantin Belousov 				p1 = td1->td_proc;
1654a69fc16SKonstantin Belousov 				if (p1 != NULL &&
1664a69fc16SKonstantin Belousov 				    (p1->p_flag2 & P2_MEMBAR_GLOBE) != 0)
1674a69fc16SKonstantin Belousov 					CPU_SET(c, &cs);
1684a69fc16SKonstantin Belousov 			}
1694a69fc16SKonstantin Belousov 			do_membarrier_ipi(&cs, membarrier_action_seqcst);
1704a69fc16SKonstantin Belousov 		}
1714a69fc16SKonstantin Belousov 		break;
1724a69fc16SKonstantin Belousov 
1734a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
1744a69fc16SKonstantin Belousov 		if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
1754a69fc16SKonstantin Belousov 			PROC_LOCK(p);
1764a69fc16SKonstantin Belousov 			p->p_flag2 |= P2_MEMBAR_GLOBE;
1774a69fc16SKonstantin Belousov 			PROC_UNLOCK(p);
1784a69fc16SKonstantin Belousov 		}
1794a69fc16SKonstantin Belousov 		break;
1804a69fc16SKonstantin Belousov 
1814a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
1824a69fc16SKonstantin Belousov 		if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
1834a69fc16SKonstantin Belousov 			error = EPERM;
1844a69fc16SKonstantin Belousov 		} else {
1854a69fc16SKonstantin Belousov 			pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
1864a69fc16SKonstantin Belousov 			do_membarrier_ipi(&cs, membarrier_action_seqcst);
1874a69fc16SKonstantin Belousov 		}
1884a69fc16SKonstantin Belousov 		break;
1894a69fc16SKonstantin Belousov 
1904a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
1914a69fc16SKonstantin Belousov 		if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
1924a69fc16SKonstantin Belousov 			PROC_LOCK(p);
1934a69fc16SKonstantin Belousov 			p->p_flag2 |= P2_MEMBAR_PRIVE;
1944a69fc16SKonstantin Belousov 			PROC_UNLOCK(p);
1954a69fc16SKonstantin Belousov 		}
1964a69fc16SKonstantin Belousov 		break;
1974a69fc16SKonstantin Belousov 
1984a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
1994a69fc16SKonstantin Belousov 		if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
2004a69fc16SKonstantin Belousov 			error = EPERM;
2014a69fc16SKonstantin Belousov 		} else {
2024a69fc16SKonstantin Belousov 			/*
2034a69fc16SKonstantin Belousov 			 * Calculating the IPI multicast mask from
2044a69fc16SKonstantin Belousov 			 * pmap active mask means that we do not call
2054a69fc16SKonstantin Belousov 			 * cpu_sync_core() on CPUs that were missed
2064a69fc16SKonstantin Belousov 			 * from pmap active mask but could be switched
2074a69fc16SKonstantin Belousov 			 * from or to meantime.  This is fine at least
2084a69fc16SKonstantin Belousov 			 * on amd64 because threads always use slow
2094a69fc16SKonstantin Belousov 			 * (IRETQ) path to return from syscall after
2104a69fc16SKonstantin Belousov 			 * context switch.
2114a69fc16SKonstantin Belousov 			 */
2124a69fc16SKonstantin Belousov 			pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
2134a69fc16SKonstantin Belousov 
2144a69fc16SKonstantin Belousov 			do_membarrier_ipi(&cs,
2154a69fc16SKonstantin Belousov 			    membarrier_action_seqcst_sync_core);
2164a69fc16SKonstantin Belousov 		}
2174a69fc16SKonstantin Belousov 		break;
2184a69fc16SKonstantin Belousov 
2194a69fc16SKonstantin Belousov 	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
2204a69fc16SKonstantin Belousov 		if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
2214a69fc16SKonstantin Belousov 			PROC_LOCK(p);
2224a69fc16SKonstantin Belousov 			p->p_flag2 |= P2_MEMBAR_PRIVE_SYNCORE;
2234a69fc16SKonstantin Belousov 			PROC_UNLOCK(p);
2244a69fc16SKonstantin Belousov 		}
2254a69fc16SKonstantin Belousov 		break;
2264a69fc16SKonstantin Belousov 
2274a69fc16SKonstantin Belousov 	default:
2284a69fc16SKonstantin Belousov 		error = EINVAL;
2294a69fc16SKonstantin Belousov 		break;
2304a69fc16SKonstantin Belousov 	}
2314a69fc16SKonstantin Belousov 
2324a69fc16SKonstantin Belousov 	return (error);
2334a69fc16SKonstantin Belousov }
2344a69fc16SKonstantin Belousov 
2354a69fc16SKonstantin Belousov int
2364a69fc16SKonstantin Belousov sys_membarrier(struct thread *td, struct membarrier_args *uap)
2374a69fc16SKonstantin Belousov {
2384a69fc16SKonstantin Belousov 	return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id));
2394a69fc16SKonstantin Belousov }
240