14a69fc16SKonstantin Belousov /*- 24a69fc16SKonstantin Belousov * Copyright (c) 2021 The FreeBSD Foundation 34a69fc16SKonstantin Belousov * 44a69fc16SKonstantin Belousov * This software were developed by Konstantin Belousov <kib@FreeBSD.org> 54a69fc16SKonstantin Belousov * under sponsorship from the FreeBSD Foundation. 64a69fc16SKonstantin Belousov * 74a69fc16SKonstantin Belousov * Redistribution and use in source and binary forms, with or without 84a69fc16SKonstantin Belousov * modification, are permitted provided that the following conditions 94a69fc16SKonstantin Belousov * are met: 104a69fc16SKonstantin Belousov * 1. Redistributions of source code must retain the above copyright 114a69fc16SKonstantin Belousov * notice, this list of conditions and the following disclaimer. 124a69fc16SKonstantin Belousov * 2. Redistributions in binary form must reproduce the above copyright 134a69fc16SKonstantin Belousov * notice, this list of conditions and the following disclaimer in the 144a69fc16SKonstantin Belousov * documentation and/or other materials provided with the distribution. 154a69fc16SKonstantin Belousov * 164a69fc16SKonstantin Belousov * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 174a69fc16SKonstantin Belousov * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 184a69fc16SKonstantin Belousov * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 194a69fc16SKonstantin Belousov * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 204a69fc16SKonstantin Belousov * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 214a69fc16SKonstantin Belousov * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 224a69fc16SKonstantin Belousov * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 234a69fc16SKonstantin Belousov * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 244a69fc16SKonstantin Belousov * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 254a69fc16SKonstantin Belousov * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 264a69fc16SKonstantin Belousov * SUCH DAMAGE. 274a69fc16SKonstantin Belousov */ 284a69fc16SKonstantin Belousov 294a69fc16SKonstantin Belousov #include <sys/param.h> 304a69fc16SKonstantin Belousov #include <sys/systm.h> 314a69fc16SKonstantin Belousov #include <sys/cpuset.h> 324a69fc16SKonstantin Belousov #include <sys/lock.h> 33*6bb132baSBrooks Davis #include <sys/malloc.h> 344a69fc16SKonstantin Belousov #include <sys/membarrier.h> 354a69fc16SKonstantin Belousov #include <sys/mutex.h> 364a69fc16SKonstantin Belousov #include <sys/proc.h> 374a69fc16SKonstantin Belousov #include <sys/sched.h> 384a69fc16SKonstantin Belousov #include <sys/smp.h> 394a69fc16SKonstantin Belousov #include <sys/syscallsubr.h> 404a69fc16SKonstantin Belousov #include <sys/sysproto.h> 414a69fc16SKonstantin Belousov 424a69fc16SKonstantin Belousov #include <vm/vm_param.h> 434a69fc16SKonstantin Belousov #include <vm/vm.h> 444a69fc16SKonstantin Belousov #include <vm/pmap.h> 454a69fc16SKonstantin Belousov #include <vm/vm_map.h> 464a69fc16SKonstantin Belousov 474a69fc16SKonstantin Belousov #define MEMBARRIER_SUPPORTED_CMDS ( \ 484a69fc16SKonstantin Belousov MEMBARRIER_CMD_GLOBAL | \ 494a69fc16SKonstantin Belousov MEMBARRIER_CMD_GLOBAL_EXPEDITED | \ 504a69fc16SKonstantin Belousov MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED | \ 514a69fc16SKonstantin Belousov MEMBARRIER_CMD_PRIVATE_EXPEDITED | \ 524a69fc16SKonstantin Belousov MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED | \ 534a69fc16SKonstantin Belousov MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE | \ 544a69fc16SKonstantin Belousov MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 554a69fc16SKonstantin Belousov 564a69fc16SKonstantin Belousov static void 574a69fc16SKonstantin Belousov membarrier_action_seqcst(void *arg __unused) 584a69fc16SKonstantin Belousov { 594a69fc16SKonstantin Belousov atomic_thread_fence_seq_cst(); 604a69fc16SKonstantin Belousov } 614a69fc16SKonstantin Belousov 624a69fc16SKonstantin Belousov static void 634a69fc16SKonstantin Belousov membarrier_action_seqcst_sync_core(void *arg __unused) 644a69fc16SKonstantin Belousov { 654a69fc16SKonstantin Belousov atomic_thread_fence_seq_cst(); 664a69fc16SKonstantin Belousov cpu_sync_core(); 674a69fc16SKonstantin Belousov } 684a69fc16SKonstantin Belousov 694a69fc16SKonstantin Belousov static void 704a69fc16SKonstantin Belousov do_membarrier_ipi(cpuset_t *csp, void (*func)(void *)) 714a69fc16SKonstantin Belousov { 724a69fc16SKonstantin Belousov atomic_thread_fence_seq_cst(); 734a69fc16SKonstantin Belousov smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func, 744a69fc16SKonstantin Belousov smp_no_rendezvous_barrier, NULL); 754a69fc16SKonstantin Belousov atomic_thread_fence_seq_cst(); 764a69fc16SKonstantin Belousov } 774a69fc16SKonstantin Belousov 784a69fc16SKonstantin Belousov static void 794a69fc16SKonstantin Belousov check_cpu_switched(int c, cpuset_t *csp, uint64_t *swt, bool init) 804a69fc16SKonstantin Belousov { 814a69fc16SKonstantin Belousov struct pcpu *pc; 824a69fc16SKonstantin Belousov uint64_t sw; 834a69fc16SKonstantin Belousov 844a69fc16SKonstantin Belousov if (CPU_ISSET(c, csp)) 854a69fc16SKonstantin Belousov return; 864a69fc16SKonstantin Belousov 874a69fc16SKonstantin Belousov pc = cpuid_to_pcpu[c]; 884a69fc16SKonstantin Belousov if (pc->pc_curthread == pc->pc_idlethread) { 894a69fc16SKonstantin Belousov CPU_SET(c, csp); 904a69fc16SKonstantin Belousov return; 914a69fc16SKonstantin Belousov } 924a69fc16SKonstantin Belousov 934a69fc16SKonstantin Belousov /* 944a69fc16SKonstantin Belousov * Sync with context switch to ensure that override of 954a69fc16SKonstantin Belousov * pc_curthread with non-idle thread pointer is visible before 964a69fc16SKonstantin Belousov * reading of pc_switchtime. 974a69fc16SKonstantin Belousov */ 984a69fc16SKonstantin Belousov atomic_thread_fence_acq(); 994a69fc16SKonstantin Belousov 1004a69fc16SKonstantin Belousov sw = pc->pc_switchtime; 1014a69fc16SKonstantin Belousov if (init) 1024a69fc16SKonstantin Belousov swt[c] = sw; 1034a69fc16SKonstantin Belousov else if (sw != swt[c]) 1044a69fc16SKonstantin Belousov CPU_SET(c, csp); 1054a69fc16SKonstantin Belousov } 1064a69fc16SKonstantin Belousov 1074a69fc16SKonstantin Belousov /* 1084a69fc16SKonstantin Belousov * 1094a69fc16SKonstantin Belousov * XXXKIB: We execute the requested action (seq_cst and possibly 1104a69fc16SKonstantin Belousov * sync_core) on current CPU as well. There is no guarantee that 1114a69fc16SKonstantin Belousov * current thread executes anything with the full fence semantics 1124a69fc16SKonstantin Belousov * during syscall execution. Similarly, cpu_core_sync() semantics 1134a69fc16SKonstantin Belousov * might be not provided by the syscall return. E.g. on amd64 we 1144a69fc16SKonstantin Belousov * typically return without IRET. 1154a69fc16SKonstantin Belousov */ 1164a69fc16SKonstantin Belousov int 1174a69fc16SKonstantin Belousov kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id) 1184a69fc16SKonstantin Belousov { 1194a69fc16SKonstantin Belousov struct proc *p, *p1; 1204a69fc16SKonstantin Belousov struct thread *td1; 1214a69fc16SKonstantin Belousov cpuset_t cs; 1224a69fc16SKonstantin Belousov uint64_t *swt; 1234a69fc16SKonstantin Belousov int c, error; 1244a69fc16SKonstantin Belousov bool first; 1254a69fc16SKonstantin Belousov 1264a69fc16SKonstantin Belousov if (flags != 0 || (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0) 1274a69fc16SKonstantin Belousov return (EINVAL); 1284a69fc16SKonstantin Belousov 1294a69fc16SKonstantin Belousov if (cmd == MEMBARRIER_CMD_QUERY) { 1304a69fc16SKonstantin Belousov td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS; 1314a69fc16SKonstantin Belousov return (0); 1324a69fc16SKonstantin Belousov } 1334a69fc16SKonstantin Belousov 1344a69fc16SKonstantin Belousov p = td->td_proc; 1354a69fc16SKonstantin Belousov error = 0; 1364a69fc16SKonstantin Belousov 1374a69fc16SKonstantin Belousov switch (cmd) { 1384a69fc16SKonstantin Belousov case MEMBARRIER_CMD_GLOBAL: 1394a69fc16SKonstantin Belousov swt = malloc((mp_maxid + 1) * sizeof(*swt), M_TEMP, M_WAITOK); 1404a69fc16SKonstantin Belousov CPU_ZERO(&cs); 1414a69fc16SKonstantin Belousov sched_pin(); 1424a69fc16SKonstantin Belousov CPU_SET(PCPU_GET(cpuid), &cs); 1434a69fc16SKonstantin Belousov for (first = true; error == 0; first = false) { 1444a69fc16SKonstantin Belousov CPU_FOREACH(c) 1454a69fc16SKonstantin Belousov check_cpu_switched(c, &cs, swt, first); 1464a69fc16SKonstantin Belousov if (CPU_CMP(&cs, &all_cpus) == 0) 1474a69fc16SKonstantin Belousov break; 1484a69fc16SKonstantin Belousov error = pause_sig("mmbr", 1); 1494a69fc16SKonstantin Belousov if (error == EWOULDBLOCK) 1504a69fc16SKonstantin Belousov error = 0; 1514a69fc16SKonstantin Belousov } 1524a69fc16SKonstantin Belousov sched_unpin(); 1534a69fc16SKonstantin Belousov free(swt, M_TEMP); 1544a69fc16SKonstantin Belousov atomic_thread_fence_seq_cst(); 1554a69fc16SKonstantin Belousov break; 1564a69fc16SKonstantin Belousov 1574a69fc16SKonstantin Belousov case MEMBARRIER_CMD_GLOBAL_EXPEDITED: 1584a69fc16SKonstantin Belousov if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) { 1594a69fc16SKonstantin Belousov error = EPERM; 1604a69fc16SKonstantin Belousov } else { 1614a69fc16SKonstantin Belousov CPU_ZERO(&cs); 1624a69fc16SKonstantin Belousov CPU_FOREACH(c) { 1634a69fc16SKonstantin Belousov td1 = cpuid_to_pcpu[c]->pc_curthread; 1644a69fc16SKonstantin Belousov p1 = td1->td_proc; 1654a69fc16SKonstantin Belousov if (p1 != NULL && 1664a69fc16SKonstantin Belousov (p1->p_flag2 & P2_MEMBAR_GLOBE) != 0) 1674a69fc16SKonstantin Belousov CPU_SET(c, &cs); 1684a69fc16SKonstantin Belousov } 1694a69fc16SKonstantin Belousov do_membarrier_ipi(&cs, membarrier_action_seqcst); 1704a69fc16SKonstantin Belousov } 1714a69fc16SKonstantin Belousov break; 1724a69fc16SKonstantin Belousov 1734a69fc16SKonstantin Belousov case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: 1744a69fc16SKonstantin Belousov if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) { 1754a69fc16SKonstantin Belousov PROC_LOCK(p); 1764a69fc16SKonstantin Belousov p->p_flag2 |= P2_MEMBAR_GLOBE; 1774a69fc16SKonstantin Belousov PROC_UNLOCK(p); 1784a69fc16SKonstantin Belousov } 1794a69fc16SKonstantin Belousov break; 1804a69fc16SKonstantin Belousov 1814a69fc16SKonstantin Belousov case MEMBARRIER_CMD_PRIVATE_EXPEDITED: 1824a69fc16SKonstantin Belousov if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) { 1834a69fc16SKonstantin Belousov error = EPERM; 1844a69fc16SKonstantin Belousov } else { 1854a69fc16SKonstantin Belousov pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs); 1864a69fc16SKonstantin Belousov do_membarrier_ipi(&cs, membarrier_action_seqcst); 1874a69fc16SKonstantin Belousov } 1884a69fc16SKonstantin Belousov break; 1894a69fc16SKonstantin Belousov 1904a69fc16SKonstantin Belousov case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: 1914a69fc16SKonstantin Belousov if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) { 1924a69fc16SKonstantin Belousov PROC_LOCK(p); 1934a69fc16SKonstantin Belousov p->p_flag2 |= P2_MEMBAR_PRIVE; 1944a69fc16SKonstantin Belousov PROC_UNLOCK(p); 1954a69fc16SKonstantin Belousov } 1964a69fc16SKonstantin Belousov break; 1974a69fc16SKonstantin Belousov 1984a69fc16SKonstantin Belousov case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: 1994a69fc16SKonstantin Belousov if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) { 2004a69fc16SKonstantin Belousov error = EPERM; 2014a69fc16SKonstantin Belousov } else { 2024a69fc16SKonstantin Belousov /* 2034a69fc16SKonstantin Belousov * Calculating the IPI multicast mask from 2044a69fc16SKonstantin Belousov * pmap active mask means that we do not call 2054a69fc16SKonstantin Belousov * cpu_sync_core() on CPUs that were missed 2064a69fc16SKonstantin Belousov * from pmap active mask but could be switched 2074a69fc16SKonstantin Belousov * from or to meantime. This is fine at least 2084a69fc16SKonstantin Belousov * on amd64 because threads always use slow 2094a69fc16SKonstantin Belousov * (IRETQ) path to return from syscall after 2104a69fc16SKonstantin Belousov * context switch. 2114a69fc16SKonstantin Belousov */ 2124a69fc16SKonstantin Belousov pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs); 2134a69fc16SKonstantin Belousov 2144a69fc16SKonstantin Belousov do_membarrier_ipi(&cs, 2154a69fc16SKonstantin Belousov membarrier_action_seqcst_sync_core); 2164a69fc16SKonstantin Belousov } 2174a69fc16SKonstantin Belousov break; 2184a69fc16SKonstantin Belousov 2194a69fc16SKonstantin Belousov case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: 2204a69fc16SKonstantin Belousov if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) { 2214a69fc16SKonstantin Belousov PROC_LOCK(p); 2224a69fc16SKonstantin Belousov p->p_flag2 |= P2_MEMBAR_PRIVE_SYNCORE; 2234a69fc16SKonstantin Belousov PROC_UNLOCK(p); 2244a69fc16SKonstantin Belousov } 2254a69fc16SKonstantin Belousov break; 2264a69fc16SKonstantin Belousov 2274a69fc16SKonstantin Belousov default: 2284a69fc16SKonstantin Belousov error = EINVAL; 2294a69fc16SKonstantin Belousov break; 2304a69fc16SKonstantin Belousov } 2314a69fc16SKonstantin Belousov 2324a69fc16SKonstantin Belousov return (error); 2334a69fc16SKonstantin Belousov } 2344a69fc16SKonstantin Belousov 2354a69fc16SKonstantin Belousov int 2364a69fc16SKonstantin Belousov sys_membarrier(struct thread *td, struct membarrier_args *uap) 2374a69fc16SKonstantin Belousov { 2384a69fc16SKonstantin Belousov return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id)); 2394a69fc16SKonstantin Belousov } 240