1*4a69fc16SKonstantin Belousov /*- 2*4a69fc16SKonstantin Belousov * Copyright (c) 2021 The FreeBSD Foundation 3*4a69fc16SKonstantin Belousov * 4*4a69fc16SKonstantin Belousov * This software were developed by Konstantin Belousov <kib@FreeBSD.org> 5*4a69fc16SKonstantin Belousov * under sponsorship from the FreeBSD Foundation. 6*4a69fc16SKonstantin Belousov * 7*4a69fc16SKonstantin Belousov * Redistribution and use in source and binary forms, with or without 8*4a69fc16SKonstantin Belousov * modification, are permitted provided that the following conditions 9*4a69fc16SKonstantin Belousov * are met: 10*4a69fc16SKonstantin Belousov * 1. Redistributions of source code must retain the above copyright 11*4a69fc16SKonstantin Belousov * notice, this list of conditions and the following disclaimer. 12*4a69fc16SKonstantin Belousov * 2. Redistributions in binary form must reproduce the above copyright 13*4a69fc16SKonstantin Belousov * notice, this list of conditions and the following disclaimer in the 14*4a69fc16SKonstantin Belousov * documentation and/or other materials provided with the distribution. 15*4a69fc16SKonstantin Belousov * 16*4a69fc16SKonstantin Belousov * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17*4a69fc16SKonstantin Belousov * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18*4a69fc16SKonstantin Belousov * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19*4a69fc16SKonstantin Belousov * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20*4a69fc16SKonstantin Belousov * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21*4a69fc16SKonstantin Belousov * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22*4a69fc16SKonstantin Belousov * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23*4a69fc16SKonstantin Belousov * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24*4a69fc16SKonstantin Belousov * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25*4a69fc16SKonstantin Belousov * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26*4a69fc16SKonstantin Belousov * SUCH DAMAGE. 27*4a69fc16SKonstantin Belousov */ 28*4a69fc16SKonstantin Belousov 29*4a69fc16SKonstantin Belousov #include <sys/cdefs.h> 30*4a69fc16SKonstantin Belousov #include <sys/param.h> 31*4a69fc16SKonstantin Belousov #include <sys/systm.h> 32*4a69fc16SKonstantin Belousov #include <sys/cpuset.h> 33*4a69fc16SKonstantin Belousov #include <sys/lock.h> 34*4a69fc16SKonstantin Belousov #include <sys/membarrier.h> 35*4a69fc16SKonstantin Belousov #include <sys/mutex.h> 36*4a69fc16SKonstantin Belousov #include <sys/proc.h> 37*4a69fc16SKonstantin Belousov #include <sys/sched.h> 38*4a69fc16SKonstantin Belousov #include <sys/smp.h> 39*4a69fc16SKonstantin Belousov #include <sys/syscallsubr.h> 40*4a69fc16SKonstantin Belousov #include <sys/sysproto.h> 41*4a69fc16SKonstantin Belousov 42*4a69fc16SKonstantin Belousov #include <vm/vm_param.h> 43*4a69fc16SKonstantin Belousov #include <vm/vm.h> 44*4a69fc16SKonstantin Belousov #include <vm/pmap.h> 45*4a69fc16SKonstantin Belousov #include <vm/vm_map.h> 46*4a69fc16SKonstantin Belousov 47*4a69fc16SKonstantin Belousov #define MEMBARRIER_SUPPORTED_CMDS ( \ 48*4a69fc16SKonstantin Belousov MEMBARRIER_CMD_GLOBAL | \ 49*4a69fc16SKonstantin Belousov MEMBARRIER_CMD_GLOBAL_EXPEDITED | \ 50*4a69fc16SKonstantin Belousov MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED | \ 51*4a69fc16SKonstantin Belousov MEMBARRIER_CMD_PRIVATE_EXPEDITED | \ 52*4a69fc16SKonstantin Belousov MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED | \ 53*4a69fc16SKonstantin Belousov MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE | \ 54*4a69fc16SKonstantin Belousov MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 55*4a69fc16SKonstantin Belousov 56*4a69fc16SKonstantin Belousov static void 57*4a69fc16SKonstantin Belousov membarrier_action_seqcst(void *arg __unused) 58*4a69fc16SKonstantin Belousov { 59*4a69fc16SKonstantin Belousov atomic_thread_fence_seq_cst(); 60*4a69fc16SKonstantin Belousov } 61*4a69fc16SKonstantin Belousov 62*4a69fc16SKonstantin Belousov static void 63*4a69fc16SKonstantin Belousov membarrier_action_seqcst_sync_core(void *arg __unused) 64*4a69fc16SKonstantin Belousov { 65*4a69fc16SKonstantin Belousov atomic_thread_fence_seq_cst(); 66*4a69fc16SKonstantin Belousov cpu_sync_core(); 67*4a69fc16SKonstantin Belousov } 68*4a69fc16SKonstantin Belousov 69*4a69fc16SKonstantin Belousov static void 70*4a69fc16SKonstantin Belousov do_membarrier_ipi(cpuset_t *csp, void (*func)(void *)) 71*4a69fc16SKonstantin Belousov { 72*4a69fc16SKonstantin Belousov atomic_thread_fence_seq_cst(); 73*4a69fc16SKonstantin Belousov smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func, 74*4a69fc16SKonstantin Belousov smp_no_rendezvous_barrier, NULL); 75*4a69fc16SKonstantin Belousov atomic_thread_fence_seq_cst(); 76*4a69fc16SKonstantin Belousov } 77*4a69fc16SKonstantin Belousov 78*4a69fc16SKonstantin Belousov static void 79*4a69fc16SKonstantin Belousov check_cpu_switched(int c, cpuset_t *csp, uint64_t *swt, bool init) 80*4a69fc16SKonstantin Belousov { 81*4a69fc16SKonstantin Belousov struct pcpu *pc; 82*4a69fc16SKonstantin Belousov uint64_t sw; 83*4a69fc16SKonstantin Belousov 84*4a69fc16SKonstantin Belousov if (CPU_ISSET(c, csp)) 85*4a69fc16SKonstantin Belousov return; 86*4a69fc16SKonstantin Belousov 87*4a69fc16SKonstantin Belousov pc = cpuid_to_pcpu[c]; 88*4a69fc16SKonstantin Belousov if (pc->pc_curthread == pc->pc_idlethread) { 89*4a69fc16SKonstantin Belousov CPU_SET(c, csp); 90*4a69fc16SKonstantin Belousov return; 91*4a69fc16SKonstantin Belousov } 92*4a69fc16SKonstantin Belousov 93*4a69fc16SKonstantin Belousov /* 94*4a69fc16SKonstantin Belousov * Sync with context switch to ensure that override of 95*4a69fc16SKonstantin Belousov * pc_curthread with non-idle thread pointer is visible before 96*4a69fc16SKonstantin Belousov * reading of pc_switchtime. 97*4a69fc16SKonstantin Belousov */ 98*4a69fc16SKonstantin Belousov atomic_thread_fence_acq(); 99*4a69fc16SKonstantin Belousov 100*4a69fc16SKonstantin Belousov sw = pc->pc_switchtime; 101*4a69fc16SKonstantin Belousov if (init) 102*4a69fc16SKonstantin Belousov swt[c] = sw; 103*4a69fc16SKonstantin Belousov else if (sw != swt[c]) 104*4a69fc16SKonstantin Belousov CPU_SET(c, csp); 105*4a69fc16SKonstantin Belousov } 106*4a69fc16SKonstantin Belousov 107*4a69fc16SKonstantin Belousov /* 108*4a69fc16SKonstantin Belousov * 109*4a69fc16SKonstantin Belousov * XXXKIB: We execute the requested action (seq_cst and possibly 110*4a69fc16SKonstantin Belousov * sync_core) on current CPU as well. There is no guarantee that 111*4a69fc16SKonstantin Belousov * current thread executes anything with the full fence semantics 112*4a69fc16SKonstantin Belousov * during syscall execution. Similarly, cpu_core_sync() semantics 113*4a69fc16SKonstantin Belousov * might be not provided by the syscall return. E.g. on amd64 we 114*4a69fc16SKonstantin Belousov * typically return without IRET. 115*4a69fc16SKonstantin Belousov */ 116*4a69fc16SKonstantin Belousov int 117*4a69fc16SKonstantin Belousov kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id) 118*4a69fc16SKonstantin Belousov { 119*4a69fc16SKonstantin Belousov struct proc *p, *p1; 120*4a69fc16SKonstantin Belousov struct thread *td1; 121*4a69fc16SKonstantin Belousov cpuset_t cs; 122*4a69fc16SKonstantin Belousov uint64_t *swt; 123*4a69fc16SKonstantin Belousov int c, error; 124*4a69fc16SKonstantin Belousov bool first; 125*4a69fc16SKonstantin Belousov 126*4a69fc16SKonstantin Belousov if (flags != 0 || (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0) 127*4a69fc16SKonstantin Belousov return (EINVAL); 128*4a69fc16SKonstantin Belousov 129*4a69fc16SKonstantin Belousov if (cmd == MEMBARRIER_CMD_QUERY) { 130*4a69fc16SKonstantin Belousov td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS; 131*4a69fc16SKonstantin Belousov return (0); 132*4a69fc16SKonstantin Belousov } 133*4a69fc16SKonstantin Belousov 134*4a69fc16SKonstantin Belousov p = td->td_proc; 135*4a69fc16SKonstantin Belousov error = 0; 136*4a69fc16SKonstantin Belousov 137*4a69fc16SKonstantin Belousov switch (cmd) { 138*4a69fc16SKonstantin Belousov case MEMBARRIER_CMD_GLOBAL: 139*4a69fc16SKonstantin Belousov swt = malloc((mp_maxid + 1) * sizeof(*swt), M_TEMP, M_WAITOK); 140*4a69fc16SKonstantin Belousov CPU_ZERO(&cs); 141*4a69fc16SKonstantin Belousov sched_pin(); 142*4a69fc16SKonstantin Belousov CPU_SET(PCPU_GET(cpuid), &cs); 143*4a69fc16SKonstantin Belousov for (first = true; error == 0; first = false) { 144*4a69fc16SKonstantin Belousov CPU_FOREACH(c) 145*4a69fc16SKonstantin Belousov check_cpu_switched(c, &cs, swt, first); 146*4a69fc16SKonstantin Belousov if (CPU_CMP(&cs, &all_cpus) == 0) 147*4a69fc16SKonstantin Belousov break; 148*4a69fc16SKonstantin Belousov error = pause_sig("mmbr", 1); 149*4a69fc16SKonstantin Belousov if (error == EWOULDBLOCK) 150*4a69fc16SKonstantin Belousov error = 0; 151*4a69fc16SKonstantin Belousov } 152*4a69fc16SKonstantin Belousov sched_unpin(); 153*4a69fc16SKonstantin Belousov free(swt, M_TEMP); 154*4a69fc16SKonstantin Belousov atomic_thread_fence_seq_cst(); 155*4a69fc16SKonstantin Belousov break; 156*4a69fc16SKonstantin Belousov 157*4a69fc16SKonstantin Belousov case MEMBARRIER_CMD_GLOBAL_EXPEDITED: 158*4a69fc16SKonstantin Belousov if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) { 159*4a69fc16SKonstantin Belousov error = EPERM; 160*4a69fc16SKonstantin Belousov } else { 161*4a69fc16SKonstantin Belousov CPU_ZERO(&cs); 162*4a69fc16SKonstantin Belousov CPU_FOREACH(c) { 163*4a69fc16SKonstantin Belousov td1 = cpuid_to_pcpu[c]->pc_curthread; 164*4a69fc16SKonstantin Belousov p1 = td1->td_proc; 165*4a69fc16SKonstantin Belousov if (p1 != NULL && 166*4a69fc16SKonstantin Belousov (p1->p_flag2 & P2_MEMBAR_GLOBE) != 0) 167*4a69fc16SKonstantin Belousov CPU_SET(c, &cs); 168*4a69fc16SKonstantin Belousov } 169*4a69fc16SKonstantin Belousov do_membarrier_ipi(&cs, membarrier_action_seqcst); 170*4a69fc16SKonstantin Belousov } 171*4a69fc16SKonstantin Belousov break; 172*4a69fc16SKonstantin Belousov 173*4a69fc16SKonstantin Belousov case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: 174*4a69fc16SKonstantin Belousov if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) { 175*4a69fc16SKonstantin Belousov PROC_LOCK(p); 176*4a69fc16SKonstantin Belousov p->p_flag2 |= P2_MEMBAR_GLOBE; 177*4a69fc16SKonstantin Belousov PROC_UNLOCK(p); 178*4a69fc16SKonstantin Belousov } 179*4a69fc16SKonstantin Belousov break; 180*4a69fc16SKonstantin Belousov 181*4a69fc16SKonstantin Belousov case MEMBARRIER_CMD_PRIVATE_EXPEDITED: 182*4a69fc16SKonstantin Belousov if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) { 183*4a69fc16SKonstantin Belousov error = EPERM; 184*4a69fc16SKonstantin Belousov } else { 185*4a69fc16SKonstantin Belousov pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs); 186*4a69fc16SKonstantin Belousov do_membarrier_ipi(&cs, membarrier_action_seqcst); 187*4a69fc16SKonstantin Belousov } 188*4a69fc16SKonstantin Belousov break; 189*4a69fc16SKonstantin Belousov 190*4a69fc16SKonstantin Belousov case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: 191*4a69fc16SKonstantin Belousov if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) { 192*4a69fc16SKonstantin Belousov PROC_LOCK(p); 193*4a69fc16SKonstantin Belousov p->p_flag2 |= P2_MEMBAR_PRIVE; 194*4a69fc16SKonstantin Belousov PROC_UNLOCK(p); 195*4a69fc16SKonstantin Belousov } 196*4a69fc16SKonstantin Belousov break; 197*4a69fc16SKonstantin Belousov 198*4a69fc16SKonstantin Belousov case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: 199*4a69fc16SKonstantin Belousov if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) { 200*4a69fc16SKonstantin Belousov error = EPERM; 201*4a69fc16SKonstantin Belousov } else { 202*4a69fc16SKonstantin Belousov /* 203*4a69fc16SKonstantin Belousov * Calculating the IPI multicast mask from 204*4a69fc16SKonstantin Belousov * pmap active mask means that we do not call 205*4a69fc16SKonstantin Belousov * cpu_sync_core() on CPUs that were missed 206*4a69fc16SKonstantin Belousov * from pmap active mask but could be switched 207*4a69fc16SKonstantin Belousov * from or to meantime. This is fine at least 208*4a69fc16SKonstantin Belousov * on amd64 because threads always use slow 209*4a69fc16SKonstantin Belousov * (IRETQ) path to return from syscall after 210*4a69fc16SKonstantin Belousov * context switch. 211*4a69fc16SKonstantin Belousov */ 212*4a69fc16SKonstantin Belousov pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs); 213*4a69fc16SKonstantin Belousov 214*4a69fc16SKonstantin Belousov do_membarrier_ipi(&cs, 215*4a69fc16SKonstantin Belousov membarrier_action_seqcst_sync_core); 216*4a69fc16SKonstantin Belousov } 217*4a69fc16SKonstantin Belousov break; 218*4a69fc16SKonstantin Belousov 219*4a69fc16SKonstantin Belousov case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: 220*4a69fc16SKonstantin Belousov if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) { 221*4a69fc16SKonstantin Belousov PROC_LOCK(p); 222*4a69fc16SKonstantin Belousov p->p_flag2 |= P2_MEMBAR_PRIVE_SYNCORE; 223*4a69fc16SKonstantin Belousov PROC_UNLOCK(p); 224*4a69fc16SKonstantin Belousov } 225*4a69fc16SKonstantin Belousov break; 226*4a69fc16SKonstantin Belousov 227*4a69fc16SKonstantin Belousov default: 228*4a69fc16SKonstantin Belousov error = EINVAL; 229*4a69fc16SKonstantin Belousov break; 230*4a69fc16SKonstantin Belousov } 231*4a69fc16SKonstantin Belousov 232*4a69fc16SKonstantin Belousov return (error); 233*4a69fc16SKonstantin Belousov } 234*4a69fc16SKonstantin Belousov 235*4a69fc16SKonstantin Belousov int 236*4a69fc16SKonstantin Belousov sys_membarrier(struct thread *td, struct membarrier_args *uap) 237*4a69fc16SKonstantin Belousov { 238*4a69fc16SKonstantin Belousov return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id)); 239*4a69fc16SKonstantin Belousov } 240