1 /*- 2 * Copyright (c) 2021 The FreeBSD Foundation 3 * 4 * This software were developed by Konstantin Belousov <kib@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/cpuset.h> 33 #include <sys/lock.h> 34 #include <sys/membarrier.h> 35 #include <sys/mutex.h> 36 #include <sys/proc.h> 37 #include <sys/sched.h> 38 #include <sys/smp.h> 39 #include <sys/syscallsubr.h> 40 #include <sys/sysproto.h> 41 42 #include <vm/vm_param.h> 43 #include <vm/vm.h> 44 #include <vm/pmap.h> 45 #include <vm/vm_map.h> 46 47 #define MEMBARRIER_SUPPORTED_CMDS ( \ 48 MEMBARRIER_CMD_GLOBAL | \ 49 MEMBARRIER_CMD_GLOBAL_EXPEDITED | \ 50 MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED | \ 51 MEMBARRIER_CMD_PRIVATE_EXPEDITED | \ 52 MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED | \ 53 MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE | \ 54 MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 55 56 static void 57 membarrier_action_seqcst(void *arg __unused) 58 { 59 atomic_thread_fence_seq_cst(); 60 } 61 62 static void 63 membarrier_action_seqcst_sync_core(void *arg __unused) 64 { 65 atomic_thread_fence_seq_cst(); 66 cpu_sync_core(); 67 } 68 69 static void 70 do_membarrier_ipi(cpuset_t *csp, void (*func)(void *)) 71 { 72 atomic_thread_fence_seq_cst(); 73 smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func, 74 smp_no_rendezvous_barrier, NULL); 75 atomic_thread_fence_seq_cst(); 76 } 77 78 static void 79 check_cpu_switched(int c, cpuset_t *csp, uint64_t *swt, bool init) 80 { 81 struct pcpu *pc; 82 uint64_t sw; 83 84 if (CPU_ISSET(c, csp)) 85 return; 86 87 pc = cpuid_to_pcpu[c]; 88 if (pc->pc_curthread == pc->pc_idlethread) { 89 CPU_SET(c, csp); 90 return; 91 } 92 93 /* 94 * Sync with context switch to ensure that override of 95 * pc_curthread with non-idle thread pointer is visible before 96 * reading of pc_switchtime. 97 */ 98 atomic_thread_fence_acq(); 99 100 sw = pc->pc_switchtime; 101 if (init) 102 swt[c] = sw; 103 else if (sw != swt[c]) 104 CPU_SET(c, csp); 105 } 106 107 /* 108 * 109 * XXXKIB: We execute the requested action (seq_cst and possibly 110 * sync_core) on current CPU as well. There is no guarantee that 111 * current thread executes anything with the full fence semantics 112 * during syscall execution. Similarly, cpu_core_sync() semantics 113 * might be not provided by the syscall return. E.g. on amd64 we 114 * typically return without IRET. 115 */ 116 int 117 kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id) 118 { 119 struct proc *p, *p1; 120 struct thread *td1; 121 cpuset_t cs; 122 uint64_t *swt; 123 int c, error; 124 bool first; 125 126 if (flags != 0 || (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0) 127 return (EINVAL); 128 129 if (cmd == MEMBARRIER_CMD_QUERY) { 130 td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS; 131 return (0); 132 } 133 134 p = td->td_proc; 135 error = 0; 136 137 switch (cmd) { 138 case MEMBARRIER_CMD_GLOBAL: 139 swt = malloc((mp_maxid + 1) * sizeof(*swt), M_TEMP, M_WAITOK); 140 CPU_ZERO(&cs); 141 sched_pin(); 142 CPU_SET(PCPU_GET(cpuid), &cs); 143 for (first = true; error == 0; first = false) { 144 CPU_FOREACH(c) 145 check_cpu_switched(c, &cs, swt, first); 146 if (CPU_CMP(&cs, &all_cpus) == 0) 147 break; 148 error = pause_sig("mmbr", 1); 149 if (error == EWOULDBLOCK) 150 error = 0; 151 } 152 sched_unpin(); 153 free(swt, M_TEMP); 154 atomic_thread_fence_seq_cst(); 155 break; 156 157 case MEMBARRIER_CMD_GLOBAL_EXPEDITED: 158 if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) { 159 error = EPERM; 160 } else { 161 CPU_ZERO(&cs); 162 CPU_FOREACH(c) { 163 td1 = cpuid_to_pcpu[c]->pc_curthread; 164 p1 = td1->td_proc; 165 if (p1 != NULL && 166 (p1->p_flag2 & P2_MEMBAR_GLOBE) != 0) 167 CPU_SET(c, &cs); 168 } 169 do_membarrier_ipi(&cs, membarrier_action_seqcst); 170 } 171 break; 172 173 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: 174 if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) { 175 PROC_LOCK(p); 176 p->p_flag2 |= P2_MEMBAR_GLOBE; 177 PROC_UNLOCK(p); 178 } 179 break; 180 181 case MEMBARRIER_CMD_PRIVATE_EXPEDITED: 182 if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) { 183 error = EPERM; 184 } else { 185 pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs); 186 do_membarrier_ipi(&cs, membarrier_action_seqcst); 187 } 188 break; 189 190 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: 191 if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) { 192 PROC_LOCK(p); 193 p->p_flag2 |= P2_MEMBAR_PRIVE; 194 PROC_UNLOCK(p); 195 } 196 break; 197 198 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: 199 if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) { 200 error = EPERM; 201 } else { 202 /* 203 * Calculating the IPI multicast mask from 204 * pmap active mask means that we do not call 205 * cpu_sync_core() on CPUs that were missed 206 * from pmap active mask but could be switched 207 * from or to meantime. This is fine at least 208 * on amd64 because threads always use slow 209 * (IRETQ) path to return from syscall after 210 * context switch. 211 */ 212 pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs); 213 214 do_membarrier_ipi(&cs, 215 membarrier_action_seqcst_sync_core); 216 } 217 break; 218 219 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: 220 if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) { 221 PROC_LOCK(p); 222 p->p_flag2 |= P2_MEMBAR_PRIVE_SYNCORE; 223 PROC_UNLOCK(p); 224 } 225 break; 226 227 default: 228 error = EINVAL; 229 break; 230 } 231 232 return (error); 233 } 234 235 int 236 sys_membarrier(struct thread *td, struct membarrier_args *uap) 237 { 238 return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id)); 239 } 240