1 /*- 2 * Copyright (c) 2021 The FreeBSD Foundation 3 * 4 * This software were developed by Konstantin Belousov <kib@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/cpuset.h> 32 #include <sys/lock.h> 33 #include <sys/membarrier.h> 34 #include <sys/mutex.h> 35 #include <sys/proc.h> 36 #include <sys/sched.h> 37 #include <sys/smp.h> 38 #include <sys/syscallsubr.h> 39 #include <sys/sysproto.h> 40 41 #include <vm/vm_param.h> 42 #include <vm/vm.h> 43 #include <vm/pmap.h> 44 #include <vm/vm_map.h> 45 46 #define MEMBARRIER_SUPPORTED_CMDS ( \ 47 MEMBARRIER_CMD_GLOBAL | \ 48 MEMBARRIER_CMD_GLOBAL_EXPEDITED | \ 49 MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED | \ 50 MEMBARRIER_CMD_PRIVATE_EXPEDITED | \ 51 MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED | \ 52 MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE | \ 53 MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 54 55 static void 56 membarrier_action_seqcst(void *arg __unused) 57 { 58 atomic_thread_fence_seq_cst(); 59 } 60 61 static void 62 membarrier_action_seqcst_sync_core(void *arg __unused) 63 { 64 atomic_thread_fence_seq_cst(); 65 cpu_sync_core(); 66 } 67 68 static void 69 do_membarrier_ipi(cpuset_t *csp, void (*func)(void *)) 70 { 71 atomic_thread_fence_seq_cst(); 72 smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func, 73 smp_no_rendezvous_barrier, NULL); 74 atomic_thread_fence_seq_cst(); 75 } 76 77 static void 78 check_cpu_switched(int c, cpuset_t *csp, uint64_t *swt, bool init) 79 { 80 struct pcpu *pc; 81 uint64_t sw; 82 83 if (CPU_ISSET(c, csp)) 84 return; 85 86 pc = cpuid_to_pcpu[c]; 87 if (pc->pc_curthread == pc->pc_idlethread) { 88 CPU_SET(c, csp); 89 return; 90 } 91 92 /* 93 * Sync with context switch to ensure that override of 94 * pc_curthread with non-idle thread pointer is visible before 95 * reading of pc_switchtime. 96 */ 97 atomic_thread_fence_acq(); 98 99 sw = pc->pc_switchtime; 100 if (init) 101 swt[c] = sw; 102 else if (sw != swt[c]) 103 CPU_SET(c, csp); 104 } 105 106 /* 107 * 108 * XXXKIB: We execute the requested action (seq_cst and possibly 109 * sync_core) on current CPU as well. There is no guarantee that 110 * current thread executes anything with the full fence semantics 111 * during syscall execution. Similarly, cpu_core_sync() semantics 112 * might be not provided by the syscall return. E.g. on amd64 we 113 * typically return without IRET. 114 */ 115 int 116 kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id) 117 { 118 struct proc *p, *p1; 119 struct thread *td1; 120 cpuset_t cs; 121 uint64_t *swt; 122 int c, error; 123 bool first; 124 125 if (flags != 0 || (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0) 126 return (EINVAL); 127 128 if (cmd == MEMBARRIER_CMD_QUERY) { 129 td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS; 130 return (0); 131 } 132 133 p = td->td_proc; 134 error = 0; 135 136 switch (cmd) { 137 case MEMBARRIER_CMD_GLOBAL: 138 swt = malloc((mp_maxid + 1) * sizeof(*swt), M_TEMP, M_WAITOK); 139 CPU_ZERO(&cs); 140 sched_pin(); 141 CPU_SET(PCPU_GET(cpuid), &cs); 142 for (first = true; error == 0; first = false) { 143 CPU_FOREACH(c) 144 check_cpu_switched(c, &cs, swt, first); 145 if (CPU_CMP(&cs, &all_cpus) == 0) 146 break; 147 error = pause_sig("mmbr", 1); 148 if (error == EWOULDBLOCK) 149 error = 0; 150 } 151 sched_unpin(); 152 free(swt, M_TEMP); 153 atomic_thread_fence_seq_cst(); 154 break; 155 156 case MEMBARRIER_CMD_GLOBAL_EXPEDITED: 157 if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) { 158 error = EPERM; 159 } else { 160 CPU_ZERO(&cs); 161 CPU_FOREACH(c) { 162 td1 = cpuid_to_pcpu[c]->pc_curthread; 163 p1 = td1->td_proc; 164 if (p1 != NULL && 165 (p1->p_flag2 & P2_MEMBAR_GLOBE) != 0) 166 CPU_SET(c, &cs); 167 } 168 do_membarrier_ipi(&cs, membarrier_action_seqcst); 169 } 170 break; 171 172 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: 173 if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) { 174 PROC_LOCK(p); 175 p->p_flag2 |= P2_MEMBAR_GLOBE; 176 PROC_UNLOCK(p); 177 } 178 break; 179 180 case MEMBARRIER_CMD_PRIVATE_EXPEDITED: 181 if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) { 182 error = EPERM; 183 } else { 184 pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs); 185 do_membarrier_ipi(&cs, membarrier_action_seqcst); 186 } 187 break; 188 189 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: 190 if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) { 191 PROC_LOCK(p); 192 p->p_flag2 |= P2_MEMBAR_PRIVE; 193 PROC_UNLOCK(p); 194 } 195 break; 196 197 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: 198 if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) { 199 error = EPERM; 200 } else { 201 /* 202 * Calculating the IPI multicast mask from 203 * pmap active mask means that we do not call 204 * cpu_sync_core() on CPUs that were missed 205 * from pmap active mask but could be switched 206 * from or to meantime. This is fine at least 207 * on amd64 because threads always use slow 208 * (IRETQ) path to return from syscall after 209 * context switch. 210 */ 211 pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs); 212 213 do_membarrier_ipi(&cs, 214 membarrier_action_seqcst_sync_core); 215 } 216 break; 217 218 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: 219 if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) { 220 PROC_LOCK(p); 221 p->p_flag2 |= P2_MEMBAR_PRIVE_SYNCORE; 222 PROC_UNLOCK(p); 223 } 224 break; 225 226 default: 227 error = EINVAL; 228 break; 229 } 230 231 return (error); 232 } 233 234 int 235 sys_membarrier(struct thread *td, struct membarrier_args *uap) 236 { 237 return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id)); 238 } 239