1 /* 2 * Copyright (c) 2005 Topspin Communications. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #ifndef __UTIL_UDMA_BARRIER_H 34 #define __UTIL_UDMA_BARRIER_H 35 36 #include <pthread.h> 37 38 /* Barriers for DMA. 39 40 These barriers are expliclty only for use with user DMA operations. If you 41 are looking for barriers to use with cache-coherent multi-threaded 42 consitency then look in stdatomic.h. If you need both kinds of synchronicity 43 for the same address then use an atomic operation followed by one 44 of these barriers. 45 46 When reasoning about these barriers there are two objects: 47 - CPU attached address space (the CPU memory could be a range of things: 48 cached/uncached/non-temporal CPU DRAM, uncached MMIO space in another 49 device, pMEM). Generally speaking the ordering is only relative 50 to the local CPU's view of the system. Eg if the local CPU 51 is not guaranteed to see a write from another CPU then it is also 52 OK for the DMA device to also not see the write after the barrier. 53 - A DMA initiator on a bus. For instance a PCI-E device issuing 54 MemRd/MemWr TLPs. 55 56 The ordering guarantee is always stated between those two streams. Eg what 57 happens if a MemRd TLP is sent in via PCI-E relative to a CPU WRITE to the 58 same memory location. 59 60 The providers have a very regular and predictable use of these barriers, 61 to make things very clear each narrow use is given a name and the proper 62 name should be used in the provider as a form of documentation. 63 */ 64 65 /* Ensure that the device's view of memory matches the CPU's view of memory. 66 This should be placed before any MMIO store that could trigger the device 67 to begin doing DMA, such as a device doorbell ring. 68 69 eg 70 *dma_buf = 1; 71 udma_to_device_barrier(); 72 mmio_write(DO_DMA_REG, dma_buf); 73 Must ensure that the device sees the '1'. 74 75 This is required to fence writes created by the libibverbs user. Those 76 writes could be to any CPU mapped memory object with any cachability mode. 77 78 NOTE: x86 has historically used a weaker semantic for this barrier, and 79 only fenced normal stores to normal memory. libibverbs users using other 80 memory types or non-temporal stores are required to use SFENCE in their own 81 code prior to calling verbs to start a DMA. 82 */ 83 #if defined(__i386__) 84 #define udma_to_device_barrier() asm volatile("" ::: "memory") 85 #elif defined(__x86_64__) 86 #define udma_to_device_barrier() asm volatile("" ::: "memory") 87 #elif defined(__PPC64__) 88 #define udma_to_device_barrier() asm volatile("sync" ::: "memory") 89 #elif defined(__PPC__) 90 #define udma_to_device_barrier() asm volatile("sync" ::: "memory") 91 #elif defined(__ia64__) 92 #define udma_to_device_barrier() asm volatile("mf" ::: "memory") 93 #elif defined(__sparc_v9__) 94 #define udma_to_device_barrier() asm volatile("membar #StoreStore" ::: "memory") 95 #elif defined(__aarch64__) 96 #define udma_to_device_barrier() asm volatile("dsb st" ::: "memory"); 97 #elif defined(__sparc__) || defined(__s390x__) 98 #define udma_to_device_barrier() asm volatile("" ::: "memory") 99 #else 100 #error No architecture specific memory barrier defines found! 101 #endif 102 103 /* Ensure that all ordered stores from the device are observable from the 104 CPU. This only makes sense after something that observes an ordered store 105 from the device - eg by reading a MMIO register or seeing that CPU memory is 106 updated. 107 108 This guarantees that all reads that follow the barrier see the ordered 109 stores that preceded the observation. 110 111 For instance, this would be used after testing a valid bit in a memory 112 that is a DMA target, to ensure that the following reads see the 113 data written before the MemWr TLP that set the valid bit. 114 */ 115 #if defined(__i386__) 116 #define udma_from_device_barrier() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") 117 #elif defined(__x86_64__) 118 #define udma_from_device_barrier() asm volatile("lfence" ::: "memory") 119 #elif defined(__PPC64__) 120 #define udma_from_device_barrier() asm volatile("lwsync" ::: "memory") 121 #elif defined(__PPC__) 122 #define udma_from_device_barrier() asm volatile("sync" ::: "memory") 123 #elif defined(__ia64__) 124 #define udma_from_device_barrier() asm volatile("mf" ::: "memory") 125 #elif defined(__sparc_v9__) 126 #define udma_from_device_barrier() asm volatile("membar #LoadLoad" ::: "memory") 127 #elif defined(__aarch64__) 128 #define udma_from_device_barrier() asm volatile("dsb ld" ::: "memory"); 129 #elif defined(__sparc__) || defined(__s390x__) 130 #define udma_from_device_barrier() asm volatile("" ::: "memory") 131 #else 132 #error No architecture specific memory barrier defines found! 133 #endif 134 135 /* Order writes to CPU memory so that a DMA device cannot view writes after 136 the barrier without also seeing all writes before the barrier. This does 137 not guarantee any writes are visible to DMA. 138 139 This would be used in cases where a DMA buffer might have a valid bit and 140 data, this barrier is placed after writing the data but before writing the 141 valid bit to ensure the DMA device cannot observe a set valid bit with 142 unwritten data. 143 144 Compared to udma_to_device_barrier() this barrier is not required to fence 145 anything but normal stores to normal malloc memory. Usage should be: 146 147 write_wqe 148 udma_to_device_barrier(); // Get user memory ready for DMA 149 wqe->addr = ...; 150 wqe->flags = ...; 151 udma_ordering_write_barrier(); // Guarantee WQE written in order 152 wqe->valid = 1; 153 */ 154 #define udma_ordering_write_barrier() udma_to_device_barrier() 155 156 /* Promptly flush writes to MMIO Write Cominbing memory. 157 This should be used after a write to WC memory. This is both a barrier 158 and a hint to the CPU to flush any buffers to reduce latency to TLP 159 generation. 160 161 This is not required to have any effect on CPU memory. 162 163 If done while holding a lock then the ordering of MMIO writes across CPUs 164 must be guaranteed to follow the natural ordering implied by the lock. 165 166 This must also act as a barrier that prevents write combining, eg 167 *wc_mem = 1; 168 mmio_flush_writes(); 169 *wc_mem = 2; 170 Must always produce two MemWr TLPs, '1' and '2'. Without the barrier 171 the CPU is allowed to produce a single TLP '2'. 172 173 Note that there is no order guarantee for writes to WC memory without 174 barriers. 175 176 This is intended to be used in conjunction with WC memory to generate large 177 PCI-E MemWr TLPs from the CPU. 178 */ 179 #if defined(__i386__) 180 #define mmio_flush_writes() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") 181 #elif defined(__x86_64__) 182 #define mmio_flush_writes() asm volatile("sfence" ::: "memory") 183 #elif defined(__PPC64__) 184 #define mmio_flush_writes() asm volatile("sync" ::: "memory") 185 #elif defined(__PPC__) 186 #define mmio_flush_writes() asm volatile("sync" ::: "memory") 187 #elif defined(__ia64__) 188 #define mmio_flush_writes() asm volatile("fwb" ::: "memory") 189 #elif defined(__sparc_v9__) 190 #define mmio_flush_writes() asm volatile("membar #StoreStore" ::: "memory") 191 #elif defined(__aarch64__) 192 #define mmio_flush_writes() asm volatile("dsb st" ::: "memory"); 193 #elif defined(__sparc__) || defined(__s390x__) 194 #define mmio_flush_writes() asm volatile("" ::: "memory") 195 #else 196 #error No architecture specific memory barrier defines found! 197 #endif 198 199 /* Prevent WC writes from being re-ordered relative to other MMIO 200 writes. This should be used before a write to WC memory. 201 202 This must act as a barrier to prevent write re-ordering from different 203 memory types: 204 *mmio_mem = 1; 205 mmio_flush_writes(); 206 *wc_mem = 2; 207 Must always produce a TLP '1' followed by '2'. 208 209 This barrier implies udma_to_device_barrier() 210 211 This is intended to be used in conjunction with WC memory to generate large 212 PCI-E MemWr TLPs from the CPU. 213 */ 214 #define mmio_wc_start() mmio_flush_writes() 215 216 /* Keep MMIO writes in order. 217 Currently we lack writel macros that universally guarantee MMIO 218 writes happen in order, like the kernel does. Even worse many 219 providers haphazardly open code writes to MMIO memory omitting even 220 volatile. 221 222 Until this can be fixed with a proper writel macro, this barrier 223 is a stand in to indicate places where MMIO writes should be switched 224 to some future writel. 225 */ 226 #define mmio_ordered_writes_hack() mmio_flush_writes() 227 228 /* Write Combining Spinlock primitive 229 230 Any access to a multi-value WC region must ensure that multiple cpus do not 231 write to the same values concurrently, these macros make that 232 straightforward and efficient if the choosen exclusion is a spinlock. 233 234 The spinlock guarantees that the WC writes issued within the critical 235 section are made visible as TLP to the device. The TLP must be seen by the 236 device strictly in the order that the spinlocks are acquired, and combining 237 WC writes between different sections is not permitted. 238 239 Use of these macros allow the fencing inside the spinlock to be combined 240 with the fencing required for DMA. 241 */ 242 static inline void mmio_wc_spinlock(pthread_spinlock_t *lock) 243 { 244 pthread_spin_lock(lock); 245 #if !defined(__i386__) && !defined(__x86_64__) 246 /* For x86 the serialization within the spin lock is enough to 247 * strongly order WC and other memory types. */ 248 mmio_wc_start(); 249 #endif 250 } 251 252 static inline void mmio_wc_spinunlock(pthread_spinlock_t *lock) 253 { 254 /* It is possible that on x86 the atomic in the lock is strong enough 255 * to force-flush the WC buffers quickly, and this SFENCE can be 256 * omitted too. */ 257 mmio_flush_writes(); 258 pthread_spin_unlock(lock); 259 } 260 261 #endif 262