1*d4665eaaSJeff Roberson /*- 2*d4665eaaSJeff Roberson * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3*d4665eaaSJeff Roberson * 4*d4665eaaSJeff Roberson * Copyright (c) 2019 Jeffrey Roberson <jeff@FreeBSD.org> 5*d4665eaaSJeff Roberson * All rights reserved. 6*d4665eaaSJeff Roberson * 7*d4665eaaSJeff Roberson * Redistribution and use in source and binary forms, with or without 8*d4665eaaSJeff Roberson * modification, are permitted provided that the following conditions 9*d4665eaaSJeff Roberson * are met: 10*d4665eaaSJeff Roberson * 1. Redistributions of source code must retain the above copyright 11*d4665eaaSJeff Roberson * notice unmodified, this list of conditions, and the following 12*d4665eaaSJeff Roberson * disclaimer. 13*d4665eaaSJeff Roberson * 2. Redistributions in binary form must reproduce the above copyright 14*d4665eaaSJeff Roberson * notice, this list of conditions and the following disclaimer in the 15*d4665eaaSJeff Roberson * documentation and/or other materials provided with the distribution. 16*d4665eaaSJeff Roberson * 17*d4665eaaSJeff Roberson * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18*d4665eaaSJeff Roberson * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19*d4665eaaSJeff Roberson * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20*d4665eaaSJeff Roberson * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21*d4665eaaSJeff Roberson * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22*d4665eaaSJeff Roberson * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23*d4665eaaSJeff Roberson * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24*d4665eaaSJeff Roberson * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25*d4665eaaSJeff Roberson * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26*d4665eaaSJeff Roberson * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27*d4665eaaSJeff Roberson */ 28*d4665eaaSJeff Roberson 29*d4665eaaSJeff Roberson #include <sys/cdefs.h> 30*d4665eaaSJeff Roberson __FBSDID("$FreeBSD$"); 31*d4665eaaSJeff Roberson 32*d4665eaaSJeff Roberson #include <sys/param.h> 33*d4665eaaSJeff Roberson #include <sys/systm.h> 34*d4665eaaSJeff Roberson #include <sys/limits.h> 35*d4665eaaSJeff Roberson #include <sys/kernel.h> 36*d4665eaaSJeff Roberson #include <sys/proc.h> 37*d4665eaaSJeff Roberson #include <sys/smp.h> 38*d4665eaaSJeff Roberson #include <sys/smr.h> 39*d4665eaaSJeff Roberson 40*d4665eaaSJeff Roberson #include <vm/uma.h> 41*d4665eaaSJeff Roberson 42*d4665eaaSJeff Roberson /* 43*d4665eaaSJeff Roberson * This is a novel safe memory reclamation technique inspired by 44*d4665eaaSJeff Roberson * epoch based reclamation from Samy Al Bahra's concurrency kit which 45*d4665eaaSJeff Roberson * in turn was based on work described in: 46*d4665eaaSJeff Roberson * Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University 47*d4665eaaSJeff Roberson * of Cambridge Computing Laboratory. 48*d4665eaaSJeff Roberson * And shares some similarities with: 49*d4665eaaSJeff Roberson * Wang, Stamler, Parmer. 2016 Parallel Sections: Scaling System-Level 50*d4665eaaSJeff Roberson * Data-Structures 51*d4665eaaSJeff Roberson * 52*d4665eaaSJeff Roberson * This is not an implementation of hazard pointers or related 53*d4665eaaSJeff Roberson * techniques. The term safe memory reclamation is used as a 54*d4665eaaSJeff Roberson * generic descriptor for algorithms that defer frees to avoid 55*d4665eaaSJeff Roberson * use-after-free errors with lockless datastructures. 56*d4665eaaSJeff Roberson * 57*d4665eaaSJeff Roberson * The basic approach is to maintain a monotonic write sequence 58*d4665eaaSJeff Roberson * number that is updated on some application defined granularity. 59*d4665eaaSJeff Roberson * Readers record the most recent write sequence number they have 60*d4665eaaSJeff Roberson * observed. A shared read sequence number records the lowest 61*d4665eaaSJeff Roberson * sequence number observed by any reader as of the last poll. Any 62*d4665eaaSJeff Roberson * write older than this value has been observed by all readers 63*d4665eaaSJeff Roberson * and memory can be reclaimed. Like Epoch we also detect idle 64*d4665eaaSJeff Roberson * readers by storing an invalid sequence number in the per-cpu 65*d4665eaaSJeff Roberson * state when the read section exits. Like Parsec we establish 66*d4665eaaSJeff Roberson * a global write clock that is used to mark memory on free. 67*d4665eaaSJeff Roberson * 68*d4665eaaSJeff Roberson * The write and read sequence numbers can be thought of as a two 69*d4665eaaSJeff Roberson * handed clock with readers always advancing towards writers. SMR 70*d4665eaaSJeff Roberson * maintains the invariant that all readers can safely access memory 71*d4665eaaSJeff Roberson * that was visible at the time they loaded their copy of the sequence 72*d4665eaaSJeff Roberson * number. Periodically the read sequence or hand is polled and 73*d4665eaaSJeff Roberson * advanced as far towards the write sequence as active readers allow. 74*d4665eaaSJeff Roberson * Memory which was freed between the old and new global read sequence 75*d4665eaaSJeff Roberson * number can now be reclaimed. When the system is idle the two hands 76*d4665eaaSJeff Roberson * meet and no deferred memory is outstanding. Readers never advance 77*d4665eaaSJeff Roberson * any sequence number, they only observe them. The shared read 78*d4665eaaSJeff Roberson * sequence number is consequently never higher than the write sequence. 79*d4665eaaSJeff Roberson * A stored sequence number that falls outside of this range has expired 80*d4665eaaSJeff Roberson * and needs no scan to reclaim. 81*d4665eaaSJeff Roberson * 82*d4665eaaSJeff Roberson * A notable distinction between this SMR and Epoch, qsbr, rcu, etc. is 83*d4665eaaSJeff Roberson * that advancing the sequence number is decoupled from detecting its 84*d4665eaaSJeff Roberson * observation. This results in a more granular assignment of sequence 85*d4665eaaSJeff Roberson * numbers even as read latencies prohibit all or some expiration. 86*d4665eaaSJeff Roberson * It also allows writers to advance the sequence number and save the 87*d4665eaaSJeff Roberson * poll for expiration until a later time when it is likely to 88*d4665eaaSJeff Roberson * complete without waiting. The batch granularity and free-to-use 89*d4665eaaSJeff Roberson * latency is dynamic and can be significantly smaller than in more 90*d4665eaaSJeff Roberson * strict systems. 91*d4665eaaSJeff Roberson * 92*d4665eaaSJeff Roberson * This mechanism is primarily intended to be used in coordination with 93*d4665eaaSJeff Roberson * UMA. By integrating with the allocator we avoid all of the callout 94*d4665eaaSJeff Roberson * queue machinery and are provided with an efficient way to batch 95*d4665eaaSJeff Roberson * sequence advancement and waiting. The allocator accumulates a full 96*d4665eaaSJeff Roberson * per-cpu cache of memory before advancing the sequence. It then 97*d4665eaaSJeff Roberson * delays waiting for this sequence to expire until the memory is 98*d4665eaaSJeff Roberson * selected for reuse. In this way we only increment the sequence 99*d4665eaaSJeff Roberson * value once for n=cache-size frees and the waits are done long 100*d4665eaaSJeff Roberson * after the sequence has been expired so they need only be verified 101*d4665eaaSJeff Roberson * to account for pathological conditions and to advance the read 102*d4665eaaSJeff Roberson * sequence. Tying the sequence number to the bucket size has the 103*d4665eaaSJeff Roberson * nice property that as the zone gets busier the buckets get larger 104*d4665eaaSJeff Roberson * and the sequence writes become fewer. If the coherency of advancing 105*d4665eaaSJeff Roberson * the write sequence number becomes too costly we can advance 106*d4665eaaSJeff Roberson * it for every N buckets in exchange for higher free-to-use 107*d4665eaaSJeff Roberson * latency and consequently higher memory consumption. 108*d4665eaaSJeff Roberson * 109*d4665eaaSJeff Roberson * If the read overhead of accessing the shared cacheline becomes 110*d4665eaaSJeff Roberson * especially burdensome an invariant TSC could be used in place of the 111*d4665eaaSJeff Roberson * sequence. The algorithm would then only need to maintain the minimum 112*d4665eaaSJeff Roberson * observed tsc. This would trade potential cache synchronization 113*d4665eaaSJeff Roberson * overhead for local serialization and cpu timestamp overhead. 114*d4665eaaSJeff Roberson */ 115*d4665eaaSJeff Roberson 116*d4665eaaSJeff Roberson /* 117*d4665eaaSJeff Roberson * A simplified diagram: 118*d4665eaaSJeff Roberson * 119*d4665eaaSJeff Roberson * 0 UINT_MAX 120*d4665eaaSJeff Roberson * | -------------------- sequence number space -------------------- | 121*d4665eaaSJeff Roberson * ^ rd seq ^ wr seq 122*d4665eaaSJeff Roberson * | ----- valid sequence numbers ---- | 123*d4665eaaSJeff Roberson * ^cpuA ^cpuC 124*d4665eaaSJeff Roberson * | -- free -- | --------- deferred frees -------- | ---- free ---- | 125*d4665eaaSJeff Roberson * 126*d4665eaaSJeff Roberson * 127*d4665eaaSJeff Roberson * In this example cpuA has the lowest sequence number and poll can 128*d4665eaaSJeff Roberson * advance rd seq. cpuB is not running and is considered to observe 129*d4665eaaSJeff Roberson * wr seq. 130*d4665eaaSJeff Roberson * 131*d4665eaaSJeff Roberson * Freed memory that is tagged with a sequence number between rd seq and 132*d4665eaaSJeff Roberson * wr seq can not be safely reclaimed because cpuA may hold a reference to 133*d4665eaaSJeff Roberson * it. Any other memory is guaranteed to be unreferenced. 134*d4665eaaSJeff Roberson * 135*d4665eaaSJeff Roberson * Any writer is free to advance wr seq at any time however it may busy 136*d4665eaaSJeff Roberson * poll in pathological cases. 137*d4665eaaSJeff Roberson */ 138*d4665eaaSJeff Roberson 139*d4665eaaSJeff Roberson static uma_zone_t smr_shared_zone; 140*d4665eaaSJeff Roberson static uma_zone_t smr_zone; 141*d4665eaaSJeff Roberson 142*d4665eaaSJeff Roberson #ifndef INVARIANTS 143*d4665eaaSJeff Roberson #define SMR_SEQ_INIT 1 /* All valid sequence numbers are odd. */ 144*d4665eaaSJeff Roberson #define SMR_SEQ_INCR 2 145*d4665eaaSJeff Roberson 146*d4665eaaSJeff Roberson /* 147*d4665eaaSJeff Roberson * SMR_SEQ_MAX_DELTA is the maximum distance allowed between rd_seq and 148*d4665eaaSJeff Roberson * wr_seq. For the modular arithmetic to work a value of UNIT_MAX / 2 149*d4665eaaSJeff Roberson * would be possible but it is checked after we increment the wr_seq so 150*d4665eaaSJeff Roberson * a safety margin is left to prevent overflow. 151*d4665eaaSJeff Roberson * 152*d4665eaaSJeff Roberson * We will block until SMR_SEQ_MAX_ADVANCE sequence numbers have progressed 153*d4665eaaSJeff Roberson * to prevent integer wrapping. See smr_advance() for more details. 154*d4665eaaSJeff Roberson */ 155*d4665eaaSJeff Roberson #define SMR_SEQ_MAX_DELTA (UINT_MAX / 4) 156*d4665eaaSJeff Roberson #define SMR_SEQ_MAX_ADVANCE (SMR_SEQ_MAX_DELTA - 1024) 157*d4665eaaSJeff Roberson #else 158*d4665eaaSJeff Roberson /* We want to test the wrapping feature in invariants kernels. */ 159*d4665eaaSJeff Roberson #define SMR_SEQ_INCR (UINT_MAX / 10000) 160*d4665eaaSJeff Roberson #define SMR_SEQ_INIT (UINT_MAX - 100000) 161*d4665eaaSJeff Roberson /* Force extra polls to test the integer overflow detection. */ 162*d4665eaaSJeff Roberson #define SMR_SEQ_MAX_DELTA (1000) 163*d4665eaaSJeff Roberson #define SMR_SEQ_MAX_ADVANCE SMR_SEQ_MAX_DELTA / 2 164*d4665eaaSJeff Roberson #endif 165*d4665eaaSJeff Roberson 166*d4665eaaSJeff Roberson /* 167*d4665eaaSJeff Roberson * Advance the write sequence and return the new value for use as the 168*d4665eaaSJeff Roberson * wait goal. This guarantees that any changes made by the calling 169*d4665eaaSJeff Roberson * thread prior to this call will be visible to all threads after 170*d4665eaaSJeff Roberson * rd_seq meets or exceeds the return value. 171*d4665eaaSJeff Roberson * 172*d4665eaaSJeff Roberson * This function may busy loop if the readers are roughly 1 billion 173*d4665eaaSJeff Roberson * sequence numbers behind the writers. 174*d4665eaaSJeff Roberson */ 175*d4665eaaSJeff Roberson smr_seq_t 176*d4665eaaSJeff Roberson smr_advance(smr_t smr) 177*d4665eaaSJeff Roberson { 178*d4665eaaSJeff Roberson smr_shared_t s; 179*d4665eaaSJeff Roberson smr_seq_t goal; 180*d4665eaaSJeff Roberson 181*d4665eaaSJeff Roberson /* 182*d4665eaaSJeff Roberson * It is illegal to enter while in an smr section. 183*d4665eaaSJeff Roberson */ 184*d4665eaaSJeff Roberson KASSERT(curthread->td_critnest == 0, 185*d4665eaaSJeff Roberson ("smr_advance: Not allowed in a critical section.")); 186*d4665eaaSJeff Roberson 187*d4665eaaSJeff Roberson /* 188*d4665eaaSJeff Roberson * Modifications not done in a smr section need to be visible 189*d4665eaaSJeff Roberson * before advancing the seq. 190*d4665eaaSJeff Roberson */ 191*d4665eaaSJeff Roberson atomic_thread_fence_rel(); 192*d4665eaaSJeff Roberson 193*d4665eaaSJeff Roberson /* 194*d4665eaaSJeff Roberson * Increment the shared write sequence by 2. Since it is 195*d4665eaaSJeff Roberson * initialized to 1 this means the only valid values are 196*d4665eaaSJeff Roberson * odd and an observed value of 0 in a particular CPU means 197*d4665eaaSJeff Roberson * it is not currently in a read section. 198*d4665eaaSJeff Roberson */ 199*d4665eaaSJeff Roberson s = smr->c_shared; 200*d4665eaaSJeff Roberson goal = atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR; 201*d4665eaaSJeff Roberson 202*d4665eaaSJeff Roberson /* 203*d4665eaaSJeff Roberson * Force a synchronization here if the goal is getting too 204*d4665eaaSJeff Roberson * far ahead of the read sequence number. This keeps the 205*d4665eaaSJeff Roberson * wrap detecting arithmetic working in pathological cases. 206*d4665eaaSJeff Roberson */ 207*d4665eaaSJeff Roberson if (goal - atomic_load_int(&s->s_rd_seq) >= SMR_SEQ_MAX_DELTA) 208*d4665eaaSJeff Roberson smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE); 209*d4665eaaSJeff Roberson 210*d4665eaaSJeff Roberson return (goal); 211*d4665eaaSJeff Roberson } 212*d4665eaaSJeff Roberson 213*d4665eaaSJeff Roberson /* 214*d4665eaaSJeff Roberson * Poll to determine whether all readers have observed the 'goal' write 215*d4665eaaSJeff Roberson * sequence number. 216*d4665eaaSJeff Roberson * 217*d4665eaaSJeff Roberson * If wait is true this will spin until the goal is met. 218*d4665eaaSJeff Roberson * 219*d4665eaaSJeff Roberson * This routine will updated the minimum observed read sequence number in 220*d4665eaaSJeff Roberson * s_rd_seq if it does a scan. It may not do a scan if another call has 221*d4665eaaSJeff Roberson * advanced s_rd_seq beyond the callers goal already. 222*d4665eaaSJeff Roberson * 223*d4665eaaSJeff Roberson * Returns true if the goal is met and false if not. 224*d4665eaaSJeff Roberson */ 225*d4665eaaSJeff Roberson bool 226*d4665eaaSJeff Roberson smr_poll(smr_t smr, smr_seq_t goal, bool wait) 227*d4665eaaSJeff Roberson { 228*d4665eaaSJeff Roberson smr_shared_t s; 229*d4665eaaSJeff Roberson smr_t c; 230*d4665eaaSJeff Roberson smr_seq_t s_wr_seq, s_rd_seq, rd_seq, c_seq; 231*d4665eaaSJeff Roberson int i; 232*d4665eaaSJeff Roberson bool success; 233*d4665eaaSJeff Roberson 234*d4665eaaSJeff Roberson /* 235*d4665eaaSJeff Roberson * It is illegal to enter while in an smr section. 236*d4665eaaSJeff Roberson */ 237*d4665eaaSJeff Roberson KASSERT(!wait || curthread->td_critnest == 0, 238*d4665eaaSJeff Roberson ("smr_poll: Blocking not allowed in a critical section.")); 239*d4665eaaSJeff Roberson 240*d4665eaaSJeff Roberson /* 241*d4665eaaSJeff Roberson * Use a critical section so that we can avoid ABA races 242*d4665eaaSJeff Roberson * caused by long preemption sleeps. 243*d4665eaaSJeff Roberson */ 244*d4665eaaSJeff Roberson success = true; 245*d4665eaaSJeff Roberson critical_enter(); 246*d4665eaaSJeff Roberson s = smr->c_shared; 247*d4665eaaSJeff Roberson 248*d4665eaaSJeff Roberson /* 249*d4665eaaSJeff Roberson * Acquire barrier loads s_wr_seq after s_rd_seq so that we can not 250*d4665eaaSJeff Roberson * observe an updated read sequence that is larger than write. 251*d4665eaaSJeff Roberson */ 252*d4665eaaSJeff Roberson s_rd_seq = atomic_load_acq_int(&s->s_rd_seq); 253*d4665eaaSJeff Roberson s_wr_seq = smr_current(smr); 254*d4665eaaSJeff Roberson 255*d4665eaaSJeff Roberson /* 256*d4665eaaSJeff Roberson * Detect whether the goal is valid and has already been observed. 257*d4665eaaSJeff Roberson * 258*d4665eaaSJeff Roberson * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for 259*d4665eaaSJeff Roberson * it to be valid. If it is not then the caller held on to it and 260*d4665eaaSJeff Roberson * the integer wrapped. If we wrapped back within range the caller 261*d4665eaaSJeff Roberson * will harmlessly scan. 262*d4665eaaSJeff Roberson * 263*d4665eaaSJeff Roberson * A valid goal must be greater than s_rd_seq or we have not verified 264*d4665eaaSJeff Roberson * that it has been observed and must fall through to polling. 265*d4665eaaSJeff Roberson */ 266*d4665eaaSJeff Roberson if (SMR_SEQ_GEQ(s_rd_seq, goal) || SMR_SEQ_LT(s_wr_seq, goal)) 267*d4665eaaSJeff Roberson goto out; 268*d4665eaaSJeff Roberson 269*d4665eaaSJeff Roberson /* 270*d4665eaaSJeff Roberson * Loop until all cores have observed the goal sequence or have 271*d4665eaaSJeff Roberson * gone inactive. Keep track of the oldest sequence currently 272*d4665eaaSJeff Roberson * active as rd_seq. 273*d4665eaaSJeff Roberson */ 274*d4665eaaSJeff Roberson rd_seq = s_wr_seq; 275*d4665eaaSJeff Roberson CPU_FOREACH(i) { 276*d4665eaaSJeff Roberson c = zpcpu_get_cpu(smr, i); 277*d4665eaaSJeff Roberson c_seq = SMR_SEQ_INVALID; 278*d4665eaaSJeff Roberson for (;;) { 279*d4665eaaSJeff Roberson c_seq = atomic_load_int(&c->c_seq); 280*d4665eaaSJeff Roberson if (c_seq == SMR_SEQ_INVALID) 281*d4665eaaSJeff Roberson break; 282*d4665eaaSJeff Roberson 283*d4665eaaSJeff Roberson /* 284*d4665eaaSJeff Roberson * There is a race described in smr.h:smr_enter that 285*d4665eaaSJeff Roberson * can lead to a stale seq value but not stale data 286*d4665eaaSJeff Roberson * access. If we find a value out of range here we 287*d4665eaaSJeff Roberson * pin it to the current min to prevent it from 288*d4665eaaSJeff Roberson * advancing until that stale section has expired. 289*d4665eaaSJeff Roberson * 290*d4665eaaSJeff Roberson * The race is created when a cpu loads the s_wr_seq 291*d4665eaaSJeff Roberson * value in a local register and then another thread 292*d4665eaaSJeff Roberson * advances s_wr_seq and calls smr_poll() which will 293*d4665eaaSJeff Roberson * oberve no value yet in c_seq and advance s_rd_seq 294*d4665eaaSJeff Roberson * up to s_wr_seq which is beyond the register 295*d4665eaaSJeff Roberson * cached value. This is only likely to happen on 296*d4665eaaSJeff Roberson * hypervisor or with a system management interrupt. 297*d4665eaaSJeff Roberson */ 298*d4665eaaSJeff Roberson if (SMR_SEQ_LT(c_seq, s_rd_seq)) 299*d4665eaaSJeff Roberson c_seq = s_rd_seq; 300*d4665eaaSJeff Roberson 301*d4665eaaSJeff Roberson /* 302*d4665eaaSJeff Roberson * If the sequence number meets the goal we are 303*d4665eaaSJeff Roberson * done with this cpu. 304*d4665eaaSJeff Roberson */ 305*d4665eaaSJeff Roberson if (SMR_SEQ_GEQ(c_seq, goal)) 306*d4665eaaSJeff Roberson break; 307*d4665eaaSJeff Roberson 308*d4665eaaSJeff Roberson /* 309*d4665eaaSJeff Roberson * If we're not waiting we will still scan the rest 310*d4665eaaSJeff Roberson * of the cpus and update s_rd_seq before returning 311*d4665eaaSJeff Roberson * an error. 312*d4665eaaSJeff Roberson */ 313*d4665eaaSJeff Roberson if (!wait) { 314*d4665eaaSJeff Roberson success = false; 315*d4665eaaSJeff Roberson break; 316*d4665eaaSJeff Roberson } 317*d4665eaaSJeff Roberson cpu_spinwait(); 318*d4665eaaSJeff Roberson } 319*d4665eaaSJeff Roberson 320*d4665eaaSJeff Roberson /* 321*d4665eaaSJeff Roberson * Limit the minimum observed rd_seq whether we met the goal 322*d4665eaaSJeff Roberson * or not. 323*d4665eaaSJeff Roberson */ 324*d4665eaaSJeff Roberson if (c_seq != SMR_SEQ_INVALID && SMR_SEQ_GT(rd_seq, c_seq)) 325*d4665eaaSJeff Roberson rd_seq = c_seq; 326*d4665eaaSJeff Roberson } 327*d4665eaaSJeff Roberson 328*d4665eaaSJeff Roberson /* 329*d4665eaaSJeff Roberson * Advance the rd_seq as long as we observed the most recent one. 330*d4665eaaSJeff Roberson */ 331*d4665eaaSJeff Roberson s_rd_seq = atomic_load_int(&s->s_rd_seq); 332*d4665eaaSJeff Roberson do { 333*d4665eaaSJeff Roberson if (SMR_SEQ_LEQ(rd_seq, s_rd_seq)) 334*d4665eaaSJeff Roberson break; 335*d4665eaaSJeff Roberson } while (atomic_fcmpset_int(&s->s_rd_seq, &s_rd_seq, rd_seq) == 0); 336*d4665eaaSJeff Roberson 337*d4665eaaSJeff Roberson out: 338*d4665eaaSJeff Roberson critical_exit(); 339*d4665eaaSJeff Roberson 340*d4665eaaSJeff Roberson return (success); 341*d4665eaaSJeff Roberson } 342*d4665eaaSJeff Roberson 343*d4665eaaSJeff Roberson smr_t 344*d4665eaaSJeff Roberson smr_create(const char *name) 345*d4665eaaSJeff Roberson { 346*d4665eaaSJeff Roberson smr_t smr, c; 347*d4665eaaSJeff Roberson smr_shared_t s; 348*d4665eaaSJeff Roberson int i; 349*d4665eaaSJeff Roberson 350*d4665eaaSJeff Roberson s = uma_zalloc(smr_shared_zone, M_WAITOK); 351*d4665eaaSJeff Roberson smr = uma_zalloc(smr_zone, M_WAITOK); 352*d4665eaaSJeff Roberson 353*d4665eaaSJeff Roberson s->s_name = name; 354*d4665eaaSJeff Roberson s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT; 355*d4665eaaSJeff Roberson 356*d4665eaaSJeff Roberson /* Initialize all CPUS, not just those running. */ 357*d4665eaaSJeff Roberson for (i = 0; i <= mp_maxid; i++) { 358*d4665eaaSJeff Roberson c = zpcpu_get_cpu(smr, i); 359*d4665eaaSJeff Roberson c->c_seq = SMR_SEQ_INVALID; 360*d4665eaaSJeff Roberson c->c_shared = s; 361*d4665eaaSJeff Roberson } 362*d4665eaaSJeff Roberson atomic_thread_fence_seq_cst(); 363*d4665eaaSJeff Roberson 364*d4665eaaSJeff Roberson return (smr); 365*d4665eaaSJeff Roberson } 366*d4665eaaSJeff Roberson 367*d4665eaaSJeff Roberson void 368*d4665eaaSJeff Roberson smr_destroy(smr_t smr) 369*d4665eaaSJeff Roberson { 370*d4665eaaSJeff Roberson 371*d4665eaaSJeff Roberson smr_synchronize(smr); 372*d4665eaaSJeff Roberson uma_zfree(smr_shared_zone, smr->c_shared); 373*d4665eaaSJeff Roberson uma_zfree(smr_zone, smr); 374*d4665eaaSJeff Roberson } 375*d4665eaaSJeff Roberson 376*d4665eaaSJeff Roberson /* 377*d4665eaaSJeff Roberson * Initialize the UMA slab zone. 378*d4665eaaSJeff Roberson */ 379*d4665eaaSJeff Roberson void 380*d4665eaaSJeff Roberson smr_init(void) 381*d4665eaaSJeff Roberson { 382*d4665eaaSJeff Roberson 383*d4665eaaSJeff Roberson smr_shared_zone = uma_zcreate("SMR SHARED", sizeof(struct smr_shared), 384*d4665eaaSJeff Roberson NULL, NULL, NULL, NULL, (CACHE_LINE_SIZE * 2) - 1, 0); 385*d4665eaaSJeff Roberson smr_zone = uma_zcreate("SMR CPU", sizeof(struct smr), 386*d4665eaaSJeff Roberson NULL, NULL, NULL, NULL, (CACHE_LINE_SIZE * 2) - 1, UMA_ZONE_PCPU); 387*d4665eaaSJeff Roberson } 388