1 /*
2 * Copyright 2011-2015 Samy Al Bahra.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 /*
28 * The implementation here is inspired from the work described in:
29 * Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University
30 * of Cambridge Computing Laboratory.
31 */
32
33 #include <ck_backoff.h>
34 #include <ck_cc.h>
35 #include <ck_epoch.h>
36 #include <ck_pr.h>
37 #include <ck_stack.h>
38 #include <ck_stdbool.h>
39 #include <ck_string.h>
40
41 /*
42 * Only three distinct values are used for reclamation, but reclamation occurs
43 * at e+2 rather than e+1. Any thread in a "critical section" would have
44 * acquired some snapshot (e) of the global epoch value (e_g) and set an active
45 * flag. Any hazardous references will only occur after a full memory barrier.
46 * For example, assume an initial e_g value of 1, e value of 0 and active value
47 * of 0.
48 *
49 * ck_epoch_begin(...)
50 * e = e_g
51 * active = 1
52 * memory_barrier();
53 *
54 * Any serialized reads may observe e = 0 or e = 1 with active = 0, or e = 0 or
55 * e = 1 with active = 1. The e_g value can only go from 1 to 2 if every thread
56 * has already observed the value of "1" (or the value we are incrementing
57 * from). This guarantees us that for any given value e_g, any threads with-in
58 * critical sections (referred to as "active" threads from here on) would have
59 * an e value of e_g-1 or e_g. This also means that hazardous references may be
60 * shared in both e_g-1 and e_g even if they are logically deleted in e_g.
61 *
62 * For example, assume all threads have an e value of e_g. Another thread may
63 * increment to e_g to e_g+1. Older threads may have a reference to an object
64 * which is only deleted in e_g+1. It could be that reader threads are
65 * executing some hash table look-ups, while some other writer thread (which
66 * causes epoch counter tick) actually deletes the same items that reader
67 * threads are looking up (this writer thread having an e value of e_g+1).
68 * This is possible if the writer thread re-observes the epoch after the
69 * counter tick.
70 *
71 * Psuedo-code for writer:
72 * ck_epoch_begin()
73 * ht_delete(x)
74 * ck_epoch_end()
75 * ck_epoch_begin()
76 * ht_delete(x)
77 * ck_epoch_end()
78 *
79 * Psuedo-code for reader:
80 * for (;;) {
81 * x = ht_lookup(x)
82 * ck_pr_inc(&x->value);
83 * }
84 *
85 * Of course, it is also possible for references logically deleted at e_g-1 to
86 * still be accessed at e_g as threads are "active" at the same time
87 * (real-world time) mutating shared objects.
88 *
89 * Now, if the epoch counter is ticked to e_g+1, then no new hazardous
90 * references could exist to objects logically deleted at e_g-1. The reason for
91 * this is that at e_g+1, all epoch read-side critical sections started at
92 * e_g-1 must have been completed. If any epoch read-side critical sections at
93 * e_g-1 were still active, then we would never increment to e_g+1 (active != 0
94 * ^ e != e_g). Additionally, e_g may still have hazardous references to
95 * objects logically deleted at e_g-1 which means objects logically deleted at
96 * e_g-1 cannot be deleted at e_g+1 unless all threads have observed e_g+1
97 * (since it is valid for active threads to be at e_g and threads at e_g still
98 * require safe memory accesses).
99 *
100 * However, at e_g+2, all active threads must be either at e_g+1 or e_g+2.
101 * Though e_g+2 may share hazardous references with e_g+1, and e_g+1 shares
102 * hazardous references to e_g, no active threads are at e_g or e_g-1. This
103 * means no hazardous references could exist to objects deleted at e_g-1 (at
104 * e_g+2).
105 *
106 * To summarize these important points,
107 * 1) Active threads will always have a value of e_g or e_g-1.
108 * 2) Items that are logically deleted e_g or e_g-1 cannot be physically
109 * deleted.
110 * 3) Objects logically deleted at e_g-1 can be physically destroyed at e_g+2
111 * or at e_g+1 if no threads are at e_g.
112 *
113 * Last but not least, if we are at e_g+2, then no active thread is at e_g
114 * which means it is safe to apply modulo-3 arithmetic to e_g value in order to
115 * re-use e_g to represent the e_g+3 state. This means it is sufficient to
116 * represent e_g using only the values 0, 1 or 2. Every time a thread re-visits
117 * a e_g (which can be determined with a non-empty deferral list) it can assume
118 * objects in the e_g deferral list involved at least three e_g transitions and
119 * are thus, safe, for physical deletion.
120 *
121 * Blocking semantics for epoch reclamation have additional restrictions.
122 * Though we only require three deferral lists, reasonable blocking semantics
123 * must be able to more gracefully handle bursty write work-loads which could
124 * easily cause e_g wrap-around if modulo-3 arithmetic is used. This allows for
125 * easy-to-trigger live-lock situations. The work-around to this is to not
126 * apply modulo arithmetic to e_g but only to deferral list indexing.
127 */
128 #define CK_EPOCH_GRACE 3U
129
130 /*
131 * CK_EPOCH_LENGTH must be a power-of-2 (because (CK_EPOCH_LENGTH - 1) is used
132 * as a mask, and it must be at least 3 (see comments above).
133 */
134 #if (CK_EPOCH_LENGTH < 3 || (CK_EPOCH_LENGTH & (CK_EPOCH_LENGTH - 1)) != 0)
135 #error "CK_EPOCH_LENGTH must be a power of 2 and >= 3"
136 #endif
137
138 enum {
139 CK_EPOCH_STATE_USED = 0,
140 CK_EPOCH_STATE_FREE = 1
141 };
142
CK_STACK_CONTAINER(struct ck_epoch_record,record_next,ck_epoch_record_container)143 CK_STACK_CONTAINER(struct ck_epoch_record, record_next,
144 ck_epoch_record_container)
145 CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,
146 ck_epoch_entry_container)
147
148 #define CK_EPOCH_SENSE_MASK (CK_EPOCH_SENSE - 1)
149
150 bool
151 _ck_epoch_delref(struct ck_epoch_record *record,
152 struct ck_epoch_section *section)
153 {
154 struct ck_epoch_ref *current, *other;
155 unsigned int i = section->bucket;
156
157 current = &record->local.bucket[i];
158 current->count--;
159
160 if (current->count > 0)
161 return false;
162
163 /*
164 * If the current bucket no longer has any references, then
165 * determine whether we have already transitioned into a newer
166 * epoch. If so, then make sure to update our shared snapshot
167 * to allow for forward progress.
168 *
169 * If no other active bucket exists, then the record will go
170 * inactive in order to allow for forward progress.
171 */
172 other = &record->local.bucket[(i + 1) & CK_EPOCH_SENSE_MASK];
173 if (other->count > 0 &&
174 ((int)(current->epoch - other->epoch) < 0)) {
175 /*
176 * The other epoch value is actually the newest,
177 * transition to it.
178 */
179 ck_pr_store_uint(&record->epoch, other->epoch);
180 }
181
182 return true;
183 }
184
185 void
_ck_epoch_addref(struct ck_epoch_record * record,struct ck_epoch_section * section)186 _ck_epoch_addref(struct ck_epoch_record *record,
187 struct ck_epoch_section *section)
188 {
189 struct ck_epoch *global = record->global;
190 struct ck_epoch_ref *ref;
191 unsigned int epoch, i;
192
193 epoch = ck_pr_load_uint(&global->epoch);
194 i = epoch & CK_EPOCH_SENSE_MASK;
195 ref = &record->local.bucket[i];
196
197 if (ref->count++ == 0) {
198 #ifndef CK_MD_TSO
199 struct ck_epoch_ref *previous;
200
201 /*
202 * The system has already ticked. If another non-zero bucket
203 * exists, make sure to order our observations with respect
204 * to it. Otherwise, it is possible to acquire a reference
205 * from the previous epoch generation.
206 *
207 * On TSO architectures, the monoticity of the global counter
208 * and load-{store, load} ordering are sufficient to guarantee
209 * this ordering.
210 */
211 previous = &record->local.bucket[(i + 1) &
212 CK_EPOCH_SENSE_MASK];
213 if (previous->count > 0)
214 ck_pr_fence_acqrel();
215 #endif /* !CK_MD_TSO */
216
217 /*
218 * If this is this is a new reference into the current
219 * bucket then cache the associated epoch value.
220 */
221 ref->epoch = epoch;
222 }
223
224 section->bucket = i;
225 return;
226 }
227
228 void
ck_epoch_init(struct ck_epoch * global)229 ck_epoch_init(struct ck_epoch *global)
230 {
231
232 ck_stack_init(&global->records);
233 global->epoch = 1;
234 global->n_free = 0;
235 ck_pr_fence_store();
236 return;
237 }
238
239 struct ck_epoch_record *
ck_epoch_recycle(struct ck_epoch * global,void * ct)240 ck_epoch_recycle(struct ck_epoch *global, void *ct)
241 {
242 struct ck_epoch_record *record;
243 ck_stack_entry_t *cursor;
244 unsigned int state;
245
246 if (ck_pr_load_uint(&global->n_free) == 0)
247 return NULL;
248
249 CK_STACK_FOREACH(&global->records, cursor) {
250 record = ck_epoch_record_container(cursor);
251
252 if (ck_pr_load_uint(&record->state) == CK_EPOCH_STATE_FREE) {
253 /* Serialize with respect to deferral list clean-up. */
254 ck_pr_fence_load();
255 state = ck_pr_fas_uint(&record->state,
256 CK_EPOCH_STATE_USED);
257 if (state == CK_EPOCH_STATE_FREE) {
258 ck_pr_dec_uint(&global->n_free);
259 ck_pr_store_ptr(&record->ct, ct);
260
261 /*
262 * The context pointer is ordered by a
263 * subsequent protected section.
264 */
265 return record;
266 }
267 }
268 }
269
270 return NULL;
271 }
272
273 void
ck_epoch_register(struct ck_epoch * global,struct ck_epoch_record * record,void * ct)274 ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record,
275 void *ct)
276 {
277 size_t i;
278
279 record->global = global;
280 record->state = CK_EPOCH_STATE_USED;
281 record->active = 0;
282 record->epoch = 0;
283 record->n_dispatch = 0;
284 record->n_peak = 0;
285 record->n_pending = 0;
286 record->ct = ct;
287 memset(&record->local, 0, sizeof record->local);
288
289 for (i = 0; i < CK_EPOCH_LENGTH; i++)
290 ck_stack_init(&record->pending[i]);
291
292 ck_pr_fence_store();
293 ck_stack_push_upmc(&global->records, &record->record_next);
294 return;
295 }
296
297 void
ck_epoch_unregister(struct ck_epoch_record * record)298 ck_epoch_unregister(struct ck_epoch_record *record)
299 {
300 struct ck_epoch *global = record->global;
301 size_t i;
302
303 record->active = 0;
304 record->epoch = 0;
305 record->n_dispatch = 0;
306 record->n_peak = 0;
307 record->n_pending = 0;
308 memset(&record->local, 0, sizeof record->local);
309
310 for (i = 0; i < CK_EPOCH_LENGTH; i++)
311 ck_stack_init(&record->pending[i]);
312
313 ck_pr_store_ptr(&record->ct, NULL);
314 ck_pr_fence_store();
315 ck_pr_store_uint(&record->state, CK_EPOCH_STATE_FREE);
316 ck_pr_inc_uint(&global->n_free);
317 return;
318 }
319
320 static struct ck_epoch_record *
ck_epoch_scan(struct ck_epoch * global,struct ck_epoch_record * cr,unsigned int epoch,bool * af)321 ck_epoch_scan(struct ck_epoch *global,
322 struct ck_epoch_record *cr,
323 unsigned int epoch,
324 bool *af)
325 {
326 ck_stack_entry_t *cursor;
327
328 if (cr == NULL) {
329 cursor = CK_STACK_FIRST(&global->records);
330 *af = false;
331 } else {
332 cursor = &cr->record_next;
333 *af = true;
334 }
335
336 while (cursor != NULL) {
337 unsigned int state, active;
338
339 cr = ck_epoch_record_container(cursor);
340
341 state = ck_pr_load_uint(&cr->state);
342 if (state & CK_EPOCH_STATE_FREE) {
343 cursor = CK_STACK_NEXT(cursor);
344 continue;
345 }
346
347 active = ck_pr_load_uint(&cr->active);
348 *af |= active;
349
350 if (active != 0 && ck_pr_load_uint(&cr->epoch) != epoch)
351 return cr;
352
353 cursor = CK_STACK_NEXT(cursor);
354 }
355
356 return NULL;
357 }
358
359 static unsigned int
ck_epoch_dispatch(struct ck_epoch_record * record,unsigned int e,ck_stack_t * deferred)360 ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e, ck_stack_t *deferred)
361 {
362 unsigned int epoch = e & (CK_EPOCH_LENGTH - 1);
363 ck_stack_entry_t *head, *next, *cursor;
364 unsigned int n_pending, n_peak;
365 unsigned int i = 0;
366
367 head = ck_stack_batch_pop_upmc(&record->pending[epoch]);
368 for (cursor = head; cursor != NULL; cursor = next) {
369 struct ck_epoch_entry *entry =
370 ck_epoch_entry_container(cursor);
371
372 next = CK_STACK_NEXT(cursor);
373 if (deferred != NULL)
374 ck_stack_push_spnc(deferred, &entry->stack_entry);
375 else
376 entry->function(entry);
377
378 i++;
379 }
380
381 n_peak = ck_pr_load_uint(&record->n_peak);
382 n_pending = ck_pr_load_uint(&record->n_pending);
383
384 /* We don't require accuracy around peak calculation. */
385 if (n_pending > n_peak)
386 ck_pr_store_uint(&record->n_peak, n_peak);
387
388 if (i > 0) {
389 ck_pr_add_uint(&record->n_dispatch, i);
390 ck_pr_sub_uint(&record->n_pending, i);
391 }
392
393 return i;
394 }
395
396 /*
397 * Reclaim all objects associated with a record.
398 */
399 void
ck_epoch_reclaim(struct ck_epoch_record * record)400 ck_epoch_reclaim(struct ck_epoch_record *record)
401 {
402 unsigned int epoch;
403
404 for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++)
405 ck_epoch_dispatch(record, epoch, NULL);
406
407 return;
408 }
409
410 CK_CC_FORCE_INLINE static void
epoch_block(struct ck_epoch * global,struct ck_epoch_record * cr,ck_epoch_wait_cb_t * cb,void * ct)411 epoch_block(struct ck_epoch *global, struct ck_epoch_record *cr,
412 ck_epoch_wait_cb_t *cb, void *ct)
413 {
414
415 if (cb != NULL)
416 cb(global, cr, ct);
417
418 return;
419 }
420
421 /*
422 * This function must not be called with-in read section.
423 */
424 void
ck_epoch_synchronize_wait(struct ck_epoch * global,ck_epoch_wait_cb_t * cb,void * ct)425 ck_epoch_synchronize_wait(struct ck_epoch *global,
426 ck_epoch_wait_cb_t *cb, void *ct)
427 {
428 struct ck_epoch_record *cr;
429 unsigned int delta, epoch, goal, i;
430 bool active;
431
432 ck_pr_fence_memory();
433
434 /*
435 * The observation of the global epoch must be ordered with respect to
436 * all prior operations. The re-ordering of loads is permitted given
437 * monoticity of global epoch counter.
438 *
439 * If UINT_MAX concurrent mutations were to occur then it is possible
440 * to encounter an ABA-issue. If this is a concern, consider tuning
441 * write-side concurrency.
442 */
443 delta = epoch = ck_pr_load_uint(&global->epoch);
444 goal = epoch + CK_EPOCH_GRACE;
445
446 for (i = 0, cr = NULL; i < CK_EPOCH_GRACE - 1; cr = NULL, i++) {
447 bool r;
448
449 /*
450 * Determine whether all threads have observed the current
451 * epoch with respect to the updates on invocation.
452 */
453 while (cr = ck_epoch_scan(global, cr, delta, &active),
454 cr != NULL) {
455 unsigned int e_d;
456
457 ck_pr_stall();
458
459 /*
460 * Another writer may have already observed a grace
461 * period.
462 */
463 e_d = ck_pr_load_uint(&global->epoch);
464 if (e_d == delta) {
465 epoch_block(global, cr, cb, ct);
466 continue;
467 }
468
469 /*
470 * If the epoch has been updated, we may have already
471 * met our goal.
472 */
473 delta = e_d;
474 if ((goal > epoch) & (delta >= goal))
475 goto leave;
476
477 epoch_block(global, cr, cb, ct);
478
479 /*
480 * If the epoch has been updated, then a grace period
481 * requires that all threads are observed idle at the
482 * same epoch.
483 */
484 cr = NULL;
485 }
486
487 /*
488 * If we have observed all threads as inactive, then we assume
489 * we are at a grace period.
490 */
491 if (active == false)
492 break;
493
494 /*
495 * Increment current epoch. CAS semantics are used to eliminate
496 * increment operations for synchronization that occurs for the
497 * same global epoch value snapshot.
498 *
499 * If we can guarantee there will only be one active barrier or
500 * epoch tick at a given time, then it is sufficient to use an
501 * increment operation. In a multi-barrier workload, however,
502 * it is possible to overflow the epoch value if we apply
503 * modulo-3 arithmetic.
504 */
505 r = ck_pr_cas_uint_value(&global->epoch, delta, delta + 1,
506 &delta);
507
508 /* Order subsequent thread active checks. */
509 ck_pr_fence_atomic_load();
510
511 /*
512 * If CAS has succeeded, then set delta to latest snapshot.
513 * Otherwise, we have just acquired latest snapshot.
514 */
515 delta = delta + r;
516 }
517
518 /*
519 * A majority of use-cases will not require full barrier semantics.
520 * However, if non-temporal instructions are used, full barrier
521 * semantics are necessary.
522 */
523 leave:
524 ck_pr_fence_memory();
525 return;
526 }
527
528 void
ck_epoch_synchronize(struct ck_epoch_record * record)529 ck_epoch_synchronize(struct ck_epoch_record *record)
530 {
531
532 ck_epoch_synchronize_wait(record->global, NULL, NULL);
533 return;
534 }
535
536 void
ck_epoch_barrier(struct ck_epoch_record * record)537 ck_epoch_barrier(struct ck_epoch_record *record)
538 {
539
540 ck_epoch_synchronize(record);
541 ck_epoch_reclaim(record);
542 return;
543 }
544
545 void
ck_epoch_barrier_wait(struct ck_epoch_record * record,ck_epoch_wait_cb_t * cb,void * ct)546 ck_epoch_barrier_wait(struct ck_epoch_record *record, ck_epoch_wait_cb_t *cb,
547 void *ct)
548 {
549
550 ck_epoch_synchronize_wait(record->global, cb, ct);
551 ck_epoch_reclaim(record);
552 return;
553 }
554
555 /*
556 * It may be worth it to actually apply these deferral semantics to an epoch
557 * that was observed at ck_epoch_call time. The problem is that the latter
558 * would require a full fence.
559 *
560 * ck_epoch_call will dispatch to the latest epoch snapshot that was observed.
561 * There are cases where it will fail to reclaim as early as it could. If this
562 * becomes a problem, we could actually use a heap for epoch buckets but that
563 * is far from ideal too.
564 */
565 bool
ck_epoch_poll_deferred(struct ck_epoch_record * record,ck_stack_t * deferred)566 ck_epoch_poll_deferred(struct ck_epoch_record *record, ck_stack_t *deferred)
567 {
568 bool active;
569 unsigned int epoch;
570 struct ck_epoch_record *cr = NULL;
571 struct ck_epoch *global = record->global;
572 unsigned int n_dispatch;
573
574 epoch = ck_pr_load_uint(&global->epoch);
575
576 /* Serialize epoch snapshots with respect to global epoch. */
577 ck_pr_fence_memory();
578
579 /*
580 * At this point, epoch is the current global epoch value.
581 * There may or may not be active threads which observed epoch - 1.
582 * (ck_epoch_scan() will tell us that). However, there should be
583 * no active threads which observed epoch - 2.
584 *
585 * Note that checking epoch - 2 is necessary, as race conditions can
586 * allow another thread to increment the global epoch before this
587 * thread runs.
588 */
589 n_dispatch = ck_epoch_dispatch(record, epoch - 2, deferred);
590
591 cr = ck_epoch_scan(global, cr, epoch, &active);
592 if (cr != NULL)
593 return (n_dispatch > 0);
594
595 /* We are at a grace period if all threads are inactive. */
596 if (active == false) {
597 record->epoch = epoch;
598 for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++)
599 ck_epoch_dispatch(record, epoch, deferred);
600
601 return true;
602 }
603
604 /*
605 * If an active thread exists, rely on epoch observation.
606 *
607 * All the active threads entered the epoch section during
608 * the current epoch. Therefore, we can now run the handlers
609 * for the immediately preceding epoch and attempt to
610 * advance the epoch if it hasn't been already.
611 */
612 (void)ck_pr_cas_uint(&global->epoch, epoch, epoch + 1);
613
614 ck_epoch_dispatch(record, epoch - 1, deferred);
615 return true;
616 }
617
618 bool
ck_epoch_poll(struct ck_epoch_record * record)619 ck_epoch_poll(struct ck_epoch_record *record)
620 {
621
622 return ck_epoch_poll_deferred(record, NULL);
623 }
624