1 /* 2 * Copyright 2011-2015 Samy Al Bahra. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 /* 28 * The implementation here is inspired from the work described in: 29 * Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University 30 * of Cambridge Computing Laboratory. 31 */ 32 33 #include <ck_backoff.h> 34 #include <ck_cc.h> 35 #include <ck_epoch.h> 36 #include <ck_pr.h> 37 #include <ck_stack.h> 38 #include <ck_stdbool.h> 39 #include <ck_string.h> 40 41 /* 42 * Only three distinct values are used for reclamation, but reclamation occurs 43 * at e+2 rather than e+1. Any thread in a "critical section" would have 44 * acquired some snapshot (e) of the global epoch value (e_g) and set an active 45 * flag. Any hazardous references will only occur after a full memory barrier. 46 * For example, assume an initial e_g value of 1, e value of 0 and active value 47 * of 0. 48 * 49 * ck_epoch_begin(...) 50 * e = e_g 51 * active = 1 52 * memory_barrier(); 53 * 54 * Any serialized reads may observe e = 0 or e = 1 with active = 0, or e = 0 or 55 * e = 1 with active = 1. The e_g value can only go from 1 to 2 if every thread 56 * has already observed the value of "1" (or the value we are incrementing 57 * from). This guarantees us that for any given value e_g, any threads with-in 58 * critical sections (referred to as "active" threads from here on) would have 59 * an e value of e_g-1 or e_g. This also means that hazardous references may be 60 * shared in both e_g-1 and e_g even if they are logically deleted in e_g. 61 * 62 * For example, assume all threads have an e value of e_g. Another thread may 63 * increment to e_g to e_g+1. Older threads may have a reference to an object 64 * which is only deleted in e_g+1. It could be that reader threads are 65 * executing some hash table look-ups, while some other writer thread (which 66 * causes epoch counter tick) actually deletes the same items that reader 67 * threads are looking up (this writer thread having an e value of e_g+1). 68 * This is possible if the writer thread re-observes the epoch after the 69 * counter tick. 70 * 71 * Psuedo-code for writer: 72 * ck_epoch_begin() 73 * ht_delete(x) 74 * ck_epoch_end() 75 * ck_epoch_begin() 76 * ht_delete(x) 77 * ck_epoch_end() 78 * 79 * Psuedo-code for reader: 80 * for (;;) { 81 * x = ht_lookup(x) 82 * ck_pr_inc(&x->value); 83 * } 84 * 85 * Of course, it is also possible for references logically deleted at e_g-1 to 86 * still be accessed at e_g as threads are "active" at the same time 87 * (real-world time) mutating shared objects. 88 * 89 * Now, if the epoch counter is ticked to e_g+1, then no new hazardous 90 * references could exist to objects logically deleted at e_g-1. The reason for 91 * this is that at e_g+1, all epoch read-side critical sections started at 92 * e_g-1 must have been completed. If any epoch read-side critical sections at 93 * e_g-1 were still active, then we would never increment to e_g+1 (active != 0 94 * ^ e != e_g). Additionally, e_g may still have hazardous references to 95 * objects logically deleted at e_g-1 which means objects logically deleted at 96 * e_g-1 cannot be deleted at e_g+1 unless all threads have observed e_g+1 97 * (since it is valid for active threads to be at e_g and threads at e_g still 98 * require safe memory accesses). 99 * 100 * However, at e_g+2, all active threads must be either at e_g+1 or e_g+2. 101 * Though e_g+2 may share hazardous references with e_g+1, and e_g+1 shares 102 * hazardous references to e_g, no active threads are at e_g or e_g-1. This 103 * means no hazardous references could exist to objects deleted at e_g-1 (at 104 * e_g+2). 105 * 106 * To summarize these important points, 107 * 1) Active threads will always have a value of e_g or e_g-1. 108 * 2) Items that are logically deleted e_g or e_g-1 cannot be physically 109 * deleted. 110 * 3) Objects logically deleted at e_g-1 can be physically destroyed at e_g+2 111 * or at e_g+1 if no threads are at e_g. 112 * 113 * Last but not least, if we are at e_g+2, then no active thread is at e_g 114 * which means it is safe to apply modulo-3 arithmetic to e_g value in order to 115 * re-use e_g to represent the e_g+3 state. This means it is sufficient to 116 * represent e_g using only the values 0, 1 or 2. Every time a thread re-visits 117 * a e_g (which can be determined with a non-empty deferral list) it can assume 118 * objects in the e_g deferral list involved at least three e_g transitions and 119 * are thus, safe, for physical deletion. 120 * 121 * Blocking semantics for epoch reclamation have additional restrictions. 122 * Though we only require three deferral lists, reasonable blocking semantics 123 * must be able to more gracefully handle bursty write work-loads which could 124 * easily cause e_g wrap-around if modulo-3 arithmetic is used. This allows for 125 * easy-to-trigger live-lock situations. The work-around to this is to not 126 * apply modulo arithmetic to e_g but only to deferral list indexing. 127 */ 128 #define CK_EPOCH_GRACE 3U 129 130 /* 131 * CK_EPOCH_LENGTH must be a power-of-2 (because (CK_EPOCH_LENGTH - 1) is used 132 * as a mask, and it must be at least 3 (see comments above). 133 */ 134 #if (CK_EPOCH_LENGTH < 3 || (CK_EPOCH_LENGTH & (CK_EPOCH_LENGTH - 1)) != 0) 135 #error "CK_EPOCH_LENGTH must be a power of 2 and >= 3" 136 #endif 137 138 enum { 139 CK_EPOCH_STATE_USED = 0, 140 CK_EPOCH_STATE_FREE = 1 141 }; 142 143 CK_STACK_CONTAINER(struct ck_epoch_record, record_next, 144 ck_epoch_record_container) 145 CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, 146 ck_epoch_entry_container) 147 148 #define CK_EPOCH_SENSE_MASK (CK_EPOCH_SENSE - 1) 149 150 bool 151 _ck_epoch_delref(struct ck_epoch_record *record, 152 struct ck_epoch_section *section) 153 { 154 struct ck_epoch_ref *current, *other; 155 unsigned int i = section->bucket; 156 157 current = &record->local.bucket[i]; 158 current->count--; 159 160 if (current->count > 0) 161 return false; 162 163 /* 164 * If the current bucket no longer has any references, then 165 * determine whether we have already transitioned into a newer 166 * epoch. If so, then make sure to update our shared snapshot 167 * to allow for forward progress. 168 * 169 * If no other active bucket exists, then the record will go 170 * inactive in order to allow for forward progress. 171 */ 172 other = &record->local.bucket[(i + 1) & CK_EPOCH_SENSE_MASK]; 173 if (other->count > 0 && 174 ((int)(current->epoch - other->epoch) < 0)) { 175 /* 176 * The other epoch value is actually the newest, 177 * transition to it. 178 */ 179 ck_pr_store_uint(&record->epoch, other->epoch); 180 } 181 182 return true; 183 } 184 185 void 186 _ck_epoch_addref(struct ck_epoch_record *record, 187 struct ck_epoch_section *section) 188 { 189 struct ck_epoch *global = record->global; 190 struct ck_epoch_ref *ref; 191 unsigned int epoch, i; 192 193 epoch = ck_pr_load_uint(&global->epoch); 194 i = epoch & CK_EPOCH_SENSE_MASK; 195 ref = &record->local.bucket[i]; 196 197 if (ref->count++ == 0) { 198 #ifndef CK_MD_TSO 199 struct ck_epoch_ref *previous; 200 201 /* 202 * The system has already ticked. If another non-zero bucket 203 * exists, make sure to order our observations with respect 204 * to it. Otherwise, it is possible to acquire a reference 205 * from the previous epoch generation. 206 * 207 * On TSO architectures, the monoticity of the global counter 208 * and load-{store, load} ordering are sufficient to guarantee 209 * this ordering. 210 */ 211 previous = &record->local.bucket[(i + 1) & 212 CK_EPOCH_SENSE_MASK]; 213 if (previous->count > 0) 214 ck_pr_fence_acqrel(); 215 #endif /* !CK_MD_TSO */ 216 217 /* 218 * If this is this is a new reference into the current 219 * bucket then cache the associated epoch value. 220 */ 221 ref->epoch = epoch; 222 } 223 224 section->bucket = i; 225 return; 226 } 227 228 void 229 ck_epoch_init(struct ck_epoch *global) 230 { 231 232 ck_stack_init(&global->records); 233 global->epoch = 1; 234 global->n_free = 0; 235 ck_pr_fence_store(); 236 return; 237 } 238 239 struct ck_epoch_record * 240 ck_epoch_recycle(struct ck_epoch *global, void *ct) 241 { 242 struct ck_epoch_record *record; 243 ck_stack_entry_t *cursor; 244 unsigned int state; 245 246 if (ck_pr_load_uint(&global->n_free) == 0) 247 return NULL; 248 249 CK_STACK_FOREACH(&global->records, cursor) { 250 record = ck_epoch_record_container(cursor); 251 252 if (ck_pr_load_uint(&record->state) == CK_EPOCH_STATE_FREE) { 253 /* Serialize with respect to deferral list clean-up. */ 254 ck_pr_fence_load(); 255 state = ck_pr_fas_uint(&record->state, 256 CK_EPOCH_STATE_USED); 257 if (state == CK_EPOCH_STATE_FREE) { 258 ck_pr_dec_uint(&global->n_free); 259 ck_pr_store_ptr(&record->ct, ct); 260 261 /* 262 * The context pointer is ordered by a 263 * subsequent protected section. 264 */ 265 return record; 266 } 267 } 268 } 269 270 return NULL; 271 } 272 273 void 274 ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record, 275 void *ct) 276 { 277 size_t i; 278 279 record->global = global; 280 record->state = CK_EPOCH_STATE_USED; 281 record->active = 0; 282 record->epoch = 0; 283 record->n_dispatch = 0; 284 record->n_peak = 0; 285 record->n_pending = 0; 286 record->ct = ct; 287 memset(&record->local, 0, sizeof record->local); 288 289 for (i = 0; i < CK_EPOCH_LENGTH; i++) 290 ck_stack_init(&record->pending[i]); 291 292 ck_pr_fence_store(); 293 ck_stack_push_upmc(&global->records, &record->record_next); 294 return; 295 } 296 297 void 298 ck_epoch_unregister(struct ck_epoch_record *record) 299 { 300 struct ck_epoch *global = record->global; 301 size_t i; 302 303 record->active = 0; 304 record->epoch = 0; 305 record->n_dispatch = 0; 306 record->n_peak = 0; 307 record->n_pending = 0; 308 memset(&record->local, 0, sizeof record->local); 309 310 for (i = 0; i < CK_EPOCH_LENGTH; i++) 311 ck_stack_init(&record->pending[i]); 312 313 ck_pr_store_ptr(&record->ct, NULL); 314 ck_pr_fence_store(); 315 ck_pr_store_uint(&record->state, CK_EPOCH_STATE_FREE); 316 ck_pr_inc_uint(&global->n_free); 317 return; 318 } 319 320 static struct ck_epoch_record * 321 ck_epoch_scan(struct ck_epoch *global, 322 struct ck_epoch_record *cr, 323 unsigned int epoch, 324 bool *af) 325 { 326 ck_stack_entry_t *cursor; 327 328 if (cr == NULL) { 329 cursor = CK_STACK_FIRST(&global->records); 330 *af = false; 331 } else { 332 cursor = &cr->record_next; 333 *af = true; 334 } 335 336 while (cursor != NULL) { 337 unsigned int state, active; 338 339 cr = ck_epoch_record_container(cursor); 340 341 state = ck_pr_load_uint(&cr->state); 342 if (state & CK_EPOCH_STATE_FREE) { 343 cursor = CK_STACK_NEXT(cursor); 344 continue; 345 } 346 347 active = ck_pr_load_uint(&cr->active); 348 *af |= active; 349 350 if (active != 0 && ck_pr_load_uint(&cr->epoch) != epoch) 351 return cr; 352 353 cursor = CK_STACK_NEXT(cursor); 354 } 355 356 return NULL; 357 } 358 359 static unsigned int 360 ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e, ck_stack_t *deferred) 361 { 362 unsigned int epoch = e & (CK_EPOCH_LENGTH - 1); 363 ck_stack_entry_t *head, *next, *cursor; 364 unsigned int n_pending, n_peak; 365 unsigned int i = 0; 366 367 head = ck_stack_batch_pop_upmc(&record->pending[epoch]); 368 for (cursor = head; cursor != NULL; cursor = next) { 369 struct ck_epoch_entry *entry = 370 ck_epoch_entry_container(cursor); 371 372 next = CK_STACK_NEXT(cursor); 373 if (deferred != NULL) 374 ck_stack_push_spnc(deferred, &entry->stack_entry); 375 else 376 entry->function(entry); 377 378 i++; 379 } 380 381 n_peak = ck_pr_load_uint(&record->n_peak); 382 n_pending = ck_pr_load_uint(&record->n_pending); 383 384 /* We don't require accuracy around peak calculation. */ 385 if (n_pending > n_peak) 386 ck_pr_store_uint(&record->n_peak, n_peak); 387 388 if (i > 0) { 389 ck_pr_add_uint(&record->n_dispatch, i); 390 ck_pr_sub_uint(&record->n_pending, i); 391 } 392 393 return i; 394 } 395 396 /* 397 * Reclaim all objects associated with a record. 398 */ 399 void 400 ck_epoch_reclaim(struct ck_epoch_record *record) 401 { 402 unsigned int epoch; 403 404 for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++) 405 ck_epoch_dispatch(record, epoch, NULL); 406 407 return; 408 } 409 410 CK_CC_FORCE_INLINE static void 411 epoch_block(struct ck_epoch *global, struct ck_epoch_record *cr, 412 ck_epoch_wait_cb_t *cb, void *ct) 413 { 414 415 if (cb != NULL) 416 cb(global, cr, ct); 417 418 return; 419 } 420 421 /* 422 * This function must not be called with-in read section. 423 */ 424 void 425 ck_epoch_synchronize_wait(struct ck_epoch *global, 426 ck_epoch_wait_cb_t *cb, void *ct) 427 { 428 struct ck_epoch_record *cr; 429 unsigned int delta, epoch, goal, i; 430 bool active; 431 432 ck_pr_fence_memory(); 433 434 /* 435 * The observation of the global epoch must be ordered with respect to 436 * all prior operations. The re-ordering of loads is permitted given 437 * monoticity of global epoch counter. 438 * 439 * If UINT_MAX concurrent mutations were to occur then it is possible 440 * to encounter an ABA-issue. If this is a concern, consider tuning 441 * write-side concurrency. 442 */ 443 delta = epoch = ck_pr_load_uint(&global->epoch); 444 goal = epoch + CK_EPOCH_GRACE; 445 446 for (i = 0, cr = NULL; i < CK_EPOCH_GRACE - 1; cr = NULL, i++) { 447 bool r; 448 449 /* 450 * Determine whether all threads have observed the current 451 * epoch with respect to the updates on invocation. 452 */ 453 while (cr = ck_epoch_scan(global, cr, delta, &active), 454 cr != NULL) { 455 unsigned int e_d; 456 457 ck_pr_stall(); 458 459 /* 460 * Another writer may have already observed a grace 461 * period. 462 */ 463 e_d = ck_pr_load_uint(&global->epoch); 464 if (e_d == delta) { 465 epoch_block(global, cr, cb, ct); 466 continue; 467 } 468 469 /* 470 * If the epoch has been updated, we may have already 471 * met our goal. 472 */ 473 delta = e_d; 474 if ((goal > epoch) & (delta >= goal)) 475 goto leave; 476 477 epoch_block(global, cr, cb, ct); 478 479 /* 480 * If the epoch has been updated, then a grace period 481 * requires that all threads are observed idle at the 482 * same epoch. 483 */ 484 cr = NULL; 485 } 486 487 /* 488 * If we have observed all threads as inactive, then we assume 489 * we are at a grace period. 490 */ 491 if (active == false) 492 break; 493 494 /* 495 * Increment current epoch. CAS semantics are used to eliminate 496 * increment operations for synchronization that occurs for the 497 * same global epoch value snapshot. 498 * 499 * If we can guarantee there will only be one active barrier or 500 * epoch tick at a given time, then it is sufficient to use an 501 * increment operation. In a multi-barrier workload, however, 502 * it is possible to overflow the epoch value if we apply 503 * modulo-3 arithmetic. 504 */ 505 r = ck_pr_cas_uint_value(&global->epoch, delta, delta + 1, 506 &delta); 507 508 /* Order subsequent thread active checks. */ 509 ck_pr_fence_atomic_load(); 510 511 /* 512 * If CAS has succeeded, then set delta to latest snapshot. 513 * Otherwise, we have just acquired latest snapshot. 514 */ 515 delta = delta + r; 516 } 517 518 /* 519 * A majority of use-cases will not require full barrier semantics. 520 * However, if non-temporal instructions are used, full barrier 521 * semantics are necessary. 522 */ 523 leave: 524 ck_pr_fence_memory(); 525 return; 526 } 527 528 void 529 ck_epoch_synchronize(struct ck_epoch_record *record) 530 { 531 532 ck_epoch_synchronize_wait(record->global, NULL, NULL); 533 return; 534 } 535 536 void 537 ck_epoch_barrier(struct ck_epoch_record *record) 538 { 539 540 ck_epoch_synchronize(record); 541 ck_epoch_reclaim(record); 542 return; 543 } 544 545 void 546 ck_epoch_barrier_wait(struct ck_epoch_record *record, ck_epoch_wait_cb_t *cb, 547 void *ct) 548 { 549 550 ck_epoch_synchronize_wait(record->global, cb, ct); 551 ck_epoch_reclaim(record); 552 return; 553 } 554 555 /* 556 * It may be worth it to actually apply these deferral semantics to an epoch 557 * that was observed at ck_epoch_call time. The problem is that the latter 558 * would require a full fence. 559 * 560 * ck_epoch_call will dispatch to the latest epoch snapshot that was observed. 561 * There are cases where it will fail to reclaim as early as it could. If this 562 * becomes a problem, we could actually use a heap for epoch buckets but that 563 * is far from ideal too. 564 */ 565 bool 566 ck_epoch_poll_deferred(struct ck_epoch_record *record, ck_stack_t *deferred) 567 { 568 bool active; 569 unsigned int epoch; 570 struct ck_epoch_record *cr = NULL; 571 struct ck_epoch *global = record->global; 572 unsigned int n_dispatch; 573 574 epoch = ck_pr_load_uint(&global->epoch); 575 576 /* Serialize epoch snapshots with respect to global epoch. */ 577 ck_pr_fence_memory(); 578 579 /* 580 * At this point, epoch is the current global epoch value. 581 * There may or may not be active threads which observed epoch - 1. 582 * (ck_epoch_scan() will tell us that). However, there should be 583 * no active threads which observed epoch - 2. 584 * 585 * Note that checking epoch - 2 is necessary, as race conditions can 586 * allow another thread to increment the global epoch before this 587 * thread runs. 588 */ 589 n_dispatch = ck_epoch_dispatch(record, epoch - 2, deferred); 590 591 cr = ck_epoch_scan(global, cr, epoch, &active); 592 if (cr != NULL) 593 return (n_dispatch > 0); 594 595 /* We are at a grace period if all threads are inactive. */ 596 if (active == false) { 597 record->epoch = epoch; 598 for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++) 599 ck_epoch_dispatch(record, epoch, deferred); 600 601 return true; 602 } 603 604 /* 605 * If an active thread exists, rely on epoch observation. 606 * 607 * All the active threads entered the epoch section during 608 * the current epoch. Therefore, we can now run the handlers 609 * for the immediately preceding epoch and attempt to 610 * advance the epoch if it hasn't been already. 611 */ 612 (void)ck_pr_cas_uint(&global->epoch, epoch, epoch + 1); 613 614 ck_epoch_dispatch(record, epoch - 1, deferred); 615 return true; 616 } 617 618 bool 619 ck_epoch_poll(struct ck_epoch_record *record) 620 { 621 622 return ck_epoch_poll_deferred(record, NULL); 623 } 624