| /* |
| * Copyright 2011-2015 Samy Al Bahra. |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
| * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| */ |
| |
| /* |
| * The implementation here is inspired from the work described in: |
| * Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University |
| * of Cambridge Computing Laboratory. |
| */ |
| |
| #include <ck_backoff.h> |
| #include <ck_cc.h> |
| #include <ck_epoch.h> |
| #include <ck_pr.h> |
| #include <ck_stack.h> |
| #include <ck_stdbool.h> |
| #include <ck_string.h> |
| |
| /* |
| * Only three distinct values are used for reclamation, but reclamation occurs |
| * at e+2 rather than e+1. Any thread in a "critical section" would have |
| * acquired some snapshot (e) of the global epoch value (e_g) and set an active |
| * flag. Any hazardous references will only occur after a full memory barrier. |
| * For example, assume an initial e_g value of 1, e value of 0 and active value |
| * of 0. |
| * |
| * ck_epoch_begin(...) |
| * e = e_g |
| * active = 1 |
| * memory_barrier(); |
| * |
| * Any serialized reads may observe e = 0 or e = 1 with active = 0, or e = 0 or |
| * e = 1 with active = 1. The e_g value can only go from 1 to 2 if every thread |
| * has already observed the value of "1" (or the value we are incrementing |
| * from). This guarantees us that for any given value e_g, any threads with-in |
| * critical sections (referred to as "active" threads from here on) would have |
| * an e value of e_g-1 or e_g. This also means that hazardous references may be |
| * shared in both e_g-1 and e_g even if they are logically deleted in e_g. |
| * |
| * For example, assume all threads have an e value of e_g. Another thread may |
| * increment to e_g to e_g+1. Older threads may have a reference to an object |
| * which is only deleted in e_g+1. It could be that reader threads are |
| * executing some hash table look-ups, while some other writer thread (which |
| * causes epoch counter tick) actually deletes the same items that reader |
| * threads are looking up (this writer thread having an e value of e_g+1). |
| * This is possible if the writer thread re-observes the epoch after the |
| * counter tick. |
| * |
| * Psuedo-code for writer: |
| * ck_epoch_begin() |
| * ht_delete(x) |
| * ck_epoch_end() |
| * ck_epoch_begin() |
| * ht_delete(x) |
| * ck_epoch_end() |
| * |
| * Psuedo-code for reader: |
| * for (;;) { |
| * x = ht_lookup(x) |
| * ck_pr_inc(&x->value); |
| * } |
| * |
| * Of course, it is also possible for references logically deleted at e_g-1 to |
| * still be accessed at e_g as threads are "active" at the same time |
| * (real-world time) mutating shared objects. |
| * |
| * Now, if the epoch counter is ticked to e_g+1, then no new hazardous |
| * references could exist to objects logically deleted at e_g-1. The reason for |
| * this is that at e_g+1, all epoch read-side critical sections started at |
| * e_g-1 must have been completed. If any epoch read-side critical sections at |
| * e_g-1 were still active, then we would never increment to e_g+1 (active != 0 |
| * ^ e != e_g). Additionally, e_g may still have hazardous references to |
| * objects logically deleted at e_g-1 which means objects logically deleted at |
| * e_g-1 cannot be deleted at e_g+1 unless all threads have observed e_g+1 |
| * (since it is valid for active threads to be at e_g and threads at e_g still |
| * require safe memory accesses). |
| * |
| * However, at e_g+2, all active threads must be either at e_g+1 or e_g+2. |
| * Though e_g+2 may share hazardous references with e_g+1, and e_g+1 shares |
| * hazardous references to e_g, no active threads are at e_g or e_g-1. This |
| * means no hazardous references could exist to objects deleted at e_g-1 (at |
| * e_g+2). |
| * |
| * To summarize these important points, |
| * 1) Active threads will always have a value of e_g or e_g-1. |
| * 2) Items that are logically deleted e_g or e_g-1 cannot be physically |
| * deleted. |
| * 3) Objects logically deleted at e_g-1 can be physically destroyed at e_g+2 |
| * or at e_g+1 if no threads are at e_g. |
| * |
| * Last but not least, if we are at e_g+2, then no active thread is at e_g |
| * which means it is safe to apply modulo-3 arithmetic to e_g value in order to |
| * re-use e_g to represent the e_g+3 state. This means it is sufficient to |
| * represent e_g using only the values 0, 1 or 2. Every time a thread re-visits |
| * a e_g (which can be determined with a non-empty deferral list) it can assume |
| * objects in the e_g deferral list involved at least three e_g transitions and |
| * are thus, safe, for physical deletion. |
| * |
| * Blocking semantics for epoch reclamation have additional restrictions. |
| * Though we only require three deferral lists, reasonable blocking semantics |
| * must be able to more gracefully handle bursty write work-loads which could |
| * easily cause e_g wrap-around if modulo-3 arithmetic is used. This allows for |
| * easy-to-trigger live-lock situations. The work-around to this is to not |
| * apply modulo arithmetic to e_g but only to deferral list indexing. |
| */ |
| #define CK_EPOCH_GRACE 3U |
| |
| enum { |
| CK_EPOCH_STATE_USED = 0, |
| CK_EPOCH_STATE_FREE = 1 |
| }; |
| |
| CK_STACK_CONTAINER(struct ck_epoch_record, record_next, |
| ck_epoch_record_container) |
| CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, |
| ck_epoch_entry_container) |
| |
| #define CK_EPOCH_SENSE_MASK (CK_EPOCH_SENSE - 1) |
| |
| bool |
| _ck_epoch_delref(struct ck_epoch_record *record, |
| struct ck_epoch_section *section) |
| { |
| struct ck_epoch_ref *current, *other; |
| unsigned int i = section->bucket; |
| |
| current = &record->local.bucket[i]; |
| current->count--; |
| |
| if (current->count > 0) |
| return false; |
| |
| /* |
| * If the current bucket no longer has any references, then |
| * determine whether we have already transitioned into a newer |
| * epoch. If so, then make sure to update our shared snapshot |
| * to allow for forward progress. |
| * |
| * If no other active bucket exists, then the record will go |
| * inactive in order to allow for forward progress. |
| */ |
| other = &record->local.bucket[(i + 1) & CK_EPOCH_SENSE_MASK]; |
| if (other->count > 0 && |
| ((int)(current->epoch - other->epoch) < 0)) { |
| /* |
| * The other epoch value is actually the newest, |
| * transition to it. |
| */ |
| ck_pr_store_uint(&record->epoch, other->epoch); |
| } |
| |
| return true; |
| } |
| |
| void |
| _ck_epoch_addref(struct ck_epoch_record *record, |
| struct ck_epoch_section *section) |
| { |
| struct ck_epoch *global = record->global; |
| struct ck_epoch_ref *ref; |
| unsigned int epoch, i; |
| |
| epoch = ck_pr_load_uint(&global->epoch); |
| i = epoch & CK_EPOCH_SENSE_MASK; |
| ref = &record->local.bucket[i]; |
| |
| if (ref->count++ == 0) { |
| #ifndef CK_MD_TSO |
| struct ck_epoch_ref *previous; |
| |
| /* |
| * The system has already ticked. If another non-zero bucket |
| * exists, make sure to order our observations with respect |
| * to it. Otherwise, it is possible to acquire a reference |
| * from the previous epoch generation. |
| * |
| * On TSO architectures, the monoticity of the global counter |
| * and load-{store, load} ordering are sufficient to guarantee |
| * this ordering. |
| */ |
| previous = &record->local.bucket[(i + 1) & |
| CK_EPOCH_SENSE_MASK]; |
| if (previous->count > 0) |
| ck_pr_fence_acqrel(); |
| #endif /* !CK_MD_TSO */ |
| |
| /* |
| * If this is this is a new reference into the current |
| * bucket then cache the associated epoch value. |
| */ |
| ref->epoch = epoch; |
| } |
| |
| section->bucket = i; |
| return; |
| } |
| |
| void |
| ck_epoch_init(struct ck_epoch *global) |
| { |
| |
| ck_stack_init(&global->records); |
| global->epoch = 1; |
| global->n_free = 0; |
| ck_pr_fence_store(); |
| return; |
| } |
| |
| struct ck_epoch_record * |
| ck_epoch_recycle(struct ck_epoch *global, void *ct) |
| { |
| struct ck_epoch_record *record; |
| ck_stack_entry_t *cursor; |
| unsigned int state; |
| |
| if (ck_pr_load_uint(&global->n_free) == 0) |
| return NULL; |
| |
| CK_STACK_FOREACH(&global->records, cursor) { |
| record = ck_epoch_record_container(cursor); |
| |
| if (ck_pr_load_uint(&record->state) == CK_EPOCH_STATE_FREE) { |
| /* Serialize with respect to deferral list clean-up. */ |
| ck_pr_fence_load(); |
| state = ck_pr_fas_uint(&record->state, |
| CK_EPOCH_STATE_USED); |
| if (state == CK_EPOCH_STATE_FREE) { |
| ck_pr_dec_uint(&global->n_free); |
| ck_pr_store_ptr(&record->ct, ct); |
| |
| /* |
| * The context pointer is ordered by a |
| * subsequent protected section. |
| */ |
| return record; |
| } |
| } |
| } |
| |
| return NULL; |
| } |
| |
| void |
| ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record, |
| void *ct) |
| { |
| size_t i; |
| |
| record->global = global; |
| record->state = CK_EPOCH_STATE_USED; |
| record->active = 0; |
| record->epoch = 0; |
| record->n_dispatch = 0; |
| record->n_peak = 0; |
| record->n_pending = 0; |
| record->ct = ct; |
| memset(&record->local, 0, sizeof record->local); |
| |
| for (i = 0; i < CK_EPOCH_LENGTH; i++) |
| ck_stack_init(&record->pending[i]); |
| |
| ck_pr_fence_store(); |
| ck_stack_push_upmc(&global->records, &record->record_next); |
| return; |
| } |
| |
| void |
| ck_epoch_unregister(struct ck_epoch_record *record) |
| { |
| struct ck_epoch *global = record->global; |
| size_t i; |
| |
| record->active = 0; |
| record->epoch = 0; |
| record->n_dispatch = 0; |
| record->n_peak = 0; |
| record->n_pending = 0; |
| memset(&record->local, 0, sizeof record->local); |
| |
| for (i = 0; i < CK_EPOCH_LENGTH; i++) |
| ck_stack_init(&record->pending[i]); |
| |
| ck_pr_store_ptr(&record->ct, NULL); |
| ck_pr_fence_store(); |
| ck_pr_store_uint(&record->state, CK_EPOCH_STATE_FREE); |
| ck_pr_inc_uint(&global->n_free); |
| return; |
| } |
| |
| static struct ck_epoch_record * |
| ck_epoch_scan(struct ck_epoch *global, |
| struct ck_epoch_record *cr, |
| unsigned int epoch, |
| bool *af) |
| { |
| ck_stack_entry_t *cursor; |
| |
| if (cr == NULL) { |
| cursor = CK_STACK_FIRST(&global->records); |
| *af = false; |
| } else { |
| cursor = &cr->record_next; |
| *af = true; |
| } |
| |
| while (cursor != NULL) { |
| unsigned int state, active; |
| |
| cr = ck_epoch_record_container(cursor); |
| |
| state = ck_pr_load_uint(&cr->state); |
| if (state & CK_EPOCH_STATE_FREE) { |
| cursor = CK_STACK_NEXT(cursor); |
| continue; |
| } |
| |
| active = ck_pr_load_uint(&cr->active); |
| *af |= active; |
| |
| if (active != 0 && ck_pr_load_uint(&cr->epoch) != epoch) |
| return cr; |
| |
| cursor = CK_STACK_NEXT(cursor); |
| } |
| |
| return NULL; |
| } |
| |
| static void |
| ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e) |
| { |
| unsigned int epoch = e & (CK_EPOCH_LENGTH - 1); |
| ck_stack_entry_t *head, *next, *cursor; |
| unsigned int n_pending, n_peak; |
| unsigned int i = 0; |
| |
| head = ck_stack_batch_pop_upmc(&record->pending[epoch]); |
| for (cursor = head; cursor != NULL; cursor = next) { |
| struct ck_epoch_entry *entry = |
| ck_epoch_entry_container(cursor); |
| |
| next = CK_STACK_NEXT(cursor); |
| entry->function(entry); |
| i++; |
| } |
| |
| n_peak = ck_pr_load_uint(&record->n_peak); |
| n_pending = ck_pr_load_uint(&record->n_pending); |
| |
| /* We don't require accuracy around peak calculation. */ |
| if (n_pending > n_peak) |
| ck_pr_store_uint(&record->n_peak, n_peak); |
| |
| if (i > 0) { |
| ck_pr_add_uint(&record->n_dispatch, i); |
| ck_pr_sub_uint(&record->n_pending, i); |
| } |
| |
| return; |
| } |
| |
| /* |
| * Reclaim all objects associated with a record. |
| */ |
| void |
| ck_epoch_reclaim(struct ck_epoch_record *record) |
| { |
| unsigned int epoch; |
| |
| for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++) |
| ck_epoch_dispatch(record, epoch); |
| |
| return; |
| } |
| |
| CK_CC_FORCE_INLINE static void |
| epoch_block(struct ck_epoch *global, struct ck_epoch_record *cr, |
| ck_epoch_wait_cb_t *cb, void *ct) |
| { |
| |
| if (cb != NULL) |
| cb(global, cr, ct); |
| |
| return; |
| } |
| |
| /* |
| * This function must not be called with-in read section. |
| */ |
| void |
| ck_epoch_synchronize_wait(struct ck_epoch *global, |
| ck_epoch_wait_cb_t *cb, void *ct) |
| { |
| struct ck_epoch_record *cr; |
| unsigned int delta, epoch, goal, i; |
| bool active; |
| |
| ck_pr_fence_memory(); |
| |
| /* |
| * The observation of the global epoch must be ordered with respect to |
| * all prior operations. The re-ordering of loads is permitted given |
| * monoticity of global epoch counter. |
| * |
| * If UINT_MAX concurrent mutations were to occur then it is possible |
| * to encounter an ABA-issue. If this is a concern, consider tuning |
| * write-side concurrency. |
| */ |
| delta = epoch = ck_pr_load_uint(&global->epoch); |
| goal = epoch + CK_EPOCH_GRACE; |
| |
| for (i = 0, cr = NULL; i < CK_EPOCH_GRACE - 1; cr = NULL, i++) { |
| bool r; |
| |
| /* |
| * Determine whether all threads have observed the current |
| * epoch with respect to the updates on invocation. |
| */ |
| while (cr = ck_epoch_scan(global, cr, delta, &active), |
| cr != NULL) { |
| unsigned int e_d; |
| |
| ck_pr_stall(); |
| |
| /* |
| * Another writer may have already observed a grace |
| * period. |
| */ |
| e_d = ck_pr_load_uint(&global->epoch); |
| if (e_d == delta) { |
| epoch_block(global, cr, cb, ct); |
| continue; |
| } |
| |
| /* |
| * If the epoch has been updated, we may have already |
| * met our goal. |
| */ |
| delta = e_d; |
| if ((goal > epoch) & (delta >= goal)) |
| goto leave; |
| |
| epoch_block(global, cr, cb, ct); |
| |
| /* |
| * If the epoch has been updated, then a grace period |
| * requires that all threads are observed idle at the |
| * same epoch. |
| */ |
| cr = NULL; |
| } |
| |
| /* |
| * If we have observed all threads as inactive, then we assume |
| * we are at a grace period. |
| */ |
| if (active == false) |
| break; |
| |
| /* |
| * Increment current epoch. CAS semantics are used to eliminate |
| * increment operations for synchronization that occurs for the |
| * same global epoch value snapshot. |
| * |
| * If we can guarantee there will only be one active barrier or |
| * epoch tick at a given time, then it is sufficient to use an |
| * increment operation. In a multi-barrier workload, however, |
| * it is possible to overflow the epoch value if we apply |
| * modulo-3 arithmetic. |
| */ |
| r = ck_pr_cas_uint_value(&global->epoch, delta, delta + 1, |
| &delta); |
| |
| /* Order subsequent thread active checks. */ |
| ck_pr_fence_atomic_load(); |
| |
| /* |
| * If CAS has succeeded, then set delta to latest snapshot. |
| * Otherwise, we have just acquired latest snapshot. |
| */ |
| delta = delta + r; |
| } |
| |
| /* |
| * A majority of use-cases will not require full barrier semantics. |
| * However, if non-temporal instructions are used, full barrier |
| * semantics are necessary. |
| */ |
| leave: |
| ck_pr_fence_memory(); |
| return; |
| } |
| |
| void |
| ck_epoch_synchronize(struct ck_epoch_record *record) |
| { |
| |
| ck_epoch_synchronize_wait(record->global, NULL, NULL); |
| return; |
| } |
| |
| void |
| ck_epoch_barrier(struct ck_epoch_record *record) |
| { |
| |
| ck_epoch_synchronize(record); |
| ck_epoch_reclaim(record); |
| return; |
| } |
| |
| void |
| ck_epoch_barrier_wait(struct ck_epoch_record *record, ck_epoch_wait_cb_t *cb, |
| void *ct) |
| { |
| |
| ck_epoch_synchronize_wait(record->global, cb, ct); |
| ck_epoch_reclaim(record); |
| return; |
| } |
| |
| /* |
| * It may be worth it to actually apply these deferral semantics to an epoch |
| * that was observed at ck_epoch_call time. The problem is that the latter |
| * would require a full fence. |
| * |
| * ck_epoch_call will dispatch to the latest epoch snapshot that was observed. |
| * There are cases where it will fail to reclaim as early as it could. If this |
| * becomes a problem, we could actually use a heap for epoch buckets but that |
| * is far from ideal too. |
| */ |
| bool |
| ck_epoch_poll(struct ck_epoch_record *record) |
| { |
| bool active; |
| unsigned int epoch; |
| struct ck_epoch_record *cr = NULL; |
| struct ck_epoch *global = record->global; |
| |
| epoch = ck_pr_load_uint(&global->epoch); |
| |
| /* Serialize epoch snapshots with respect to global epoch. */ |
| ck_pr_fence_memory(); |
| cr = ck_epoch_scan(global, cr, epoch, &active); |
| if (cr != NULL) { |
| record->epoch = epoch; |
| return false; |
| } |
| |
| /* We are at a grace period if all threads are inactive. */ |
| if (active == false) { |
| record->epoch = epoch; |
| for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++) |
| ck_epoch_dispatch(record, epoch); |
| |
| return true; |
| } |
| |
| /* If an active thread exists, rely on epoch observation. */ |
| (void)ck_pr_cas_uint(&global->epoch, epoch, epoch + 1); |
| |
| ck_epoch_dispatch(record, epoch + 1); |
| return true; |
| } |