xref: /freebsd/sys/contrib/ck/include/ck_ec.h (revision 4f5890a0fb086324a657f3cd7ba1abc57274e0db)
1 /*
2  * Copyright 2018 Paul Khuong, Google LLC.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /*
28  * Overview
29  * ========
30  *
31  * ck_ec implements 32- and 64- bit event counts. Event counts let us
32  * easily integrate OS-level blocking (e.g., futexes) in lock-free
33  * protocols. Waiters block conditionally, if the event count's value
34  * is still equal to some old value.
35  *
36  * Event counts come in four variants: 32 and 64 bit (with one bit
37  * stolen for internal signaling, so 31 and 63 bit counters), and
38  * single or multiple producers (wakers). Waiters are always multiple
39  * consumers. The 32 bit variants are smaller, and more efficient,
40  * especially in single producer mode. The 64 bit variants are larger,
41  * but practically invulnerable to ABA.
42  *
43  * The 32 bit variant is always available. The 64 bit variant is only
44  * available if CK supports 64-bit atomic operations. Currently,
45  * specialization for single producer is only implemented for x86 and
46  * x86-64, on compilers that support GCC extended inline assembly;
47  * other platforms fall back to the multiple producer code path.
48  *
49  * A typical usage pattern is:
50  *
51  *  1. On the producer side:
52  *
53  *    - Make changes to some shared data structure, without involving
54  *	the event count at all.
55  *    - After each change, call ck_ec_inc on the event count. The call
56  *	acts as a write-write barrier, and wakes up any consumer blocked
57  *	on the event count (waiting for new changes).
58  *
59  *  2. On the consumer side:
60  *
61  *    - Snapshot ck_ec_value of the event count. The call acts as a
62  *	read barrier.
63  *    - Read and process the shared data structure.
64  *    - Wait for new changes by calling ck_ec_wait with the snapshot value.
65  *
66  * Some data structures may opt for tighter integration with their
67  * event count. For example, an SPMC ring buffer or disruptor might
68  * use the event count's value as the write pointer. If the buffer is
69  * regularly full, it might also make sense to store the read pointer
70  * in an MP event count.
71  *
72  * This event count implementation supports tighter integration in two
73  * ways.
74  *
75  * Producers may opt to increment by an arbitrary value (less than
76  * INT32_MAX / INT64_MAX), in order to encode, e.g., byte
77  * offsets. Larger increment values make wraparound more likely, so
78  * the increments should still be relatively small.
79  *
80  * Consumers may pass a predicate to ck_ec_wait_pred. This predicate
81  * can make `ck_ec_wait_pred` return early, before the event count's
82  * value changes, and can override the deadline passed to futex_wait.
83  * This lets consumer block on one eventcount, while optimistically
84  * looking at other waking conditions.
85  *
86  * API Reference
87  * =============
88  *
89  * When compiled as C11 or later, this header defines type-generic
90  * macros for ck_ec32 and ck_ec64; the reference describes this
91  * type-generic API.
92  *
93  * ck_ec needs additional OS primitives to determine the current time,
94  * to wait on an address, and to wake all threads waiting on a given
95  * address. These are defined with fields in a struct ck_ec_ops.  Each
96  * ck_ec_ops may additionally define the number of spin loop
97  * iterations in the slow path, as well as the initial wait time in
98  * the internal exponential backoff, the exponential scale factor, and
99  * the right shift count (< 32).
100  *
101  * The ops, in addition to the single/multiple producer flag, are
102  * encapsulated in a struct ck_ec_mode, passed to most ck_ec
103  * operations.
104  *
105  * ec is a struct ck_ec32 *, or a struct ck_ec64 *.
106  *
107  * value is an uint32_t for ck_ec32, and an uint64_t for ck_ec64. It
108  * never exceeds INT32_MAX and INT64_MAX respectively.
109  *
110  * mode is a struct ck_ec_mode *.
111  *
112  * deadline is either NULL, or a `const struct timespec *` that will
113  * be treated as an absolute deadline.
114  *
115  * `void ck_ec_init(ec, value)`: initializes the event count to value.
116  *
117  * `value ck_ec_value(ec)`: returns the current value of the event
118  *  counter.  This read acts as a read (acquire) barrier.
119  *
120  * `bool ck_ec_has_waiters(ec)`: returns whether some thread has
121  *  marked the event count as requiring an OS wakeup.
122  *
123  * `void ck_ec_inc(ec, mode)`: increments the value of the event
124  *  counter by one. This writes acts as a write barrier. Wakes up
125  *  any waiting thread.
126  *
127  * `value ck_ec_add(ec, mode, value)`: increments the event counter by
128  *  `value`, and returns the event counter's previous value. This
129  *  write acts as a write barrier. Wakes up any waiting thread.
130  *
131  * `int ck_ec_deadline(struct timespec *new_deadline,
132  *		       mode,
133  *		       const struct timespec *timeout)`:
134  *  computes a deadline `timeout` away from the current time. If
135  *  timeout is NULL, computes a deadline in the infinite future. The
136  *  resulting deadline is written to `new_deadline`. Returns 0 on
137  *  success, and -1 if ops->gettime failed (without touching errno).
138  *
139  * `int ck_ec_wait(ec, mode, value, deadline)`: waits until the event
140  *  counter's value differs from `value`, or, if `deadline` is
141  *  provided and non-NULL, until the current time is after that
142  *  deadline. Use a deadline with tv_sec = 0 for a non-blocking
143  *  execution. Returns 0 if the event counter has changed, and -1 on
144  *  timeout. This function acts as a read (acquire) barrier.
145  *
146  * `int ck_ec_wait_pred(ec, mode, value, pred, data, deadline)`: waits
147  * until the event counter's value differs from `value`, or until
148  * `pred` returns non-zero, or, if `deadline` is provided and
149  * non-NULL, until the current time is after that deadline. Use a
150  * deadline with tv_sec = 0 for a non-blocking execution. Returns 0 if
151  * the event counter has changed, `pred`'s return value if non-zero,
152  * and -1 on timeout. This function acts as a read (acquire) barrier.
153  *
154  * `pred` is always called as `pred(data, iteration_deadline, now)`,
155  * where `iteration_deadline` is a timespec of the deadline for this
156  * exponential backoff iteration, and `now` is the current time. If
157  * `pred` returns a non-zero value, that value is immediately returned
158  * to the waiter. Otherwise, `pred` is free to modify
159  * `iteration_deadline` (moving it further in the future is a bad
160  * idea).
161  *
162  * Implementation notes
163  * ====================
164  *
165  * The multiple producer implementation is a regular locked event
166  * count, with a single flag bit to denote the need to wake up waiting
167  * threads.
168  *
169  * The single producer specialization is heavily tied to
170  * [x86-TSO](https://www.cl.cam.ac.uk/~pes20/weakmemory/cacm.pdf), and
171  * to non-atomic read-modify-write instructions (e.g., `inc mem`);
172  * these non-atomic RMW let us write to the same memory locations with
173  * atomic and non-atomic instructions, without suffering from process
174  * scheduling stalls.
175  *
176  * The reason we can mix atomic and non-atomic writes to the `counter`
177  * word is that every non-atomic write obviates the need for the
178  * atomically flipped flag bit: we only use non-atomic writes to
179  * update the event count, and the atomic flag only informs the
180  * producer that we would like a futex_wake, because of the update.
181  * We only require the non-atomic RMW counter update to prevent
182  * preemption from introducing arbitrarily long worst case delays.
183  *
184  * Correctness does not rely on the usual ordering argument: in the
185  * absence of fences, there is no strict ordering between atomic and
186  * non-atomic writes. The key is instead x86-TSO's guarantee that a
187  * read is satisfied from the most recent buffered write in the local
188  * store queue if there is one, or from memory if there is no write to
189  * that address in the store queue.
190  *
191  * x86-TSO's constraint on reads suffices to guarantee that the
192  * producer will never forget about a counter update. If the last
193  * update is still queued, the new update will be based on the queued
194  * value. Otherwise, the new update will be based on the value in
195  * memory, which may or may not have had its flag flipped. In either
196  * case, the value of the counter (modulo flag) is correct.
197  *
198  * When the producer forwards the counter's value from its store
199  * queue, the new update might not preserve a flag flip. Any waiter
200  * thus has to check from time to time to determine if it wasn't
201  * woken up because the flag bit was silently cleared.
202  *
203  * In reality, the store queue in x86-TSO stands for in-flight
204  * instructions in the chip's out-of-order backend. In the vast
205  * majority of cases, instructions will only remain in flight for a
206  * few hundred or thousand of cycles. That's why ck_ec_wait spins on
207  * the `counter` word for ~100 iterations after flipping its flag bit:
208  * if the counter hasn't changed after that many iterations, it is
209  * very likely that the producer's next counter update will observe
210  * the flag flip.
211  *
212  * That's still not a hard guarantee of correctness. Conservatively,
213  * we can expect that no instruction will remain in flight for more
214  * than 1 second... if only because some interrupt will have forced
215  * the chip to store its architectural state in memory, at which point
216  * an instruction is either fully retired or rolled back. Interrupts,
217  * particularly the pre-emption timer, are why single-producer updates
218  * must happen in a single non-atomic read-modify-write instruction.
219  * Having a single instruction as the critical section means we only
220  * have to consider the worst-case execution time for that
221  * instruction. That's easier than doing the same for a pair of
222  * instructions, which an unlucky pre-emption could delay for
223  * arbitrarily long.
224  *
225  * Thus, after a short spin loop, ck_ec_wait enters an exponential
226  * backoff loop, where each "sleep" is instead a futex_wait.  The
227  * backoff is only necessary to handle rare cases where the flag flip
228  * was overwritten after the spin loop. Eventually, more than one
229  * second will have elapsed since the flag flip, and the sleep timeout
230  * becomes infinite: since the flag bit has been set for much longer
231  * than the time for which an instruction may remain in flight, the
232  * flag will definitely be observed at the next counter update.
233  *
234  * The 64 bit ck_ec_wait pulls another trick: futexes only handle 32
235  * bit ints, so we must treat the 64 bit counter's low 32 bits as an
236  * int in futex_wait. That's a bit dodgy, but fine in practice, given
237  * that the OS's futex code will always read whatever value is
238  * currently in memory: even if the producer thread were to wait on
239  * its own event count, the syscall and ring transition would empty
240  * the store queue (the out-of-order execution backend).
241  *
242  * Finally, what happens when the producer is migrated to another core
243  * or otherwise pre-empted? Migration must already incur a barrier, so
244  * that thread always sees its own writes, so that's safe. As for
245  * pre-emption, that requires storing the architectural state, which
246  * means every instruction must either be executed fully or not at
247  * all when pre-emption happens.
248  */
249 
250 #ifndef CK_EC_H
251 #define CK_EC_H
252 #include <ck_cc.h>
253 #include <ck_pr.h>
254 #include <ck_stdbool.h>
255 #include <ck_stdint.h>
256 #include <ck_stddef.h>
257 #include <sys/time.h>
258 
259 /*
260  * If we have ck_pr_faa_64 (and, presumably, ck_pr_load_64), we
261  * support 63 bit counters.
262  */
263 #ifdef CK_F_PR_FAA_64
264 #define CK_F_EC64
265 #endif /* CK_F_PR_FAA_64 */
266 
267 /*
268  * GCC inline assembly lets us exploit non-atomic read-modify-write
269  * instructions on x86/x86_64 for a fast single-producer mode.
270  *
271  * If we CK_F_EC_SP is not defined, CK_EC always uses the slower
272  * multiple producer code.
273  */
274 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
275 #define CK_F_EC_SP
276 #endif /* GNUC && (__i386__ || __x86_64__) */
277 
278 struct ck_ec_ops;
279 
280 struct ck_ec_wait_state {
281 	struct timespec start;	/* Time when we entered ck_ec_wait. */
282 	struct timespec now;  /* Time now. */
283 	const struct ck_ec_ops *ops;
284 	void *data;  /* Opaque pointer for the predicate's internal state. */
285 
286 };
287 
288 /*
289  * ck_ec_ops define system-specific functions to get the current time,
290  * atomically wait on an address if it still has some expected value,
291  * and to wake all threads waiting on an address.
292  *
293  * Each platform is expected to have few (one) opaque pointer to a
294  * const ops struct, and reuse it for all ck_ec_mode structs.
295  */
296 struct ck_ec_ops {
297 	/* Populates out with the current time. Returns non-zero on failure. */
298 	int (*gettime)(const struct ck_ec_ops *, struct timespec *out);
299 
300 	/*
301 	 * Waits on address if its value is still `expected`.  If
302 	 * deadline is non-NULL, stops waiting once that deadline is
303 	 * reached. May return early for any reason.
304 	 */
305 	void (*wait32)(const struct ck_ec_wait_state *, const uint32_t *,
306 		       uint32_t expected, const struct timespec *deadline);
307 
308 	/*
309 	 * Same as wait32, but for a 64 bit counter. Only used if
310 	 * CK_F_EC64 is defined.
311 	 *
312 	 * If underlying blocking primitive only supports 32 bit
313 	 * control words, it should be safe to block on the least
314 	 * significant half of the 64 bit address.
315 	 */
316 	void (*wait64)(const struct ck_ec_wait_state *, const uint64_t *,
317 		       uint64_t expected, const struct timespec *deadline);
318 
319 	/* Wakes up all threads waiting on address. */
320 	void (*wake32)(const struct ck_ec_ops *, const uint32_t *address);
321 
322 	/*
323 	 * Same as wake32, but for a 64 bit counter. Only used if
324 	 * CK_F_EC64 is defined.
325 	 *
326 	 * When wait64 truncates the control word at address to `only`
327 	 * consider its least significant half, wake64 should perform
328 	 * any necessary fixup (e.g., on big endian platforms).
329 	 */
330 	void (*wake64)(const struct ck_ec_ops *, const uint64_t *address);
331 
332 	/*
333 	 * Number of iterations for the initial busy wait. 0 defaults
334 	 * to 100 (not ABI stable).
335 	 */
336 	uint32_t busy_loop_iter;
337 
338 	/*
339 	 * Delay in nanoseconds for the first iteration of the
340 	 * exponential backoff. 0 defaults to 2 ms (not ABI stable).
341 	 */
342 	uint32_t initial_wait_ns;
343 
344 	/*
345 	 * Scale factor for the exponential backoff. 0 defaults to 8x
346 	 * (not ABI stable).
347 	 */
348 	uint32_t wait_scale_factor;
349 
350 	/*
351 	 * Right shift count for the exponential backoff. The update
352 	 * after each iteration is
353 	 *     wait_ns = (wait_ns * wait_scale_factor) >> wait_shift_count,
354 	 * until one second has elapsed. After that, the deadline goes
355 	 * to infinity.
356 	 */
357 	uint32_t wait_shift_count;
358 };
359 
360 /*
361  * ck_ec_mode wraps the ops table, and informs the fast path whether
362  * it should attempt to specialize for single producer mode.
363  *
364  * mode structs are expected to be exposed by value, e.g.,
365  *
366  *    extern const struct ck_ec_ops system_ec_ops;
367  *
368  *    static const struct ck_ec_mode ec_sp = {
369  *	  .ops = &system_ec_ops,
370  *	  .single_producer = true
371  *    };
372  *
373  *    static const struct ck_ec_mode ec_mp = {
374  *	  .ops = &system_ec_ops,
375  *	  .single_producer = false
376  *    };
377  *
378  * ck_ec_mode structs are only passed to inline functions defined in
379  * this header, and never escape to their slow paths, so they should
380  * not result in any object file size increase.
381  */
382 struct ck_ec_mode {
383 	const struct ck_ec_ops *ops;
384 	/*
385 	 * If single_producer is true, the event count has a unique
386 	 * incrementer. The implementation will specialize ck_ec_inc
387 	 * and ck_ec_add if possible (if CK_F_EC_SP is defined).
388 	 */
389 	bool single_producer;
390 };
391 
392 struct ck_ec32 {
393 	/* Flag is "sign" bit, value in bits 0:30. */
394 	uint32_t counter;
395 };
396 
397 typedef struct ck_ec32 ck_ec32_t;
398 
399 #ifdef CK_F_EC64
400 struct ck_ec64 {
401 	/*
402 	 * Flag is bottom bit, value in bits 1:63. Eventcount only
403 	 * works on x86-64 (i.e., little endian), so the futex int
404 	 * lies in the first 4 (bottom) bytes.
405 	 */
406 	uint64_t counter;
407 };
408 
409 typedef struct ck_ec64 ck_ec64_t;
410 #endif /* CK_F_EC64 */
411 
412 #define CK_EC_INITIALIZER { .counter = 0 }
413 
414 /*
415  * Initializes the event count to `value`. The value must not
416  * exceed INT32_MAX.
417  */
418 static void ck_ec32_init(struct ck_ec32 *ec, uint32_t value);
419 
420 #ifndef CK_F_EC64
421 #define ck_ec_init ck_ec32_init
422 #else
423 /*
424  * Initializes the event count to `value`. The value must not
425  * exceed INT64_MAX.
426  */
427 static void ck_ec64_init(struct ck_ec64 *ec, uint64_t value);
428 
429 #if __STDC_VERSION__ >= 201112L
430 #define ck_ec_init(EC, VALUE)				\
431 	(_Generic(*(EC),				\
432 		  struct ck_ec32 : ck_ec32_init,	\
433 		  struct ck_ec64 : ck_ec64_init)((EC), (VALUE)))
434 #endif /* __STDC_VERSION__ */
435 #endif /* CK_F_EC64 */
436 
437 /*
438  * Returns the counter value in the event count. The value is at most
439  * INT32_MAX.
440  */
441 static uint32_t ck_ec32_value(const struct ck_ec32* ec);
442 
443 #ifndef CK_F_EC64
444 #define ck_ec_value ck_ec32_value
445 #else
446 /*
447  * Returns the counter value in the event count. The value is at most
448  * INT64_MAX.
449  */
450 static uint64_t ck_ec64_value(const struct ck_ec64* ec);
451 
452 #if __STDC_VERSION__ >= 201112L
453 #define ck_ec_value(EC)					\
454 	(_Generic(*(EC),				\
455 		  struct ck_ec32 : ck_ec32_value,	\
456 		struct ck_ec64 : ck_ec64_value)((EC)))
457 #endif /* __STDC_VERSION__ */
458 #endif /* CK_F_EC64 */
459 
460 /*
461  * Returns whether there may be slow pathed waiters that need an
462  * explicit OS wakeup for this event count.
463  */
464 static bool ck_ec32_has_waiters(const struct ck_ec32 *ec);
465 
466 #ifndef CK_F_EC64
467 #define ck_ec_has_waiters ck_ec32_has_waiters
468 #else
469 static bool ck_ec64_has_waiters(const struct ck_ec64 *ec);
470 
471 #if __STDC_VERSION__ >= 201112L
472 #define ck_ec_has_waiters(EC)				      \
473 	(_Generic(*(EC),				      \
474 		  struct ck_ec32 : ck_ec32_has_waiters,	      \
475 		  struct ck_ec64 : ck_ec64_has_waiters)((EC)))
476 #endif /* __STDC_VERSION__ */
477 #endif /* CK_F_EC64 */
478 
479 /*
480  * Increments the counter value in the event count by one, and wakes
481  * up any waiter.
482  */
483 static void ck_ec32_inc(struct ck_ec32 *ec, const struct ck_ec_mode *mode);
484 
485 #ifndef CK_F_EC64
486 #define ck_ec_inc ck_ec32_inc
487 #else
488 static void ck_ec64_inc(struct ck_ec64 *ec, const struct ck_ec_mode *mode);
489 
490 #if __STDC_VERSION__ >= 201112L
491 #define ck_ec_inc(EC, MODE)					\
492 	(_Generic(*(EC),					\
493 		  struct ck_ec32 : ck_ec32_inc,			\
494 		  struct ck_ec64 : ck_ec64_inc)((EC), (MODE)))
495 #endif /* __STDC_VERSION__ */
496 #endif /* CK_F_EC64 */
497 
498 /*
499  * Increments the counter value in the event count by delta, wakes
500  * up any waiter, and returns the previous counter value.
501  */
502 static uint32_t ck_ec32_add(struct ck_ec32 *ec,
503 			    const struct ck_ec_mode *mode,
504 			    uint32_t delta);
505 
506 #ifndef CK_F_EC64
507 #define ck_ec_add ck_ec32_add
508 #else
509 static uint64_t ck_ec64_add(struct ck_ec64 *ec,
510 			    const struct ck_ec_mode *mode,
511 			    uint64_t delta);
512 
513 #if __STDC_VERSION__ >= 201112L
514 #define ck_ec_add(EC, MODE, DELTA)					\
515 	(_Generic(*(EC),						\
516 		  struct ck_ec32 : ck_ec32_add,				\
517 		  struct ck_ec64 : ck_ec64_add)((EC), (MODE), (DELTA)))
518 #endif /* __STDC_VERSION__ */
519 #endif /* CK_F_EC64 */
520 
521 /*
522  * Populates `new_deadline` with a deadline `timeout` in the future.
523  * Returns 0 on success, and -1 if clock_gettime failed, in which
524  * case errno is left as is.
525  */
526 static int ck_ec_deadline(struct timespec *new_deadline,
527 			  const struct ck_ec_mode *mode,
528 			  const struct timespec *timeout);
529 
530 /*
531  * Waits until the counter value in the event count differs from
532  * old_value, or, if deadline is non-NULL, until CLOCK_MONOTONIC is
533  * past the deadline.
534  *
535  * Returns 0 on success, and -1 on timeout.
536  */
537 static int ck_ec32_wait(struct ck_ec32 *ec,
538 			const struct ck_ec_mode *mode,
539 			uint32_t old_value,
540 			const struct timespec *deadline);
541 
542 #ifndef CK_F_EC64
543 #define ck_ec_wait ck_ec32_wait
544 #else
545 static int ck_ec64_wait(struct ck_ec64 *ec,
546 			const struct ck_ec_mode *mode,
547 			uint64_t old_value,
548 			const struct timespec *deadline);
549 
550 #if __STDC_VERSION__ >= 201112L
551 #define ck_ec_wait(EC, MODE, OLD_VALUE, DEADLINE)			\
552 	(_Generic(*(EC),						\
553 		  struct ck_ec32 : ck_ec32_wait,			\
554 		  struct ck_ec64 : ck_ec64_wait)((EC), (MODE),		\
555 						 (OLD_VALUE), (DEADLINE)))
556 
557 #endif /* __STDC_VERSION__ */
558 #endif /* CK_F_EC64 */
559 
560 /*
561  * Waits until the counter value in the event count differs from
562  * old_value, pred returns non-zero, or, if deadline is non-NULL,
563  * until CLOCK_MONOTONIC is past the deadline.
564  *
565  * Returns 0 on success, -1 on timeout, and the return value of pred
566  * if it returns non-zero.
567  *
568  * A NULL pred represents a function that always returns 0.
569  */
570 static int ck_ec32_wait_pred(struct ck_ec32 *ec,
571 			     const struct ck_ec_mode *mode,
572 			     uint32_t old_value,
573 			     int (*pred)(const struct ck_ec_wait_state *,
574 					 struct timespec *deadline),
575 			     void *data,
576 			     const struct timespec *deadline);
577 
578 #ifndef CK_F_EC64
579 #define ck_ec_wait_pred ck_ec32_wait_pred
580 #else
581 static int ck_ec64_wait_pred(struct ck_ec64 *ec,
582 			     const struct ck_ec_mode *mode,
583 			     uint64_t old_value,
584 			     int (*pred)(const struct ck_ec_wait_state *,
585 					 struct timespec *deadline),
586 			     void *data,
587 			     const struct timespec *deadline);
588 
589 #if __STDC_VERSION__ >= 201112L
590 #define ck_ec_wait_pred(EC, MODE, OLD_VALUE, PRED, DATA, DEADLINE)	\
591 	(_Generic(*(EC),						\
592 		  struct ck_ec32 : ck_ec32_wait_pred,			\
593 		  struct ck_ec64 : ck_ec64_wait_pred)			\
594 	 ((EC), (MODE), (OLD_VALUE), (PRED), (DATA), (DEADLINE)))
595 #endif /* __STDC_VERSION__ */
596 #endif /* CK_F_EC64 */
597 
598 /*
599  * Inline implementation details. 32 bit first, then 64 bit
600  * conditionally.
601  */
602 CK_CC_FORCE_INLINE void ck_ec32_init(struct ck_ec32 *ec, uint32_t value)
603 {
604 	ec->counter = value & ~(1UL << 31);
605 	return;
606 }
607 
608 CK_CC_FORCE_INLINE uint32_t ck_ec32_value(const struct ck_ec32 *ec)
609 {
610 	uint32_t ret = ck_pr_load_32(&ec->counter) & ~(1UL << 31);
611 
612 	ck_pr_fence_acquire();
613 	return ret;
614 }
615 
616 CK_CC_FORCE_INLINE bool ck_ec32_has_waiters(const struct ck_ec32 *ec)
617 {
618 	return ck_pr_load_32(&ec->counter) & (1UL << 31);
619 }
620 
621 /* Slow path for ck_ec{32,64}_{inc,add} */
622 void ck_ec32_wake(struct ck_ec32 *ec, const struct ck_ec_ops *ops);
623 
624 CK_CC_FORCE_INLINE void ck_ec32_inc(struct ck_ec32 *ec,
625 				    const struct ck_ec_mode *mode)
626 {
627 #if !defined(CK_F_EC_SP)
628 	/* Nothing to specialize if we don't have EC_SP. */
629 	ck_ec32_add(ec, mode, 1);
630 	return;
631 #else
632 	char flagged;
633 
634 #if __GNUC__ >= 6
635 	/*
636 	 * We don't want to wake if the sign bit is 0. We do want to
637 	 * wake if the sign bit just flipped from 1 to 0. We don't
638 	 * care what happens when our increment caused the sign bit to
639 	 * flip from 0 to 1 (that's once per 2^31 increment).
640 	 *
641 	 * This leaves us with four cases:
642 	 *
643 	 *  old sign bit | new sign bit | SF | OF | ZF
644 	 *  -------------------------------------------
645 	 *	       0 |	      0 |  0 |	0 | ?
646 	 *	       0 |	      1 |  1 |	0 | ?
647 	 *	       1 |	      1 |  1 |	0 | ?
648 	 *	       1 |	      0 |  0 |	0 | 1
649 	 *
650 	 * In the first case, we don't want to hit ck_ec32_wake. In
651 	 * the last two cases, we do want to call ck_ec32_wake. In the
652 	 * second case, we don't care, so we arbitrarily choose to
653 	 * call ck_ec32_wake.
654 	 *
655 	 * The "le" condition checks if SF != OF, or ZF == 1, which
656 	 * meets our requirements.
657 	 */
658 #define CK_EC32_INC_ASM(PREFIX)					\
659 	__asm__ volatile(PREFIX " incl %0"		    \
660 			 : "+m"(ec->counter), "=@ccle"(flagged)	 \
661 			 :: "cc", "memory")
662 #else
663 #define CK_EC32_INC_ASM(PREFIX)						\
664 	__asm__ volatile(PREFIX " incl %0; setle %1"			\
665 			 : "+m"(ec->counter), "=r"(flagged)		\
666 			 :: "cc", "memory")
667 #endif /* __GNUC__ */
668 
669 	if (mode->single_producer == true) {
670 		ck_pr_fence_store();
671 		CK_EC32_INC_ASM("");
672 	} else {
673 		ck_pr_fence_store_atomic();
674 		CK_EC32_INC_ASM("lock");
675 	}
676 #undef CK_EC32_INC_ASM
677 
678 	if (CK_CC_UNLIKELY(flagged)) {
679 		ck_ec32_wake(ec, mode->ops);
680 	}
681 
682 	return;
683 #endif /* CK_F_EC_SP */
684 }
685 
686 CK_CC_FORCE_INLINE uint32_t ck_ec32_add_epilogue(struct ck_ec32 *ec,
687 						 const struct ck_ec_mode *mode,
688 						 uint32_t old)
689 {
690 	const uint32_t flag_mask = 1U << 31;
691 	uint32_t ret;
692 
693 	ret = old & ~flag_mask;
694 	/* These two only differ if the flag bit is set. */
695 	if (CK_CC_UNLIKELY(old != ret)) {
696 		ck_ec32_wake(ec, mode->ops);
697 	}
698 
699 	return ret;
700 }
701 
702 static CK_CC_INLINE uint32_t ck_ec32_add_mp(struct ck_ec32 *ec,
703 					    const struct ck_ec_mode *mode,
704 					    uint32_t delta)
705 {
706 	uint32_t old;
707 
708 	ck_pr_fence_store_atomic();
709 	old = ck_pr_faa_32(&ec->counter, delta);
710 	return ck_ec32_add_epilogue(ec, mode, old);
711 }
712 
713 #ifdef CK_F_EC_SP
714 static CK_CC_INLINE uint32_t ck_ec32_add_sp(struct ck_ec32 *ec,
715 					    const struct ck_ec_mode *mode,
716 					    uint32_t delta)
717 {
718 	uint32_t old;
719 
720 	/*
721 	 * Correctness of this racy write depends on actually
722 	 * having an update to write. Exit here if the update
723 	 * is a no-op.
724 	 */
725 	if (CK_CC_UNLIKELY(delta == 0)) {
726 		return ck_ec32_value(ec);
727 	}
728 
729 	ck_pr_fence_store();
730 	old = delta;
731 	__asm__ volatile("xaddl %1, %0"
732 			 : "+m"(ec->counter), "+r"(old)
733 			 :: "cc", "memory");
734 	return ck_ec32_add_epilogue(ec, mode, old);
735 }
736 #endif /* CK_F_EC_SP */
737 
738 CK_CC_FORCE_INLINE uint32_t ck_ec32_add(struct ck_ec32 *ec,
739 					const struct ck_ec_mode *mode,
740 					uint32_t delta)
741 {
742 #ifdef CK_F_EC_SP
743 	if (mode->single_producer == true) {
744 		return ck_ec32_add_sp(ec, mode, delta);
745 	}
746 #endif
747 
748 	return ck_ec32_add_mp(ec, mode, delta);
749 }
750 
751 int ck_ec_deadline_impl(struct timespec *new_deadline,
752 			const struct ck_ec_ops *ops,
753 			const struct timespec *timeout);
754 
755 CK_CC_FORCE_INLINE int ck_ec_deadline(struct timespec *new_deadline,
756 				      const struct ck_ec_mode *mode,
757 				      const struct timespec *timeout)
758 {
759 	return ck_ec_deadline_impl(new_deadline, mode->ops, timeout);
760 }
761 
762 
763 int ck_ec32_wait_slow(struct ck_ec32 *ec,
764 		      const struct ck_ec_ops *ops,
765 		      uint32_t old_value,
766 		      const struct timespec *deadline);
767 
768 CK_CC_FORCE_INLINE int ck_ec32_wait(struct ck_ec32 *ec,
769 				    const struct ck_ec_mode *mode,
770 				    uint32_t old_value,
771 				    const struct timespec *deadline)
772 {
773 	if (ck_ec32_value(ec) != old_value) {
774 		return 0;
775 	}
776 
777 	return ck_ec32_wait_slow(ec, mode->ops, old_value, deadline);
778 }
779 
780 int ck_ec32_wait_pred_slow(struct ck_ec32 *ec,
781 			   const struct ck_ec_ops *ops,
782 			   uint32_t old_value,
783 			   int (*pred)(const struct ck_ec_wait_state *state,
784 				       struct timespec *deadline),
785 			   void *data,
786 			   const struct timespec *deadline);
787 
788 CK_CC_FORCE_INLINE int
789 ck_ec32_wait_pred(struct ck_ec32 *ec,
790 		  const struct ck_ec_mode *mode,
791 		  uint32_t old_value,
792 		  int (*pred)(const struct ck_ec_wait_state *state,
793 			      struct timespec *deadline),
794 		  void *data,
795 		  const struct timespec *deadline)
796 {
797 	if (ck_ec32_value(ec) != old_value) {
798 		return 0;
799 	}
800 
801 	return ck_ec32_wait_pred_slow(ec, mode->ops, old_value,
802 				      pred, data, deadline);
803 }
804 
805 #ifdef CK_F_EC64
806 CK_CC_FORCE_INLINE void ck_ec64_init(struct ck_ec64 *ec, uint64_t value)
807 {
808 	ec->counter = value << 1;
809 	return;
810 }
811 
812 CK_CC_FORCE_INLINE uint64_t ck_ec64_value(const struct ck_ec64 *ec)
813 {
814 	uint64_t ret = ck_pr_load_64(&ec->counter) >> 1;
815 
816 	ck_pr_fence_acquire();
817 	return ret;
818 }
819 
820 CK_CC_FORCE_INLINE bool ck_ec64_has_waiters(const struct ck_ec64 *ec)
821 {
822 	return ck_pr_load_64(&ec->counter) & 1;
823 }
824 
825 void ck_ec64_wake(struct ck_ec64 *ec, const struct ck_ec_ops *ops);
826 
827 CK_CC_FORCE_INLINE void ck_ec64_inc(struct ck_ec64 *ec,
828 				    const struct ck_ec_mode *mode)
829 {
830 	/* We always xadd, so there's no special optimization here. */
831 	(void)ck_ec64_add(ec, mode, 1);
832 	return;
833 }
834 
835 CK_CC_FORCE_INLINE uint64_t ck_ec_add64_epilogue(struct ck_ec64 *ec,
836 					       const struct ck_ec_mode *mode,
837 					       uint64_t old)
838 {
839 	uint64_t ret = old >> 1;
840 
841 	if (CK_CC_UNLIKELY(old & 1)) {
842 		ck_ec64_wake(ec, mode->ops);
843 	}
844 
845 	return ret;
846 }
847 
848 static CK_CC_INLINE uint64_t ck_ec64_add_mp(struct ck_ec64 *ec,
849 					    const struct ck_ec_mode *mode,
850 					    uint64_t delta)
851 {
852 	uint64_t inc = 2 * delta;  /* The low bit is the flag bit. */
853 
854 	ck_pr_fence_store_atomic();
855 	return ck_ec_add64_epilogue(ec, mode, ck_pr_faa_64(&ec->counter, inc));
856 }
857 
858 #ifdef CK_F_EC_SP
859 /* Single-producer specialisation. */
860 static CK_CC_INLINE uint64_t ck_ec64_add_sp(struct ck_ec64 *ec,
861 					    const struct ck_ec_mode *mode,
862 					    uint64_t delta)
863 {
864 	uint64_t old;
865 
866 	/*
867 	 * Correctness of this racy write depends on actually
868 	 * having an update to write. Exit here if the update
869 	 * is a no-op.
870 	 */
871 	if (CK_CC_UNLIKELY(delta == 0)) {
872 		return ck_ec64_value(ec);
873 	}
874 
875 	ck_pr_fence_store();
876 	old = 2 * delta;  /* The low bit is the flag bit. */
877 	__asm__ volatile("xaddq %1, %0"
878 			 : "+m"(ec->counter), "+r"(old)
879 			 :: "cc", "memory");
880 	return ck_ec_add64_epilogue(ec, mode, old);
881 }
882 #endif /* CK_F_EC_SP */
883 
884 /*
885  * Dispatch on mode->single_producer in this FORCE_INLINE function:
886  * the end result is always small, but not all compilers have enough
887  * foresight to inline and get the reduction.
888  */
889 CK_CC_FORCE_INLINE uint64_t ck_ec64_add(struct ck_ec64 *ec,
890 					const struct ck_ec_mode *mode,
891 					uint64_t delta)
892 {
893 #ifdef CK_F_EC_SP
894 	if (mode->single_producer == true) {
895 		return ck_ec64_add_sp(ec, mode, delta);
896 	}
897 #endif
898 
899 	return ck_ec64_add_mp(ec, mode, delta);
900 }
901 
902 int ck_ec64_wait_slow(struct ck_ec64 *ec,
903 		      const struct ck_ec_ops *ops,
904 		      uint64_t old_value,
905 		      const struct timespec *deadline);
906 
907 CK_CC_FORCE_INLINE int ck_ec64_wait(struct ck_ec64 *ec,
908 				    const struct ck_ec_mode *mode,
909 				    uint64_t old_value,
910 				    const struct timespec *deadline)
911 {
912 	if (ck_ec64_value(ec) != old_value) {
913 		return 0;
914 	}
915 
916 	return ck_ec64_wait_slow(ec, mode->ops, old_value, deadline);
917 }
918 
919 int ck_ec64_wait_pred_slow(struct ck_ec64 *ec,
920 			   const struct ck_ec_ops *ops,
921 			   uint64_t old_value,
922 			   int (*pred)(const struct ck_ec_wait_state *state,
923 				       struct timespec *deadline),
924 			   void *data,
925 			   const struct timespec *deadline);
926 
927 
928 CK_CC_FORCE_INLINE int
929 ck_ec64_wait_pred(struct ck_ec64 *ec,
930 		  const struct ck_ec_mode *mode,
931 		  uint64_t old_value,
932 		  int (*pred)(const struct ck_ec_wait_state *state,
933 			      struct timespec *deadline),
934 		  void *data,
935 		  const struct timespec *deadline)
936 {
937 	if (ck_ec64_value(ec) != old_value) {
938 		return 0;
939 	}
940 
941 	return ck_ec64_wait_pred_slow(ec, mode->ops, old_value,
942 				      pred, data, deadline);
943 }
944 #endif /* CK_F_EC64 */
945 #endif /* !CK_EC_H */
946