1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy * CDDL HEADER START
3eda14cbcSMatt Macy *
4eda14cbcSMatt Macy * This file and its contents are supplied under the terms of the
5eda14cbcSMatt Macy * Common Development and Distribution License ("CDDL"), version 1.0.
6eda14cbcSMatt Macy * You may only use this file in accordance with the terms of version
7eda14cbcSMatt Macy * 1.0 of the CDDL.
8eda14cbcSMatt Macy *
9eda14cbcSMatt Macy * A full copy of the text of the CDDL should have accompanied this
10eda14cbcSMatt Macy * source. A copy of the CDDL is also available via the Internet at
11eda14cbcSMatt Macy * http://www.illumos.org/license/CDDL.
12eda14cbcSMatt Macy *
13eda14cbcSMatt Macy * CDDL HEADER END
14eda14cbcSMatt Macy */
15eda14cbcSMatt Macy /*
16eda14cbcSMatt Macy * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
17eda14cbcSMatt Macy */
18eda14cbcSMatt Macy
19eda14cbcSMatt Macy #include <sys/zfs_context.h>
20eda14cbcSMatt Macy #include <sys/aggsum.h>
21eda14cbcSMatt Macy
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy * Aggregate-sum counters are a form of fanned-out counter, used when atomic
24eda14cbcSMatt Macy * instructions on a single field cause enough CPU cache line contention to
25eda14cbcSMatt Macy * slow system performance. Due to their increased overhead and the expense
26eda14cbcSMatt Macy * involved with precisely reading from them, they should only be used in cases
27eda14cbcSMatt Macy * where the write rate (increment/decrement) is much higher than the read rate
28eda14cbcSMatt Macy * (get value).
29eda14cbcSMatt Macy *
30eda14cbcSMatt Macy * Aggregate sum counters are comprised of two basic parts, the core and the
31eda14cbcSMatt Macy * buckets. The core counter contains a lock for the entire counter, as well
32eda14cbcSMatt Macy * as the current upper and lower bounds on the value of the counter. The
33eda14cbcSMatt Macy * aggsum_bucket structure contains a per-bucket lock to protect the contents of
34eda14cbcSMatt Macy * the bucket, the current amount that this bucket has changed from the global
35eda14cbcSMatt Macy * counter (called the delta), and the amount of increment and decrement we have
36eda14cbcSMatt Macy * "borrowed" from the core counter.
37eda14cbcSMatt Macy *
38eda14cbcSMatt Macy * The basic operation of an aggsum is simple. Threads that wish to modify the
39eda14cbcSMatt Macy * counter will modify one bucket's counter (determined by their current CPU, to
40eda14cbcSMatt Macy * help minimize lock and cache contention). If the bucket already has
41eda14cbcSMatt Macy * sufficient capacity borrowed from the core structure to handle their request,
42eda14cbcSMatt Macy * they simply modify the delta and return. If the bucket does not, we clear
43eda14cbcSMatt Macy * the bucket's current state (to prevent the borrowed amounts from getting too
44eda14cbcSMatt Macy * large), and borrow more from the core counter. Borrowing is done by adding to
45eda14cbcSMatt Macy * the upper bound (or subtracting from the lower bound) of the core counter,
46eda14cbcSMatt Macy * and setting the borrow value for the bucket to the amount added (or
47eda14cbcSMatt Macy * subtracted). Clearing the bucket is the opposite; we add the current delta
48eda14cbcSMatt Macy * to both the lower and upper bounds of the core counter, subtract the borrowed
49eda14cbcSMatt Macy * incremental from the upper bound, and add the borrowed decrement from the
50eda14cbcSMatt Macy * lower bound. Note that only borrowing and clearing require access to the
51eda14cbcSMatt Macy * core counter; since all other operations access CPU-local resources,
52eda14cbcSMatt Macy * performance can be much higher than a traditional counter.
53eda14cbcSMatt Macy *
54eda14cbcSMatt Macy * Threads that wish to read from the counter have a slightly more challenging
55eda14cbcSMatt Macy * task. It is fast to determine the upper and lower bounds of the aggum; this
56eda14cbcSMatt Macy * does not require grabbing any locks. This suffices for cases where an
57eda14cbcSMatt Macy * approximation of the aggsum's value is acceptable. However, if one needs to
58eda14cbcSMatt Macy * know whether some specific value is above or below the current value in the
59eda14cbcSMatt Macy * aggsum, they invoke aggsum_compare(). This function operates by repeatedly
60eda14cbcSMatt Macy * comparing the target value to the upper and lower bounds of the aggsum, and
61eda14cbcSMatt Macy * then clearing a bucket. This proceeds until the target is outside of the
62eda14cbcSMatt Macy * upper and lower bounds and we return a response, or the last bucket has been
63eda14cbcSMatt Macy * cleared and we know that the target is equal to the aggsum's value. Finally,
64eda14cbcSMatt Macy * the most expensive operation is determining the precise value of the aggsum.
65eda14cbcSMatt Macy * To do this, we clear every bucket and then return the upper bound (which must
66eda14cbcSMatt Macy * be equal to the lower bound). What makes aggsum_compare() and aggsum_value()
67eda14cbcSMatt Macy * expensive is clearing buckets. This involves grabbing the global lock
68eda14cbcSMatt Macy * (serializing against themselves and borrow operations), grabbing a bucket's
69eda14cbcSMatt Macy * lock (preventing threads on those CPUs from modifying their delta), and
70eda14cbcSMatt Macy * zeroing out the borrowed value (forcing that thread to borrow on its next
71eda14cbcSMatt Macy * request, which will also be expensive). This is what makes aggsums well
72eda14cbcSMatt Macy * suited for write-many read-rarely operations.
737877fdebSMatt Macy *
747877fdebSMatt Macy * Note that the aggsums do not expand if more CPUs are hot-added. In that
757877fdebSMatt Macy * case, we will have less fanout than boot_ncpus, but we don't want to always
767877fdebSMatt Macy * reserve the RAM necessary to create the extra slots for additional CPUs up
777877fdebSMatt Macy * front, and dynamically adding them is a complex task.
78eda14cbcSMatt Macy */
79eda14cbcSMatt Macy
80eda14cbcSMatt Macy /*
8116038816SMartin Matuska * We will borrow 2^aggsum_borrow_shift times the current request, so we will
8216038816SMartin Matuska * have to get the as_lock approximately every 2^aggsum_borrow_shift calls to
8316038816SMartin Matuska * aggsum_add().
84eda14cbcSMatt Macy */
8516038816SMartin Matuska static uint_t aggsum_borrow_shift = 4;
86eda14cbcSMatt Macy
87eda14cbcSMatt Macy void
aggsum_init(aggsum_t * as,uint64_t value)88eda14cbcSMatt Macy aggsum_init(aggsum_t *as, uint64_t value)
89eda14cbcSMatt Macy {
90*da5137abSMartin Matuska memset(as, 0, sizeof (*as));
91eda14cbcSMatt Macy as->as_lower_bound = as->as_upper_bound = value;
92eda14cbcSMatt Macy mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
9316038816SMartin Matuska /*
9416038816SMartin Matuska * Too many buckets may hurt read performance without improving
9516038816SMartin Matuska * write. From 12 CPUs use bucket per 2 CPUs, from 48 per 4, etc.
9616038816SMartin Matuska */
9716038816SMartin Matuska as->as_bucketshift = highbit64(boot_ncpus / 6) / 2;
9816038816SMartin Matuska as->as_numbuckets = ((boot_ncpus - 1) >> as->as_bucketshift) + 1;
9916038816SMartin Matuska as->as_buckets = kmem_zalloc(as->as_numbuckets *
10016038816SMartin Matuska sizeof (aggsum_bucket_t), KM_SLEEP);
101eda14cbcSMatt Macy for (int i = 0; i < as->as_numbuckets; i++) {
102eda14cbcSMatt Macy mutex_init(&as->as_buckets[i].asc_lock,
103eda14cbcSMatt Macy NULL, MUTEX_DEFAULT, NULL);
104eda14cbcSMatt Macy }
105eda14cbcSMatt Macy }
106eda14cbcSMatt Macy
107eda14cbcSMatt Macy void
aggsum_fini(aggsum_t * as)108eda14cbcSMatt Macy aggsum_fini(aggsum_t *as)
109eda14cbcSMatt Macy {
110eda14cbcSMatt Macy for (int i = 0; i < as->as_numbuckets; i++)
111eda14cbcSMatt Macy mutex_destroy(&as->as_buckets[i].asc_lock);
112eda14cbcSMatt Macy kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t));
113eda14cbcSMatt Macy mutex_destroy(&as->as_lock);
114eda14cbcSMatt Macy }
115eda14cbcSMatt Macy
116eda14cbcSMatt Macy int64_t
aggsum_lower_bound(aggsum_t * as)117eda14cbcSMatt Macy aggsum_lower_bound(aggsum_t *as)
118eda14cbcSMatt Macy {
11916038816SMartin Matuska return (atomic_load_64((volatile uint64_t *)&as->as_lower_bound));
120eda14cbcSMatt Macy }
121eda14cbcSMatt Macy
12216038816SMartin Matuska uint64_t
aggsum_upper_bound(aggsum_t * as)123eda14cbcSMatt Macy aggsum_upper_bound(aggsum_t *as)
124eda14cbcSMatt Macy {
12516038816SMartin Matuska return (atomic_load_64(&as->as_upper_bound));
126eda14cbcSMatt Macy }
127eda14cbcSMatt Macy
128eda14cbcSMatt Macy uint64_t
aggsum_value(aggsum_t * as)129eda14cbcSMatt Macy aggsum_value(aggsum_t *as)
130eda14cbcSMatt Macy {
13116038816SMartin Matuska int64_t lb;
13216038816SMartin Matuska uint64_t ub;
133eda14cbcSMatt Macy
134eda14cbcSMatt Macy mutex_enter(&as->as_lock);
13516038816SMartin Matuska lb = as->as_lower_bound;
13616038816SMartin Matuska ub = as->as_upper_bound;
13716038816SMartin Matuska if (lb == ub) {
138eda14cbcSMatt Macy for (int i = 0; i < as->as_numbuckets; i++) {
139eda14cbcSMatt Macy ASSERT0(as->as_buckets[i].asc_delta);
140eda14cbcSMatt Macy ASSERT0(as->as_buckets[i].asc_borrowed);
141eda14cbcSMatt Macy }
142eda14cbcSMatt Macy mutex_exit(&as->as_lock);
14316038816SMartin Matuska return (lb);
144eda14cbcSMatt Macy }
145eda14cbcSMatt Macy for (int i = 0; i < as->as_numbuckets; i++) {
146eda14cbcSMatt Macy struct aggsum_bucket *asb = &as->as_buckets[i];
14716038816SMartin Matuska if (asb->asc_borrowed == 0)
14816038816SMartin Matuska continue;
149eda14cbcSMatt Macy mutex_enter(&asb->asc_lock);
15016038816SMartin Matuska lb += asb->asc_delta + asb->asc_borrowed;
15116038816SMartin Matuska ub += asb->asc_delta - asb->asc_borrowed;
15216038816SMartin Matuska asb->asc_delta = 0;
15316038816SMartin Matuska asb->asc_borrowed = 0;
154eda14cbcSMatt Macy mutex_exit(&asb->asc_lock);
155eda14cbcSMatt Macy }
15616038816SMartin Matuska ASSERT3U(lb, ==, ub);
15716038816SMartin Matuska atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb);
15816038816SMartin Matuska atomic_store_64(&as->as_upper_bound, lb);
159eda14cbcSMatt Macy mutex_exit(&as->as_lock);
160eda14cbcSMatt Macy
16116038816SMartin Matuska return (lb);
162eda14cbcSMatt Macy }
163eda14cbcSMatt Macy
164eda14cbcSMatt Macy void
aggsum_add(aggsum_t * as,int64_t delta)165eda14cbcSMatt Macy aggsum_add(aggsum_t *as, int64_t delta)
166eda14cbcSMatt Macy {
167eda14cbcSMatt Macy struct aggsum_bucket *asb;
168eda14cbcSMatt Macy int64_t borrow;
169eda14cbcSMatt Macy
17016038816SMartin Matuska asb = &as->as_buckets[(CPU_SEQID_UNSTABLE >> as->as_bucketshift) %
17116038816SMartin Matuska as->as_numbuckets];
172eda14cbcSMatt Macy
173eda14cbcSMatt Macy /* Try fast path if we already borrowed enough before. */
174eda14cbcSMatt Macy mutex_enter(&asb->asc_lock);
175eda14cbcSMatt Macy if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
176eda14cbcSMatt Macy asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
177eda14cbcSMatt Macy asb->asc_delta += delta;
178eda14cbcSMatt Macy mutex_exit(&asb->asc_lock);
179eda14cbcSMatt Macy return;
180eda14cbcSMatt Macy }
181eda14cbcSMatt Macy mutex_exit(&asb->asc_lock);
182eda14cbcSMatt Macy
183eda14cbcSMatt Macy /*
184eda14cbcSMatt Macy * We haven't borrowed enough. Take the global lock and borrow
185eda14cbcSMatt Macy * considering what is requested now and what we borrowed before.
186eda14cbcSMatt Macy */
18716038816SMartin Matuska borrow = (delta < 0 ? -delta : delta);
18816038816SMartin Matuska borrow <<= aggsum_borrow_shift + as->as_bucketshift;
189eda14cbcSMatt Macy mutex_enter(&as->as_lock);
190eda14cbcSMatt Macy if (borrow >= asb->asc_borrowed)
191eda14cbcSMatt Macy borrow -= asb->asc_borrowed;
192eda14cbcSMatt Macy else
193eda14cbcSMatt Macy borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
19416038816SMartin Matuska mutex_enter(&asb->asc_lock);
19516038816SMartin Matuska delta += asb->asc_delta;
19616038816SMartin Matuska asb->asc_delta = 0;
197eda14cbcSMatt Macy asb->asc_borrowed += borrow;
198eda14cbcSMatt Macy mutex_exit(&asb->asc_lock);
19916038816SMartin Matuska atomic_store_64((volatile uint64_t *)&as->as_lower_bound,
20016038816SMartin Matuska as->as_lower_bound + delta - borrow);
20116038816SMartin Matuska atomic_store_64(&as->as_upper_bound,
20216038816SMartin Matuska as->as_upper_bound + delta + borrow);
203eda14cbcSMatt Macy mutex_exit(&as->as_lock);
204eda14cbcSMatt Macy }
205eda14cbcSMatt Macy
206eda14cbcSMatt Macy /*
207eda14cbcSMatt Macy * Compare the aggsum value to target efficiently. Returns -1 if the value
208eda14cbcSMatt Macy * represented by the aggsum is less than target, 1 if it's greater, and 0 if
209eda14cbcSMatt Macy * they are equal.
210eda14cbcSMatt Macy */
211eda14cbcSMatt Macy int
aggsum_compare(aggsum_t * as,uint64_t target)212eda14cbcSMatt Macy aggsum_compare(aggsum_t *as, uint64_t target)
213eda14cbcSMatt Macy {
21416038816SMartin Matuska int64_t lb;
21516038816SMartin Matuska uint64_t ub;
21616038816SMartin Matuska int i;
21716038816SMartin Matuska
21816038816SMartin Matuska if (atomic_load_64(&as->as_upper_bound) < target)
219eda14cbcSMatt Macy return (-1);
22016038816SMartin Matuska lb = atomic_load_64((volatile uint64_t *)&as->as_lower_bound);
22116038816SMartin Matuska if (lb > 0 && (uint64_t)lb > target)
222eda14cbcSMatt Macy return (1);
223eda14cbcSMatt Macy mutex_enter(&as->as_lock);
22416038816SMartin Matuska lb = as->as_lower_bound;
22516038816SMartin Matuska ub = as->as_upper_bound;
22616038816SMartin Matuska for (i = 0; i < as->as_numbuckets; i++) {
227eda14cbcSMatt Macy struct aggsum_bucket *asb = &as->as_buckets[i];
22816038816SMartin Matuska if (asb->asc_borrowed == 0)
22916038816SMartin Matuska continue;
230eda14cbcSMatt Macy mutex_enter(&asb->asc_lock);
23116038816SMartin Matuska lb += asb->asc_delta + asb->asc_borrowed;
23216038816SMartin Matuska ub += asb->asc_delta - asb->asc_borrowed;
23316038816SMartin Matuska asb->asc_delta = 0;
23416038816SMartin Matuska asb->asc_borrowed = 0;
235eda14cbcSMatt Macy mutex_exit(&asb->asc_lock);
23616038816SMartin Matuska if (ub < target || (lb > 0 && (uint64_t)lb > target))
23716038816SMartin Matuska break;
238eda14cbcSMatt Macy }
23916038816SMartin Matuska if (i >= as->as_numbuckets)
24016038816SMartin Matuska ASSERT3U(lb, ==, ub);
24116038816SMartin Matuska atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb);
24216038816SMartin Matuska atomic_store_64(&as->as_upper_bound, ub);
243eda14cbcSMatt Macy mutex_exit(&as->as_lock);
24416038816SMartin Matuska return (ub < target ? -1 : (uint64_t)lb > target ? 1 : 0);
245eda14cbcSMatt Macy }
246