1*61145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
2eda14cbcSMatt Macy /*
3eda14cbcSMatt Macy * CDDL HEADER START
4eda14cbcSMatt Macy *
5eda14cbcSMatt Macy * The contents of this file are subject to the terms of the
6eda14cbcSMatt Macy * Common Development and Distribution License (the "License").
7eda14cbcSMatt Macy * You may not use this file except in compliance with the License.
8eda14cbcSMatt Macy *
9eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0.
11eda14cbcSMatt Macy * See the License for the specific language governing permissions
12eda14cbcSMatt Macy * and limitations under the License.
13eda14cbcSMatt Macy *
14eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each
15eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the
17eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying
18eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner]
19eda14cbcSMatt Macy *
20eda14cbcSMatt Macy * CDDL HEADER END
21eda14cbcSMatt Macy */
22eda14cbcSMatt Macy
23eda14cbcSMatt Macy /*
24eda14cbcSMatt Macy * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
25eda14cbcSMatt Macy */
26eda14cbcSMatt Macy
27eda14cbcSMatt Macy #include <sys/dmu_objset.h>
28eda14cbcSMatt Macy #include <sys/metaslab.h>
29eda14cbcSMatt Macy #include <sys/metaslab_impl.h>
30eda14cbcSMatt Macy #include <sys/spa.h>
31eda14cbcSMatt Macy #include <sys/spa_impl.h>
32eda14cbcSMatt Macy #include <sys/spa_log_spacemap.h>
33eda14cbcSMatt Macy #include <sys/vdev_impl.h>
34eda14cbcSMatt Macy #include <sys/zap.h>
35eda14cbcSMatt Macy
36eda14cbcSMatt Macy /*
37eda14cbcSMatt Macy * Log Space Maps
38eda14cbcSMatt Macy *
39eda14cbcSMatt Macy * Log space maps are an optimization in ZFS metadata allocations for pools
40eda14cbcSMatt Macy * whose workloads are primarily random-writes. Random-write workloads are also
41eda14cbcSMatt Macy * typically random-free, meaning that they are freeing from locations scattered
42eda14cbcSMatt Macy * throughout the pool. This means that each TXG we will have to append some
43eda14cbcSMatt Macy * FREE records to almost every metaslab. With log space maps, we hold their
44eda14cbcSMatt Macy * changes in memory and log them altogether in one pool-wide space map on-disk
45eda14cbcSMatt Macy * for persistence. As more blocks are accumulated in the log space maps and
46eda14cbcSMatt Macy * more unflushed changes are accounted in memory, we flush a selected group
47eda14cbcSMatt Macy * of metaslabs every TXG to relieve memory pressure and potential overheads
48eda14cbcSMatt Macy * when loading the pool. Flushing a metaslab to disk relieves memory as we
49eda14cbcSMatt Macy * flush any unflushed changes from memory to disk (i.e. the metaslab's space
50eda14cbcSMatt Macy * map) and saves import time by making old log space maps obsolete and
51eda14cbcSMatt Macy * eventually destroying them. [A log space map is said to be obsolete when all
52eda14cbcSMatt Macy * its entries have made it to their corresponding metaslab space maps].
53eda14cbcSMatt Macy *
54eda14cbcSMatt Macy * == On disk data structures used ==
55eda14cbcSMatt Macy *
56eda14cbcSMatt Macy * - The pool has a new feature flag and a new entry in the MOS. The feature
57eda14cbcSMatt Macy * is activated when we create the first log space map and remains active
58eda14cbcSMatt Macy * for the lifetime of the pool. The new entry in the MOS Directory [refer
59eda14cbcSMatt Macy * to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
60eda14cbcSMatt Macy * pairs are of the form <key: txg, value: log space map object for that txg>.
61eda14cbcSMatt Macy * This entry is our on-disk reference of the log space maps that exist in
62eda14cbcSMatt Macy * the pool for each TXG and it is used during import to load all the
63eda14cbcSMatt Macy * metaslab unflushed changes in memory. To see how this structure is first
64eda14cbcSMatt Macy * created and later populated refer to spa_generate_syncing_log_sm(). To see
65eda14cbcSMatt Macy * how it is used during import time refer to spa_ld_log_sm_metadata().
66eda14cbcSMatt Macy *
67eda14cbcSMatt Macy * - Each vdev has a new entry in its vdev_top_zap (see field
68eda14cbcSMatt Macy * VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
69eda14cbcSMatt Macy * each metaslab in this vdev. This field is the on-disk counterpart of the
70eda14cbcSMatt Macy * in-memory field ms_unflushed_txg which tells us from which TXG and onwards
71eda14cbcSMatt Macy * the metaslab haven't had its changes flushed. During import, we use this
72eda14cbcSMatt Macy * to ignore any entries in the space map log that are for this metaslab but
73eda14cbcSMatt Macy * from a TXG before msp_unflushed_txg. At that point, we also populate its
74eda14cbcSMatt Macy * in-memory counterpart and from there both fields are updated every time
75eda14cbcSMatt Macy * we flush that metaslab.
76eda14cbcSMatt Macy *
77eda14cbcSMatt Macy * - A space map is created every TXG and, during that TXG, it is used to log
78eda14cbcSMatt Macy * all incoming changes (the log space map). When created, the log space map
79eda14cbcSMatt Macy * is referenced in memory by spa_syncing_log_sm and its object ID is inserted
80eda14cbcSMatt Macy * to the space map ZAP mentioned above. The log space map is closed at the
81eda14cbcSMatt Macy * end of the TXG and will be destroyed when it becomes fully obsolete. We
82eda14cbcSMatt Macy * know when a log space map has become obsolete by looking at the oldest
83eda14cbcSMatt Macy * (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
84eda14cbcSMatt Macy * than the log space map's TXG, then it means that there is no metaslab who
85eda14cbcSMatt Macy * doesn't have the changes from that log and we can therefore destroy it.
86eda14cbcSMatt Macy * [see spa_cleanup_old_sm_logs()].
87eda14cbcSMatt Macy *
88eda14cbcSMatt Macy * == Important in-memory structures ==
89eda14cbcSMatt Macy *
90eda14cbcSMatt Macy * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
91eda14cbcSMatt Macy * the pool by their ms_unflushed_txg field. It is primarily used for three
92eda14cbcSMatt Macy * reasons. First of all, it is used during flushing where we try to flush
93eda14cbcSMatt Macy * metaslabs in-order from the oldest-flushed to the most recently flushed
94eda14cbcSMatt Macy * every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
95eda14cbcSMatt Macy * oldest flushed metaslab to distinguish which log space maps have become
96eda14cbcSMatt Macy * obsolete and which ones are still relevant. Finally it tells us which
97eda14cbcSMatt Macy * metaslabs have unflushed changes in a pool where this feature was just
98eda14cbcSMatt Macy * enabled, as we don't immediately add all of the pool's metaslabs but we
99eda14cbcSMatt Macy * add them over time as they go through metaslab_sync(). The reason that
100eda14cbcSMatt Macy * we do that is to ease these pools into the behavior of the flushing
101eda14cbcSMatt Macy * algorithm (described later on).
102eda14cbcSMatt Macy *
103eda14cbcSMatt Macy * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
104eda14cbcSMatt Macy * counterpart of the space map ZAP mentioned above. It's an AVL tree whose
105eda14cbcSMatt Macy * nodes represent the log space maps in the pool. This in-memory
106eda14cbcSMatt Macy * representation of log space maps in the pool sorts the log space maps by
107eda14cbcSMatt Macy * the TXG that they were created (which is also the TXG of their unflushed
108eda14cbcSMatt Macy * changes). It also contains the following extra information for each
109eda14cbcSMatt Macy * space map:
110eda14cbcSMatt Macy * [1] The number of metaslabs that were last flushed on that TXG. This is
111eda14cbcSMatt Macy * important because if that counter is zero and this is the oldest
112eda14cbcSMatt Macy * log then it means that it is also obsolete.
113eda14cbcSMatt Macy * [2] The number of blocks of that space map. This field is used by the
114eda14cbcSMatt Macy * block heuristic of our flushing algorithm (described later on).
115eda14cbcSMatt Macy * It represents how many blocks of metadata changes ZFS had to write
116eda14cbcSMatt Macy * to disk for that TXG.
117eda14cbcSMatt Macy *
118eda14cbcSMatt Macy * - The per-spa field spa_log_summary is a list of entries that summarizes
119eda14cbcSMatt Macy * the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
120eda14cbcSMatt Macy * AVL tree mentioned above. The reason this exists is that our flushing
121eda14cbcSMatt Macy * algorithm (described later) tries to estimate how many metaslabs to flush
122eda14cbcSMatt Macy * in each TXG by iterating over all the log space maps and looking at their
123eda14cbcSMatt Macy * block counts. Summarizing that information means that don't have to
124eda14cbcSMatt Macy * iterate through each space map, minimizing the runtime overhead of the
125eda14cbcSMatt Macy * flushing algorithm which would be induced in syncing context. In terms of
126eda14cbcSMatt Macy * implementation the log summary is used as a queue:
127eda14cbcSMatt Macy * * we modify or pop entries from its head when we flush metaslabs
128eda14cbcSMatt Macy * * we modify or append entries to its tail when we sync changes.
129eda14cbcSMatt Macy *
130eda14cbcSMatt Macy * - Each metaslab has two new range trees that hold its unflushed changes,
131eda14cbcSMatt Macy * ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
132eda14cbcSMatt Macy *
133eda14cbcSMatt Macy * == Flushing algorithm ==
134eda14cbcSMatt Macy *
135eda14cbcSMatt Macy * The decision of how many metaslabs to flush on a give TXG is guided by
136eda14cbcSMatt Macy * two heuristics:
137eda14cbcSMatt Macy *
138eda14cbcSMatt Macy * [1] The memory heuristic -
139eda14cbcSMatt Macy * We keep track of the memory used by the unflushed trees from all the
140eda14cbcSMatt Macy * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
141eda14cbcSMatt Macy * stays below a certain threshold which is determined by an arbitrary hard
142eda14cbcSMatt Macy * limit and an arbitrary percentage of the system's memory [see
143eda14cbcSMatt Macy * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
144eda14cbcSMatt Macy * unflushed changes are passing that threshold, we flush metaslabs, which
145eda14cbcSMatt Macy * empties their unflushed range trees, reducing the memory used.
146eda14cbcSMatt Macy *
147eda14cbcSMatt Macy * [2] The block heuristic -
148eda14cbcSMatt Macy * We try to keep the total number of blocks in the log space maps in check
149eda14cbcSMatt Macy * so the log doesn't grow indefinitely and we don't induce a lot of overhead
150eda14cbcSMatt Macy * when loading the pool. At the same time we don't want to flush a lot of
151eda14cbcSMatt Macy * metaslabs too often as this would defeat the purpose of the log space map.
152eda14cbcSMatt Macy * As a result we set a limit in the amount of blocks that we think it's
153eda14cbcSMatt Macy * acceptable for the log space maps to have and try not to cross it.
154eda14cbcSMatt Macy * [see sus_blocklimit from spa_unflushed_stats].
155eda14cbcSMatt Macy *
156eda14cbcSMatt Macy * In order to stay below the block limit every TXG we have to estimate how
157eda14cbcSMatt Macy * many metaslabs we need to flush based on the current rate of incoming blocks
158eda14cbcSMatt Macy * and our history of log space map blocks. The main idea here is to answer
159eda14cbcSMatt Macy * the question of how many metaslabs do we need to flush in order to get rid
160eda14cbcSMatt Macy * at least an X amount of log space map blocks. We can answer this question
161eda14cbcSMatt Macy * by iterating backwards from the oldest log space map to the newest one
162eda14cbcSMatt Macy * and looking at their metaslab and block counts. At this point the log summary
163eda14cbcSMatt Macy * mentioned above comes handy as it reduces the amount of things that we have
164eda14cbcSMatt Macy * to iterate (even though it may reduce the preciseness of our estimates due
165eda14cbcSMatt Macy * to its aggregation of data). So with that in mind, we project the incoming
166eda14cbcSMatt Macy * rate of the current TXG into the future and attempt to approximate how many
167eda14cbcSMatt Macy * metaslabs would we need to flush from now in order to avoid exceeding our
168eda14cbcSMatt Macy * block limit in different points in the future (granted that we would keep
169eda14cbcSMatt Macy * flushing the same number of metaslabs for every TXG). Then we take the
170eda14cbcSMatt Macy * maximum number from all these estimates to be on the safe side. For the
171eda14cbcSMatt Macy * exact implementation details of algorithm refer to
172eda14cbcSMatt Macy * spa_estimate_metaslabs_to_flush.
173eda14cbcSMatt Macy */
174eda14cbcSMatt Macy
175eda14cbcSMatt Macy /*
176eda14cbcSMatt Macy * This is used as the block size for the space maps used for the
177eda14cbcSMatt Macy * log space map feature. These space maps benefit from a bigger
178eda14cbcSMatt Macy * block size as we expect to be writing a lot of data to them at
179eda14cbcSMatt Macy * once.
180eda14cbcSMatt Macy */
181e92ffd9bSMartin Matuska static const unsigned long zfs_log_sm_blksz = 1ULL << 17;
182eda14cbcSMatt Macy
183eda14cbcSMatt Macy /*
184eda14cbcSMatt Macy * Percentage of the overall system's memory that ZFS allows to be
185eda14cbcSMatt Macy * used for unflushed changes (e.g. the sum of size of all the nodes
186eda14cbcSMatt Macy * in the unflushed trees).
187eda14cbcSMatt Macy *
188eda14cbcSMatt Macy * Note that this value is calculated over 1000000 for finer granularity
189eda14cbcSMatt Macy * (thus the _ppm suffix; reads as "parts per million"). As an example,
190eda14cbcSMatt Macy * the default of 1000 allows 0.1% of memory to be used.
191eda14cbcSMatt Macy */
192dbd5678dSMartin Matuska static uint64_t zfs_unflushed_max_mem_ppm = 1000;
193eda14cbcSMatt Macy
194eda14cbcSMatt Macy /*
195eda14cbcSMatt Macy * Specific hard-limit in memory that ZFS allows to be used for
196eda14cbcSMatt Macy * unflushed changes.
197eda14cbcSMatt Macy */
198dbd5678dSMartin Matuska static uint64_t zfs_unflushed_max_mem_amt = 1ULL << 30;
199eda14cbcSMatt Macy
200eda14cbcSMatt Macy /*
201eda14cbcSMatt Macy * The following tunable determines the number of blocks that can be used for
202eda14cbcSMatt Macy * the log space maps. It is expressed as a percentage of the total number of
203eda14cbcSMatt Macy * metaslabs in the pool (i.e. the default of 400 means that the number of log
204eda14cbcSMatt Macy * blocks is capped at 4 times the number of metaslabs).
205eda14cbcSMatt Macy *
206eda14cbcSMatt Macy * This value exists to tune our flushing algorithm, with higher values
207eda14cbcSMatt Macy * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
208eda14cbcSMatt Macy * flushing metaslabs more aggressively with the upside of saving overheads
209eda14cbcSMatt Macy * when loading the pool. Another factor in this tradeoff is that flushing
210eda14cbcSMatt Macy * less often can potentially lead to better utilization of the metaslab space
211eda14cbcSMatt Macy * map's block size as we accumulate more changes per flush.
212eda14cbcSMatt Macy *
213eda14cbcSMatt Macy * Given that this tunable indirectly controls the flush rate (metaslabs
214eda14cbcSMatt Macy * flushed per txg) and that's why making it a percentage in terms of the
215eda14cbcSMatt Macy * number of metaslabs in the pool makes sense here.
216eda14cbcSMatt Macy *
217eda14cbcSMatt Macy * As a rule of thumb we default this tunable to 400% based on the following:
218eda14cbcSMatt Macy *
219eda14cbcSMatt Macy * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
220eda14cbcSMatt Macy * it is reasonable to expect that the amount of obsolete entries changes
221eda14cbcSMatt Macy * linearly from txg to txg (e.g. the oldest log should have the most
222eda14cbcSMatt Macy * obsolete entries, and the most recent one the least). With this we could
223eda14cbcSMatt Macy * say that, at any given time, about half of the entries in the whole space
224eda14cbcSMatt Macy * map log are obsolete. Thus for every two entries for a metaslab in the
225eda14cbcSMatt Macy * log space map, only one of them is valid and actually makes it to the
226eda14cbcSMatt Macy * metaslab's space map.
227eda14cbcSMatt Macy * [factor of 2]
228eda14cbcSMatt Macy * 2] Each entry in the log space map is guaranteed to be two words while
229eda14cbcSMatt Macy * entries in metaslab space maps are generally single-word.
230eda14cbcSMatt Macy * [an extra factor of 2 - 400% overall]
231eda14cbcSMatt Macy * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
232eda14cbcSMatt Macy * account any consolidation of segments from the log space map to the
233eda14cbcSMatt Macy * unflushed range trees nor their history (e.g. a segment being allocated,
234eda14cbcSMatt Macy * then freed, then allocated again means 3 log space map entries but 0
235eda14cbcSMatt Macy * metaslab space map entries). Depending on the workload, we've seen ~1.8
236eda14cbcSMatt Macy * non-obsolete log space map entries per metaslab entry, for a total of
237eda14cbcSMatt Macy * ~600%. Since most of these estimates though are workload dependent, we
238eda14cbcSMatt Macy * default on 400% to be conservative.
239eda14cbcSMatt Macy *
240eda14cbcSMatt Macy * Thus we could say that even in the worst
241eda14cbcSMatt Macy * case of [1] and [2], the factor should end up being 4.
242eda14cbcSMatt Macy *
243eda14cbcSMatt Macy * That said, regardless of the number of metaslabs in the pool we need to
244eda14cbcSMatt Macy * provide upper and lower bounds for the log block limit.
245eda14cbcSMatt Macy * [see zfs_unflushed_log_block_{min,max}]
246eda14cbcSMatt Macy */
247dbd5678dSMartin Matuska static uint_t zfs_unflushed_log_block_pct = 400;
248eda14cbcSMatt Macy
249eda14cbcSMatt Macy /*
250eda14cbcSMatt Macy * If the number of metaslabs is small and our incoming rate is high, we could
251eda14cbcSMatt Macy * get into a situation that we are flushing all our metaslabs every TXG. Thus
252eda14cbcSMatt Macy * we always allow at least this many log blocks.
253eda14cbcSMatt Macy */
254dbd5678dSMartin Matuska static uint64_t zfs_unflushed_log_block_min = 1000;
255eda14cbcSMatt Macy
256eda14cbcSMatt Macy /*
257eda14cbcSMatt Macy * If the log becomes too big, the import time of the pool can take a hit in
258eda14cbcSMatt Macy * terms of performance. Thus we have a hard limit in the size of the log in
259eda14cbcSMatt Macy * terms of blocks.
260eda14cbcSMatt Macy */
261dbd5678dSMartin Matuska static uint64_t zfs_unflushed_log_block_max = (1ULL << 17);
262716fd348SMartin Matuska
263716fd348SMartin Matuska /*
264716fd348SMartin Matuska * Also we have a hard limit in the size of the log in terms of dirty TXGs.
265716fd348SMartin Matuska */
266dbd5678dSMartin Matuska static uint64_t zfs_unflushed_log_txg_max = 1000;
267eda14cbcSMatt Macy
268eda14cbcSMatt Macy /*
269eda14cbcSMatt Macy * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
270eda14cbcSMatt Macy * stability of the flushing algorithm (longer summary) vs its runtime overhead
271eda14cbcSMatt Macy * (smaller summary is faster to traverse).
272eda14cbcSMatt Macy */
273dbd5678dSMartin Matuska static uint64_t zfs_max_logsm_summary_length = 10;
274eda14cbcSMatt Macy
275eda14cbcSMatt Macy /*
276eda14cbcSMatt Macy * Tunable that sets the lower bound on the metaslabs to flush every TXG.
277eda14cbcSMatt Macy *
278eda14cbcSMatt Macy * Setting this to 0 has no effect since if the pool is idle we won't even be
279eda14cbcSMatt Macy * creating log space maps and therefore we won't be flushing. On the other
280eda14cbcSMatt Macy * hand if the pool has any incoming workload our block heuristic will start
281eda14cbcSMatt Macy * flushing metaslabs anyway.
282eda14cbcSMatt Macy *
283eda14cbcSMatt Macy * The point of this tunable is to be used in extreme cases where we really
284eda14cbcSMatt Macy * want to flush more metaslabs than our adaptable heuristic plans to flush.
285eda14cbcSMatt Macy */
286dbd5678dSMartin Matuska static uint64_t zfs_min_metaslabs_to_flush = 1;
287eda14cbcSMatt Macy
288eda14cbcSMatt Macy /*
289eda14cbcSMatt Macy * Tunable that specifies how far in the past do we want to look when trying to
290eda14cbcSMatt Macy * estimate the incoming log blocks for the current TXG.
291eda14cbcSMatt Macy *
292eda14cbcSMatt Macy * Setting this too high may not only increase runtime but also minimize the
293eda14cbcSMatt Macy * effect of the incoming rates from the most recent TXGs as we take the
294eda14cbcSMatt Macy * average over all the blocks that we walk
295eda14cbcSMatt Macy * [see spa_estimate_incoming_log_blocks].
296eda14cbcSMatt Macy */
297dbd5678dSMartin Matuska static uint64_t zfs_max_log_walking = 5;
298eda14cbcSMatt Macy
299eda14cbcSMatt Macy /*
300eda14cbcSMatt Macy * This tunable exists solely for testing purposes. It ensures that the log
301eda14cbcSMatt Macy * spacemaps are not flushed and destroyed during export in order for the
302eda14cbcSMatt Macy * relevant log spacemap import code paths to be tested (effectively simulating
303eda14cbcSMatt Macy * a crash).
304eda14cbcSMatt Macy */
305eda14cbcSMatt Macy int zfs_keep_log_spacemaps_at_export = 0;
306eda14cbcSMatt Macy
307eda14cbcSMatt Macy static uint64_t
spa_estimate_incoming_log_blocks(spa_t * spa)308eda14cbcSMatt Macy spa_estimate_incoming_log_blocks(spa_t *spa)
309eda14cbcSMatt Macy {
310eda14cbcSMatt Macy ASSERT3U(spa_sync_pass(spa), ==, 1);
311eda14cbcSMatt Macy uint64_t steps = 0, sum = 0;
312eda14cbcSMatt Macy for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
313eda14cbcSMatt Macy sls != NULL && steps < zfs_max_log_walking;
314eda14cbcSMatt Macy sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
315eda14cbcSMatt Macy if (sls->sls_txg == spa_syncing_txg(spa)) {
316eda14cbcSMatt Macy /*
317eda14cbcSMatt Macy * skip the log created in this TXG as this would
318eda14cbcSMatt Macy * make our estimations inaccurate.
319eda14cbcSMatt Macy */
320eda14cbcSMatt Macy continue;
321eda14cbcSMatt Macy }
322eda14cbcSMatt Macy sum += sls->sls_nblocks;
323eda14cbcSMatt Macy steps++;
324eda14cbcSMatt Macy }
325eda14cbcSMatt Macy return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
326eda14cbcSMatt Macy }
327eda14cbcSMatt Macy
328eda14cbcSMatt Macy uint64_t
spa_log_sm_blocklimit(spa_t * spa)329eda14cbcSMatt Macy spa_log_sm_blocklimit(spa_t *spa)
330eda14cbcSMatt Macy {
331eda14cbcSMatt Macy return (spa->spa_unflushed_stats.sus_blocklimit);
332eda14cbcSMatt Macy }
333eda14cbcSMatt Macy
334eda14cbcSMatt Macy void
spa_log_sm_set_blocklimit(spa_t * spa)335eda14cbcSMatt Macy spa_log_sm_set_blocklimit(spa_t *spa)
336eda14cbcSMatt Macy {
337eda14cbcSMatt Macy if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
338eda14cbcSMatt Macy ASSERT0(spa_log_sm_blocklimit(spa));
339eda14cbcSMatt Macy return;
340eda14cbcSMatt Macy }
341eda14cbcSMatt Macy
342716fd348SMartin Matuska uint64_t msdcount = 0;
343716fd348SMartin Matuska for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
344716fd348SMartin Matuska e; e = list_next(&spa->spa_log_summary, e))
345716fd348SMartin Matuska msdcount += e->lse_msdcount;
346716fd348SMartin Matuska
347716fd348SMartin Matuska uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100;
348716fd348SMartin Matuska spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit,
349eda14cbcSMatt Macy zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
350eda14cbcSMatt Macy }
351eda14cbcSMatt Macy
352eda14cbcSMatt Macy uint64_t
spa_log_sm_nblocks(spa_t * spa)353eda14cbcSMatt Macy spa_log_sm_nblocks(spa_t *spa)
354eda14cbcSMatt Macy {
355eda14cbcSMatt Macy return (spa->spa_unflushed_stats.sus_nblocks);
356eda14cbcSMatt Macy }
357eda14cbcSMatt Macy
358eda14cbcSMatt Macy /*
359eda14cbcSMatt Macy * Ensure that the in-memory log space map structures and the summary
360eda14cbcSMatt Macy * have the same block and metaslab counts.
361eda14cbcSMatt Macy */
362eda14cbcSMatt Macy static void
spa_log_summary_verify_counts(spa_t * spa)363eda14cbcSMatt Macy spa_log_summary_verify_counts(spa_t *spa)
364eda14cbcSMatt Macy {
365eda14cbcSMatt Macy ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
366eda14cbcSMatt Macy
367eda14cbcSMatt Macy if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
368eda14cbcSMatt Macy return;
369eda14cbcSMatt Macy
370eda14cbcSMatt Macy uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
371eda14cbcSMatt Macy
372eda14cbcSMatt Macy uint64_t ms_in_summary = 0, blk_in_summary = 0;
373eda14cbcSMatt Macy for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
374eda14cbcSMatt Macy e; e = list_next(&spa->spa_log_summary, e)) {
375eda14cbcSMatt Macy ms_in_summary += e->lse_mscount;
376eda14cbcSMatt Macy blk_in_summary += e->lse_blkcount;
377eda14cbcSMatt Macy }
378eda14cbcSMatt Macy
379eda14cbcSMatt Macy uint64_t ms_in_logs = 0, blk_in_logs = 0;
380eda14cbcSMatt Macy for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
381eda14cbcSMatt Macy sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
382eda14cbcSMatt Macy ms_in_logs += sls->sls_mscount;
383eda14cbcSMatt Macy blk_in_logs += sls->sls_nblocks;
384eda14cbcSMatt Macy }
385eda14cbcSMatt Macy
386eda14cbcSMatt Macy VERIFY3U(ms_in_logs, ==, ms_in_summary);
387eda14cbcSMatt Macy VERIFY3U(ms_in_logs, ==, ms_in_avl);
388eda14cbcSMatt Macy VERIFY3U(blk_in_logs, ==, blk_in_summary);
389eda14cbcSMatt Macy VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
390eda14cbcSMatt Macy }
391eda14cbcSMatt Macy
392eda14cbcSMatt Macy static boolean_t
summary_entry_is_full(spa_t * spa,log_summary_entry_t * e,uint64_t txg)393716fd348SMartin Matuska summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg)
394eda14cbcSMatt Macy {
395716fd348SMartin Matuska if (e->lse_end == txg)
396716fd348SMartin Matuska return (0);
397716fd348SMartin Matuska if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max,
398716fd348SMartin Matuska zfs_max_logsm_summary_length))
399716fd348SMartin Matuska return (1);
400eda14cbcSMatt Macy uint64_t blocks_per_row = MAX(1,
401eda14cbcSMatt Macy DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
402eda14cbcSMatt Macy zfs_max_logsm_summary_length));
403eda14cbcSMatt Macy return (blocks_per_row <= e->lse_blkcount);
404eda14cbcSMatt Macy }
405eda14cbcSMatt Macy
406eda14cbcSMatt Macy /*
407eda14cbcSMatt Macy * Update the log summary information to reflect the fact that a metaslab
408eda14cbcSMatt Macy * was flushed or destroyed (e.g due to device removal or pool export/destroy).
409eda14cbcSMatt Macy *
410eda14cbcSMatt Macy * We typically flush the oldest flushed metaslab so the first (and oldest)
411eda14cbcSMatt Macy * entry of the summary is updated. However if that metaslab is getting loaded
412eda14cbcSMatt Macy * we may flush the second oldest one which may be part of an entry later in
413eda14cbcSMatt Macy * the summary. Moreover, if we call into this function from metaslab_fini()
414eda14cbcSMatt Macy * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
415eda14cbcSMatt Macy * for a txg as an argument so we can locate the appropriate summary entry for
416eda14cbcSMatt Macy * the metaslab.
417eda14cbcSMatt Macy */
418eda14cbcSMatt Macy void
spa_log_summary_decrement_mscount(spa_t * spa,uint64_t txg,boolean_t dirty)419716fd348SMartin Matuska spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty)
420eda14cbcSMatt Macy {
421eda14cbcSMatt Macy /*
422eda14cbcSMatt Macy * We don't track summary data for read-only pools and this function
423eda14cbcSMatt Macy * can be called from metaslab_fini(). In that case return immediately.
424eda14cbcSMatt Macy */
425eda14cbcSMatt Macy if (!spa_writeable(spa))
426eda14cbcSMatt Macy return;
427eda14cbcSMatt Macy
428eda14cbcSMatt Macy log_summary_entry_t *target = NULL;
429eda14cbcSMatt Macy for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
430eda14cbcSMatt Macy e != NULL; e = list_next(&spa->spa_log_summary, e)) {
431eda14cbcSMatt Macy if (e->lse_start > txg)
432eda14cbcSMatt Macy break;
433eda14cbcSMatt Macy target = e;
434eda14cbcSMatt Macy }
435eda14cbcSMatt Macy
436eda14cbcSMatt Macy if (target == NULL || target->lse_mscount == 0) {
437eda14cbcSMatt Macy /*
438eda14cbcSMatt Macy * We didn't find a summary entry for this metaslab. We must be
439eda14cbcSMatt Macy * at the teardown of a spa_load() attempt that got an error
440eda14cbcSMatt Macy * while reading the log space maps.
441eda14cbcSMatt Macy */
442eda14cbcSMatt Macy VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
443eda14cbcSMatt Macy return;
444eda14cbcSMatt Macy }
445eda14cbcSMatt Macy
446eda14cbcSMatt Macy target->lse_mscount--;
447716fd348SMartin Matuska if (dirty)
448716fd348SMartin Matuska target->lse_msdcount--;
449eda14cbcSMatt Macy }
450eda14cbcSMatt Macy
451eda14cbcSMatt Macy /*
452eda14cbcSMatt Macy * Update the log summary information to reflect the fact that we destroyed
453eda14cbcSMatt Macy * old log space maps. Since we can only destroy the oldest log space maps,
454eda14cbcSMatt Macy * we decrement the block count of the oldest summary entry and potentially
455eda14cbcSMatt Macy * destroy it when that count hits 0.
456eda14cbcSMatt Macy *
457eda14cbcSMatt Macy * This function is called after a metaslab is flushed and typically that
458eda14cbcSMatt Macy * metaslab is the oldest flushed, which means that this function will
459eda14cbcSMatt Macy * typically decrement the block count of the first entry of the summary and
460eda14cbcSMatt Macy * potentially free it if the block count gets to zero (its metaslab count
461eda14cbcSMatt Macy * should be zero too at that point).
462eda14cbcSMatt Macy *
463eda14cbcSMatt Macy * There are certain scenarios though that don't work exactly like that so we
464eda14cbcSMatt Macy * need to account for them:
465eda14cbcSMatt Macy *
466eda14cbcSMatt Macy * Scenario [1]: It is possible that after we flushed the oldest flushed
467eda14cbcSMatt Macy * metaslab and we destroyed the oldest log space map, more recent logs had 0
468eda14cbcSMatt Macy * metaslabs pointing to them so we got rid of them too. This can happen due
469eda14cbcSMatt Macy * to metaslabs being destroyed through device removal, or because the oldest
470eda14cbcSMatt Macy * flushed metaslab was loading but we kept flushing more recently flushed
471eda14cbcSMatt Macy * metaslabs due to the memory pressure of unflushed changes. Because of that,
472eda14cbcSMatt Macy * we always iterate from the beginning of the summary and if blocks_gone is
473eda14cbcSMatt Macy * bigger than the block_count of the current entry we free that entry (we
474eda14cbcSMatt Macy * expect its metaslab count to be zero), we decrement blocks_gone and on to
475eda14cbcSMatt Macy * the next entry repeating this procedure until blocks_gone gets decremented
476eda14cbcSMatt Macy * to 0. Doing this also works for the typical case mentioned above.
477eda14cbcSMatt Macy *
478eda14cbcSMatt Macy * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
479eda14cbcSMatt Macy * the first (and oldest) entry in the summary. If the first few entries of
480eda14cbcSMatt Macy * the summary were only accounting metaslabs from a device that was just
481eda14cbcSMatt Macy * removed, then the current oldest flushed metaslab could be accounted by an
482eda14cbcSMatt Macy * entry somewhere in the middle of the summary. Moreover flushing that
483eda14cbcSMatt Macy * metaslab will destroy all the log space maps older than its ms_unflushed_txg
484eda14cbcSMatt Macy * because they became obsolete after the removal. Thus, iterating as we did
485eda14cbcSMatt Macy * for scenario [1] works out for this case too.
486eda14cbcSMatt Macy *
487eda14cbcSMatt Macy * Scenario [3]: At times we decide to flush all the metaslabs in the pool
488eda14cbcSMatt Macy * in one TXG (either because we are exporting the pool or because our flushing
489eda14cbcSMatt Macy * heuristics decided to do so). When that happens all the log space maps get
490eda14cbcSMatt Macy * destroyed except the one created for the current TXG which doesn't have
491eda14cbcSMatt Macy * any log blocks yet. As log space maps get destroyed with every metaslab that
492eda14cbcSMatt Macy * we flush, entries in the summary are also destroyed. This brings a weird
493eda14cbcSMatt Macy * corner-case when we flush the last metaslab and the log space map of the
494eda14cbcSMatt Macy * current TXG is in the same summary entry with other log space maps that
495eda14cbcSMatt Macy * are older. When that happens we are eventually left with this one last
496eda14cbcSMatt Macy * summary entry whose blocks are gone (blocks_gone equals the entry's block
497eda14cbcSMatt Macy * count) but its metaslab count is non-zero (because it accounts all the
498eda14cbcSMatt Macy * metaslabs in the pool as they all got flushed). Under this scenario we can't
499eda14cbcSMatt Macy * free this last summary entry as it's referencing all the metaslabs in the
500eda14cbcSMatt Macy * pool and its block count will get incremented at the end of this sync (when
501eda14cbcSMatt Macy * we close the syncing log space map). Thus we just decrement its current
502eda14cbcSMatt Macy * block count and leave it alone. In the case that the pool gets exported,
503eda14cbcSMatt Macy * its metaslab count will be decremented over time as we call metaslab_fini()
504eda14cbcSMatt Macy * for all the metaslabs in the pool and the entry will be freed at
505eda14cbcSMatt Macy * spa_unload_log_sm_metadata().
506eda14cbcSMatt Macy */
507eda14cbcSMatt Macy void
spa_log_summary_decrement_blkcount(spa_t * spa,uint64_t blocks_gone)508eda14cbcSMatt Macy spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
509eda14cbcSMatt Macy {
510716fd348SMartin Matuska log_summary_entry_t *e = list_head(&spa->spa_log_summary);
511dbd5678dSMartin Matuska ASSERT3P(e, !=, NULL);
512716fd348SMartin Matuska if (e->lse_txgcount > 0)
513716fd348SMartin Matuska e->lse_txgcount--;
514716fd348SMartin Matuska for (; e != NULL; e = list_head(&spa->spa_log_summary)) {
515eda14cbcSMatt Macy if (e->lse_blkcount > blocks_gone) {
516eda14cbcSMatt Macy e->lse_blkcount -= blocks_gone;
517eda14cbcSMatt Macy blocks_gone = 0;
518eda14cbcSMatt Macy break;
519eda14cbcSMatt Macy } else if (e->lse_mscount == 0) {
520eda14cbcSMatt Macy /* remove obsolete entry */
521eda14cbcSMatt Macy blocks_gone -= e->lse_blkcount;
522eda14cbcSMatt Macy list_remove(&spa->spa_log_summary, e);
523eda14cbcSMatt Macy kmem_free(e, sizeof (log_summary_entry_t));
524eda14cbcSMatt Macy } else {
525eda14cbcSMatt Macy /* Verify that this is scenario [3] mentioned above. */
526eda14cbcSMatt Macy VERIFY3U(blocks_gone, ==, e->lse_blkcount);
527eda14cbcSMatt Macy
528eda14cbcSMatt Macy /*
529eda14cbcSMatt Macy * Assert that this is scenario [3] further by ensuring
530eda14cbcSMatt Macy * that this is the only entry in the summary.
531eda14cbcSMatt Macy */
532eda14cbcSMatt Macy VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
533eda14cbcSMatt Macy ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
534eda14cbcSMatt Macy
535eda14cbcSMatt Macy blocks_gone = e->lse_blkcount = 0;
536eda14cbcSMatt Macy break;
537eda14cbcSMatt Macy }
538eda14cbcSMatt Macy }
539eda14cbcSMatt Macy
540eda14cbcSMatt Macy /*
541eda14cbcSMatt Macy * Ensure that there is no way we are trying to remove more blocks
542eda14cbcSMatt Macy * than the # of blocks in the summary.
543eda14cbcSMatt Macy */
544eda14cbcSMatt Macy ASSERT0(blocks_gone);
545eda14cbcSMatt Macy }
546eda14cbcSMatt Macy
547eda14cbcSMatt Macy void
spa_log_sm_decrement_mscount(spa_t * spa,uint64_t txg)548eda14cbcSMatt Macy spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
549eda14cbcSMatt Macy {
550eda14cbcSMatt Macy spa_log_sm_t target = { .sls_txg = txg };
551eda14cbcSMatt Macy spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
552eda14cbcSMatt Macy &target, NULL);
553eda14cbcSMatt Macy
554eda14cbcSMatt Macy if (sls == NULL) {
555eda14cbcSMatt Macy /*
556eda14cbcSMatt Macy * We must be at the teardown of a spa_load() attempt that
557eda14cbcSMatt Macy * got an error while reading the log space maps.
558eda14cbcSMatt Macy */
559eda14cbcSMatt Macy VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
560eda14cbcSMatt Macy return;
561eda14cbcSMatt Macy }
562eda14cbcSMatt Macy
563eda14cbcSMatt Macy ASSERT(sls->sls_mscount > 0);
564eda14cbcSMatt Macy sls->sls_mscount--;
565eda14cbcSMatt Macy }
566eda14cbcSMatt Macy
567eda14cbcSMatt Macy void
spa_log_sm_increment_current_mscount(spa_t * spa)568eda14cbcSMatt Macy spa_log_sm_increment_current_mscount(spa_t *spa)
569eda14cbcSMatt Macy {
570eda14cbcSMatt Macy spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
571eda14cbcSMatt Macy ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
572eda14cbcSMatt Macy last_sls->sls_mscount++;
573eda14cbcSMatt Macy }
574eda14cbcSMatt Macy
575eda14cbcSMatt Macy static void
summary_add_data(spa_t * spa,uint64_t txg,uint64_t metaslabs_flushed,uint64_t metaslabs_dirty,uint64_t nblocks)576eda14cbcSMatt Macy summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
577716fd348SMartin Matuska uint64_t metaslabs_dirty, uint64_t nblocks)
578eda14cbcSMatt Macy {
579eda14cbcSMatt Macy log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
580eda14cbcSMatt Macy
581716fd348SMartin Matuska if (e == NULL || summary_entry_is_full(spa, e, txg)) {
582eda14cbcSMatt Macy e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
583716fd348SMartin Matuska e->lse_start = e->lse_end = txg;
584716fd348SMartin Matuska e->lse_txgcount = 1;
585eda14cbcSMatt Macy list_insert_tail(&spa->spa_log_summary, e);
586eda14cbcSMatt Macy }
587eda14cbcSMatt Macy
588eda14cbcSMatt Macy ASSERT3U(e->lse_start, <=, txg);
589716fd348SMartin Matuska if (e->lse_end < txg) {
590716fd348SMartin Matuska e->lse_end = txg;
591716fd348SMartin Matuska e->lse_txgcount++;
592716fd348SMartin Matuska }
593eda14cbcSMatt Macy e->lse_mscount += metaslabs_flushed;
594716fd348SMartin Matuska e->lse_msdcount += metaslabs_dirty;
595eda14cbcSMatt Macy e->lse_blkcount += nblocks;
596eda14cbcSMatt Macy }
597eda14cbcSMatt Macy
598eda14cbcSMatt Macy static void
spa_log_summary_add_incoming_blocks(spa_t * spa,uint64_t nblocks)599eda14cbcSMatt Macy spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
600eda14cbcSMatt Macy {
601716fd348SMartin Matuska summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks);
602eda14cbcSMatt Macy }
603eda14cbcSMatt Macy
604eda14cbcSMatt Macy void
spa_log_summary_add_flushed_metaslab(spa_t * spa,boolean_t dirty)605716fd348SMartin Matuska spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty)
606eda14cbcSMatt Macy {
607716fd348SMartin Matuska summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0);
608716fd348SMartin Matuska }
609716fd348SMartin Matuska
610716fd348SMartin Matuska void
spa_log_summary_dirty_flushed_metaslab(spa_t * spa,uint64_t txg)611716fd348SMartin Matuska spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg)
612716fd348SMartin Matuska {
613716fd348SMartin Matuska log_summary_entry_t *target = NULL;
614716fd348SMartin Matuska for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
615716fd348SMartin Matuska e != NULL; e = list_next(&spa->spa_log_summary, e)) {
616716fd348SMartin Matuska if (e->lse_start > txg)
617716fd348SMartin Matuska break;
618716fd348SMartin Matuska target = e;
619716fd348SMartin Matuska }
620716fd348SMartin Matuska ASSERT3P(target, !=, NULL);
621716fd348SMartin Matuska ASSERT3U(target->lse_mscount, !=, 0);
622716fd348SMartin Matuska target->lse_msdcount++;
623eda14cbcSMatt Macy }
624eda14cbcSMatt Macy
625eda14cbcSMatt Macy /*
626eda14cbcSMatt Macy * This function attempts to estimate how many metaslabs should
627eda14cbcSMatt Macy * we flush to satisfy our block heuristic for the log spacemap
628eda14cbcSMatt Macy * for the upcoming TXGs.
629eda14cbcSMatt Macy *
630eda14cbcSMatt Macy * Specifically, it first tries to estimate the number of incoming
631eda14cbcSMatt Macy * blocks in this TXG. Then by projecting that incoming rate to
632eda14cbcSMatt Macy * future TXGs and using the log summary, it figures out how many
633eda14cbcSMatt Macy * flushes we would need to do for future TXGs individually to
634eda14cbcSMatt Macy * stay below our block limit and returns the maximum number of
635eda14cbcSMatt Macy * flushes from those estimates.
636eda14cbcSMatt Macy */
637eda14cbcSMatt Macy static uint64_t
spa_estimate_metaslabs_to_flush(spa_t * spa)638eda14cbcSMatt Macy spa_estimate_metaslabs_to_flush(spa_t *spa)
639eda14cbcSMatt Macy {
640eda14cbcSMatt Macy ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
641eda14cbcSMatt Macy ASSERT3U(spa_sync_pass(spa), ==, 1);
642eda14cbcSMatt Macy ASSERT(spa_log_sm_blocklimit(spa) != 0);
643eda14cbcSMatt Macy
644eda14cbcSMatt Macy /*
645eda14cbcSMatt Macy * This variable contains the incoming rate that will be projected
646eda14cbcSMatt Macy * and used for our flushing estimates in the future.
647eda14cbcSMatt Macy */
648eda14cbcSMatt Macy uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
649eda14cbcSMatt Macy
650eda14cbcSMatt Macy /*
651eda14cbcSMatt Macy * At any point in time this variable tells us how many
652eda14cbcSMatt Macy * TXGs in the future we are so we can make our estimations.
653eda14cbcSMatt Macy */
654eda14cbcSMatt Macy uint64_t txgs_in_future = 1;
655eda14cbcSMatt Macy
656eda14cbcSMatt Macy /*
657eda14cbcSMatt Macy * This variable tells us how much room do we have until we hit
658eda14cbcSMatt Macy * our limit. When it goes negative, it means that we've exceeded
659eda14cbcSMatt Macy * our limit and we need to flush.
660eda14cbcSMatt Macy *
661eda14cbcSMatt Macy * Note that since we start at the first TXG in the future (i.e.
662eda14cbcSMatt Macy * txgs_in_future starts from 1) we already decrement this
663eda14cbcSMatt Macy * variable by the incoming rate.
664eda14cbcSMatt Macy */
665eda14cbcSMatt Macy int64_t available_blocks =
666eda14cbcSMatt Macy spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
667eda14cbcSMatt Macy
668716fd348SMartin Matuska int64_t available_txgs = zfs_unflushed_log_txg_max;
669716fd348SMartin Matuska for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
670716fd348SMartin Matuska e; e = list_next(&spa->spa_log_summary, e))
671716fd348SMartin Matuska available_txgs -= e->lse_txgcount;
672716fd348SMartin Matuska
673eda14cbcSMatt Macy /*
674eda14cbcSMatt Macy * This variable tells us the total number of flushes needed to
675eda14cbcSMatt Macy * keep the log size within the limit when we reach txgs_in_future.
676eda14cbcSMatt Macy */
677eda14cbcSMatt Macy uint64_t total_flushes = 0;
678eda14cbcSMatt Macy
679eda14cbcSMatt Macy /* Holds the current maximum of our estimates so far. */
680716fd348SMartin Matuska uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush;
681eda14cbcSMatt Macy
682eda14cbcSMatt Macy /*
683eda14cbcSMatt Macy * For our estimations we only look as far in the future
684eda14cbcSMatt Macy * as the summary allows us.
685eda14cbcSMatt Macy */
686eda14cbcSMatt Macy for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
687eda14cbcSMatt Macy e; e = list_next(&spa->spa_log_summary, e)) {
688eda14cbcSMatt Macy
689eda14cbcSMatt Macy /*
690eda14cbcSMatt Macy * If there is still room before we exceed our limit
691eda14cbcSMatt Macy * then keep skipping TXGs accumulating more blocks
692eda14cbcSMatt Macy * based on the incoming rate until we exceed it.
693eda14cbcSMatt Macy */
694716fd348SMartin Matuska if (available_blocks >= 0 && available_txgs >= 0) {
695dbd5678dSMartin Matuska uint64_t skip_txgs = (incoming == 0) ?
696dbd5678dSMartin Matuska available_txgs + 1 : MIN(available_txgs + 1,
697716fd348SMartin Matuska (available_blocks / incoming) + 1);
698eda14cbcSMatt Macy available_blocks -= (skip_txgs * incoming);
699716fd348SMartin Matuska available_txgs -= skip_txgs;
700eda14cbcSMatt Macy txgs_in_future += skip_txgs;
701eda14cbcSMatt Macy ASSERT3S(available_blocks, >=, -incoming);
702716fd348SMartin Matuska ASSERT3S(available_txgs, >=, -1);
703eda14cbcSMatt Macy }
704eda14cbcSMatt Macy
705eda14cbcSMatt Macy /*
706eda14cbcSMatt Macy * At this point we're far enough into the future where
707eda14cbcSMatt Macy * the limit was just exceeded and we flush metaslabs
708eda14cbcSMatt Macy * based on the current entry in the summary, updating
709eda14cbcSMatt Macy * our available_blocks.
710eda14cbcSMatt Macy */
711716fd348SMartin Matuska ASSERT(available_blocks < 0 || available_txgs < 0);
712eda14cbcSMatt Macy available_blocks += e->lse_blkcount;
713716fd348SMartin Matuska available_txgs += e->lse_txgcount;
714716fd348SMartin Matuska total_flushes += e->lse_msdcount;
715eda14cbcSMatt Macy
716eda14cbcSMatt Macy /*
717eda14cbcSMatt Macy * Keep the running maximum of the total_flushes that
718eda14cbcSMatt Macy * we've done so far over the number of TXGs in the
719eda14cbcSMatt Macy * future that we are. The idea here is to estimate
720eda14cbcSMatt Macy * the average number of flushes that we should do
721eda14cbcSMatt Macy * every TXG so that when we are that many TXGs in the
722eda14cbcSMatt Macy * future we stay under the limit.
723eda14cbcSMatt Macy */
724eda14cbcSMatt Macy max_flushes_pertxg = MAX(max_flushes_pertxg,
725eda14cbcSMatt Macy DIV_ROUND_UP(total_flushes, txgs_in_future));
726eda14cbcSMatt Macy }
727eda14cbcSMatt Macy return (max_flushes_pertxg);
728eda14cbcSMatt Macy }
729eda14cbcSMatt Macy
730eda14cbcSMatt Macy uint64_t
spa_log_sm_memused(spa_t * spa)731eda14cbcSMatt Macy spa_log_sm_memused(spa_t *spa)
732eda14cbcSMatt Macy {
733eda14cbcSMatt Macy return (spa->spa_unflushed_stats.sus_memused);
734eda14cbcSMatt Macy }
735eda14cbcSMatt Macy
736eda14cbcSMatt Macy static boolean_t
spa_log_exceeds_memlimit(spa_t * spa)737eda14cbcSMatt Macy spa_log_exceeds_memlimit(spa_t *spa)
738eda14cbcSMatt Macy {
739eda14cbcSMatt Macy if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
740eda14cbcSMatt Macy return (B_TRUE);
741eda14cbcSMatt Macy
742eda14cbcSMatt Macy uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
743eda14cbcSMatt Macy zfs_unflushed_max_mem_ppm) / 1000000;
744eda14cbcSMatt Macy if (spa_log_sm_memused(spa) > system_mem_allowed)
745eda14cbcSMatt Macy return (B_TRUE);
746eda14cbcSMatt Macy
747eda14cbcSMatt Macy return (B_FALSE);
748eda14cbcSMatt Macy }
749eda14cbcSMatt Macy
750eda14cbcSMatt Macy boolean_t
spa_flush_all_logs_requested(spa_t * spa)751eda14cbcSMatt Macy spa_flush_all_logs_requested(spa_t *spa)
752eda14cbcSMatt Macy {
753eda14cbcSMatt Macy return (spa->spa_log_flushall_txg != 0);
754eda14cbcSMatt Macy }
755eda14cbcSMatt Macy
756eda14cbcSMatt Macy void
spa_flush_metaslabs(spa_t * spa,dmu_tx_t * tx)757eda14cbcSMatt Macy spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
758eda14cbcSMatt Macy {
759eda14cbcSMatt Macy uint64_t txg = dmu_tx_get_txg(tx);
760eda14cbcSMatt Macy
761eda14cbcSMatt Macy if (spa_sync_pass(spa) != 1)
762eda14cbcSMatt Macy return;
763eda14cbcSMatt Macy
764eda14cbcSMatt Macy if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
765eda14cbcSMatt Macy return;
766eda14cbcSMatt Macy
767eda14cbcSMatt Macy /*
768eda14cbcSMatt Macy * If we don't have any metaslabs with unflushed changes
769eda14cbcSMatt Macy * return immediately.
770eda14cbcSMatt Macy */
771eda14cbcSMatt Macy if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
772eda14cbcSMatt Macy return;
773eda14cbcSMatt Macy
774eda14cbcSMatt Macy /*
775eda14cbcSMatt Macy * During SPA export we leave a few empty TXGs to go by [see
776eda14cbcSMatt Macy * spa_final_dirty_txg() to understand why]. For this specific
777eda14cbcSMatt Macy * case, it is important to not flush any metaslabs as that
778eda14cbcSMatt Macy * would dirty this TXG.
779eda14cbcSMatt Macy *
780eda14cbcSMatt Macy * That said, during one of these dirty TXGs that is less or
781eda14cbcSMatt Macy * equal to spa_final_dirty(), spa_unload() will request that
782eda14cbcSMatt Macy * we try to flush all the metaslabs for that TXG before
783eda14cbcSMatt Macy * exporting the pool, thus we ensure that we didn't get a
784eda14cbcSMatt Macy * request of flushing everything before we attempt to return
785eda14cbcSMatt Macy * immediately.
786eda14cbcSMatt Macy */
787783d3ff6SMartin Matuska if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
788eda14cbcSMatt Macy !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
789eda14cbcSMatt Macy !spa_flush_all_logs_requested(spa))
790eda14cbcSMatt Macy return;
791eda14cbcSMatt Macy
792eda14cbcSMatt Macy /*
793eda14cbcSMatt Macy * We need to generate a log space map before flushing because this
794eda14cbcSMatt Macy * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
795eda14cbcSMatt Macy * for this TXG's flushed metaslab count (aka sls_mscount which is
796eda14cbcSMatt Macy * manipulated in many ways down the metaslab_flush() codepath).
797eda14cbcSMatt Macy *
798eda14cbcSMatt Macy * That is not to say that we may generate a log space map when we
799eda14cbcSMatt Macy * don't need it. If we are flushing metaslabs, that means that we
800eda14cbcSMatt Macy * were going to write changes to disk anyway, so even if we were
801eda14cbcSMatt Macy * not flushing, a log space map would have been created anyway in
802eda14cbcSMatt Macy * metaslab_sync().
803eda14cbcSMatt Macy */
804eda14cbcSMatt Macy spa_generate_syncing_log_sm(spa, tx);
805eda14cbcSMatt Macy
806eda14cbcSMatt Macy /*
807eda14cbcSMatt Macy * This variable tells us how many metaslabs we want to flush based
808eda14cbcSMatt Macy * on the block-heuristic of our flushing algorithm (see block comment
809eda14cbcSMatt Macy * of log space map feature). We also decrement this as we flush
810eda14cbcSMatt Macy * metaslabs and attempt to destroy old log space maps.
811eda14cbcSMatt Macy */
812eda14cbcSMatt Macy uint64_t want_to_flush;
813eda14cbcSMatt Macy if (spa_flush_all_logs_requested(spa)) {
814eda14cbcSMatt Macy ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
815716fd348SMartin Matuska want_to_flush = UINT64_MAX;
816eda14cbcSMatt Macy } else {
817eda14cbcSMatt Macy want_to_flush = spa_estimate_metaslabs_to_flush(spa);
818eda14cbcSMatt Macy }
819eda14cbcSMatt Macy
820eda14cbcSMatt Macy /* Used purely for verification purposes */
821eda14cbcSMatt Macy uint64_t visited = 0;
822eda14cbcSMatt Macy
823eda14cbcSMatt Macy /*
824eda14cbcSMatt Macy * Ideally we would only iterate through spa_metaslabs_by_flushed
825eda14cbcSMatt Macy * using only one variable (curr). We can't do that because
826eda14cbcSMatt Macy * metaslab_flush() mutates position of curr in the AVL when
827eda14cbcSMatt Macy * it flushes that metaslab by moving it to the end of the tree.
828eda14cbcSMatt Macy * Thus we always keep track of the original next node of the
829eda14cbcSMatt Macy * current node (curr) in another variable (next).
830eda14cbcSMatt Macy */
831eda14cbcSMatt Macy metaslab_t *next = NULL;
832eda14cbcSMatt Macy for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
833eda14cbcSMatt Macy curr != NULL; curr = next) {
834eda14cbcSMatt Macy next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
835eda14cbcSMatt Macy
836eda14cbcSMatt Macy /*
837eda14cbcSMatt Macy * If this metaslab has been flushed this txg then we've done
838eda14cbcSMatt Macy * a full circle over the metaslabs.
839eda14cbcSMatt Macy */
840eda14cbcSMatt Macy if (metaslab_unflushed_txg(curr) == txg)
841eda14cbcSMatt Macy break;
842eda14cbcSMatt Macy
843eda14cbcSMatt Macy /*
844eda14cbcSMatt Macy * If we are done flushing for the block heuristic and the
845eda14cbcSMatt Macy * unflushed changes don't exceed the memory limit just stop.
846eda14cbcSMatt Macy */
847eda14cbcSMatt Macy if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
848eda14cbcSMatt Macy break;
849eda14cbcSMatt Macy
850716fd348SMartin Matuska if (metaslab_unflushed_dirty(curr)) {
851eda14cbcSMatt Macy mutex_enter(&curr->ms_sync_lock);
852eda14cbcSMatt Macy mutex_enter(&curr->ms_lock);
853716fd348SMartin Matuska metaslab_flush(curr, tx);
854eda14cbcSMatt Macy mutex_exit(&curr->ms_lock);
855eda14cbcSMatt Macy mutex_exit(&curr->ms_sync_lock);
856716fd348SMartin Matuska if (want_to_flush > 0)
857eda14cbcSMatt Macy want_to_flush--;
858716fd348SMartin Matuska } else
859716fd348SMartin Matuska metaslab_unflushed_bump(curr, tx, B_FALSE);
860eda14cbcSMatt Macy
861eda14cbcSMatt Macy visited++;
862eda14cbcSMatt Macy }
863eda14cbcSMatt Macy ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
864716fd348SMartin Matuska
865716fd348SMartin Matuska spa_log_sm_set_blocklimit(spa);
866eda14cbcSMatt Macy }
867eda14cbcSMatt Macy
868eda14cbcSMatt Macy /*
869eda14cbcSMatt Macy * Close the log space map for this TXG and update the block counts
870eda14cbcSMatt Macy * for the log's in-memory structure and the summary.
871eda14cbcSMatt Macy */
872eda14cbcSMatt Macy void
spa_sync_close_syncing_log_sm(spa_t * spa)873eda14cbcSMatt Macy spa_sync_close_syncing_log_sm(spa_t *spa)
874eda14cbcSMatt Macy {
875eda14cbcSMatt Macy if (spa_syncing_log_sm(spa) == NULL)
876eda14cbcSMatt Macy return;
877eda14cbcSMatt Macy ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
878eda14cbcSMatt Macy
879eda14cbcSMatt Macy spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
880eda14cbcSMatt Macy ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
881eda14cbcSMatt Macy
882eda14cbcSMatt Macy sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
883eda14cbcSMatt Macy spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
884eda14cbcSMatt Macy
885eda14cbcSMatt Macy /*
886eda14cbcSMatt Macy * Note that we can't assert that sls_mscount is not 0,
887eda14cbcSMatt Macy * because there is the case where the first metaslab
888eda14cbcSMatt Macy * in spa_metaslabs_by_flushed is loading and we were
889eda14cbcSMatt Macy * not able to flush any metaslabs the current TXG.
890eda14cbcSMatt Macy */
891eda14cbcSMatt Macy ASSERT(sls->sls_nblocks != 0);
892eda14cbcSMatt Macy
893eda14cbcSMatt Macy spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
894eda14cbcSMatt Macy spa_log_summary_verify_counts(spa);
895eda14cbcSMatt Macy
896eda14cbcSMatt Macy space_map_close(spa->spa_syncing_log_sm);
897eda14cbcSMatt Macy spa->spa_syncing_log_sm = NULL;
898eda14cbcSMatt Macy
899eda14cbcSMatt Macy /*
900eda14cbcSMatt Macy * At this point we tried to flush as many metaslabs as we
901eda14cbcSMatt Macy * can as the pool is getting exported. Reset the "flush all"
902eda14cbcSMatt Macy * so the last few TXGs before closing the pool can be empty
903eda14cbcSMatt Macy * (e.g. not dirty).
904eda14cbcSMatt Macy */
905eda14cbcSMatt Macy if (spa_flush_all_logs_requested(spa)) {
906eda14cbcSMatt Macy ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
907eda14cbcSMatt Macy spa->spa_log_flushall_txg = 0;
908eda14cbcSMatt Macy }
909eda14cbcSMatt Macy }
910eda14cbcSMatt Macy
911eda14cbcSMatt Macy void
spa_cleanup_old_sm_logs(spa_t * spa,dmu_tx_t * tx)912eda14cbcSMatt Macy spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
913eda14cbcSMatt Macy {
914eda14cbcSMatt Macy objset_t *mos = spa_meta_objset(spa);
915eda14cbcSMatt Macy
916eda14cbcSMatt Macy uint64_t spacemap_zap;
917eda14cbcSMatt Macy int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
918eda14cbcSMatt Macy DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
919eda14cbcSMatt Macy if (error == ENOENT) {
920eda14cbcSMatt Macy ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
921eda14cbcSMatt Macy return;
922eda14cbcSMatt Macy }
923eda14cbcSMatt Macy VERIFY0(error);
924eda14cbcSMatt Macy
925eda14cbcSMatt Macy metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
926eda14cbcSMatt Macy uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
927eda14cbcSMatt Macy
928eda14cbcSMatt Macy /* Free all log space maps older than the oldest_flushed_txg. */
929eda14cbcSMatt Macy for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
930eda14cbcSMatt Macy sls && sls->sls_txg < oldest_flushed_txg;
931eda14cbcSMatt Macy sls = avl_first(&spa->spa_sm_logs_by_txg)) {
932eda14cbcSMatt Macy ASSERT0(sls->sls_mscount);
933eda14cbcSMatt Macy avl_remove(&spa->spa_sm_logs_by_txg, sls);
934eda14cbcSMatt Macy space_map_free_obj(mos, sls->sls_sm_obj, tx);
935eda14cbcSMatt Macy VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
936716fd348SMartin Matuska spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks);
937eda14cbcSMatt Macy spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
938eda14cbcSMatt Macy kmem_free(sls, sizeof (spa_log_sm_t));
939eda14cbcSMatt Macy }
940eda14cbcSMatt Macy }
941eda14cbcSMatt Macy
942eda14cbcSMatt Macy static spa_log_sm_t *
spa_log_sm_alloc(uint64_t sm_obj,uint64_t txg)943eda14cbcSMatt Macy spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
944eda14cbcSMatt Macy {
945eda14cbcSMatt Macy spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
946eda14cbcSMatt Macy sls->sls_sm_obj = sm_obj;
947eda14cbcSMatt Macy sls->sls_txg = txg;
948eda14cbcSMatt Macy return (sls);
949eda14cbcSMatt Macy }
950eda14cbcSMatt Macy
951eda14cbcSMatt Macy void
spa_generate_syncing_log_sm(spa_t * spa,dmu_tx_t * tx)952eda14cbcSMatt Macy spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
953eda14cbcSMatt Macy {
954eda14cbcSMatt Macy uint64_t txg = dmu_tx_get_txg(tx);
955eda14cbcSMatt Macy objset_t *mos = spa_meta_objset(spa);
956eda14cbcSMatt Macy
957eda14cbcSMatt Macy if (spa_syncing_log_sm(spa) != NULL)
958eda14cbcSMatt Macy return;
959eda14cbcSMatt Macy
960eda14cbcSMatt Macy if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
961eda14cbcSMatt Macy return;
962eda14cbcSMatt Macy
963eda14cbcSMatt Macy uint64_t spacemap_zap;
964eda14cbcSMatt Macy int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
965eda14cbcSMatt Macy DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
966eda14cbcSMatt Macy if (error == ENOENT) {
967eda14cbcSMatt Macy ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
968eda14cbcSMatt Macy
969eda14cbcSMatt Macy error = 0;
970eda14cbcSMatt Macy spacemap_zap = zap_create(mos,
971eda14cbcSMatt Macy DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
972eda14cbcSMatt Macy VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
973eda14cbcSMatt Macy DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
974eda14cbcSMatt Macy &spacemap_zap, tx));
975eda14cbcSMatt Macy spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
976eda14cbcSMatt Macy }
977eda14cbcSMatt Macy VERIFY0(error);
978eda14cbcSMatt Macy
979eda14cbcSMatt Macy uint64_t sm_obj;
980eda14cbcSMatt Macy ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
981eda14cbcSMatt Macy ==, ENOENT);
982eda14cbcSMatt Macy sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
983eda14cbcSMatt Macy VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
984eda14cbcSMatt Macy avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
985eda14cbcSMatt Macy
986eda14cbcSMatt Macy /*
987eda14cbcSMatt Macy * We pass UINT64_MAX as the space map's representation size
988eda14cbcSMatt Macy * and SPA_MINBLOCKSHIFT as the shift, to make the space map
989eda14cbcSMatt Macy * accept any sorts of segments since there's no real advantage
990eda14cbcSMatt Macy * to being more restrictive (given that we're already going
991eda14cbcSMatt Macy * to be using 2-word entries).
992eda14cbcSMatt Macy */
993eda14cbcSMatt Macy VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
994eda14cbcSMatt Macy 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
995eda14cbcSMatt Macy
996eda14cbcSMatt Macy spa_log_sm_set_blocklimit(spa);
997eda14cbcSMatt Macy }
998eda14cbcSMatt Macy
999eda14cbcSMatt Macy /*
1000eda14cbcSMatt Macy * Find all the log space maps stored in the space map ZAP and sort
1001eda14cbcSMatt Macy * them by their TXG in spa_sm_logs_by_txg.
1002eda14cbcSMatt Macy */
1003eda14cbcSMatt Macy static int
spa_ld_log_sm_metadata(spa_t * spa)1004eda14cbcSMatt Macy spa_ld_log_sm_metadata(spa_t *spa)
1005eda14cbcSMatt Macy {
1006eda14cbcSMatt Macy int error;
1007eda14cbcSMatt Macy uint64_t spacemap_zap;
1008eda14cbcSMatt Macy
1009eda14cbcSMatt Macy ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
1010eda14cbcSMatt Macy
1011eda14cbcSMatt Macy error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
1012eda14cbcSMatt Macy DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
1013eda14cbcSMatt Macy if (error == ENOENT) {
1014eda14cbcSMatt Macy /* the space map ZAP doesn't exist yet */
1015eda14cbcSMatt Macy return (0);
1016eda14cbcSMatt Macy } else if (error != 0) {
1017eda14cbcSMatt Macy spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
1018eda14cbcSMatt Macy "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
1019eda14cbcSMatt Macy error);
1020eda14cbcSMatt Macy return (error);
1021eda14cbcSMatt Macy }
1022eda14cbcSMatt Macy
1023eda14cbcSMatt Macy zap_cursor_t zc;
10247a7741afSMartin Matuska zap_attribute_t *za = zap_attribute_alloc();
1025eda14cbcSMatt Macy for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
10267a7741afSMartin Matuska (error = zap_cursor_retrieve(&zc, za)) == 0;
1027eda14cbcSMatt Macy zap_cursor_advance(&zc)) {
10287a7741afSMartin Matuska uint64_t log_txg = zfs_strtonum(za->za_name, NULL);
1029eda14cbcSMatt Macy spa_log_sm_t *sls =
10307a7741afSMartin Matuska spa_log_sm_alloc(za->za_first_integer, log_txg);
1031eda14cbcSMatt Macy avl_add(&spa->spa_sm_logs_by_txg, sls);
1032eda14cbcSMatt Macy }
1033eda14cbcSMatt Macy zap_cursor_fini(&zc);
10347a7741afSMartin Matuska zap_attribute_free(za);
1035eda14cbcSMatt Macy if (error != ENOENT) {
1036eda14cbcSMatt Macy spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
1037eda14cbcSMatt Macy "zap_cursor_retrieve(spacemap_zap) [error %d]",
1038eda14cbcSMatt Macy error);
1039eda14cbcSMatt Macy return (error);
1040eda14cbcSMatt Macy }
1041eda14cbcSMatt Macy
1042eda14cbcSMatt Macy for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1043eda14cbcSMatt Macy m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1044eda14cbcSMatt Macy spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
1045eda14cbcSMatt Macy spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
1046eda14cbcSMatt Macy &target, NULL);
1047eda14cbcSMatt Macy
1048eda14cbcSMatt Macy /*
1049eda14cbcSMatt Macy * At this point if sls is zero it means that a bug occurred
1050eda14cbcSMatt Macy * in ZFS the last time the pool was open or earlier in the
1051eda14cbcSMatt Macy * import code path. In general, we would have placed a
1052eda14cbcSMatt Macy * VERIFY() here or in this case just let the kernel panic
1053eda14cbcSMatt Macy * with NULL pointer dereference when incrementing sls_mscount,
1054eda14cbcSMatt Macy * but since this is the import code path we can be a bit more
1055eda14cbcSMatt Macy * lenient. Thus, for DEBUG bits we always cause a panic, while
1056eda14cbcSMatt Macy * in production we log the error and just fail the import.
1057eda14cbcSMatt Macy */
1058eda14cbcSMatt Macy ASSERT(sls != NULL);
1059eda14cbcSMatt Macy if (sls == NULL) {
1060eda14cbcSMatt Macy spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
1061eda14cbcSMatt Macy "encountered: could not find log spacemap for "
10621f88aa09SMartin Matuska "TXG %llu [error %d]",
10631f88aa09SMartin Matuska (u_longlong_t)metaslab_unflushed_txg(m), ENOENT);
1064eda14cbcSMatt Macy return (ENOENT);
1065eda14cbcSMatt Macy }
1066eda14cbcSMatt Macy sls->sls_mscount++;
1067eda14cbcSMatt Macy }
1068eda14cbcSMatt Macy
1069eda14cbcSMatt Macy return (0);
1070eda14cbcSMatt Macy }
1071eda14cbcSMatt Macy
1072eda14cbcSMatt Macy typedef struct spa_ld_log_sm_arg {
1073eda14cbcSMatt Macy spa_t *slls_spa;
1074eda14cbcSMatt Macy uint64_t slls_txg;
1075eda14cbcSMatt Macy } spa_ld_log_sm_arg_t;
1076eda14cbcSMatt Macy
1077eda14cbcSMatt Macy static int
spa_ld_log_sm_cb(space_map_entry_t * sme,void * arg)1078eda14cbcSMatt Macy spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
1079eda14cbcSMatt Macy {
1080eda14cbcSMatt Macy uint64_t offset = sme->sme_offset;
1081eda14cbcSMatt Macy uint64_t size = sme->sme_run;
1082eda14cbcSMatt Macy uint32_t vdev_id = sme->sme_vdev;
1083eda14cbcSMatt Macy
1084eda14cbcSMatt Macy spa_ld_log_sm_arg_t *slls = arg;
1085eda14cbcSMatt Macy spa_t *spa = slls->slls_spa;
1086eda14cbcSMatt Macy
1087eda14cbcSMatt Macy vdev_t *vd = vdev_lookup_top(spa, vdev_id);
1088eda14cbcSMatt Macy
1089eda14cbcSMatt Macy /*
1090eda14cbcSMatt Macy * If the vdev has been removed (i.e. it is indirect or a hole)
1091eda14cbcSMatt Macy * skip this entry. The contents of this vdev have already moved
1092eda14cbcSMatt Macy * elsewhere.
1093eda14cbcSMatt Macy */
1094eda14cbcSMatt Macy if (!vdev_is_concrete(vd))
1095eda14cbcSMatt Macy return (0);
1096eda14cbcSMatt Macy
1097eda14cbcSMatt Macy metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1098eda14cbcSMatt Macy ASSERT(!ms->ms_loaded);
1099eda14cbcSMatt Macy
1100eda14cbcSMatt Macy /*
1101eda14cbcSMatt Macy * If we have already flushed entries for this TXG to this
1102eda14cbcSMatt Macy * metaslab's space map, then ignore it. Note that we flush
1103eda14cbcSMatt Macy * before processing any allocations/frees for that TXG, so
1104eda14cbcSMatt Macy * the metaslab's space map only has entries from *before*
1105eda14cbcSMatt Macy * the unflushed TXG.
1106eda14cbcSMatt Macy */
1107eda14cbcSMatt Macy if (slls->slls_txg < metaslab_unflushed_txg(ms))
1108eda14cbcSMatt Macy return (0);
1109eda14cbcSMatt Macy
1110eda14cbcSMatt Macy switch (sme->sme_type) {
1111eda14cbcSMatt Macy case SM_ALLOC:
1112b59a0cdeSMartin Matuska zfs_range_tree_remove_xor_add_segment(offset, offset + size,
1113eda14cbcSMatt Macy ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
1114eda14cbcSMatt Macy break;
1115eda14cbcSMatt Macy case SM_FREE:
1116b59a0cdeSMartin Matuska zfs_range_tree_remove_xor_add_segment(offset, offset + size,
1117eda14cbcSMatt Macy ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
1118eda14cbcSMatt Macy break;
1119eda14cbcSMatt Macy default:
1120eda14cbcSMatt Macy panic("invalid maptype_t");
1121eda14cbcSMatt Macy break;
1122eda14cbcSMatt Macy }
1123716fd348SMartin Matuska if (!metaslab_unflushed_dirty(ms)) {
1124716fd348SMartin Matuska metaslab_set_unflushed_dirty(ms, B_TRUE);
1125716fd348SMartin Matuska spa_log_summary_dirty_flushed_metaslab(spa,
1126716fd348SMartin Matuska metaslab_unflushed_txg(ms));
1127716fd348SMartin Matuska }
1128eda14cbcSMatt Macy return (0);
1129eda14cbcSMatt Macy }
1130eda14cbcSMatt Macy
1131eda14cbcSMatt Macy static int
spa_ld_log_sm_data(spa_t * spa)1132eda14cbcSMatt Macy spa_ld_log_sm_data(spa_t *spa)
1133eda14cbcSMatt Macy {
1134716fd348SMartin Matuska spa_log_sm_t *sls, *psls;
1135eda14cbcSMatt Macy int error = 0;
1136eda14cbcSMatt Macy
1137eda14cbcSMatt Macy /*
1138eda14cbcSMatt Macy * If we are not going to do any writes there is no need
1139eda14cbcSMatt Macy * to read the log space maps.
1140eda14cbcSMatt Macy */
1141eda14cbcSMatt Macy if (!spa_writeable(spa))
1142eda14cbcSMatt Macy return (0);
1143eda14cbcSMatt Macy
1144eda14cbcSMatt Macy ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
1145eda14cbcSMatt Macy ASSERT0(spa->spa_unflushed_stats.sus_memused);
1146eda14cbcSMatt Macy
1147eda14cbcSMatt Macy hrtime_t read_logs_starttime = gethrtime();
1148716fd348SMartin Matuska
1149716fd348SMartin Matuska /* Prefetch log spacemaps dnodes. */
1150716fd348SMartin Matuska for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
1151716fd348SMartin Matuska sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1152315ee00fSMartin Matuska dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj,
1153315ee00fSMartin Matuska ZIO_PRIORITY_SYNC_READ);
1154716fd348SMartin Matuska }
1155716fd348SMartin Matuska
1156716fd348SMartin Matuska uint_t pn = 0;
1157716fd348SMartin Matuska uint64_t ps = 0;
11583494f7c0SMartin Matuska uint64_t nsm = 0;
1159716fd348SMartin Matuska psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
1160716fd348SMartin Matuska while (sls != NULL) {
1161716fd348SMartin Matuska /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
1162716fd348SMartin Matuska if (psls != NULL && pn < 16 &&
1163716fd348SMartin Matuska (pn < 2 || ps < 2 * dmu_prefetch_max)) {
1164716fd348SMartin Matuska error = space_map_open(&psls->sls_sm,
1165716fd348SMartin Matuska spa_meta_objset(spa), psls->sls_sm_obj, 0,
1166716fd348SMartin Matuska UINT64_MAX, SPA_MINBLOCKSHIFT);
1167eda14cbcSMatt Macy if (error != 0) {
1168716fd348SMartin Matuska spa_load_failed(spa, "spa_ld_log_sm_data(): "
1169716fd348SMartin Matuska "failed at space_map_open(obj=%llu) "
1170716fd348SMartin Matuska "[error %d]",
1171eda14cbcSMatt Macy (u_longlong_t)sls->sls_sm_obj, error);
1172eda14cbcSMatt Macy goto out;
1173eda14cbcSMatt Macy }
1174716fd348SMartin Matuska dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj,
1175716fd348SMartin Matuska 0, 0, space_map_length(psls->sls_sm),
1176716fd348SMartin Matuska ZIO_PRIORITY_ASYNC_READ);
1177716fd348SMartin Matuska pn++;
1178716fd348SMartin Matuska ps += space_map_length(psls->sls_sm);
1179716fd348SMartin Matuska psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls);
1180716fd348SMartin Matuska continue;
1181716fd348SMartin Matuska }
1182716fd348SMartin Matuska
1183716fd348SMartin Matuska /* Load TXG log spacemap into ms_unflushed_allocs/frees. */
1184c7046f76SMartin Matuska kpreempt(KPREEMPT_SYNC);
1185716fd348SMartin Matuska ASSERT0(sls->sls_nblocks);
1186716fd348SMartin Matuska sls->sls_nblocks = space_map_nblocks(sls->sls_sm);
1187716fd348SMartin Matuska spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
1188716fd348SMartin Matuska summary_add_data(spa, sls->sls_txg,
1189716fd348SMartin Matuska sls->sls_mscount, 0, sls->sls_nblocks);
1190eda14cbcSMatt Macy
11913494f7c0SMartin Matuska spa_import_progress_set_notes_nolog(spa,
11923494f7c0SMartin Matuska "Read %llu of %lu log space maps", (u_longlong_t)nsm,
11933494f7c0SMartin Matuska avl_numnodes(&spa->spa_sm_logs_by_txg));
11943494f7c0SMartin Matuska
1195eda14cbcSMatt Macy struct spa_ld_log_sm_arg vla = {
1196eda14cbcSMatt Macy .slls_spa = spa,
1197eda14cbcSMatt Macy .slls_txg = sls->sls_txg
1198eda14cbcSMatt Macy };
1199716fd348SMartin Matuska error = space_map_iterate(sls->sls_sm,
1200716fd348SMartin Matuska space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla);
1201eda14cbcSMatt Macy if (error != 0) {
1202eda14cbcSMatt Macy spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
1203eda14cbcSMatt Macy "at space_map_iterate(obj=%llu) [error %d]",
1204eda14cbcSMatt Macy (u_longlong_t)sls->sls_sm_obj, error);
1205eda14cbcSMatt Macy goto out;
1206eda14cbcSMatt Macy }
1207eda14cbcSMatt Macy
1208716fd348SMartin Matuska pn--;
1209716fd348SMartin Matuska ps -= space_map_length(sls->sls_sm);
12103494f7c0SMartin Matuska nsm++;
1211716fd348SMartin Matuska space_map_close(sls->sls_sm);
1212716fd348SMartin Matuska sls->sls_sm = NULL;
1213716fd348SMartin Matuska sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
1214eda14cbcSMatt Macy
1215716fd348SMartin Matuska /* Update log block limits considering just loaded. */
1216716fd348SMartin Matuska spa_log_sm_set_blocklimit(spa);
1217eda14cbcSMatt Macy }
1218716fd348SMartin Matuska
1219eda14cbcSMatt Macy hrtime_t read_logs_endtime = gethrtime();
1220eda14cbcSMatt Macy spa_load_note(spa,
12213494f7c0SMartin Matuska "Read %lu log space maps (%llu total blocks - blksz = %llu bytes) "
12223494f7c0SMartin Matuska "in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg),
1223eda14cbcSMatt Macy (u_longlong_t)spa_log_sm_nblocks(spa),
1224eda14cbcSMatt Macy (u_longlong_t)zfs_log_sm_blksz,
12253494f7c0SMartin Matuska (longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime));
1226eda14cbcSMatt Macy
1227eda14cbcSMatt Macy out:
1228716fd348SMartin Matuska if (error != 0) {
1229716fd348SMartin Matuska for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
1230716fd348SMartin Matuska sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1231716fd348SMartin Matuska if (sls->sls_sm) {
1232716fd348SMartin Matuska space_map_close(sls->sls_sm);
1233716fd348SMartin Matuska sls->sls_sm = NULL;
1234716fd348SMartin Matuska }
1235716fd348SMartin Matuska }
1236716fd348SMartin Matuska } else {
1237716fd348SMartin Matuska ASSERT0(pn);
1238716fd348SMartin Matuska ASSERT0(ps);
1239716fd348SMartin Matuska }
1240eda14cbcSMatt Macy /*
1241eda14cbcSMatt Macy * Now that the metaslabs contain their unflushed changes:
1242eda14cbcSMatt Macy * [1] recalculate their actual allocated space
1243eda14cbcSMatt Macy * [2] recalculate their weights
1244eda14cbcSMatt Macy * [3] sum up the memory usage of their unflushed range trees
1245eda14cbcSMatt Macy * [4] optionally load them, if debug_load is set
1246eda14cbcSMatt Macy *
1247eda14cbcSMatt Macy * Note that even in the case where we get here because of an
1248eda14cbcSMatt Macy * error (e.g. error != 0), we still want to update the fields
1249eda14cbcSMatt Macy * below in order to have a proper teardown in spa_unload().
1250eda14cbcSMatt Macy */
1251eda14cbcSMatt Macy for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1252eda14cbcSMatt Macy m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1253eda14cbcSMatt Macy mutex_enter(&m->ms_lock);
1254eda14cbcSMatt Macy m->ms_allocated_space = space_map_allocated(m->ms_sm) +
1255b59a0cdeSMartin Matuska zfs_range_tree_space(m->ms_unflushed_allocs) -
1256b59a0cdeSMartin Matuska zfs_range_tree_space(m->ms_unflushed_frees);
1257eda14cbcSMatt Macy
1258eda14cbcSMatt Macy vdev_t *vd = m->ms_group->mg_vd;
1259eda14cbcSMatt Macy metaslab_space_update(vd, m->ms_group->mg_class,
1260b59a0cdeSMartin Matuska zfs_range_tree_space(m->ms_unflushed_allocs), 0, 0);
1261eda14cbcSMatt Macy metaslab_space_update(vd, m->ms_group->mg_class,
1262b59a0cdeSMartin Matuska -zfs_range_tree_space(m->ms_unflushed_frees), 0, 0);
1263eda14cbcSMatt Macy
1264eda14cbcSMatt Macy ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
1265eda14cbcSMatt Macy metaslab_recalculate_weight_and_sort(m);
1266eda14cbcSMatt Macy
1267eda14cbcSMatt Macy spa->spa_unflushed_stats.sus_memused +=
1268eda14cbcSMatt Macy metaslab_unflushed_changes_memused(m);
1269eda14cbcSMatt Macy
1270eda14cbcSMatt Macy if (metaslab_debug_load && m->ms_sm != NULL) {
1271eda14cbcSMatt Macy VERIFY0(metaslab_load(m));
1272eda14cbcSMatt Macy metaslab_set_selected_txg(m, 0);
1273eda14cbcSMatt Macy }
1274eda14cbcSMatt Macy mutex_exit(&m->ms_lock);
1275eda14cbcSMatt Macy }
1276eda14cbcSMatt Macy
1277eda14cbcSMatt Macy return (error);
1278eda14cbcSMatt Macy }
1279eda14cbcSMatt Macy
1280eda14cbcSMatt Macy static int
spa_ld_unflushed_txgs(vdev_t * vd)1281eda14cbcSMatt Macy spa_ld_unflushed_txgs(vdev_t *vd)
1282eda14cbcSMatt Macy {
1283eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa;
1284eda14cbcSMatt Macy objset_t *mos = spa_meta_objset(spa);
1285eda14cbcSMatt Macy
1286eda14cbcSMatt Macy if (vd->vdev_top_zap == 0)
1287eda14cbcSMatt Macy return (0);
1288eda14cbcSMatt Macy
1289eda14cbcSMatt Macy uint64_t object = 0;
1290eda14cbcSMatt Macy int error = zap_lookup(mos, vd->vdev_top_zap,
1291eda14cbcSMatt Macy VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
1292eda14cbcSMatt Macy sizeof (uint64_t), 1, &object);
1293eda14cbcSMatt Macy if (error == ENOENT)
1294eda14cbcSMatt Macy return (0);
1295eda14cbcSMatt Macy else if (error != 0) {
1296eda14cbcSMatt Macy spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
1297eda14cbcSMatt Macy "zap_lookup(vdev_top_zap=%llu) [error %d]",
1298eda14cbcSMatt Macy (u_longlong_t)vd->vdev_top_zap, error);
1299eda14cbcSMatt Macy return (error);
1300eda14cbcSMatt Macy }
1301eda14cbcSMatt Macy
1302eda14cbcSMatt Macy for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
1303eda14cbcSMatt Macy metaslab_t *ms = vd->vdev_ms[m];
1304eda14cbcSMatt Macy ASSERT(ms != NULL);
1305eda14cbcSMatt Macy
1306eda14cbcSMatt Macy metaslab_unflushed_phys_t entry;
1307eda14cbcSMatt Macy uint64_t entry_size = sizeof (entry);
1308eda14cbcSMatt Macy uint64_t entry_offset = ms->ms_id * entry_size;
1309eda14cbcSMatt Macy
1310eda14cbcSMatt Macy error = dmu_read(mos, object,
1311eda14cbcSMatt Macy entry_offset, entry_size, &entry, 0);
1312eda14cbcSMatt Macy if (error != 0) {
1313eda14cbcSMatt Macy spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
1314eda14cbcSMatt Macy "failed at dmu_read(obj=%llu) [error %d]",
1315eda14cbcSMatt Macy (u_longlong_t)object, error);
1316eda14cbcSMatt Macy return (error);
1317eda14cbcSMatt Macy }
1318eda14cbcSMatt Macy
1319eda14cbcSMatt Macy ms->ms_unflushed_txg = entry.msp_unflushed_txg;
1320716fd348SMartin Matuska ms->ms_unflushed_dirty = B_FALSE;
1321b59a0cdeSMartin Matuska ASSERT(zfs_range_tree_is_empty(ms->ms_unflushed_allocs));
1322b59a0cdeSMartin Matuska ASSERT(zfs_range_tree_is_empty(ms->ms_unflushed_frees));
1323eda14cbcSMatt Macy if (ms->ms_unflushed_txg != 0) {
1324eda14cbcSMatt Macy mutex_enter(&spa->spa_flushed_ms_lock);
1325eda14cbcSMatt Macy avl_add(&spa->spa_metaslabs_by_flushed, ms);
1326eda14cbcSMatt Macy mutex_exit(&spa->spa_flushed_ms_lock);
1327eda14cbcSMatt Macy }
1328eda14cbcSMatt Macy }
1329eda14cbcSMatt Macy return (0);
1330eda14cbcSMatt Macy }
1331eda14cbcSMatt Macy
1332eda14cbcSMatt Macy /*
1333eda14cbcSMatt Macy * Read all the log space map entries into their respective
1334eda14cbcSMatt Macy * metaslab unflushed trees and keep them sorted by TXG in the
1335eda14cbcSMatt Macy * SPA's metadata. In addition, setup all the metadata for the
1336eda14cbcSMatt Macy * memory and the block heuristics.
1337eda14cbcSMatt Macy */
1338eda14cbcSMatt Macy int
spa_ld_log_spacemaps(spa_t * spa)1339eda14cbcSMatt Macy spa_ld_log_spacemaps(spa_t *spa)
1340eda14cbcSMatt Macy {
1341eda14cbcSMatt Macy int error;
1342eda14cbcSMatt Macy
1343eda14cbcSMatt Macy spa_log_sm_set_blocklimit(spa);
1344eda14cbcSMatt Macy
1345eda14cbcSMatt Macy for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
1346eda14cbcSMatt Macy vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
1347eda14cbcSMatt Macy error = spa_ld_unflushed_txgs(vd);
1348eda14cbcSMatt Macy if (error != 0)
1349eda14cbcSMatt Macy return (error);
1350eda14cbcSMatt Macy }
1351eda14cbcSMatt Macy
1352eda14cbcSMatt Macy error = spa_ld_log_sm_metadata(spa);
1353eda14cbcSMatt Macy if (error != 0)
1354eda14cbcSMatt Macy return (error);
1355eda14cbcSMatt Macy
1356eda14cbcSMatt Macy /*
1357eda14cbcSMatt Macy * Note: we don't actually expect anything to change at this point
1358eda14cbcSMatt Macy * but we grab the config lock so we don't fail any assertions
1359eda14cbcSMatt Macy * when using vdev_lookup_top().
1360eda14cbcSMatt Macy */
1361eda14cbcSMatt Macy spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1362eda14cbcSMatt Macy error = spa_ld_log_sm_data(spa);
1363eda14cbcSMatt Macy spa_config_exit(spa, SCL_CONFIG, FTAG);
1364eda14cbcSMatt Macy
1365eda14cbcSMatt Macy return (error);
1366eda14cbcSMatt Macy }
1367eda14cbcSMatt Macy
1368dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, U64, ZMOD_RW,
1369eda14cbcSMatt Macy "Specific hard-limit in memory that ZFS allows to be used for "
1370eda14cbcSMatt Macy "unflushed changes");
1371eda14cbcSMatt Macy
1372dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, U64, ZMOD_RW,
1373eda14cbcSMatt Macy "Percentage of the overall system memory that ZFS allows to be "
1374eda14cbcSMatt Macy "used for unflushed changes (value is calculated over 1000000 for "
137516038816SMartin Matuska "finer granularity)");
1376eda14cbcSMatt Macy
1377dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, U64, ZMOD_RW,
1378eda14cbcSMatt Macy "Hard limit (upper-bound) in the size of the space map log "
1379eda14cbcSMatt Macy "in terms of blocks.");
1380eda14cbcSMatt Macy
1381dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, U64, ZMOD_RW,
1382eda14cbcSMatt Macy "Lower-bound limit for the maximum amount of blocks allowed in "
1383eda14cbcSMatt Macy "log spacemap (see zfs_unflushed_log_block_max)");
1384eda14cbcSMatt Macy
1385dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, U64, ZMOD_RW,
1386716fd348SMartin Matuska "Hard limit (upper-bound) in the size of the space map log "
1387716fd348SMartin Matuska "in terms of dirty TXGs.");
1388716fd348SMartin Matuska
1389dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, UINT, ZMOD_RW,
1390eda14cbcSMatt Macy "Tunable used to determine the number of blocks that can be used for "
1391eda14cbcSMatt Macy "the spacemap log, expressed as a percentage of the total number of "
1392eda14cbcSMatt Macy "metaslabs in the pool (e.g. 400 means the number of log blocks is "
1393eda14cbcSMatt Macy "capped at 4 times the number of metaslabs)");
1394eda14cbcSMatt Macy
1395dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, U64, ZMOD_RW,
1396eda14cbcSMatt Macy "The number of past TXGs that the flushing algorithm of the log "
1397eda14cbcSMatt Macy "spacemap feature uses to estimate incoming log blocks");
1398eda14cbcSMatt Macy
1399c03c5b1cSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW,
1400c03c5b1cSMartin Matuska "Prevent the log spacemaps from being flushed and destroyed "
1401c03c5b1cSMartin Matuska "during pool export/destroy");
1402c03c5b1cSMartin Matuska
1403dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, U64, ZMOD_RW,
1404eda14cbcSMatt Macy "Maximum number of rows allowed in the summary of the spacemap log");
1405eda14cbcSMatt Macy
1406dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, U64, ZMOD_RW,
1407eda14cbcSMatt Macy "Minimum number of metaslabs to flush per dirty TXG");
1408