1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
25 */
26
27 #include <sys/dmu_objset.h>
28 #include <sys/metaslab.h>
29 #include <sys/metaslab_impl.h>
30 #include <sys/spa.h>
31 #include <sys/spa_impl.h>
32 #include <sys/spa_log_spacemap.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/zap.h>
35
36 /*
37 * Log Space Maps
38 *
39 * Log space maps are an optimization in ZFS metadata allocations for pools
40 * whose workloads are primarily random-writes. Random-write workloads are also
41 * typically random-free, meaning that they are freeing from locations scattered
42 * throughout the pool. This means that each TXG we will have to append some
43 * FREE records to almost every metaslab. With log space maps, we hold their
44 * changes in memory and log them altogether in one pool-wide space map on-disk
45 * for persistence. As more blocks are accumulated in the log space maps and
46 * more unflushed changes are accounted in memory, we flush a selected group
47 * of metaslabs every TXG to relieve memory pressure and potential overheads
48 * when loading the pool. Flushing a metaslab to disk relieves memory as we
49 * flush any unflushed changes from memory to disk (i.e. the metaslab's space
50 * map) and saves import time by making old log space maps obsolete and
51 * eventually destroying them. [A log space map is said to be obsolete when all
52 * its entries have made it to their corresponding metaslab space maps].
53 *
54 * == On disk data structures used ==
55 *
56 * - The pool has a new feature flag and a new entry in the MOS. The feature
57 * is activated when we create the first log space map and remains active
58 * for the lifetime of the pool. The new entry in the MOS Directory [refer
59 * to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
60 * pairs are of the form <key: txg, value: log space map object for that txg>.
61 * This entry is our on-disk reference of the log space maps that exist in
62 * the pool for each TXG and it is used during import to load all the
63 * metaslab unflushed changes in memory. To see how this structure is first
64 * created and later populated refer to spa_generate_syncing_log_sm(). To see
65 * how it is used during import time refer to spa_ld_log_sm_metadata().
66 *
67 * - Each vdev has a new entry in its vdev_top_zap (see field
68 * VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
69 * each metaslab in this vdev. This field is the on-disk counterpart of the
70 * in-memory field ms_unflushed_txg which tells us from which TXG and onwards
71 * the metaslab haven't had its changes flushed. During import, we use this
72 * to ignore any entries in the space map log that are for this metaslab but
73 * from a TXG before msp_unflushed_txg. At that point, we also populate its
74 * in-memory counterpart and from there both fields are updated every time
75 * we flush that metaslab.
76 *
77 * - A space map is created every TXG and, during that TXG, it is used to log
78 * all incoming changes (the log space map). When created, the log space map
79 * is referenced in memory by spa_syncing_log_sm and its object ID is inserted
80 * to the space map ZAP mentioned above. The log space map is closed at the
81 * end of the TXG and will be destroyed when it becomes fully obsolete. We
82 * know when a log space map has become obsolete by looking at the oldest
83 * (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
84 * than the log space map's TXG, then it means that there is no metaslab who
85 * doesn't have the changes from that log and we can therefore destroy it.
86 * [see spa_cleanup_old_sm_logs()].
87 *
88 * == Important in-memory structures ==
89 *
90 * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
91 * the pool by their ms_unflushed_txg field. It is primarily used for three
92 * reasons. First of all, it is used during flushing where we try to flush
93 * metaslabs in-order from the oldest-flushed to the most recently flushed
94 * every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
95 * oldest flushed metaslab to distinguish which log space maps have become
96 * obsolete and which ones are still relevant. Finally it tells us which
97 * metaslabs have unflushed changes in a pool where this feature was just
98 * enabled, as we don't immediately add all of the pool's metaslabs but we
99 * add them over time as they go through metaslab_sync(). The reason that
100 * we do that is to ease these pools into the behavior of the flushing
101 * algorithm (described later on).
102 *
103 * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
104 * counterpart of the space map ZAP mentioned above. It's an AVL tree whose
105 * nodes represent the log space maps in the pool. This in-memory
106 * representation of log space maps in the pool sorts the log space maps by
107 * the TXG that they were created (which is also the TXG of their unflushed
108 * changes). It also contains the following extra information for each
109 * space map:
110 * [1] The number of metaslabs that were last flushed on that TXG. This is
111 * important because if that counter is zero and this is the oldest
112 * log then it means that it is also obsolete.
113 * [2] The number of blocks of that space map. This field is used by the
114 * block heuristic of our flushing algorithm (described later on).
115 * It represents how many blocks of metadata changes ZFS had to write
116 * to disk for that TXG.
117 *
118 * - The per-spa field spa_log_summary is a list of entries that summarizes
119 * the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
120 * AVL tree mentioned above. The reason this exists is that our flushing
121 * algorithm (described later) tries to estimate how many metaslabs to flush
122 * in each TXG by iterating over all the log space maps and looking at their
123 * block counts. Summarizing that information means that don't have to
124 * iterate through each space map, minimizing the runtime overhead of the
125 * flushing algorithm which would be induced in syncing context. In terms of
126 * implementation the log summary is used as a queue:
127 * * we modify or pop entries from its head when we flush metaslabs
128 * * we modify or append entries to its tail when we sync changes.
129 *
130 * - Each metaslab has two new range trees that hold its unflushed changes,
131 * ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
132 *
133 * == Flushing algorithm ==
134 *
135 * The decision of how many metaslabs to flush on a give TXG is guided by
136 * two heuristics:
137 *
138 * [1] The memory heuristic -
139 * We keep track of the memory used by the unflushed trees from all the
140 * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
141 * stays below a certain threshold which is determined by an arbitrary hard
142 * limit and an arbitrary percentage of the system's memory [see
143 * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
144 * unflushed changes are passing that threshold, we flush metaslabs, which
145 * empties their unflushed range trees, reducing the memory used.
146 *
147 * [2] The block heuristic -
148 * We try to keep the total number of blocks in the log space maps in check
149 * so the log doesn't grow indefinitely and we don't induce a lot of overhead
150 * when loading the pool. At the same time we don't want to flush a lot of
151 * metaslabs too often as this would defeat the purpose of the log space map.
152 * As a result we set a limit in the amount of blocks that we think it's
153 * acceptable for the log space maps to have and try not to cross it.
154 * [see sus_blocklimit from spa_unflushed_stats].
155 *
156 * In order to stay below the block limit every TXG we have to estimate how
157 * many metaslabs we need to flush based on the current rate of incoming blocks
158 * and our history of log space map blocks. The main idea here is to answer
159 * the question of how many metaslabs do we need to flush in order to get rid
160 * at least an X amount of log space map blocks. We can answer this question
161 * by iterating backwards from the oldest log space map to the newest one
162 * and looking at their metaslab and block counts. At this point the log summary
163 * mentioned above comes handy as it reduces the amount of things that we have
164 * to iterate (even though it may reduce the preciseness of our estimates due
165 * to its aggregation of data). So with that in mind, we project the incoming
166 * rate of the current TXG into the future and attempt to approximate how many
167 * metaslabs would we need to flush from now in order to avoid exceeding our
168 * block limit in different points in the future (granted that we would keep
169 * flushing the same number of metaslabs for every TXG). Then we take the
170 * maximum number from all these estimates to be on the safe side. For the
171 * exact implementation details of algorithm refer to
172 * spa_estimate_metaslabs_to_flush.
173 */
174
175 /*
176 * This is used as the block size for the space maps used for the
177 * log space map feature. These space maps benefit from a bigger
178 * block size as we expect to be writing a lot of data to them at
179 * once.
180 */
181 static const unsigned long zfs_log_sm_blksz = 1ULL << 17;
182
183 /*
184 * Percentage of the overall system's memory that ZFS allows to be
185 * used for unflushed changes (e.g. the sum of size of all the nodes
186 * in the unflushed trees).
187 *
188 * Note that this value is calculated over 1000000 for finer granularity
189 * (thus the _ppm suffix; reads as "parts per million"). As an example,
190 * the default of 1000 allows 0.1% of memory to be used.
191 */
192 static uint64_t zfs_unflushed_max_mem_ppm = 1000;
193
194 /*
195 * Specific hard-limit in memory that ZFS allows to be used for
196 * unflushed changes.
197 */
198 static uint64_t zfs_unflushed_max_mem_amt = 1ULL << 30;
199
200 /*
201 * The following tunable determines the number of blocks that can be used for
202 * the log space maps. It is expressed as a percentage of the total number of
203 * metaslabs in the pool (i.e. the default of 400 means that the number of log
204 * blocks is capped at 4 times the number of metaslabs).
205 *
206 * This value exists to tune our flushing algorithm, with higher values
207 * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
208 * flushing metaslabs more aggressively with the upside of saving overheads
209 * when loading the pool. Another factor in this tradeoff is that flushing
210 * less often can potentially lead to better utilization of the metaslab space
211 * map's block size as we accumulate more changes per flush.
212 *
213 * Given that this tunable indirectly controls the flush rate (metaslabs
214 * flushed per txg) and that's why making it a percentage in terms of the
215 * number of metaslabs in the pool makes sense here.
216 *
217 * As a rule of thumb we default this tunable to 400% based on the following:
218 *
219 * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
220 * it is reasonable to expect that the amount of obsolete entries changes
221 * linearly from txg to txg (e.g. the oldest log should have the most
222 * obsolete entries, and the most recent one the least). With this we could
223 * say that, at any given time, about half of the entries in the whole space
224 * map log are obsolete. Thus for every two entries for a metaslab in the
225 * log space map, only one of them is valid and actually makes it to the
226 * metaslab's space map.
227 * [factor of 2]
228 * 2] Each entry in the log space map is guaranteed to be two words while
229 * entries in metaslab space maps are generally single-word.
230 * [an extra factor of 2 - 400% overall]
231 * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
232 * account any consolidation of segments from the log space map to the
233 * unflushed range trees nor their history (e.g. a segment being allocated,
234 * then freed, then allocated again means 3 log space map entries but 0
235 * metaslab space map entries). Depending on the workload, we've seen ~1.8
236 * non-obsolete log space map entries per metaslab entry, for a total of
237 * ~600%. Since most of these estimates though are workload dependent, we
238 * default on 400% to be conservative.
239 *
240 * Thus we could say that even in the worst
241 * case of [1] and [2], the factor should end up being 4.
242 *
243 * That said, regardless of the number of metaslabs in the pool we need to
244 * provide upper and lower bounds for the log block limit.
245 * [see zfs_unflushed_log_block_{min,max}]
246 */
247 static uint_t zfs_unflushed_log_block_pct = 400;
248
249 /*
250 * If the number of metaslabs is small and our incoming rate is high, we could
251 * get into a situation that we are flushing all our metaslabs every TXG. Thus
252 * we always allow at least this many log blocks.
253 */
254 static uint64_t zfs_unflushed_log_block_min = 1000;
255
256 /*
257 * If the log becomes too big, the import time of the pool can take a hit in
258 * terms of performance. Thus we have a hard limit in the size of the log in
259 * terms of blocks.
260 */
261 static uint64_t zfs_unflushed_log_block_max = (1ULL << 17);
262
263 /*
264 * Also we have a hard limit in the size of the log in terms of dirty TXGs.
265 */
266 static uint64_t zfs_unflushed_log_txg_max = 1000;
267
268 /*
269 * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
270 * stability of the flushing algorithm (longer summary) vs its runtime overhead
271 * (smaller summary is faster to traverse).
272 */
273 static uint64_t zfs_max_logsm_summary_length = 10;
274
275 /*
276 * Tunable that sets the lower bound on the metaslabs to flush every TXG.
277 *
278 * Setting this to 0 has no effect since if the pool is idle we won't even be
279 * creating log space maps and therefore we won't be flushing. On the other
280 * hand if the pool has any incoming workload our block heuristic will start
281 * flushing metaslabs anyway.
282 *
283 * The point of this tunable is to be used in extreme cases where we really
284 * want to flush more metaslabs than our adaptable heuristic plans to flush.
285 */
286 static uint64_t zfs_min_metaslabs_to_flush = 1;
287
288 /*
289 * Tunable that specifies how far in the past do we want to look when trying to
290 * estimate the incoming log blocks for the current TXG.
291 *
292 * Setting this too high may not only increase runtime but also minimize the
293 * effect of the incoming rates from the most recent TXGs as we take the
294 * average over all the blocks that we walk
295 * [see spa_estimate_incoming_log_blocks].
296 */
297 static uint64_t zfs_max_log_walking = 5;
298
299 /*
300 * This tunable exists solely for testing purposes. It ensures that the log
301 * spacemaps are not flushed and destroyed during export in order for the
302 * relevant log spacemap import code paths to be tested (effectively simulating
303 * a crash).
304 */
305 int zfs_keep_log_spacemaps_at_export = 0;
306
307 static uint64_t
spa_estimate_incoming_log_blocks(spa_t * spa)308 spa_estimate_incoming_log_blocks(spa_t *spa)
309 {
310 ASSERT3U(spa_sync_pass(spa), ==, 1);
311 uint64_t steps = 0, sum = 0;
312 for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
313 sls != NULL && steps < zfs_max_log_walking;
314 sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
315 if (sls->sls_txg == spa_syncing_txg(spa)) {
316 /*
317 * skip the log created in this TXG as this would
318 * make our estimations inaccurate.
319 */
320 continue;
321 }
322 sum += sls->sls_nblocks;
323 steps++;
324 }
325 return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
326 }
327
328 uint64_t
spa_log_sm_blocklimit(spa_t * spa)329 spa_log_sm_blocklimit(spa_t *spa)
330 {
331 return (spa->spa_unflushed_stats.sus_blocklimit);
332 }
333
334 void
spa_log_sm_set_blocklimit(spa_t * spa)335 spa_log_sm_set_blocklimit(spa_t *spa)
336 {
337 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
338 ASSERT0(spa_log_sm_blocklimit(spa));
339 return;
340 }
341
342 uint64_t msdcount = 0;
343 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
344 e; e = list_next(&spa->spa_log_summary, e))
345 msdcount += e->lse_msdcount;
346
347 uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100;
348 spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit,
349 zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
350 }
351
352 uint64_t
spa_log_sm_nblocks(spa_t * spa)353 spa_log_sm_nblocks(spa_t *spa)
354 {
355 return (spa->spa_unflushed_stats.sus_nblocks);
356 }
357
358 /*
359 * Ensure that the in-memory log space map structures and the summary
360 * have the same block and metaslab counts.
361 */
362 static void
spa_log_summary_verify_counts(spa_t * spa)363 spa_log_summary_verify_counts(spa_t *spa)
364 {
365 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
366
367 if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
368 return;
369
370 uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
371
372 uint64_t ms_in_summary = 0, blk_in_summary = 0;
373 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
374 e; e = list_next(&spa->spa_log_summary, e)) {
375 ms_in_summary += e->lse_mscount;
376 blk_in_summary += e->lse_blkcount;
377 }
378
379 uint64_t ms_in_logs = 0, blk_in_logs = 0;
380 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
381 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
382 ms_in_logs += sls->sls_mscount;
383 blk_in_logs += sls->sls_nblocks;
384 }
385
386 VERIFY3U(ms_in_logs, ==, ms_in_summary);
387 VERIFY3U(ms_in_logs, ==, ms_in_avl);
388 VERIFY3U(blk_in_logs, ==, blk_in_summary);
389 VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
390 }
391
392 static boolean_t
summary_entry_is_full(spa_t * spa,log_summary_entry_t * e,uint64_t txg)393 summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg)
394 {
395 if (e->lse_end == txg)
396 return (0);
397 if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max,
398 zfs_max_logsm_summary_length))
399 return (1);
400 uint64_t blocks_per_row = MAX(1,
401 DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
402 zfs_max_logsm_summary_length));
403 return (blocks_per_row <= e->lse_blkcount);
404 }
405
406 /*
407 * Update the log summary information to reflect the fact that a metaslab
408 * was flushed or destroyed (e.g due to device removal or pool export/destroy).
409 *
410 * We typically flush the oldest flushed metaslab so the first (and oldest)
411 * entry of the summary is updated. However if that metaslab is getting loaded
412 * we may flush the second oldest one which may be part of an entry later in
413 * the summary. Moreover, if we call into this function from metaslab_fini()
414 * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
415 * for a txg as an argument so we can locate the appropriate summary entry for
416 * the metaslab.
417 */
418 void
spa_log_summary_decrement_mscount(spa_t * spa,uint64_t txg,boolean_t dirty)419 spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty)
420 {
421 /*
422 * We don't track summary data for read-only pools and this function
423 * can be called from metaslab_fini(). In that case return immediately.
424 */
425 if (!spa_writeable(spa))
426 return;
427
428 log_summary_entry_t *target = NULL;
429 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
430 e != NULL; e = list_next(&spa->spa_log_summary, e)) {
431 if (e->lse_start > txg)
432 break;
433 target = e;
434 }
435
436 if (target == NULL || target->lse_mscount == 0) {
437 /*
438 * We didn't find a summary entry for this metaslab. We must be
439 * at the teardown of a spa_load() attempt that got an error
440 * while reading the log space maps.
441 */
442 VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
443 return;
444 }
445
446 target->lse_mscount--;
447 if (dirty)
448 target->lse_msdcount--;
449 }
450
451 /*
452 * Update the log summary information to reflect the fact that we destroyed
453 * old log space maps. Since we can only destroy the oldest log space maps,
454 * we decrement the block count of the oldest summary entry and potentially
455 * destroy it when that count hits 0.
456 *
457 * This function is called after a metaslab is flushed and typically that
458 * metaslab is the oldest flushed, which means that this function will
459 * typically decrement the block count of the first entry of the summary and
460 * potentially free it if the block count gets to zero (its metaslab count
461 * should be zero too at that point).
462 *
463 * There are certain scenarios though that don't work exactly like that so we
464 * need to account for them:
465 *
466 * Scenario [1]: It is possible that after we flushed the oldest flushed
467 * metaslab and we destroyed the oldest log space map, more recent logs had 0
468 * metaslabs pointing to them so we got rid of them too. This can happen due
469 * to metaslabs being destroyed through device removal, or because the oldest
470 * flushed metaslab was loading but we kept flushing more recently flushed
471 * metaslabs due to the memory pressure of unflushed changes. Because of that,
472 * we always iterate from the beginning of the summary and if blocks_gone is
473 * bigger than the block_count of the current entry we free that entry (we
474 * expect its metaslab count to be zero), we decrement blocks_gone and on to
475 * the next entry repeating this procedure until blocks_gone gets decremented
476 * to 0. Doing this also works for the typical case mentioned above.
477 *
478 * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
479 * the first (and oldest) entry in the summary. If the first few entries of
480 * the summary were only accounting metaslabs from a device that was just
481 * removed, then the current oldest flushed metaslab could be accounted by an
482 * entry somewhere in the middle of the summary. Moreover flushing that
483 * metaslab will destroy all the log space maps older than its ms_unflushed_txg
484 * because they became obsolete after the removal. Thus, iterating as we did
485 * for scenario [1] works out for this case too.
486 *
487 * Scenario [3]: At times we decide to flush all the metaslabs in the pool
488 * in one TXG (either because we are exporting the pool or because our flushing
489 * heuristics decided to do so). When that happens all the log space maps get
490 * destroyed except the one created for the current TXG which doesn't have
491 * any log blocks yet. As log space maps get destroyed with every metaslab that
492 * we flush, entries in the summary are also destroyed. This brings a weird
493 * corner-case when we flush the last metaslab and the log space map of the
494 * current TXG is in the same summary entry with other log space maps that
495 * are older. When that happens we are eventually left with this one last
496 * summary entry whose blocks are gone (blocks_gone equals the entry's block
497 * count) but its metaslab count is non-zero (because it accounts all the
498 * metaslabs in the pool as they all got flushed). Under this scenario we can't
499 * free this last summary entry as it's referencing all the metaslabs in the
500 * pool and its block count will get incremented at the end of this sync (when
501 * we close the syncing log space map). Thus we just decrement its current
502 * block count and leave it alone. In the case that the pool gets exported,
503 * its metaslab count will be decremented over time as we call metaslab_fini()
504 * for all the metaslabs in the pool and the entry will be freed at
505 * spa_unload_log_sm_metadata().
506 */
507 void
spa_log_summary_decrement_blkcount(spa_t * spa,uint64_t blocks_gone)508 spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
509 {
510 log_summary_entry_t *e = list_head(&spa->spa_log_summary);
511 ASSERT3P(e, !=, NULL);
512 if (e->lse_txgcount > 0)
513 e->lse_txgcount--;
514 for (; e != NULL; e = list_head(&spa->spa_log_summary)) {
515 if (e->lse_blkcount > blocks_gone) {
516 e->lse_blkcount -= blocks_gone;
517 blocks_gone = 0;
518 break;
519 } else if (e->lse_mscount == 0) {
520 /* remove obsolete entry */
521 blocks_gone -= e->lse_blkcount;
522 list_remove(&spa->spa_log_summary, e);
523 kmem_free(e, sizeof (log_summary_entry_t));
524 } else {
525 /* Verify that this is scenario [3] mentioned above. */
526 VERIFY3U(blocks_gone, ==, e->lse_blkcount);
527
528 /*
529 * Assert that this is scenario [3] further by ensuring
530 * that this is the only entry in the summary.
531 */
532 VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
533 ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
534
535 blocks_gone = e->lse_blkcount = 0;
536 break;
537 }
538 }
539
540 /*
541 * Ensure that there is no way we are trying to remove more blocks
542 * than the # of blocks in the summary.
543 */
544 ASSERT0(blocks_gone);
545 }
546
547 void
spa_log_sm_decrement_mscount(spa_t * spa,uint64_t txg)548 spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
549 {
550 spa_log_sm_t target = { .sls_txg = txg };
551 spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
552 &target, NULL);
553
554 if (sls == NULL) {
555 /*
556 * We must be at the teardown of a spa_load() attempt that
557 * got an error while reading the log space maps.
558 */
559 VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
560 return;
561 }
562
563 ASSERT(sls->sls_mscount > 0);
564 sls->sls_mscount--;
565 }
566
567 void
spa_log_sm_increment_current_mscount(spa_t * spa)568 spa_log_sm_increment_current_mscount(spa_t *spa)
569 {
570 spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
571 ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
572 last_sls->sls_mscount++;
573 }
574
575 static void
summary_add_data(spa_t * spa,uint64_t txg,uint64_t metaslabs_flushed,uint64_t metaslabs_dirty,uint64_t nblocks)576 summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
577 uint64_t metaslabs_dirty, uint64_t nblocks)
578 {
579 log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
580
581 if (e == NULL || summary_entry_is_full(spa, e, txg)) {
582 e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
583 e->lse_start = e->lse_end = txg;
584 e->lse_txgcount = 1;
585 list_insert_tail(&spa->spa_log_summary, e);
586 }
587
588 ASSERT3U(e->lse_start, <=, txg);
589 if (e->lse_end < txg) {
590 e->lse_end = txg;
591 e->lse_txgcount++;
592 }
593 e->lse_mscount += metaslabs_flushed;
594 e->lse_msdcount += metaslabs_dirty;
595 e->lse_blkcount += nblocks;
596 }
597
598 static void
spa_log_summary_add_incoming_blocks(spa_t * spa,uint64_t nblocks)599 spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
600 {
601 summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks);
602 }
603
604 void
spa_log_summary_add_flushed_metaslab(spa_t * spa,boolean_t dirty)605 spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty)
606 {
607 summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0);
608 }
609
610 void
spa_log_summary_dirty_flushed_metaslab(spa_t * spa,uint64_t txg)611 spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg)
612 {
613 log_summary_entry_t *target = NULL;
614 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
615 e != NULL; e = list_next(&spa->spa_log_summary, e)) {
616 if (e->lse_start > txg)
617 break;
618 target = e;
619 }
620 ASSERT3P(target, !=, NULL);
621 ASSERT3U(target->lse_mscount, !=, 0);
622 target->lse_msdcount++;
623 }
624
625 /*
626 * This function attempts to estimate how many metaslabs should
627 * we flush to satisfy our block heuristic for the log spacemap
628 * for the upcoming TXGs.
629 *
630 * Specifically, it first tries to estimate the number of incoming
631 * blocks in this TXG. Then by projecting that incoming rate to
632 * future TXGs and using the log summary, it figures out how many
633 * flushes we would need to do for future TXGs individually to
634 * stay below our block limit and returns the maximum number of
635 * flushes from those estimates.
636 */
637 static uint64_t
spa_estimate_metaslabs_to_flush(spa_t * spa)638 spa_estimate_metaslabs_to_flush(spa_t *spa)
639 {
640 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
641 ASSERT3U(spa_sync_pass(spa), ==, 1);
642 ASSERT(spa_log_sm_blocklimit(spa) != 0);
643
644 /*
645 * This variable contains the incoming rate that will be projected
646 * and used for our flushing estimates in the future.
647 */
648 uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
649
650 /*
651 * At any point in time this variable tells us how many
652 * TXGs in the future we are so we can make our estimations.
653 */
654 uint64_t txgs_in_future = 1;
655
656 /*
657 * This variable tells us how much room do we have until we hit
658 * our limit. When it goes negative, it means that we've exceeded
659 * our limit and we need to flush.
660 *
661 * Note that since we start at the first TXG in the future (i.e.
662 * txgs_in_future starts from 1) we already decrement this
663 * variable by the incoming rate.
664 */
665 int64_t available_blocks =
666 spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
667
668 int64_t available_txgs = zfs_unflushed_log_txg_max;
669 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
670 e; e = list_next(&spa->spa_log_summary, e))
671 available_txgs -= e->lse_txgcount;
672
673 /*
674 * This variable tells us the total number of flushes needed to
675 * keep the log size within the limit when we reach txgs_in_future.
676 */
677 uint64_t total_flushes = 0;
678
679 /* Holds the current maximum of our estimates so far. */
680 uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush;
681
682 /*
683 * For our estimations we only look as far in the future
684 * as the summary allows us.
685 */
686 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
687 e; e = list_next(&spa->spa_log_summary, e)) {
688
689 /*
690 * If there is still room before we exceed our limit
691 * then keep skipping TXGs accumulating more blocks
692 * based on the incoming rate until we exceed it.
693 */
694 if (available_blocks >= 0 && available_txgs >= 0) {
695 uint64_t skip_txgs = (incoming == 0) ?
696 available_txgs + 1 : MIN(available_txgs + 1,
697 (available_blocks / incoming) + 1);
698 available_blocks -= (skip_txgs * incoming);
699 available_txgs -= skip_txgs;
700 txgs_in_future += skip_txgs;
701 ASSERT3S(available_blocks, >=, -incoming);
702 ASSERT3S(available_txgs, >=, -1);
703 }
704
705 /*
706 * At this point we're far enough into the future where
707 * the limit was just exceeded and we flush metaslabs
708 * based on the current entry in the summary, updating
709 * our available_blocks.
710 */
711 ASSERT(available_blocks < 0 || available_txgs < 0);
712 available_blocks += e->lse_blkcount;
713 available_txgs += e->lse_txgcount;
714 total_flushes += e->lse_msdcount;
715
716 /*
717 * Keep the running maximum of the total_flushes that
718 * we've done so far over the number of TXGs in the
719 * future that we are. The idea here is to estimate
720 * the average number of flushes that we should do
721 * every TXG so that when we are that many TXGs in the
722 * future we stay under the limit.
723 */
724 max_flushes_pertxg = MAX(max_flushes_pertxg,
725 DIV_ROUND_UP(total_flushes, txgs_in_future));
726 }
727 return (max_flushes_pertxg);
728 }
729
730 uint64_t
spa_log_sm_memused(spa_t * spa)731 spa_log_sm_memused(spa_t *spa)
732 {
733 return (spa->spa_unflushed_stats.sus_memused);
734 }
735
736 static boolean_t
spa_log_exceeds_memlimit(spa_t * spa)737 spa_log_exceeds_memlimit(spa_t *spa)
738 {
739 if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
740 return (B_TRUE);
741
742 uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
743 zfs_unflushed_max_mem_ppm) / 1000000;
744 if (spa_log_sm_memused(spa) > system_mem_allowed)
745 return (B_TRUE);
746
747 return (B_FALSE);
748 }
749
750 boolean_t
spa_flush_all_logs_requested(spa_t * spa)751 spa_flush_all_logs_requested(spa_t *spa)
752 {
753 return (spa->spa_log_flushall_txg != 0);
754 }
755
756 void
spa_flush_metaslabs(spa_t * spa,dmu_tx_t * tx)757 spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
758 {
759 uint64_t txg = dmu_tx_get_txg(tx);
760
761 if (spa_sync_pass(spa) != 1)
762 return;
763
764 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
765 return;
766
767 /*
768 * If we don't have any metaslabs with unflushed changes
769 * return immediately.
770 */
771 if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
772 return;
773
774 /*
775 * During SPA export we leave a few empty TXGs to go by [see
776 * spa_final_dirty_txg() to understand why]. For this specific
777 * case, it is important to not flush any metaslabs as that
778 * would dirty this TXG.
779 *
780 * That said, during one of these dirty TXGs that is less or
781 * equal to spa_final_dirty(), spa_unload() will request that
782 * we try to flush all the metaslabs for that TXG before
783 * exporting the pool, thus we ensure that we didn't get a
784 * request of flushing everything before we attempt to return
785 * immediately.
786 */
787 if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
788 !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
789 !spa_flush_all_logs_requested(spa))
790 return;
791
792 /*
793 * We need to generate a log space map before flushing because this
794 * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
795 * for this TXG's flushed metaslab count (aka sls_mscount which is
796 * manipulated in many ways down the metaslab_flush() codepath).
797 *
798 * That is not to say that we may generate a log space map when we
799 * don't need it. If we are flushing metaslabs, that means that we
800 * were going to write changes to disk anyway, so even if we were
801 * not flushing, a log space map would have been created anyway in
802 * metaslab_sync().
803 */
804 spa_generate_syncing_log_sm(spa, tx);
805
806 /*
807 * This variable tells us how many metaslabs we want to flush based
808 * on the block-heuristic of our flushing algorithm (see block comment
809 * of log space map feature). We also decrement this as we flush
810 * metaslabs and attempt to destroy old log space maps.
811 */
812 uint64_t want_to_flush;
813 if (spa_flush_all_logs_requested(spa)) {
814 ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
815 want_to_flush = UINT64_MAX;
816 } else {
817 want_to_flush = spa_estimate_metaslabs_to_flush(spa);
818 }
819
820 /* Used purely for verification purposes */
821 uint64_t visited = 0;
822
823 /*
824 * Ideally we would only iterate through spa_metaslabs_by_flushed
825 * using only one variable (curr). We can't do that because
826 * metaslab_flush() mutates position of curr in the AVL when
827 * it flushes that metaslab by moving it to the end of the tree.
828 * Thus we always keep track of the original next node of the
829 * current node (curr) in another variable (next).
830 */
831 metaslab_t *next = NULL;
832 for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
833 curr != NULL; curr = next) {
834 next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
835
836 /*
837 * If this metaslab has been flushed this txg then we've done
838 * a full circle over the metaslabs.
839 */
840 if (metaslab_unflushed_txg(curr) == txg)
841 break;
842
843 /*
844 * If we are done flushing for the block heuristic and the
845 * unflushed changes don't exceed the memory limit just stop.
846 */
847 if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
848 break;
849
850 if (metaslab_unflushed_dirty(curr)) {
851 mutex_enter(&curr->ms_sync_lock);
852 mutex_enter(&curr->ms_lock);
853 metaslab_flush(curr, tx);
854 mutex_exit(&curr->ms_lock);
855 mutex_exit(&curr->ms_sync_lock);
856 if (want_to_flush > 0)
857 want_to_flush--;
858 } else
859 metaslab_unflushed_bump(curr, tx, B_FALSE);
860
861 visited++;
862 }
863 ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
864
865 spa_log_sm_set_blocklimit(spa);
866 }
867
868 /*
869 * Close the log space map for this TXG and update the block counts
870 * for the log's in-memory structure and the summary.
871 */
872 void
spa_sync_close_syncing_log_sm(spa_t * spa)873 spa_sync_close_syncing_log_sm(spa_t *spa)
874 {
875 if (spa_syncing_log_sm(spa) == NULL)
876 return;
877 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
878
879 spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
880 ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
881
882 sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
883 spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
884
885 /*
886 * Note that we can't assert that sls_mscount is not 0,
887 * because there is the case where the first metaslab
888 * in spa_metaslabs_by_flushed is loading and we were
889 * not able to flush any metaslabs the current TXG.
890 */
891 ASSERT(sls->sls_nblocks != 0);
892
893 spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
894 spa_log_summary_verify_counts(spa);
895
896 space_map_close(spa->spa_syncing_log_sm);
897 spa->spa_syncing_log_sm = NULL;
898
899 /*
900 * At this point we tried to flush as many metaslabs as we
901 * can as the pool is getting exported. Reset the "flush all"
902 * so the last few TXGs before closing the pool can be empty
903 * (e.g. not dirty).
904 */
905 if (spa_flush_all_logs_requested(spa)) {
906 ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
907 spa->spa_log_flushall_txg = 0;
908 }
909 }
910
911 void
spa_cleanup_old_sm_logs(spa_t * spa,dmu_tx_t * tx)912 spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
913 {
914 objset_t *mos = spa_meta_objset(spa);
915
916 uint64_t spacemap_zap;
917 int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
918 DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
919 if (error == ENOENT) {
920 ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
921 return;
922 }
923 VERIFY0(error);
924
925 metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
926 uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
927
928 /* Free all log space maps older than the oldest_flushed_txg. */
929 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
930 sls && sls->sls_txg < oldest_flushed_txg;
931 sls = avl_first(&spa->spa_sm_logs_by_txg)) {
932 ASSERT0(sls->sls_mscount);
933 avl_remove(&spa->spa_sm_logs_by_txg, sls);
934 space_map_free_obj(mos, sls->sls_sm_obj, tx);
935 VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
936 spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks);
937 spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
938 kmem_free(sls, sizeof (spa_log_sm_t));
939 }
940 }
941
942 static spa_log_sm_t *
spa_log_sm_alloc(uint64_t sm_obj,uint64_t txg)943 spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
944 {
945 spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
946 sls->sls_sm_obj = sm_obj;
947 sls->sls_txg = txg;
948 return (sls);
949 }
950
951 void
spa_generate_syncing_log_sm(spa_t * spa,dmu_tx_t * tx)952 spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
953 {
954 uint64_t txg = dmu_tx_get_txg(tx);
955 objset_t *mos = spa_meta_objset(spa);
956
957 if (spa_syncing_log_sm(spa) != NULL)
958 return;
959
960 if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
961 return;
962
963 uint64_t spacemap_zap;
964 int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
965 DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
966 if (error == ENOENT) {
967 ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
968
969 error = 0;
970 spacemap_zap = zap_create(mos,
971 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
972 VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
973 DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
974 &spacemap_zap, tx));
975 spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
976 }
977 VERIFY0(error);
978
979 uint64_t sm_obj;
980 ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
981 ==, ENOENT);
982 sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
983 VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
984 avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
985
986 /*
987 * We pass UINT64_MAX as the space map's representation size
988 * and SPA_MINBLOCKSHIFT as the shift, to make the space map
989 * accept any sorts of segments since there's no real advantage
990 * to being more restrictive (given that we're already going
991 * to be using 2-word entries).
992 */
993 VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
994 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
995
996 spa_log_sm_set_blocklimit(spa);
997 }
998
999 /*
1000 * Find all the log space maps stored in the space map ZAP and sort
1001 * them by their TXG in spa_sm_logs_by_txg.
1002 */
1003 static int
spa_ld_log_sm_metadata(spa_t * spa)1004 spa_ld_log_sm_metadata(spa_t *spa)
1005 {
1006 int error;
1007 uint64_t spacemap_zap;
1008
1009 ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
1010
1011 error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
1012 DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
1013 if (error == ENOENT) {
1014 /* the space map ZAP doesn't exist yet */
1015 return (0);
1016 } else if (error != 0) {
1017 spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
1018 "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
1019 error);
1020 return (error);
1021 }
1022
1023 zap_cursor_t zc;
1024 zap_attribute_t *za = zap_attribute_alloc();
1025 for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
1026 (error = zap_cursor_retrieve(&zc, za)) == 0;
1027 zap_cursor_advance(&zc)) {
1028 uint64_t log_txg = zfs_strtonum(za->za_name, NULL);
1029 spa_log_sm_t *sls =
1030 spa_log_sm_alloc(za->za_first_integer, log_txg);
1031 avl_add(&spa->spa_sm_logs_by_txg, sls);
1032 }
1033 zap_cursor_fini(&zc);
1034 zap_attribute_free(za);
1035 if (error != ENOENT) {
1036 spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
1037 "zap_cursor_retrieve(spacemap_zap) [error %d]",
1038 error);
1039 return (error);
1040 }
1041
1042 for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1043 m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1044 spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
1045 spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
1046 &target, NULL);
1047
1048 /*
1049 * At this point if sls is zero it means that a bug occurred
1050 * in ZFS the last time the pool was open or earlier in the
1051 * import code path. In general, we would have placed a
1052 * VERIFY() here or in this case just let the kernel panic
1053 * with NULL pointer dereference when incrementing sls_mscount,
1054 * but since this is the import code path we can be a bit more
1055 * lenient. Thus, for DEBUG bits we always cause a panic, while
1056 * in production we log the error and just fail the import.
1057 */
1058 ASSERT(sls != NULL);
1059 if (sls == NULL) {
1060 spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
1061 "encountered: could not find log spacemap for "
1062 "TXG %llu [error %d]",
1063 (u_longlong_t)metaslab_unflushed_txg(m), ENOENT);
1064 return (ENOENT);
1065 }
1066 sls->sls_mscount++;
1067 }
1068
1069 return (0);
1070 }
1071
1072 typedef struct spa_ld_log_sm_arg {
1073 spa_t *slls_spa;
1074 uint64_t slls_txg;
1075 } spa_ld_log_sm_arg_t;
1076
1077 static int
spa_ld_log_sm_cb(space_map_entry_t * sme,void * arg)1078 spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
1079 {
1080 uint64_t offset = sme->sme_offset;
1081 uint64_t size = sme->sme_run;
1082 uint32_t vdev_id = sme->sme_vdev;
1083
1084 spa_ld_log_sm_arg_t *slls = arg;
1085 spa_t *spa = slls->slls_spa;
1086
1087 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
1088
1089 /*
1090 * If the vdev has been removed (i.e. it is indirect or a hole)
1091 * skip this entry. The contents of this vdev have already moved
1092 * elsewhere.
1093 */
1094 if (!vdev_is_concrete(vd))
1095 return (0);
1096
1097 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1098 ASSERT(!ms->ms_loaded);
1099
1100 /*
1101 * If we have already flushed entries for this TXG to this
1102 * metaslab's space map, then ignore it. Note that we flush
1103 * before processing any allocations/frees for that TXG, so
1104 * the metaslab's space map only has entries from *before*
1105 * the unflushed TXG.
1106 */
1107 if (slls->slls_txg < metaslab_unflushed_txg(ms))
1108 return (0);
1109
1110 switch (sme->sme_type) {
1111 case SM_ALLOC:
1112 zfs_range_tree_remove_xor_add_segment(offset, offset + size,
1113 ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
1114 break;
1115 case SM_FREE:
1116 zfs_range_tree_remove_xor_add_segment(offset, offset + size,
1117 ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
1118 break;
1119 default:
1120 panic("invalid maptype_t");
1121 break;
1122 }
1123 if (!metaslab_unflushed_dirty(ms)) {
1124 metaslab_set_unflushed_dirty(ms, B_TRUE);
1125 spa_log_summary_dirty_flushed_metaslab(spa,
1126 metaslab_unflushed_txg(ms));
1127 }
1128 return (0);
1129 }
1130
1131 static int
spa_ld_log_sm_data(spa_t * spa)1132 spa_ld_log_sm_data(spa_t *spa)
1133 {
1134 spa_log_sm_t *sls, *psls;
1135 int error = 0;
1136
1137 /*
1138 * If we are not going to do any writes there is no need
1139 * to read the log space maps.
1140 */
1141 if (!spa_writeable(spa))
1142 return (0);
1143
1144 ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
1145 ASSERT0(spa->spa_unflushed_stats.sus_memused);
1146
1147 hrtime_t read_logs_starttime = gethrtime();
1148
1149 /* Prefetch log spacemaps dnodes. */
1150 for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
1151 sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1152 dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj,
1153 ZIO_PRIORITY_SYNC_READ);
1154 }
1155
1156 uint_t pn = 0;
1157 uint64_t ps = 0;
1158 uint64_t nsm = 0;
1159 psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
1160 while (sls != NULL) {
1161 /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
1162 if (psls != NULL && pn < 16 &&
1163 (pn < 2 || ps < 2 * dmu_prefetch_max)) {
1164 error = space_map_open(&psls->sls_sm,
1165 spa_meta_objset(spa), psls->sls_sm_obj, 0,
1166 UINT64_MAX, SPA_MINBLOCKSHIFT);
1167 if (error != 0) {
1168 spa_load_failed(spa, "spa_ld_log_sm_data(): "
1169 "failed at space_map_open(obj=%llu) "
1170 "[error %d]",
1171 (u_longlong_t)sls->sls_sm_obj, error);
1172 goto out;
1173 }
1174 dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj,
1175 0, 0, space_map_length(psls->sls_sm),
1176 ZIO_PRIORITY_ASYNC_READ);
1177 pn++;
1178 ps += space_map_length(psls->sls_sm);
1179 psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls);
1180 continue;
1181 }
1182
1183 /* Load TXG log spacemap into ms_unflushed_allocs/frees. */
1184 kpreempt(KPREEMPT_SYNC);
1185 ASSERT0(sls->sls_nblocks);
1186 sls->sls_nblocks = space_map_nblocks(sls->sls_sm);
1187 spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
1188 summary_add_data(spa, sls->sls_txg,
1189 sls->sls_mscount, 0, sls->sls_nblocks);
1190
1191 spa_import_progress_set_notes_nolog(spa,
1192 "Read %llu of %lu log space maps", (u_longlong_t)nsm,
1193 avl_numnodes(&spa->spa_sm_logs_by_txg));
1194
1195 struct spa_ld_log_sm_arg vla = {
1196 .slls_spa = spa,
1197 .slls_txg = sls->sls_txg
1198 };
1199 error = space_map_iterate(sls->sls_sm,
1200 space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla);
1201 if (error != 0) {
1202 spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
1203 "at space_map_iterate(obj=%llu) [error %d]",
1204 (u_longlong_t)sls->sls_sm_obj, error);
1205 goto out;
1206 }
1207
1208 pn--;
1209 ps -= space_map_length(sls->sls_sm);
1210 nsm++;
1211 space_map_close(sls->sls_sm);
1212 sls->sls_sm = NULL;
1213 sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
1214
1215 /* Update log block limits considering just loaded. */
1216 spa_log_sm_set_blocklimit(spa);
1217 }
1218
1219 hrtime_t read_logs_endtime = gethrtime();
1220 spa_load_note(spa,
1221 "Read %lu log space maps (%llu total blocks - blksz = %llu bytes) "
1222 "in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg),
1223 (u_longlong_t)spa_log_sm_nblocks(spa),
1224 (u_longlong_t)zfs_log_sm_blksz,
1225 (longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime));
1226
1227 out:
1228 if (error != 0) {
1229 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
1230 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1231 if (sls->sls_sm) {
1232 space_map_close(sls->sls_sm);
1233 sls->sls_sm = NULL;
1234 }
1235 }
1236 } else {
1237 ASSERT0(pn);
1238 ASSERT0(ps);
1239 }
1240 /*
1241 * Now that the metaslabs contain their unflushed changes:
1242 * [1] recalculate their actual allocated space
1243 * [2] recalculate their weights
1244 * [3] sum up the memory usage of their unflushed range trees
1245 * [4] optionally load them, if debug_load is set
1246 *
1247 * Note that even in the case where we get here because of an
1248 * error (e.g. error != 0), we still want to update the fields
1249 * below in order to have a proper teardown in spa_unload().
1250 */
1251 for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
1252 m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
1253 mutex_enter(&m->ms_lock);
1254 m->ms_allocated_space = space_map_allocated(m->ms_sm) +
1255 zfs_range_tree_space(m->ms_unflushed_allocs) -
1256 zfs_range_tree_space(m->ms_unflushed_frees);
1257
1258 vdev_t *vd = m->ms_group->mg_vd;
1259 metaslab_space_update(vd, m->ms_group->mg_class,
1260 zfs_range_tree_space(m->ms_unflushed_allocs), 0, 0);
1261 metaslab_space_update(vd, m->ms_group->mg_class,
1262 -zfs_range_tree_space(m->ms_unflushed_frees), 0, 0);
1263
1264 ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
1265 metaslab_recalculate_weight_and_sort(m);
1266
1267 spa->spa_unflushed_stats.sus_memused +=
1268 metaslab_unflushed_changes_memused(m);
1269
1270 if (metaslab_debug_load && m->ms_sm != NULL) {
1271 VERIFY0(metaslab_load(m));
1272 metaslab_set_selected_txg(m, 0);
1273 }
1274 mutex_exit(&m->ms_lock);
1275 }
1276
1277 return (error);
1278 }
1279
1280 static int
spa_ld_unflushed_txgs(vdev_t * vd)1281 spa_ld_unflushed_txgs(vdev_t *vd)
1282 {
1283 spa_t *spa = vd->vdev_spa;
1284 objset_t *mos = spa_meta_objset(spa);
1285
1286 if (vd->vdev_top_zap == 0)
1287 return (0);
1288
1289 uint64_t object = 0;
1290 int error = zap_lookup(mos, vd->vdev_top_zap,
1291 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
1292 sizeof (uint64_t), 1, &object);
1293 if (error == ENOENT)
1294 return (0);
1295 else if (error != 0) {
1296 spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
1297 "zap_lookup(vdev_top_zap=%llu) [error %d]",
1298 (u_longlong_t)vd->vdev_top_zap, error);
1299 return (error);
1300 }
1301
1302 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
1303 metaslab_t *ms = vd->vdev_ms[m];
1304 ASSERT(ms != NULL);
1305
1306 metaslab_unflushed_phys_t entry;
1307 uint64_t entry_size = sizeof (entry);
1308 uint64_t entry_offset = ms->ms_id * entry_size;
1309
1310 error = dmu_read(mos, object,
1311 entry_offset, entry_size, &entry, 0);
1312 if (error != 0) {
1313 spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
1314 "failed at dmu_read(obj=%llu) [error %d]",
1315 (u_longlong_t)object, error);
1316 return (error);
1317 }
1318
1319 ms->ms_unflushed_txg = entry.msp_unflushed_txg;
1320 ms->ms_unflushed_dirty = B_FALSE;
1321 ASSERT(zfs_range_tree_is_empty(ms->ms_unflushed_allocs));
1322 ASSERT(zfs_range_tree_is_empty(ms->ms_unflushed_frees));
1323 if (ms->ms_unflushed_txg != 0) {
1324 mutex_enter(&spa->spa_flushed_ms_lock);
1325 avl_add(&spa->spa_metaslabs_by_flushed, ms);
1326 mutex_exit(&spa->spa_flushed_ms_lock);
1327 }
1328 }
1329 return (0);
1330 }
1331
1332 /*
1333 * Read all the log space map entries into their respective
1334 * metaslab unflushed trees and keep them sorted by TXG in the
1335 * SPA's metadata. In addition, setup all the metadata for the
1336 * memory and the block heuristics.
1337 */
1338 int
spa_ld_log_spacemaps(spa_t * spa)1339 spa_ld_log_spacemaps(spa_t *spa)
1340 {
1341 int error;
1342
1343 spa_log_sm_set_blocklimit(spa);
1344
1345 for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
1346 vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
1347 error = spa_ld_unflushed_txgs(vd);
1348 if (error != 0)
1349 return (error);
1350 }
1351
1352 error = spa_ld_log_sm_metadata(spa);
1353 if (error != 0)
1354 return (error);
1355
1356 /*
1357 * Note: we don't actually expect anything to change at this point
1358 * but we grab the config lock so we don't fail any assertions
1359 * when using vdev_lookup_top().
1360 */
1361 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1362 error = spa_ld_log_sm_data(spa);
1363 spa_config_exit(spa, SCL_CONFIG, FTAG);
1364
1365 return (error);
1366 }
1367
1368 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, U64, ZMOD_RW,
1369 "Specific hard-limit in memory that ZFS allows to be used for "
1370 "unflushed changes");
1371
1372 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, U64, ZMOD_RW,
1373 "Percentage of the overall system memory that ZFS allows to be "
1374 "used for unflushed changes (value is calculated over 1000000 for "
1375 "finer granularity)");
1376
1377 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, U64, ZMOD_RW,
1378 "Hard limit (upper-bound) in the size of the space map log "
1379 "in terms of blocks.");
1380
1381 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, U64, ZMOD_RW,
1382 "Lower-bound limit for the maximum amount of blocks allowed in "
1383 "log spacemap (see zfs_unflushed_log_block_max)");
1384
1385 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, U64, ZMOD_RW,
1386 "Hard limit (upper-bound) in the size of the space map log "
1387 "in terms of dirty TXGs.");
1388
1389 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, UINT, ZMOD_RW,
1390 "Tunable used to determine the number of blocks that can be used for "
1391 "the spacemap log, expressed as a percentage of the total number of "
1392 "metaslabs in the pool (e.g. 400 means the number of log blocks is "
1393 "capped at 4 times the number of metaslabs)");
1394
1395 ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, U64, ZMOD_RW,
1396 "The number of past TXGs that the flushing algorithm of the log "
1397 "spacemap feature uses to estimate incoming log blocks");
1398
1399 ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW,
1400 "Prevent the log spacemaps from being flushed and destroyed "
1401 "during pool export/destroy");
1402
1403 ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, U64, ZMOD_RW,
1404 "Maximum number of rows allowed in the summary of the spacemap log");
1405
1406 ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, U64, ZMOD_RW,
1407 "Minimum number of metaslabs to flush per dirty TXG");
1408