161145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
2eda14cbcSMatt Macy /*
3eda14cbcSMatt Macy * CDDL HEADER START
4eda14cbcSMatt Macy *
5eda14cbcSMatt Macy * The contents of this file are subject to the terms of the
6eda14cbcSMatt Macy * Common Development and Distribution License (the "License").
7eda14cbcSMatt Macy * You may not use this file except in compliance with the License.
8eda14cbcSMatt Macy *
9eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0.
11eda14cbcSMatt Macy * See the License for the specific language governing permissions
12eda14cbcSMatt Macy * and limitations under the License.
13eda14cbcSMatt Macy *
14eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each
15eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the
17eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying
18eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner]
19eda14cbcSMatt Macy *
20eda14cbcSMatt Macy * CDDL HEADER END
21eda14cbcSMatt Macy */
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24ba27dd8bSMartin Matuska * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
25eda14cbcSMatt Macy * Copyright 2016 Gary Mills
26eda14cbcSMatt Macy * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
272c48331dSMatt Macy * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
28eda14cbcSMatt Macy * Copyright 2019 Joyent, Inc.
29eda14cbcSMatt Macy */
30eda14cbcSMatt Macy
31eda14cbcSMatt Macy #include <sys/dsl_scan.h>
32eda14cbcSMatt Macy #include <sys/dsl_pool.h>
33eda14cbcSMatt Macy #include <sys/dsl_dataset.h>
34eda14cbcSMatt Macy #include <sys/dsl_prop.h>
35eda14cbcSMatt Macy #include <sys/dsl_dir.h>
36eda14cbcSMatt Macy #include <sys/dsl_synctask.h>
37eda14cbcSMatt Macy #include <sys/dnode.h>
38eda14cbcSMatt Macy #include <sys/dmu_tx.h>
39eda14cbcSMatt Macy #include <sys/dmu_objset.h>
40eda14cbcSMatt Macy #include <sys/arc.h>
41c9539b89SMartin Matuska #include <sys/arc_impl.h>
42eda14cbcSMatt Macy #include <sys/zap.h>
43eda14cbcSMatt Macy #include <sys/zio.h>
44eda14cbcSMatt Macy #include <sys/zfs_context.h>
45eda14cbcSMatt Macy #include <sys/fs/zfs.h>
46eda14cbcSMatt Macy #include <sys/zfs_znode.h>
47eda14cbcSMatt Macy #include <sys/spa_impl.h>
48eda14cbcSMatt Macy #include <sys/vdev_impl.h>
49eda14cbcSMatt Macy #include <sys/zil_impl.h>
50eda14cbcSMatt Macy #include <sys/zio_checksum.h>
512a58b312SMartin Matuska #include <sys/brt.h>
52eda14cbcSMatt Macy #include <sys/ddt.h>
53eda14cbcSMatt Macy #include <sys/sa.h>
54eda14cbcSMatt Macy #include <sys/sa_impl.h>
55eda14cbcSMatt Macy #include <sys/zfeature.h>
56eda14cbcSMatt Macy #include <sys/abd.h>
57eda14cbcSMatt Macy #include <sys/range_tree.h>
58c0a83fe0SMartin Matuska #include <sys/dbuf.h>
59eda14cbcSMatt Macy #ifdef _KERNEL
60eda14cbcSMatt Macy #include <sys/zfs_vfsops.h>
61eda14cbcSMatt Macy #endif
62eda14cbcSMatt Macy
63eda14cbcSMatt Macy /*
64eda14cbcSMatt Macy * Grand theory statement on scan queue sorting
65eda14cbcSMatt Macy *
66eda14cbcSMatt Macy * Scanning is implemented by recursively traversing all indirection levels
67eda14cbcSMatt Macy * in an object and reading all blocks referenced from said objects. This
68eda14cbcSMatt Macy * results in us approximately traversing the object from lowest logical
69eda14cbcSMatt Macy * offset to the highest. For best performance, we would want the logical
70eda14cbcSMatt Macy * blocks to be physically contiguous. However, this is frequently not the
71eda14cbcSMatt Macy * case with pools given the allocation patterns of copy-on-write filesystems.
72eda14cbcSMatt Macy * So instead, we put the I/Os into a reordering queue and issue them in a
73eda14cbcSMatt Macy * way that will most benefit physical disks (LBA-order).
74eda14cbcSMatt Macy *
75eda14cbcSMatt Macy * Queue management:
76eda14cbcSMatt Macy *
77eda14cbcSMatt Macy * Ideally, we would want to scan all metadata and queue up all block I/O
78eda14cbcSMatt Macy * prior to starting to issue it, because that allows us to do an optimal
79eda14cbcSMatt Macy * sorting job. This can however consume large amounts of memory. Therefore
80eda14cbcSMatt Macy * we continuously monitor the size of the queues and constrain them to 5%
81eda14cbcSMatt Macy * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
82eda14cbcSMatt Macy * limit, we clear out a few of the largest extents at the head of the queues
83eda14cbcSMatt Macy * to make room for more scanning. Hopefully, these extents will be fairly
84eda14cbcSMatt Macy * large and contiguous, allowing us to approach sequential I/O throughput
85eda14cbcSMatt Macy * even without a fully sorted tree.
86eda14cbcSMatt Macy *
87eda14cbcSMatt Macy * Metadata scanning takes place in dsl_scan_visit(), which is called from
88eda14cbcSMatt Macy * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
89eda14cbcSMatt Macy * metadata on the pool, or we need to make room in memory because our
90eda14cbcSMatt Macy * queues are too large, dsl_scan_visit() is postponed and
91eda14cbcSMatt Macy * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
92eda14cbcSMatt Macy * that metadata scanning and queued I/O issuing are mutually exclusive. This
93eda14cbcSMatt Macy * allows us to provide maximum sequential I/O throughput for the majority of
94eda14cbcSMatt Macy * I/O's issued since sequential I/O performance is significantly negatively
95eda14cbcSMatt Macy * impacted if it is interleaved with random I/O.
96eda14cbcSMatt Macy *
97eda14cbcSMatt Macy * Implementation Notes
98eda14cbcSMatt Macy *
99eda14cbcSMatt Macy * One side effect of the queued scanning algorithm is that the scanning code
100eda14cbcSMatt Macy * needs to be notified whenever a block is freed. This is needed to allow
101eda14cbcSMatt Macy * the scanning code to remove these I/Os from the issuing queue. Additionally,
102eda14cbcSMatt Macy * we do not attempt to queue gang blocks to be issued sequentially since this
103eda14cbcSMatt Macy * is very hard to do and would have an extremely limited performance benefit.
104eda14cbcSMatt Macy * Instead, we simply issue gang I/Os as soon as we find them using the legacy
105eda14cbcSMatt Macy * algorithm.
106eda14cbcSMatt Macy *
107eda14cbcSMatt Macy * Backwards compatibility
108eda14cbcSMatt Macy *
109eda14cbcSMatt Macy * This new algorithm is backwards compatible with the legacy on-disk data
110eda14cbcSMatt Macy * structures (and therefore does not require a new feature flag).
111eda14cbcSMatt Macy * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
112eda14cbcSMatt Macy * will stop scanning metadata (in logical order) and wait for all outstanding
113eda14cbcSMatt Macy * sorted I/O to complete. Once this is done, we write out a checkpoint
114eda14cbcSMatt Macy * bookmark, indicating that we have scanned everything logically before it.
115eda14cbcSMatt Macy * If the pool is imported on a machine without the new sorting algorithm,
116eda14cbcSMatt Macy * the scan simply resumes from the last checkpoint using the legacy algorithm.
117eda14cbcSMatt Macy */
118eda14cbcSMatt Macy
119eda14cbcSMatt Macy typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
120eda14cbcSMatt Macy const zbookmark_phys_t *);
121eda14cbcSMatt Macy
122eda14cbcSMatt Macy static scan_cb_t dsl_scan_scrub_cb;
123eda14cbcSMatt Macy
124eda14cbcSMatt Macy static int scan_ds_queue_compare(const void *a, const void *b);
125eda14cbcSMatt Macy static int scan_prefetch_queue_compare(const void *a, const void *b);
126eda14cbcSMatt Macy static void scan_ds_queue_clear(dsl_scan_t *scn);
127eda14cbcSMatt Macy static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn);
128eda14cbcSMatt Macy static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
129eda14cbcSMatt Macy uint64_t *txg);
130eda14cbcSMatt Macy static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
131eda14cbcSMatt Macy static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
132eda14cbcSMatt Macy static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
133c9539b89SMartin Matuska static uint64_t dsl_scan_count_data_disks(spa_t *spa);
134c0a83fe0SMartin Matuska static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb);
135eda14cbcSMatt Macy
136be181ee2SMartin Matuska extern uint_t zfs_vdev_async_write_active_min_dirty_percent;
137a0b956f5SMartin Matuska static int zfs_scan_blkstats = 0;
138eda14cbcSMatt Macy
139eda14cbcSMatt Macy /*
140c9539b89SMartin Matuska * 'zpool status' uses bytes processed per pass to report throughput and
141c9539b89SMartin Matuska * estimate time remaining. We define a pass to start when the scanning
142c9539b89SMartin Matuska * phase completes for a sequential resilver. Optionally, this value
143c9539b89SMartin Matuska * may be used to reset the pass statistics every N txgs to provide an
144c9539b89SMartin Matuska * estimated completion time based on currently observed performance.
145c9539b89SMartin Matuska */
146c9539b89SMartin Matuska static uint_t zfs_scan_report_txgs = 0;
147c9539b89SMartin Matuska
148c9539b89SMartin Matuska /*
149eda14cbcSMatt Macy * By default zfs will check to ensure it is not over the hard memory
150eda14cbcSMatt Macy * limit before each txg. If finer-grained control of this is needed
151eda14cbcSMatt Macy * this value can be set to 1 to enable checking before scanning each
152eda14cbcSMatt Macy * block.
153eda14cbcSMatt Macy */
154e92ffd9bSMartin Matuska static int zfs_scan_strict_mem_lim = B_FALSE;
155eda14cbcSMatt Macy
156eda14cbcSMatt Macy /*
157eda14cbcSMatt Macy * Maximum number of parallelly executed bytes per leaf vdev. We attempt
158eda14cbcSMatt Macy * to strike a balance here between keeping the vdev queues full of I/Os
159eda14cbcSMatt Macy * at all times and not overflowing the queues to cause long latency,
160eda14cbcSMatt Macy * which would cause long txg sync times. No matter what, we will not
161eda14cbcSMatt Macy * overload the drives with I/O, since that is protected by
162eda14cbcSMatt Macy * zfs_vdev_scrub_max_active.
163eda14cbcSMatt Macy */
164c9539b89SMartin Matuska static uint64_t zfs_scan_vdev_limit = 16 << 20;
165eda14cbcSMatt Macy
166be181ee2SMartin Matuska static uint_t zfs_scan_issue_strategy = 0;
167be181ee2SMartin Matuska
168be181ee2SMartin Matuska /* don't queue & sort zios, go direct */
169be181ee2SMartin Matuska static int zfs_scan_legacy = B_FALSE;
170dbd5678dSMartin Matuska static uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
171eda14cbcSMatt Macy
172eda14cbcSMatt Macy /*
173eda14cbcSMatt Macy * fill_weight is non-tunable at runtime, so we copy it at module init from
174eda14cbcSMatt Macy * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
175eda14cbcSMatt Macy * break queue sorting.
176eda14cbcSMatt Macy */
177be181ee2SMartin Matuska static uint_t zfs_scan_fill_weight = 3;
178eda14cbcSMatt Macy static uint64_t fill_weight;
179eda14cbcSMatt Macy
180eda14cbcSMatt Macy /* See dsl_scan_should_clear() for details on the memory limit tunables */
181e92ffd9bSMartin Matuska static const uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */
182e92ffd9bSMartin Matuska static const uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */
183eda14cbcSMatt Macy
184be181ee2SMartin Matuska
185be181ee2SMartin Matuska /* fraction of physmem */
186be181ee2SMartin Matuska static uint_t zfs_scan_mem_lim_fact = 20;
187be181ee2SMartin Matuska
188be181ee2SMartin Matuska /* fraction of mem lim above */
189be181ee2SMartin Matuska static uint_t zfs_scan_mem_lim_soft_fact = 20;
190be181ee2SMartin Matuska
191be181ee2SMartin Matuska /* minimum milliseconds to scrub per txg */
192be181ee2SMartin Matuska static uint_t zfs_scrub_min_time_ms = 1000;
193be181ee2SMartin Matuska
194be181ee2SMartin Matuska /* minimum milliseconds to obsolete per txg */
195be181ee2SMartin Matuska static uint_t zfs_obsolete_min_time_ms = 500;
196be181ee2SMartin Matuska
197be181ee2SMartin Matuska /* minimum milliseconds to free per txg */
198be181ee2SMartin Matuska static uint_t zfs_free_min_time_ms = 1000;
199be181ee2SMartin Matuska
200be181ee2SMartin Matuska /* minimum milliseconds to resilver per txg */
201be181ee2SMartin Matuska static uint_t zfs_resilver_min_time_ms = 3000;
202be181ee2SMartin Matuska
203be181ee2SMartin Matuska static uint_t zfs_scan_checkpoint_intval = 7200; /* in seconds */
204eda14cbcSMatt Macy int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
205e92ffd9bSMartin Matuska static int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
206e92ffd9bSMartin Matuska static int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
2074fefe1b7SMartin Matuska static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
208eda14cbcSMatt Macy /* max number of blocks to free in a single TXG */
209dbd5678dSMartin Matuska static uint64_t zfs_async_block_max_blocks = UINT64_MAX;
210eda14cbcSMatt Macy /* max number of dedup blocks to free in a single TXG */
211dbd5678dSMartin Matuska static uint64_t zfs_max_async_dedup_frees = 100000;
212eda14cbcSMatt Macy
213e92ffd9bSMartin Matuska /* set to disable resilver deferring */
214e92ffd9bSMartin Matuska static int zfs_resilver_disable_defer = B_FALSE;
215eda14cbcSMatt Macy
2167a7741afSMartin Matuska /* Don't defer a resilver if the one in progress only got this far: */
2177a7741afSMartin Matuska static uint_t zfs_resilver_defer_percent = 10;
2187a7741afSMartin Matuska
219eda14cbcSMatt Macy /*
220eda14cbcSMatt Macy * We wait a few txgs after importing a pool to begin scanning so that
221eda14cbcSMatt Macy * the import / mounting code isn't held up by scrub / resilver IO.
222eda14cbcSMatt Macy * Unfortunately, it is a bit difficult to determine exactly how long
223eda14cbcSMatt Macy * this will take since userspace will trigger fs mounts asynchronously
224eda14cbcSMatt Macy * and the kernel will create zvol minors asynchronously. As a result,
225eda14cbcSMatt Macy * the value provided here is a bit arbitrary, but represents a
226eda14cbcSMatt Macy * reasonable estimate of how many txgs it will take to finish fully
227eda14cbcSMatt Macy * importing a pool
228eda14cbcSMatt Macy */
229eda14cbcSMatt Macy #define SCAN_IMPORT_WAIT_TXGS 5
230eda14cbcSMatt Macy
231eda14cbcSMatt Macy #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
232eda14cbcSMatt Macy ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
233eda14cbcSMatt Macy (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
234eda14cbcSMatt Macy
23517aab35aSMartin Matuska #define DSL_SCAN_IS_SCRUB(scn) \
23617aab35aSMartin Matuska ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB)
23717aab35aSMartin Matuska
238*3a896071SMartin Matuska #define DSL_SCAN_IS_RESILVER(scn) \
239*3a896071SMartin Matuska ((scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
240*3a896071SMartin Matuska
241eda14cbcSMatt Macy /*
242eda14cbcSMatt Macy * Enable/disable the processing of the free_bpobj object.
243eda14cbcSMatt Macy */
244e92ffd9bSMartin Matuska static int zfs_free_bpobj_enabled = 1;
245eda14cbcSMatt Macy
246c0a83fe0SMartin Matuska /* Error blocks to be scrubbed in one txg. */
2474e8d558cSMartin Matuska static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12;
248c0a83fe0SMartin Matuska
249eda14cbcSMatt Macy /* the order has to match pool_scan_type */
250eda14cbcSMatt Macy static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
251eda14cbcSMatt Macy NULL,
252eda14cbcSMatt Macy dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */
253eda14cbcSMatt Macy dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
254eda14cbcSMatt Macy };
255eda14cbcSMatt Macy
256eda14cbcSMatt Macy /* In core node for the scn->scn_queue. Represents a dataset to be scanned */
257eda14cbcSMatt Macy typedef struct {
258eda14cbcSMatt Macy uint64_t sds_dsobj;
259eda14cbcSMatt Macy uint64_t sds_txg;
260eda14cbcSMatt Macy avl_node_t sds_node;
261eda14cbcSMatt Macy } scan_ds_t;
262eda14cbcSMatt Macy
263eda14cbcSMatt Macy /*
264eda14cbcSMatt Macy * This controls what conditions are placed on dsl_scan_sync_state():
265a0b956f5SMartin Matuska * SYNC_OPTIONAL) write out scn_phys iff scn_queues_pending == 0
266a0b956f5SMartin Matuska * SYNC_MANDATORY) write out scn_phys always. scn_queues_pending must be 0.
267a0b956f5SMartin Matuska * SYNC_CACHED) if scn_queues_pending == 0, write out scn_phys. Otherwise
268eda14cbcSMatt Macy * write out the scn_phys_cached version.
269eda14cbcSMatt Macy * See dsl_scan_sync_state for details.
270eda14cbcSMatt Macy */
271eda14cbcSMatt Macy typedef enum {
272eda14cbcSMatt Macy SYNC_OPTIONAL,
273eda14cbcSMatt Macy SYNC_MANDATORY,
274eda14cbcSMatt Macy SYNC_CACHED
275eda14cbcSMatt Macy } state_sync_type_t;
276eda14cbcSMatt Macy
277eda14cbcSMatt Macy /*
278eda14cbcSMatt Macy * This struct represents the minimum information needed to reconstruct a
279eda14cbcSMatt Macy * zio for sequential scanning. This is useful because many of these will
280eda14cbcSMatt Macy * accumulate in the sequential IO queues before being issued, so saving
281eda14cbcSMatt Macy * memory matters here.
282eda14cbcSMatt Macy */
283eda14cbcSMatt Macy typedef struct scan_io {
284eda14cbcSMatt Macy /* fields from blkptr_t */
285eda14cbcSMatt Macy uint64_t sio_blk_prop;
286eda14cbcSMatt Macy uint64_t sio_phys_birth;
287eda14cbcSMatt Macy uint64_t sio_birth;
288eda14cbcSMatt Macy zio_cksum_t sio_cksum;
289eda14cbcSMatt Macy uint32_t sio_nr_dvas;
290eda14cbcSMatt Macy
291eda14cbcSMatt Macy /* fields from zio_t */
292eda14cbcSMatt Macy uint32_t sio_flags;
293eda14cbcSMatt Macy zbookmark_phys_t sio_zb;
294eda14cbcSMatt Macy
295eda14cbcSMatt Macy /* members for queue sorting */
296eda14cbcSMatt Macy union {
297eda14cbcSMatt Macy avl_node_t sio_addr_node; /* link into issuing queue */
298eda14cbcSMatt Macy list_node_t sio_list_node; /* link for issuing to disk */
299eda14cbcSMatt Macy } sio_nodes;
300eda14cbcSMatt Macy
301eda14cbcSMatt Macy /*
302eda14cbcSMatt Macy * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
303eda14cbcSMatt Macy * depending on how many were in the original bp. Only the
304eda14cbcSMatt Macy * first DVA is really used for sorting and issuing purposes.
305eda14cbcSMatt Macy * The other DVAs (if provided) simply exist so that the zio
306eda14cbcSMatt Macy * layer can find additional copies to repair from in the
307eda14cbcSMatt Macy * event of an error. This array must go at the end of the
308eda14cbcSMatt Macy * struct to allow this for the variable number of elements.
309eda14cbcSMatt Macy */
31015f0b8c3SMartin Matuska dva_t sio_dva[];
311eda14cbcSMatt Macy } scan_io_t;
312eda14cbcSMatt Macy
313eda14cbcSMatt Macy #define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
314eda14cbcSMatt Macy #define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
315eda14cbcSMatt Macy #define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0])
316eda14cbcSMatt Macy #define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0])
317eda14cbcSMatt Macy #define SIO_GET_END_OFFSET(sio) \
318eda14cbcSMatt Macy (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
319eda14cbcSMatt Macy #define SIO_GET_MUSED(sio) \
320eda14cbcSMatt Macy (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
321eda14cbcSMatt Macy
322eda14cbcSMatt Macy struct dsl_scan_io_queue {
323eda14cbcSMatt Macy dsl_scan_t *q_scn; /* associated dsl_scan_t */
324eda14cbcSMatt Macy vdev_t *q_vd; /* top-level vdev that this queue represents */
3251f1e2261SMartin Matuska zio_t *q_zio; /* scn_zio_root child for waiting on IO */
326eda14cbcSMatt Macy
327eda14cbcSMatt Macy /* trees used for sorting I/Os and extents of I/Os */
328b59a0cdeSMartin Matuska zfs_range_tree_t *q_exts_by_addr;
329eda14cbcSMatt Macy zfs_btree_t q_exts_by_size;
330eda14cbcSMatt Macy avl_tree_t q_sios_by_addr;
331eda14cbcSMatt Macy uint64_t q_sio_memused;
332a0b956f5SMartin Matuska uint64_t q_last_ext_addr;
333eda14cbcSMatt Macy
334eda14cbcSMatt Macy /* members for zio rate limiting */
335eda14cbcSMatt Macy uint64_t q_maxinflight_bytes;
336eda14cbcSMatt Macy uint64_t q_inflight_bytes;
337eda14cbcSMatt Macy kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
338eda14cbcSMatt Macy
339eda14cbcSMatt Macy /* per txg statistics */
340eda14cbcSMatt Macy uint64_t q_total_seg_size_this_txg;
341eda14cbcSMatt Macy uint64_t q_segs_this_txg;
342eda14cbcSMatt Macy uint64_t q_total_zio_size_this_txg;
343eda14cbcSMatt Macy uint64_t q_zios_this_txg;
344eda14cbcSMatt Macy };
345eda14cbcSMatt Macy
346eda14cbcSMatt Macy /* private data for dsl_scan_prefetch_cb() */
347eda14cbcSMatt Macy typedef struct scan_prefetch_ctx {
348eda14cbcSMatt Macy zfs_refcount_t spc_refcnt; /* refcount for memory management */
349eda14cbcSMatt Macy dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */
350eda14cbcSMatt Macy boolean_t spc_root; /* is this prefetch for an objset? */
351eda14cbcSMatt Macy uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */
352eda14cbcSMatt Macy uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */
353eda14cbcSMatt Macy } scan_prefetch_ctx_t;
354eda14cbcSMatt Macy
355eda14cbcSMatt Macy /* private data for dsl_scan_prefetch() */
356eda14cbcSMatt Macy typedef struct scan_prefetch_issue_ctx {
357eda14cbcSMatt Macy avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */
358eda14cbcSMatt Macy scan_prefetch_ctx_t *spic_spc; /* spc for the callback */
359eda14cbcSMatt Macy blkptr_t spic_bp; /* bp to prefetch */
360eda14cbcSMatt Macy zbookmark_phys_t spic_zb; /* bookmark to prefetch */
361eda14cbcSMatt Macy } scan_prefetch_issue_ctx_t;
362eda14cbcSMatt Macy
363eda14cbcSMatt Macy static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
364eda14cbcSMatt Macy const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
365eda14cbcSMatt Macy static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
366eda14cbcSMatt Macy scan_io_t *sio);
367eda14cbcSMatt Macy
368eda14cbcSMatt Macy static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
369eda14cbcSMatt Macy static void scan_io_queues_destroy(dsl_scan_t *scn);
370eda14cbcSMatt Macy
371eda14cbcSMatt Macy static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
372eda14cbcSMatt Macy
373eda14cbcSMatt Macy /* sio->sio_nr_dvas must be set so we know which cache to free from */
374eda14cbcSMatt Macy static void
sio_free(scan_io_t * sio)375eda14cbcSMatt Macy sio_free(scan_io_t *sio)
376eda14cbcSMatt Macy {
377eda14cbcSMatt Macy ASSERT3U(sio->sio_nr_dvas, >, 0);
378eda14cbcSMatt Macy ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
379eda14cbcSMatt Macy
380eda14cbcSMatt Macy kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
381eda14cbcSMatt Macy }
382eda14cbcSMatt Macy
383eda14cbcSMatt Macy /* It is up to the caller to set sio->sio_nr_dvas for freeing */
384eda14cbcSMatt Macy static scan_io_t *
sio_alloc(unsigned short nr_dvas)385eda14cbcSMatt Macy sio_alloc(unsigned short nr_dvas)
386eda14cbcSMatt Macy {
387eda14cbcSMatt Macy ASSERT3U(nr_dvas, >, 0);
388eda14cbcSMatt Macy ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
389eda14cbcSMatt Macy
390eda14cbcSMatt Macy return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
391eda14cbcSMatt Macy }
392eda14cbcSMatt Macy
393eda14cbcSMatt Macy void
scan_init(void)394eda14cbcSMatt Macy scan_init(void)
395eda14cbcSMatt Macy {
396eda14cbcSMatt Macy /*
397eda14cbcSMatt Macy * This is used in ext_size_compare() to weight segments
398eda14cbcSMatt Macy * based on how sparse they are. This cannot be changed
399eda14cbcSMatt Macy * mid-scan and the tree comparison functions don't currently
400eda14cbcSMatt Macy * have a mechanism for passing additional context to the
401eda14cbcSMatt Macy * compare functions. Thus we store this value globally and
402eda14cbcSMatt Macy * we only allow it to be set at module initialization time
403eda14cbcSMatt Macy */
404eda14cbcSMatt Macy fill_weight = zfs_scan_fill_weight;
405eda14cbcSMatt Macy
406eda14cbcSMatt Macy for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
407eda14cbcSMatt Macy char name[36];
408eda14cbcSMatt Macy
409eda14cbcSMatt Macy (void) snprintf(name, sizeof (name), "sio_cache_%d", i);
410eda14cbcSMatt Macy sio_cache[i] = kmem_cache_create(name,
411eda14cbcSMatt Macy (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
412eda14cbcSMatt Macy 0, NULL, NULL, NULL, NULL, NULL, 0);
413eda14cbcSMatt Macy }
414eda14cbcSMatt Macy }
415eda14cbcSMatt Macy
416eda14cbcSMatt Macy void
scan_fini(void)417eda14cbcSMatt Macy scan_fini(void)
418eda14cbcSMatt Macy {
419eda14cbcSMatt Macy for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
420eda14cbcSMatt Macy kmem_cache_destroy(sio_cache[i]);
421eda14cbcSMatt Macy }
422eda14cbcSMatt Macy }
423eda14cbcSMatt Macy
424eda14cbcSMatt Macy static inline boolean_t
dsl_scan_is_running(const dsl_scan_t * scn)425eda14cbcSMatt Macy dsl_scan_is_running(const dsl_scan_t *scn)
426eda14cbcSMatt Macy {
427eda14cbcSMatt Macy return (scn->scn_phys.scn_state == DSS_SCANNING);
428eda14cbcSMatt Macy }
429eda14cbcSMatt Macy
430eda14cbcSMatt Macy boolean_t
dsl_scan_resilvering(dsl_pool_t * dp)431eda14cbcSMatt Macy dsl_scan_resilvering(dsl_pool_t *dp)
432eda14cbcSMatt Macy {
433eda14cbcSMatt Macy return (dsl_scan_is_running(dp->dp_scan) &&
434eda14cbcSMatt Macy dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
435eda14cbcSMatt Macy }
436eda14cbcSMatt Macy
437eda14cbcSMatt Macy static inline void
sio2bp(const scan_io_t * sio,blkptr_t * bp)438eda14cbcSMatt Macy sio2bp(const scan_io_t *sio, blkptr_t *bp)
439eda14cbcSMatt Macy {
440da5137abSMartin Matuska memset(bp, 0, sizeof (*bp));
441eda14cbcSMatt Macy bp->blk_prop = sio->sio_blk_prop;
442783d3ff6SMartin Matuska BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth);
443783d3ff6SMartin Matuska BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth);
444eda14cbcSMatt Macy bp->blk_fill = 1; /* we always only work with data pointers */
445eda14cbcSMatt Macy bp->blk_cksum = sio->sio_cksum;
446eda14cbcSMatt Macy
447eda14cbcSMatt Macy ASSERT3U(sio->sio_nr_dvas, >, 0);
448eda14cbcSMatt Macy ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
449eda14cbcSMatt Macy
450da5137abSMartin Matuska memcpy(bp->blk_dva, sio->sio_dva, sio->sio_nr_dvas * sizeof (dva_t));
451eda14cbcSMatt Macy }
452eda14cbcSMatt Macy
453eda14cbcSMatt Macy static inline void
bp2sio(const blkptr_t * bp,scan_io_t * sio,int dva_i)454eda14cbcSMatt Macy bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
455eda14cbcSMatt Macy {
456eda14cbcSMatt Macy sio->sio_blk_prop = bp->blk_prop;
457783d3ff6SMartin Matuska sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
458783d3ff6SMartin Matuska sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp);
459eda14cbcSMatt Macy sio->sio_cksum = bp->blk_cksum;
460eda14cbcSMatt Macy sio->sio_nr_dvas = BP_GET_NDVAS(bp);
461eda14cbcSMatt Macy
462eda14cbcSMatt Macy /*
463eda14cbcSMatt Macy * Copy the DVAs to the sio. We need all copies of the block so
464eda14cbcSMatt Macy * that the self healing code can use the alternate copies if the
465eda14cbcSMatt Macy * first is corrupted. We want the DVA at index dva_i to be first
466eda14cbcSMatt Macy * in the sio since this is the primary one that we want to issue.
467eda14cbcSMatt Macy */
468eda14cbcSMatt Macy for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
469eda14cbcSMatt Macy sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
470eda14cbcSMatt Macy }
471eda14cbcSMatt Macy }
472eda14cbcSMatt Macy
473eda14cbcSMatt Macy int
dsl_scan_init(dsl_pool_t * dp,uint64_t txg)474eda14cbcSMatt Macy dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
475eda14cbcSMatt Macy {
476eda14cbcSMatt Macy int err;
477eda14cbcSMatt Macy dsl_scan_t *scn;
478eda14cbcSMatt Macy spa_t *spa = dp->dp_spa;
479eda14cbcSMatt Macy uint64_t f;
480eda14cbcSMatt Macy
481eda14cbcSMatt Macy scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
482eda14cbcSMatt Macy scn->scn_dp = dp;
483eda14cbcSMatt Macy
484eda14cbcSMatt Macy /*
485eda14cbcSMatt Macy * It's possible that we're resuming a scan after a reboot so
486eda14cbcSMatt Macy * make sure that the scan_async_destroying flag is initialized
487eda14cbcSMatt Macy * appropriately.
488eda14cbcSMatt Macy */
489eda14cbcSMatt Macy ASSERT(!scn->scn_async_destroying);
490eda14cbcSMatt Macy scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
491eda14cbcSMatt Macy SPA_FEATURE_ASYNC_DESTROY);
492eda14cbcSMatt Macy
493eda14cbcSMatt Macy /*
494eda14cbcSMatt Macy * Calculate the max number of in-flight bytes for pool-wide
495c9539b89SMartin Matuska * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
496c9539b89SMartin Matuska * Limits for the issuing phase are done per top-level vdev and
497c9539b89SMartin Matuska * are handled separately.
498eda14cbcSMatt Macy */
499c9539b89SMartin Matuska scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
500c9539b89SMartin Matuska zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
501eda14cbcSMatt Macy
502eda14cbcSMatt Macy avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
503eda14cbcSMatt Macy offsetof(scan_ds_t, sds_node));
50449086aa3SAlexander Motin mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL);
505eda14cbcSMatt Macy avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
506eda14cbcSMatt Macy sizeof (scan_prefetch_issue_ctx_t),
507eda14cbcSMatt Macy offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
508eda14cbcSMatt Macy
509eda14cbcSMatt Macy err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
510eda14cbcSMatt Macy "scrub_func", sizeof (uint64_t), 1, &f);
511eda14cbcSMatt Macy if (err == 0) {
512eda14cbcSMatt Macy /*
513eda14cbcSMatt Macy * There was an old-style scrub in progress. Restart a
514eda14cbcSMatt Macy * new-style scrub from the beginning.
515eda14cbcSMatt Macy */
516eda14cbcSMatt Macy scn->scn_restart_txg = txg;
51781b22a98SMartin Matuska zfs_dbgmsg("old-style scrub was in progress for %s; "
518eda14cbcSMatt Macy "restarting new-style scrub in txg %llu",
51981b22a98SMartin Matuska spa->spa_name,
520eda14cbcSMatt Macy (longlong_t)scn->scn_restart_txg);
521eda14cbcSMatt Macy
522eda14cbcSMatt Macy /*
523eda14cbcSMatt Macy * Load the queue obj from the old location so that it
524eda14cbcSMatt Macy * can be freed by dsl_scan_done().
525eda14cbcSMatt Macy */
526eda14cbcSMatt Macy (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
527eda14cbcSMatt Macy "scrub_queue", sizeof (uint64_t), 1,
528eda14cbcSMatt Macy &scn->scn_phys.scn_queue_obj);
529eda14cbcSMatt Macy } else {
530eda14cbcSMatt Macy err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
531c0a83fe0SMartin Matuska DMU_POOL_ERRORSCRUB, sizeof (uint64_t),
532c0a83fe0SMartin Matuska ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys);
533c0a83fe0SMartin Matuska
534c0a83fe0SMartin Matuska if (err != 0 && err != ENOENT)
535c0a83fe0SMartin Matuska return (err);
536c0a83fe0SMartin Matuska
537c0a83fe0SMartin Matuska err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
538eda14cbcSMatt Macy DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
539eda14cbcSMatt Macy &scn->scn_phys);
540c0a83fe0SMartin Matuska
541eda14cbcSMatt Macy /*
542eda14cbcSMatt Macy * Detect if the pool contains the signature of #2094. If it
543eda14cbcSMatt Macy * does properly update the scn->scn_phys structure and notify
544eda14cbcSMatt Macy * the administrator by setting an errata for the pool.
545eda14cbcSMatt Macy */
546eda14cbcSMatt Macy if (err == EOVERFLOW) {
547eda14cbcSMatt Macy uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1];
548eda14cbcSMatt Macy VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24);
549eda14cbcSMatt Macy VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==,
550eda14cbcSMatt Macy (23 * sizeof (uint64_t)));
551eda14cbcSMatt Macy
552eda14cbcSMatt Macy err = zap_lookup(dp->dp_meta_objset,
553eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN,
554eda14cbcSMatt Macy sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp);
555eda14cbcSMatt Macy if (err == 0) {
556eda14cbcSMatt Macy uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS];
557eda14cbcSMatt Macy
558eda14cbcSMatt Macy if (overflow & ~DSL_SCAN_FLAGS_MASK ||
559eda14cbcSMatt Macy scn->scn_async_destroying) {
560eda14cbcSMatt Macy spa->spa_errata =
561eda14cbcSMatt Macy ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY;
562eda14cbcSMatt Macy return (EOVERFLOW);
563eda14cbcSMatt Macy }
564eda14cbcSMatt Macy
565da5137abSMartin Matuska memcpy(&scn->scn_phys, zaptmp,
566eda14cbcSMatt Macy SCAN_PHYS_NUMINTS * sizeof (uint64_t));
567eda14cbcSMatt Macy scn->scn_phys.scn_flags = overflow;
568eda14cbcSMatt Macy
569eda14cbcSMatt Macy /* Required scrub already in progress. */
570eda14cbcSMatt Macy if (scn->scn_phys.scn_state == DSS_FINISHED ||
571eda14cbcSMatt Macy scn->scn_phys.scn_state == DSS_CANCELED)
572eda14cbcSMatt Macy spa->spa_errata =
573eda14cbcSMatt Macy ZPOOL_ERRATA_ZOL_2094_SCRUB;
574eda14cbcSMatt Macy }
575eda14cbcSMatt Macy }
576eda14cbcSMatt Macy
577eda14cbcSMatt Macy if (err == ENOENT)
578eda14cbcSMatt Macy return (0);
579eda14cbcSMatt Macy else if (err)
580eda14cbcSMatt Macy return (err);
581eda14cbcSMatt Macy
582eda14cbcSMatt Macy /*
583eda14cbcSMatt Macy * We might be restarting after a reboot, so jump the issued
584eda14cbcSMatt Macy * counter to how far we've scanned. We know we're consistent
585eda14cbcSMatt Macy * up to here.
586eda14cbcSMatt Macy */
5870a97523dSMartin Matuska scn->scn_issued_before_pass = scn->scn_phys.scn_examined -
5880a97523dSMartin Matuska scn->scn_phys.scn_skipped;
589eda14cbcSMatt Macy
590eda14cbcSMatt Macy if (dsl_scan_is_running(scn) &&
591eda14cbcSMatt Macy spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
592eda14cbcSMatt Macy /*
593eda14cbcSMatt Macy * A new-type scrub was in progress on an old
594eda14cbcSMatt Macy * pool, and the pool was accessed by old
595eda14cbcSMatt Macy * software. Restart from the beginning, since
596eda14cbcSMatt Macy * the old software may have changed the pool in
597eda14cbcSMatt Macy * the meantime.
598eda14cbcSMatt Macy */
599eda14cbcSMatt Macy scn->scn_restart_txg = txg;
60081b22a98SMartin Matuska zfs_dbgmsg("new-style scrub for %s was modified "
601eda14cbcSMatt Macy "by old software; restarting in txg %llu",
60281b22a98SMartin Matuska spa->spa_name,
603eda14cbcSMatt Macy (longlong_t)scn->scn_restart_txg);
604eda14cbcSMatt Macy } else if (dsl_scan_resilvering(dp)) {
605eda14cbcSMatt Macy /*
606eda14cbcSMatt Macy * If a resilver is in progress and there are already
607eda14cbcSMatt Macy * errors, restart it instead of finishing this scan and
608eda14cbcSMatt Macy * then restarting it. If there haven't been any errors
609eda14cbcSMatt Macy * then remember that the incore DTL is valid.
610eda14cbcSMatt Macy */
611eda14cbcSMatt Macy if (scn->scn_phys.scn_errors > 0) {
612eda14cbcSMatt Macy scn->scn_restart_txg = txg;
613eda14cbcSMatt Macy zfs_dbgmsg("resilver can't excise DTL_MISSING "
61481b22a98SMartin Matuska "when finished; restarting on %s in txg "
61581b22a98SMartin Matuska "%llu",
61681b22a98SMartin Matuska spa->spa_name,
617eda14cbcSMatt Macy (u_longlong_t)scn->scn_restart_txg);
618eda14cbcSMatt Macy } else {
619eda14cbcSMatt Macy /* it's safe to excise DTL when finished */
620eda14cbcSMatt Macy spa->spa_scrub_started = B_TRUE;
621eda14cbcSMatt Macy }
622eda14cbcSMatt Macy }
623eda14cbcSMatt Macy }
624eda14cbcSMatt Macy
625da5137abSMartin Matuska memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
626eda14cbcSMatt Macy
627eda14cbcSMatt Macy /* reload the queue into the in-core state */
628eda14cbcSMatt Macy if (scn->scn_phys.scn_queue_obj != 0) {
629eda14cbcSMatt Macy zap_cursor_t zc;
6307a7741afSMartin Matuska zap_attribute_t *za = zap_attribute_alloc();
631eda14cbcSMatt Macy
632eda14cbcSMatt Macy for (zap_cursor_init(&zc, dp->dp_meta_objset,
633eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj);
6347a7741afSMartin Matuska zap_cursor_retrieve(&zc, za) == 0;
635eda14cbcSMatt Macy (void) zap_cursor_advance(&zc)) {
636eda14cbcSMatt Macy scan_ds_queue_insert(scn,
6377a7741afSMartin Matuska zfs_strtonum(za->za_name, NULL),
6387a7741afSMartin Matuska za->za_first_integer);
639eda14cbcSMatt Macy }
640eda14cbcSMatt Macy zap_cursor_fini(&zc);
6417a7741afSMartin Matuska zap_attribute_free(za);
642eda14cbcSMatt Macy }
643eda14cbcSMatt Macy
644e2df9bb4SMartin Matuska ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
645e2df9bb4SMartin Matuska
646eda14cbcSMatt Macy spa_scan_stat_init(spa);
647c9539b89SMartin Matuska vdev_scan_stat_init(spa->spa_root_vdev);
648c9539b89SMartin Matuska
649eda14cbcSMatt Macy return (0);
650eda14cbcSMatt Macy }
651eda14cbcSMatt Macy
652eda14cbcSMatt Macy void
dsl_scan_fini(dsl_pool_t * dp)653eda14cbcSMatt Macy dsl_scan_fini(dsl_pool_t *dp)
654eda14cbcSMatt Macy {
655eda14cbcSMatt Macy if (dp->dp_scan != NULL) {
656eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
657eda14cbcSMatt Macy
658eda14cbcSMatt Macy if (scn->scn_taskq != NULL)
659eda14cbcSMatt Macy taskq_destroy(scn->scn_taskq);
660eda14cbcSMatt Macy
661eda14cbcSMatt Macy scan_ds_queue_clear(scn);
662eda14cbcSMatt Macy avl_destroy(&scn->scn_queue);
66349086aa3SAlexander Motin mutex_destroy(&scn->scn_queue_lock);
664eda14cbcSMatt Macy scan_ds_prefetch_queue_clear(scn);
665eda14cbcSMatt Macy avl_destroy(&scn->scn_prefetch_queue);
666eda14cbcSMatt Macy
667eda14cbcSMatt Macy kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
668eda14cbcSMatt Macy dp->dp_scan = NULL;
669eda14cbcSMatt Macy }
670eda14cbcSMatt Macy }
671eda14cbcSMatt Macy
672eda14cbcSMatt Macy static boolean_t
dsl_scan_restarting(dsl_scan_t * scn,dmu_tx_t * tx)673eda14cbcSMatt Macy dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
674eda14cbcSMatt Macy {
675eda14cbcSMatt Macy return (scn->scn_restart_txg != 0 &&
676eda14cbcSMatt Macy scn->scn_restart_txg <= tx->tx_txg);
677eda14cbcSMatt Macy }
678eda14cbcSMatt Macy
679eda14cbcSMatt Macy boolean_t
dsl_scan_resilver_scheduled(dsl_pool_t * dp)680eda14cbcSMatt Macy dsl_scan_resilver_scheduled(dsl_pool_t *dp)
681eda14cbcSMatt Macy {
682eda14cbcSMatt Macy return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) ||
683eda14cbcSMatt Macy (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER));
684eda14cbcSMatt Macy }
685eda14cbcSMatt Macy
686eda14cbcSMatt Macy boolean_t
dsl_scan_scrubbing(const dsl_pool_t * dp)687eda14cbcSMatt Macy dsl_scan_scrubbing(const dsl_pool_t *dp)
688eda14cbcSMatt Macy {
689eda14cbcSMatt Macy dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
690eda14cbcSMatt Macy
691eda14cbcSMatt Macy return (scn_phys->scn_state == DSS_SCANNING &&
692eda14cbcSMatt Macy scn_phys->scn_func == POOL_SCAN_SCRUB);
693eda14cbcSMatt Macy }
694eda14cbcSMatt Macy
695eda14cbcSMatt Macy boolean_t
dsl_errorscrubbing(const dsl_pool_t * dp)696c0a83fe0SMartin Matuska dsl_errorscrubbing(const dsl_pool_t *dp)
697c0a83fe0SMartin Matuska {
698c0a83fe0SMartin Matuska dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys;
699c0a83fe0SMartin Matuska
700c0a83fe0SMartin Matuska return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING &&
701c0a83fe0SMartin Matuska errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB);
702c0a83fe0SMartin Matuska }
703c0a83fe0SMartin Matuska
704c0a83fe0SMartin Matuska boolean_t
dsl_errorscrub_is_paused(const dsl_scan_t * scn)705c0a83fe0SMartin Matuska dsl_errorscrub_is_paused(const dsl_scan_t *scn)
706c0a83fe0SMartin Matuska {
707c0a83fe0SMartin Matuska return (dsl_errorscrubbing(scn->scn_dp) &&
708c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_paused_flags);
709c0a83fe0SMartin Matuska }
710c0a83fe0SMartin Matuska
711c0a83fe0SMartin Matuska boolean_t
dsl_scan_is_paused_scrub(const dsl_scan_t * scn)712eda14cbcSMatt Macy dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
713eda14cbcSMatt Macy {
714eda14cbcSMatt Macy return (dsl_scan_scrubbing(scn->scn_dp) &&
715eda14cbcSMatt Macy scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
716eda14cbcSMatt Macy }
717eda14cbcSMatt Macy
718c0a83fe0SMartin Matuska static void
dsl_errorscrub_sync_state(dsl_scan_t * scn,dmu_tx_t * tx)719c0a83fe0SMartin Matuska dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
720c0a83fe0SMartin Matuska {
721c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_cursor =
722c0a83fe0SMartin Matuska zap_cursor_serialize(&scn->errorscrub_cursor);
723c0a83fe0SMartin Matuska
724c0a83fe0SMartin Matuska VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
725c0a83fe0SMartin Matuska DMU_POOL_DIRECTORY_OBJECT,
726c0a83fe0SMartin Matuska DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS,
727c0a83fe0SMartin Matuska &scn->errorscrub_phys, tx));
728c0a83fe0SMartin Matuska }
729c0a83fe0SMartin Matuska
730c0a83fe0SMartin Matuska static void
dsl_errorscrub_setup_sync(void * arg,dmu_tx_t * tx)731c0a83fe0SMartin Matuska dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx)
732c0a83fe0SMartin Matuska {
733c0a83fe0SMartin Matuska dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
734c0a83fe0SMartin Matuska pool_scan_func_t *funcp = arg;
735c0a83fe0SMartin Matuska dsl_pool_t *dp = scn->scn_dp;
736c0a83fe0SMartin Matuska spa_t *spa = dp->dp_spa;
737c0a83fe0SMartin Matuska
738c0a83fe0SMartin Matuska ASSERT(!dsl_scan_is_running(scn));
739c0a83fe0SMartin Matuska ASSERT(!dsl_errorscrubbing(scn->scn_dp));
740c0a83fe0SMartin Matuska ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
741c0a83fe0SMartin Matuska
742c0a83fe0SMartin Matuska memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
743c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_func = *funcp;
744c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING;
745c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_start_time = gethrestime_sec();
746c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa);
747c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_examined = 0;
748c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_errors = 0;
749c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_cursor = 0;
750c0a83fe0SMartin Matuska zap_cursor_init_serialized(&scn->errorscrub_cursor,
751c0a83fe0SMartin Matuska spa->spa_meta_objset, spa->spa_errlog_last,
752c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_cursor);
753c0a83fe0SMartin Matuska
754c0a83fe0SMartin Matuska vdev_config_dirty(spa->spa_root_vdev);
755c0a83fe0SMartin Matuska spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START);
756c0a83fe0SMartin Matuska
757c0a83fe0SMartin Matuska dsl_errorscrub_sync_state(scn, tx);
758c0a83fe0SMartin Matuska
759c0a83fe0SMartin Matuska spa_history_log_internal(spa, "error scrub setup", tx,
760c0a83fe0SMartin Matuska "func=%u mintxg=%u maxtxg=%llu",
761c0a83fe0SMartin Matuska *funcp, 0, (u_longlong_t)tx->tx_txg);
762c0a83fe0SMartin Matuska }
763c0a83fe0SMartin Matuska
764c0a83fe0SMartin Matuska static int
dsl_errorscrub_setup_check(void * arg,dmu_tx_t * tx)765c0a83fe0SMartin Matuska dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx)
766c0a83fe0SMartin Matuska {
767c0a83fe0SMartin Matuska (void) arg;
768c0a83fe0SMartin Matuska dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
769c0a83fe0SMartin Matuska
770c0a83fe0SMartin Matuska if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) {
771c0a83fe0SMartin Matuska return (SET_ERROR(EBUSY));
772c0a83fe0SMartin Matuska }
773c0a83fe0SMartin Matuska
774c0a83fe0SMartin Matuska if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) {
775c0a83fe0SMartin Matuska return (ECANCELED);
776c0a83fe0SMartin Matuska }
777c0a83fe0SMartin Matuska return (0);
778c0a83fe0SMartin Matuska }
779c0a83fe0SMartin Matuska
780eda14cbcSMatt Macy /*
781eda14cbcSMatt Macy * Writes out a persistent dsl_scan_phys_t record to the pool directory.
782eda14cbcSMatt Macy * Because we can be running in the block sorting algorithm, we do not always
783eda14cbcSMatt Macy * want to write out the record, only when it is "safe" to do so. This safety
784eda14cbcSMatt Macy * condition is achieved by making sure that the sorting queues are empty
785a0b956f5SMartin Matuska * (scn_queues_pending == 0). When this condition is not true, the sync'd state
786eda14cbcSMatt Macy * is inconsistent with how much actual scanning progress has been made. The
787eda14cbcSMatt Macy * kind of sync to be performed is specified by the sync_type argument. If the
788eda14cbcSMatt Macy * sync is optional, we only sync if the queues are empty. If the sync is
789eda14cbcSMatt Macy * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
790eda14cbcSMatt Macy * third possible state is a "cached" sync. This is done in response to:
791eda14cbcSMatt Macy * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
792eda14cbcSMatt Macy * destroyed, so we wouldn't be able to restart scanning from it.
793eda14cbcSMatt Macy * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
794eda14cbcSMatt Macy * superseded by a newer snapshot.
795eda14cbcSMatt Macy * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
796eda14cbcSMatt Macy * swapped with its clone.
797eda14cbcSMatt Macy * In all cases, a cached sync simply rewrites the last record we've written,
798eda14cbcSMatt Macy * just slightly modified. For the modifications that are performed to the
799eda14cbcSMatt Macy * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
800eda14cbcSMatt Macy * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
801eda14cbcSMatt Macy */
802eda14cbcSMatt Macy static void
dsl_scan_sync_state(dsl_scan_t * scn,dmu_tx_t * tx,state_sync_type_t sync_type)803eda14cbcSMatt Macy dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
804eda14cbcSMatt Macy {
805eda14cbcSMatt Macy int i;
806eda14cbcSMatt Macy spa_t *spa = scn->scn_dp->dp_spa;
807eda14cbcSMatt Macy
808a0b956f5SMartin Matuska ASSERT(sync_type != SYNC_MANDATORY || scn->scn_queues_pending == 0);
809a0b956f5SMartin Matuska if (scn->scn_queues_pending == 0) {
810eda14cbcSMatt Macy for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
811eda14cbcSMatt Macy vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
812eda14cbcSMatt Macy dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
813eda14cbcSMatt Macy
814eda14cbcSMatt Macy if (q == NULL)
815eda14cbcSMatt Macy continue;
816eda14cbcSMatt Macy
817eda14cbcSMatt Macy mutex_enter(&vd->vdev_scan_io_queue_lock);
818eda14cbcSMatt Macy ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
819eda14cbcSMatt Macy ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==,
820eda14cbcSMatt Macy NULL);
821b59a0cdeSMartin Matuska ASSERT3P(zfs_range_tree_first(q->q_exts_by_addr), ==,
822b59a0cdeSMartin Matuska NULL);
823eda14cbcSMatt Macy mutex_exit(&vd->vdev_scan_io_queue_lock);
824eda14cbcSMatt Macy }
825eda14cbcSMatt Macy
826eda14cbcSMatt Macy if (scn->scn_phys.scn_queue_obj != 0)
827eda14cbcSMatt Macy scan_ds_queue_sync(scn, tx);
828eda14cbcSMatt Macy VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
829eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT,
830eda14cbcSMatt Macy DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
831eda14cbcSMatt Macy &scn->scn_phys, tx));
832da5137abSMartin Matuska memcpy(&scn->scn_phys_cached, &scn->scn_phys,
833eda14cbcSMatt Macy sizeof (scn->scn_phys));
834eda14cbcSMatt Macy
835eda14cbcSMatt Macy if (scn->scn_checkpointing)
83681b22a98SMartin Matuska zfs_dbgmsg("finish scan checkpoint for %s",
83781b22a98SMartin Matuska spa->spa_name);
838eda14cbcSMatt Macy
839eda14cbcSMatt Macy scn->scn_checkpointing = B_FALSE;
840eda14cbcSMatt Macy scn->scn_last_checkpoint = ddi_get_lbolt();
841eda14cbcSMatt Macy } else if (sync_type == SYNC_CACHED) {
842eda14cbcSMatt Macy VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
843eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT,
844eda14cbcSMatt Macy DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
845eda14cbcSMatt Macy &scn->scn_phys_cached, tx));
846eda14cbcSMatt Macy }
847eda14cbcSMatt Macy }
848eda14cbcSMatt Macy
84916038816SMartin Matuska int
dsl_scan_setup_check(void * arg,dmu_tx_t * tx)850eda14cbcSMatt Macy dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
851eda14cbcSMatt Macy {
852e92ffd9bSMartin Matuska (void) arg;
853eda14cbcSMatt Macy dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
854eda14cbcSMatt Macy vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
855eda14cbcSMatt Macy
856c0a83fe0SMartin Matuska if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) ||
857c0a83fe0SMartin Matuska dsl_errorscrubbing(scn->scn_dp))
858eda14cbcSMatt Macy return (SET_ERROR(EBUSY));
859eda14cbcSMatt Macy
860eda14cbcSMatt Macy return (0);
861eda14cbcSMatt Macy }
862eda14cbcSMatt Macy
8637877fdebSMatt Macy void
dsl_scan_setup_sync(void * arg,dmu_tx_t * tx)864eda14cbcSMatt Macy dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
865eda14cbcSMatt Macy {
86617aab35aSMartin Matuska setup_sync_arg_t *setup_sync_arg = (setup_sync_arg_t *)arg;
867eda14cbcSMatt Macy dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
868eda14cbcSMatt Macy dmu_object_type_t ot = 0;
869eda14cbcSMatt Macy dsl_pool_t *dp = scn->scn_dp;
870eda14cbcSMatt Macy spa_t *spa = dp->dp_spa;
871eda14cbcSMatt Macy
872eda14cbcSMatt Macy ASSERT(!dsl_scan_is_running(scn));
87317aab35aSMartin Matuska ASSERT3U(setup_sync_arg->func, >, POOL_SCAN_NONE);
87417aab35aSMartin Matuska ASSERT3U(setup_sync_arg->func, <, POOL_SCAN_FUNCS);
875da5137abSMartin Matuska memset(&scn->scn_phys, 0, sizeof (scn->scn_phys));
876c0a83fe0SMartin Matuska
877c0a83fe0SMartin Matuska /*
878c0a83fe0SMartin Matuska * If we are starting a fresh scrub, we erase the error scrub
879c0a83fe0SMartin Matuska * information from disk.
880c0a83fe0SMartin Matuska */
881c0a83fe0SMartin Matuska memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
882c0a83fe0SMartin Matuska dsl_errorscrub_sync_state(scn, tx);
883c0a83fe0SMartin Matuska
88417aab35aSMartin Matuska scn->scn_phys.scn_func = setup_sync_arg->func;
885eda14cbcSMatt Macy scn->scn_phys.scn_state = DSS_SCANNING;
88617aab35aSMartin Matuska scn->scn_phys.scn_min_txg = setup_sync_arg->txgstart;
88717aab35aSMartin Matuska if (setup_sync_arg->txgend == 0) {
888eda14cbcSMatt Macy scn->scn_phys.scn_max_txg = tx->tx_txg;
88917aab35aSMartin Matuska } else {
89017aab35aSMartin Matuska scn->scn_phys.scn_max_txg = setup_sync_arg->txgend;
89117aab35aSMartin Matuska }
892eda14cbcSMatt Macy scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
893eda14cbcSMatt Macy scn->scn_phys.scn_start_time = gethrestime_sec();
894eda14cbcSMatt Macy scn->scn_phys.scn_errors = 0;
895eda14cbcSMatt Macy scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
896eda14cbcSMatt Macy scn->scn_issued_before_pass = 0;
897eda14cbcSMatt Macy scn->scn_restart_txg = 0;
898eda14cbcSMatt Macy scn->scn_done_txg = 0;
899eda14cbcSMatt Macy scn->scn_last_checkpoint = 0;
900eda14cbcSMatt Macy scn->scn_checkpointing = B_FALSE;
901eda14cbcSMatt Macy spa_scan_stat_init(spa);
902c9539b89SMartin Matuska vdev_scan_stat_init(spa->spa_root_vdev);
903eda14cbcSMatt Macy
904eda14cbcSMatt Macy if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
905eda14cbcSMatt Macy scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
906eda14cbcSMatt Macy
907eda14cbcSMatt Macy /* rewrite all disk labels */
908eda14cbcSMatt Macy vdev_config_dirty(spa->spa_root_vdev);
909eda14cbcSMatt Macy
910eda14cbcSMatt Macy if (vdev_resilver_needed(spa->spa_root_vdev,
911eda14cbcSMatt Macy &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
912eda14cbcSMatt Macy nvlist_t *aux = fnvlist_alloc();
913eda14cbcSMatt Macy fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
914eda14cbcSMatt Macy "healing");
915eda14cbcSMatt Macy spa_event_notify(spa, NULL, aux,
916eda14cbcSMatt Macy ESC_ZFS_RESILVER_START);
917eda14cbcSMatt Macy nvlist_free(aux);
918eda14cbcSMatt Macy } else {
919eda14cbcSMatt Macy spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
920eda14cbcSMatt Macy }
921eda14cbcSMatt Macy
922eda14cbcSMatt Macy spa->spa_scrub_started = B_TRUE;
923eda14cbcSMatt Macy /*
924eda14cbcSMatt Macy * If this is an incremental scrub, limit the DDT scrub phase
925eda14cbcSMatt Macy * to just the auto-ditto class (for correctness); the rest
926eda14cbcSMatt Macy * of the scrub should go faster using top-down pruning.
927eda14cbcSMatt Macy */
928eda14cbcSMatt Macy if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
929eda14cbcSMatt Macy scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
930eda14cbcSMatt Macy
931eda14cbcSMatt Macy /*
932eda14cbcSMatt Macy * When starting a resilver clear any existing rebuild state.
933eda14cbcSMatt Macy * This is required to prevent stale rebuild status from
934eda14cbcSMatt Macy * being reported when a rebuild is run, then a resilver and
935eda14cbcSMatt Macy * finally a scrub. In which case only the scrub status
936eda14cbcSMatt Macy * should be reported by 'zpool status'.
937eda14cbcSMatt Macy */
938eda14cbcSMatt Macy if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
939eda14cbcSMatt Macy vdev_t *rvd = spa->spa_root_vdev;
940eda14cbcSMatt Macy for (uint64_t i = 0; i < rvd->vdev_children; i++) {
941eda14cbcSMatt Macy vdev_t *vd = rvd->vdev_child[i];
942eda14cbcSMatt Macy vdev_rebuild_clear_sync(
943eda14cbcSMatt Macy (void *)(uintptr_t)vd->vdev_id, tx);
944eda14cbcSMatt Macy }
945eda14cbcSMatt Macy }
946eda14cbcSMatt Macy }
947eda14cbcSMatt Macy
948eda14cbcSMatt Macy /* back to the generic stuff */
949eda14cbcSMatt Macy
950a0b956f5SMartin Matuska if (zfs_scan_blkstats) {
951eda14cbcSMatt Macy if (dp->dp_blkstats == NULL) {
952eda14cbcSMatt Macy dp->dp_blkstats =
953eda14cbcSMatt Macy vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
954eda14cbcSMatt Macy }
955da5137abSMartin Matuska memset(&dp->dp_blkstats->zab_type, 0,
956da5137abSMartin Matuska sizeof (dp->dp_blkstats->zab_type));
957a0b956f5SMartin Matuska } else {
958a0b956f5SMartin Matuska if (dp->dp_blkstats) {
959a0b956f5SMartin Matuska vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
960a0b956f5SMartin Matuska dp->dp_blkstats = NULL;
961a0b956f5SMartin Matuska }
962a0b956f5SMartin Matuska }
963eda14cbcSMatt Macy
964eda14cbcSMatt Macy if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
965eda14cbcSMatt Macy ot = DMU_OT_ZAP_OTHER;
966eda14cbcSMatt Macy
967eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
968eda14cbcSMatt Macy ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
969eda14cbcSMatt Macy
970da5137abSMartin Matuska memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
971eda14cbcSMatt Macy
972e2df9bb4SMartin Matuska ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
973e2df9bb4SMartin Matuska
974eda14cbcSMatt Macy dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
975eda14cbcSMatt Macy
976eda14cbcSMatt Macy spa_history_log_internal(spa, "scan setup", tx,
977eda14cbcSMatt Macy "func=%u mintxg=%llu maxtxg=%llu",
97817aab35aSMartin Matuska setup_sync_arg->func, (u_longlong_t)scn->scn_phys.scn_min_txg,
979eda14cbcSMatt Macy (u_longlong_t)scn->scn_phys.scn_max_txg);
980eda14cbcSMatt Macy }
981eda14cbcSMatt Macy
982eda14cbcSMatt Macy /*
983c0a83fe0SMartin Matuska * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub,
984c0a83fe0SMartin Matuska * error scrub or resilver. Can also be called to resume a paused scrub or
985c0a83fe0SMartin Matuska * error scrub.
986eda14cbcSMatt Macy */
987eda14cbcSMatt Macy int
dsl_scan(dsl_pool_t * dp,pool_scan_func_t func,uint64_t txgstart,uint64_t txgend)98817aab35aSMartin Matuska dsl_scan(dsl_pool_t *dp, pool_scan_func_t func, uint64_t txgstart,
98917aab35aSMartin Matuska uint64_t txgend)
990eda14cbcSMatt Macy {
991eda14cbcSMatt Macy spa_t *spa = dp->dp_spa;
992eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
99317aab35aSMartin Matuska setup_sync_arg_t setup_sync_arg;
99417aab35aSMartin Matuska
99517aab35aSMartin Matuska if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0)) {
99617aab35aSMartin Matuska return (EINVAL);
99717aab35aSMartin Matuska }
998eda14cbcSMatt Macy
999eda14cbcSMatt Macy /*
1000eda14cbcSMatt Macy * Purge all vdev caches and probe all devices. We do this here
1001eda14cbcSMatt Macy * rather than in sync context because this requires a writer lock
1002eda14cbcSMatt Macy * on the spa_config lock, which we can't do from sync context. The
1003eda14cbcSMatt Macy * spa_scrub_reopen flag indicates that vdev_open() should not
1004eda14cbcSMatt Macy * attempt to start another scrub.
1005eda14cbcSMatt Macy */
1006eda14cbcSMatt Macy spa_vdev_state_enter(spa, SCL_NONE);
1007eda14cbcSMatt Macy spa->spa_scrub_reopen = B_TRUE;
1008eda14cbcSMatt Macy vdev_reopen(spa->spa_root_vdev);
1009eda14cbcSMatt Macy spa->spa_scrub_reopen = B_FALSE;
1010eda14cbcSMatt Macy (void) spa_vdev_state_exit(spa, NULL, 0);
1011eda14cbcSMatt Macy
1012eda14cbcSMatt Macy if (func == POOL_SCAN_RESILVER) {
1013eda14cbcSMatt Macy dsl_scan_restart_resilver(spa->spa_dsl_pool, 0);
1014eda14cbcSMatt Macy return (0);
1015eda14cbcSMatt Macy }
1016eda14cbcSMatt Macy
1017c0a83fe0SMartin Matuska if (func == POOL_SCAN_ERRORSCRUB) {
1018c0a83fe0SMartin Matuska if (dsl_errorscrub_is_paused(dp->dp_scan)) {
1019c0a83fe0SMartin Matuska /*
1020c0a83fe0SMartin Matuska * got error scrub start cmd, resume paused error scrub.
1021c0a83fe0SMartin Matuska */
1022c0a83fe0SMartin Matuska int err = dsl_scrub_set_pause_resume(scn->scn_dp,
1023c0a83fe0SMartin Matuska POOL_SCRUB_NORMAL);
1024c0a83fe0SMartin Matuska if (err == 0) {
1025c0a83fe0SMartin Matuska spa_event_notify(spa, NULL, NULL,
1026c0a83fe0SMartin Matuska ESC_ZFS_ERRORSCRUB_RESUME);
1027b1c1ee44SMartin Matuska return (0);
1028c0a83fe0SMartin Matuska }
1029c0a83fe0SMartin Matuska return (SET_ERROR(err));
1030c0a83fe0SMartin Matuska }
1031c0a83fe0SMartin Matuska
1032c0a83fe0SMartin Matuska return (dsl_sync_task(spa_name(dp->dp_spa),
1033c0a83fe0SMartin Matuska dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync,
1034c0a83fe0SMartin Matuska &func, 0, ZFS_SPACE_CHECK_RESERVED));
1035c0a83fe0SMartin Matuska }
1036c0a83fe0SMartin Matuska
1037eda14cbcSMatt Macy if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
1038eda14cbcSMatt Macy /* got scrub start cmd, resume paused scrub */
1039eda14cbcSMatt Macy int err = dsl_scrub_set_pause_resume(scn->scn_dp,
1040eda14cbcSMatt Macy POOL_SCRUB_NORMAL);
1041eda14cbcSMatt Macy if (err == 0) {
1042eda14cbcSMatt Macy spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
1043b1c1ee44SMartin Matuska return (0);
1044eda14cbcSMatt Macy }
1045eda14cbcSMatt Macy return (SET_ERROR(err));
1046eda14cbcSMatt Macy }
1047eda14cbcSMatt Macy
104817aab35aSMartin Matuska setup_sync_arg.func = func;
104917aab35aSMartin Matuska setup_sync_arg.txgstart = txgstart;
105017aab35aSMartin Matuska setup_sync_arg.txgend = txgend;
105117aab35aSMartin Matuska
1052eda14cbcSMatt Macy return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
105317aab35aSMartin Matuska dsl_scan_setup_sync, &setup_sync_arg, 0,
105417aab35aSMartin Matuska ZFS_SPACE_CHECK_EXTRA_RESERVED));
1055eda14cbcSMatt Macy }
1056eda14cbcSMatt Macy
1057eda14cbcSMatt Macy static void
dsl_errorscrub_done(dsl_scan_t * scn,boolean_t complete,dmu_tx_t * tx)1058c0a83fe0SMartin Matuska dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
1059c0a83fe0SMartin Matuska {
1060c0a83fe0SMartin Matuska dsl_pool_t *dp = scn->scn_dp;
1061c0a83fe0SMartin Matuska spa_t *spa = dp->dp_spa;
1062c0a83fe0SMartin Matuska
1063c0a83fe0SMartin Matuska if (complete) {
1064c0a83fe0SMartin Matuska spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH);
1065c0a83fe0SMartin Matuska spa_history_log_internal(spa, "error scrub done", tx,
1066c0a83fe0SMartin Matuska "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
1067c0a83fe0SMartin Matuska } else {
1068c0a83fe0SMartin Matuska spa_history_log_internal(spa, "error scrub canceled", tx,
1069c0a83fe0SMartin Matuska "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
1070c0a83fe0SMartin Matuska }
1071c0a83fe0SMartin Matuska
1072c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED;
1073c0a83fe0SMartin Matuska spa->spa_scrub_active = B_FALSE;
1074c0a83fe0SMartin Matuska spa_errlog_rotate(spa);
1075c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_end_time = gethrestime_sec();
1076c0a83fe0SMartin Matuska zap_cursor_fini(&scn->errorscrub_cursor);
1077c0a83fe0SMartin Matuska
1078c0a83fe0SMartin Matuska if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
1079c0a83fe0SMartin Matuska spa->spa_errata = 0;
1080c0a83fe0SMartin Matuska
1081c0a83fe0SMartin Matuska ASSERT(!dsl_errorscrubbing(scn->scn_dp));
1082c0a83fe0SMartin Matuska }
1083c0a83fe0SMartin Matuska
1084c0a83fe0SMartin Matuska static void
dsl_scan_done(dsl_scan_t * scn,boolean_t complete,dmu_tx_t * tx)1085eda14cbcSMatt Macy dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
1086eda14cbcSMatt Macy {
1087eda14cbcSMatt Macy static const char *old_names[] = {
1088eda14cbcSMatt Macy "scrub_bookmark",
1089eda14cbcSMatt Macy "scrub_ddt_bookmark",
1090eda14cbcSMatt Macy "scrub_ddt_class_max",
1091eda14cbcSMatt Macy "scrub_queue",
1092eda14cbcSMatt Macy "scrub_min_txg",
1093eda14cbcSMatt Macy "scrub_max_txg",
1094eda14cbcSMatt Macy "scrub_func",
1095eda14cbcSMatt Macy "scrub_errors",
1096eda14cbcSMatt Macy NULL
1097eda14cbcSMatt Macy };
1098eda14cbcSMatt Macy
1099eda14cbcSMatt Macy dsl_pool_t *dp = scn->scn_dp;
1100eda14cbcSMatt Macy spa_t *spa = dp->dp_spa;
1101eda14cbcSMatt Macy int i;
1102eda14cbcSMatt Macy
1103eda14cbcSMatt Macy /* Remove any remnants of an old-style scrub. */
1104eda14cbcSMatt Macy for (i = 0; old_names[i]; i++) {
1105eda14cbcSMatt Macy (void) zap_remove(dp->dp_meta_objset,
1106eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
1107eda14cbcSMatt Macy }
1108eda14cbcSMatt Macy
1109eda14cbcSMatt Macy if (scn->scn_phys.scn_queue_obj != 0) {
1110eda14cbcSMatt Macy VERIFY0(dmu_object_free(dp->dp_meta_objset,
1111eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj, tx));
1112eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj = 0;
1113eda14cbcSMatt Macy }
1114eda14cbcSMatt Macy scan_ds_queue_clear(scn);
1115eda14cbcSMatt Macy scan_ds_prefetch_queue_clear(scn);
1116eda14cbcSMatt Macy
1117eda14cbcSMatt Macy scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
1118eda14cbcSMatt Macy
1119eda14cbcSMatt Macy /*
1120eda14cbcSMatt Macy * If we were "restarted" from a stopped state, don't bother
1121eda14cbcSMatt Macy * with anything else.
1122eda14cbcSMatt Macy */
1123eda14cbcSMatt Macy if (!dsl_scan_is_running(scn)) {
1124eda14cbcSMatt Macy ASSERT(!scn->scn_is_sorted);
1125eda14cbcSMatt Macy return;
1126eda14cbcSMatt Macy }
1127eda14cbcSMatt Macy
1128eda14cbcSMatt Macy if (scn->scn_is_sorted) {
1129eda14cbcSMatt Macy scan_io_queues_destroy(scn);
1130eda14cbcSMatt Macy scn->scn_is_sorted = B_FALSE;
1131eda14cbcSMatt Macy
1132eda14cbcSMatt Macy if (scn->scn_taskq != NULL) {
1133eda14cbcSMatt Macy taskq_destroy(scn->scn_taskq);
1134eda14cbcSMatt Macy scn->scn_taskq = NULL;
1135eda14cbcSMatt Macy }
1136eda14cbcSMatt Macy }
1137eda14cbcSMatt Macy
113817aab35aSMartin Matuska if (dsl_scan_restarting(scn, tx)) {
1139eda14cbcSMatt Macy spa_history_log_internal(spa, "scan aborted, restarting", tx,
114015f0b8c3SMartin Matuska "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
114117aab35aSMartin Matuska } else if (!complete) {
1142eda14cbcSMatt Macy spa_history_log_internal(spa, "scan cancelled", tx,
114315f0b8c3SMartin Matuska "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
114417aab35aSMartin Matuska } else {
1145eda14cbcSMatt Macy spa_history_log_internal(spa, "scan done", tx,
114615f0b8c3SMartin Matuska "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
114717aab35aSMartin Matuska if (DSL_SCAN_IS_SCRUB(scn)) {
114817aab35aSMartin Matuska VERIFY0(zap_update(dp->dp_meta_objset,
114917aab35aSMartin Matuska DMU_POOL_DIRECTORY_OBJECT,
115017aab35aSMartin Matuska DMU_POOL_LAST_SCRUBBED_TXG,
115117aab35aSMartin Matuska sizeof (uint64_t), 1,
115217aab35aSMartin Matuska &scn->scn_phys.scn_max_txg, tx));
115317aab35aSMartin Matuska spa->spa_scrubbed_last_txg = scn->scn_phys.scn_max_txg;
115417aab35aSMartin Matuska }
115517aab35aSMartin Matuska }
1156eda14cbcSMatt Macy
1157eda14cbcSMatt Macy if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
1158eda14cbcSMatt Macy spa->spa_scrub_active = B_FALSE;
1159eda14cbcSMatt Macy
1160eda14cbcSMatt Macy /*
1161eda14cbcSMatt Macy * If the scrub/resilver completed, update all DTLs to
1162eda14cbcSMatt Macy * reflect this. Whether it succeeded or not, vacate
1163eda14cbcSMatt Macy * all temporary scrub DTLs.
1164eda14cbcSMatt Macy *
1165eda14cbcSMatt Macy * As the scrub does not currently support traversing
1166eda14cbcSMatt Macy * data that have been freed but are part of a checkpoint,
1167eda14cbcSMatt Macy * we don't mark the scrub as done in the DTLs as faults
1168eda14cbcSMatt Macy * may still exist in those vdevs.
1169eda14cbcSMatt Macy */
1170eda14cbcSMatt Macy if (complete &&
1171eda14cbcSMatt Macy !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
1172eda14cbcSMatt Macy vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
1173eda14cbcSMatt Macy scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE);
1174eda14cbcSMatt Macy
1175*3a896071SMartin Matuska if (DSL_SCAN_IS_RESILVER(scn)) {
1176eda14cbcSMatt Macy nvlist_t *aux = fnvlist_alloc();
1177eda14cbcSMatt Macy fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
1178eda14cbcSMatt Macy "healing");
1179eda14cbcSMatt Macy spa_event_notify(spa, NULL, aux,
1180eda14cbcSMatt Macy ESC_ZFS_RESILVER_FINISH);
1181eda14cbcSMatt Macy nvlist_free(aux);
1182eda14cbcSMatt Macy } else {
1183eda14cbcSMatt Macy spa_event_notify(spa, NULL, NULL,
1184eda14cbcSMatt Macy ESC_ZFS_SCRUB_FINISH);
1185eda14cbcSMatt Macy }
1186eda14cbcSMatt Macy } else {
1187eda14cbcSMatt Macy vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
1188eda14cbcSMatt Macy 0, B_TRUE, B_FALSE);
1189eda14cbcSMatt Macy }
1190eda14cbcSMatt Macy spa_errlog_rotate(spa);
1191eda14cbcSMatt Macy
1192eda14cbcSMatt Macy /*
1193eda14cbcSMatt Macy * Don't clear flag until after vdev_dtl_reassess to ensure that
1194eda14cbcSMatt Macy * DTL_MISSING will get updated when possible.
1195eda14cbcSMatt Macy */
1196071ab5a1SMartin Matuska scn->scn_phys.scn_state = complete ? DSS_FINISHED :
1197071ab5a1SMartin Matuska DSS_CANCELED;
1198071ab5a1SMartin Matuska scn->scn_phys.scn_end_time = gethrestime_sec();
1199eda14cbcSMatt Macy spa->spa_scrub_started = B_FALSE;
1200eda14cbcSMatt Macy
1201eda14cbcSMatt Macy /*
1202eda14cbcSMatt Macy * We may have finished replacing a device.
1203eda14cbcSMatt Macy * Let the async thread assess this and handle the detach.
1204eda14cbcSMatt Macy */
1205eda14cbcSMatt Macy spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
1206eda14cbcSMatt Macy
1207eda14cbcSMatt Macy /*
1208eda14cbcSMatt Macy * Clear any resilver_deferred flags in the config.
1209eda14cbcSMatt Macy * If there are drives that need resilvering, kick
1210eda14cbcSMatt Macy * off an asynchronous request to start resilver.
1211eda14cbcSMatt Macy * vdev_clear_resilver_deferred() may update the config
1212eda14cbcSMatt Macy * before the resilver can restart. In the event of
1213eda14cbcSMatt Macy * a crash during this period, the spa loading code
1214eda14cbcSMatt Macy * will find the drives that need to be resilvered
1215eda14cbcSMatt Macy * and start the resilver then.
1216eda14cbcSMatt Macy */
1217eda14cbcSMatt Macy if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) &&
1218eda14cbcSMatt Macy vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
1219eda14cbcSMatt Macy spa_history_log_internal(spa,
1220eda14cbcSMatt Macy "starting deferred resilver", tx, "errors=%llu",
122115f0b8c3SMartin Matuska (u_longlong_t)spa_approx_errlog_size(spa));
1222eda14cbcSMatt Macy spa_async_request(spa, SPA_ASYNC_RESILVER);
1223eda14cbcSMatt Macy }
1224ba27dd8bSMartin Matuska
1225ba27dd8bSMartin Matuska /* Clear recent error events (i.e. duplicate events tracking) */
1226ba27dd8bSMartin Matuska if (complete)
1227ba27dd8bSMartin Matuska zfs_ereport_clear(spa, NULL);
1228071ab5a1SMartin Matuska } else {
1229071ab5a1SMartin Matuska scn->scn_phys.scn_state = complete ? DSS_FINISHED :
1230071ab5a1SMartin Matuska DSS_CANCELED;
1231071ab5a1SMartin Matuska scn->scn_phys.scn_end_time = gethrestime_sec();
1232eda14cbcSMatt Macy }
1233eda14cbcSMatt Macy
1234071ab5a1SMartin Matuska spa_notify_waiters(spa);
1235eda14cbcSMatt Macy
1236eda14cbcSMatt Macy if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
1237eda14cbcSMatt Macy spa->spa_errata = 0;
1238eda14cbcSMatt Macy
1239eda14cbcSMatt Macy ASSERT(!dsl_scan_is_running(scn));
1240eda14cbcSMatt Macy }
1241eda14cbcSMatt Macy
1242eda14cbcSMatt Macy static int
dsl_errorscrub_pause_resume_check(void * arg,dmu_tx_t * tx)1243c0a83fe0SMartin Matuska dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx)
1244c0a83fe0SMartin Matuska {
1245c0a83fe0SMartin Matuska pool_scrub_cmd_t *cmd = arg;
1246c0a83fe0SMartin Matuska dsl_pool_t *dp = dmu_tx_pool(tx);
1247c0a83fe0SMartin Matuska dsl_scan_t *scn = dp->dp_scan;
1248c0a83fe0SMartin Matuska
1249c0a83fe0SMartin Matuska if (*cmd == POOL_SCRUB_PAUSE) {
1250c0a83fe0SMartin Matuska /*
1251c0a83fe0SMartin Matuska * can't pause a error scrub when there is no in-progress
1252c0a83fe0SMartin Matuska * error scrub.
1253c0a83fe0SMartin Matuska */
1254c0a83fe0SMartin Matuska if (!dsl_errorscrubbing(dp))
1255c0a83fe0SMartin Matuska return (SET_ERROR(ENOENT));
1256c0a83fe0SMartin Matuska
1257c0a83fe0SMartin Matuska /* can't pause a paused error scrub */
1258c0a83fe0SMartin Matuska if (dsl_errorscrub_is_paused(scn))
1259c0a83fe0SMartin Matuska return (SET_ERROR(EBUSY));
1260c0a83fe0SMartin Matuska } else if (*cmd != POOL_SCRUB_NORMAL) {
1261c0a83fe0SMartin Matuska return (SET_ERROR(ENOTSUP));
1262c0a83fe0SMartin Matuska }
1263c0a83fe0SMartin Matuska
1264c0a83fe0SMartin Matuska return (0);
1265c0a83fe0SMartin Matuska }
1266c0a83fe0SMartin Matuska
1267c0a83fe0SMartin Matuska static void
dsl_errorscrub_pause_resume_sync(void * arg,dmu_tx_t * tx)1268c0a83fe0SMartin Matuska dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
1269c0a83fe0SMartin Matuska {
1270c0a83fe0SMartin Matuska pool_scrub_cmd_t *cmd = arg;
1271c0a83fe0SMartin Matuska dsl_pool_t *dp = dmu_tx_pool(tx);
1272c0a83fe0SMartin Matuska spa_t *spa = dp->dp_spa;
1273c0a83fe0SMartin Matuska dsl_scan_t *scn = dp->dp_scan;
1274c0a83fe0SMartin Matuska
1275c0a83fe0SMartin Matuska if (*cmd == POOL_SCRUB_PAUSE) {
1276c0a83fe0SMartin Matuska spa->spa_scan_pass_errorscrub_pause = gethrestime_sec();
1277c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_paused_flags = B_TRUE;
1278c0a83fe0SMartin Matuska dsl_errorscrub_sync_state(scn, tx);
1279c0a83fe0SMartin Matuska spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED);
1280c0a83fe0SMartin Matuska } else {
1281c0a83fe0SMartin Matuska ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
1282c0a83fe0SMartin Matuska if (dsl_errorscrub_is_paused(scn)) {
1283c0a83fe0SMartin Matuska /*
1284c0a83fe0SMartin Matuska * We need to keep track of how much time we spend
1285c0a83fe0SMartin Matuska * paused per pass so that we can adjust the error scrub
1286c0a83fe0SMartin Matuska * rate shown in the output of 'zpool status'.
1287c0a83fe0SMartin Matuska */
1288c0a83fe0SMartin Matuska spa->spa_scan_pass_errorscrub_spent_paused +=
1289c0a83fe0SMartin Matuska gethrestime_sec() -
1290c0a83fe0SMartin Matuska spa->spa_scan_pass_errorscrub_pause;
1291c0a83fe0SMartin Matuska
1292c0a83fe0SMartin Matuska spa->spa_scan_pass_errorscrub_pause = 0;
1293c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_paused_flags = B_FALSE;
1294c0a83fe0SMartin Matuska
1295c0a83fe0SMartin Matuska zap_cursor_init_serialized(
1296c0a83fe0SMartin Matuska &scn->errorscrub_cursor,
1297c0a83fe0SMartin Matuska spa->spa_meta_objset, spa->spa_errlog_last,
1298c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_cursor);
1299c0a83fe0SMartin Matuska
1300c0a83fe0SMartin Matuska dsl_errorscrub_sync_state(scn, tx);
1301c0a83fe0SMartin Matuska }
1302c0a83fe0SMartin Matuska }
1303c0a83fe0SMartin Matuska }
1304c0a83fe0SMartin Matuska
1305c0a83fe0SMartin Matuska static int
dsl_errorscrub_cancel_check(void * arg,dmu_tx_t * tx)1306c0a83fe0SMartin Matuska dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx)
1307c0a83fe0SMartin Matuska {
1308c0a83fe0SMartin Matuska (void) arg;
1309c0a83fe0SMartin Matuska dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
1310c0a83fe0SMartin Matuska /* can't cancel a error scrub when there is no one in-progress */
1311c0a83fe0SMartin Matuska if (!dsl_errorscrubbing(scn->scn_dp))
1312c0a83fe0SMartin Matuska return (SET_ERROR(ENOENT));
1313c0a83fe0SMartin Matuska return (0);
1314c0a83fe0SMartin Matuska }
1315c0a83fe0SMartin Matuska
1316c0a83fe0SMartin Matuska static void
dsl_errorscrub_cancel_sync(void * arg,dmu_tx_t * tx)1317c0a83fe0SMartin Matuska dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx)
1318c0a83fe0SMartin Matuska {
1319c0a83fe0SMartin Matuska (void) arg;
1320c0a83fe0SMartin Matuska dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
1321c0a83fe0SMartin Matuska
1322c0a83fe0SMartin Matuska dsl_errorscrub_done(scn, B_FALSE, tx);
1323c0a83fe0SMartin Matuska dsl_errorscrub_sync_state(scn, tx);
1324c0a83fe0SMartin Matuska spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL,
1325c0a83fe0SMartin Matuska ESC_ZFS_ERRORSCRUB_ABORT);
1326c0a83fe0SMartin Matuska }
1327c0a83fe0SMartin Matuska
1328c0a83fe0SMartin Matuska static int
dsl_scan_cancel_check(void * arg,dmu_tx_t * tx)1329eda14cbcSMatt Macy dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
1330eda14cbcSMatt Macy {
1331e92ffd9bSMartin Matuska (void) arg;
1332eda14cbcSMatt Macy dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
1333eda14cbcSMatt Macy
1334eda14cbcSMatt Macy if (!dsl_scan_is_running(scn))
1335eda14cbcSMatt Macy return (SET_ERROR(ENOENT));
1336eda14cbcSMatt Macy return (0);
1337eda14cbcSMatt Macy }
1338eda14cbcSMatt Macy
1339eda14cbcSMatt Macy static void
dsl_scan_cancel_sync(void * arg,dmu_tx_t * tx)1340eda14cbcSMatt Macy dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
1341eda14cbcSMatt Macy {
1342e92ffd9bSMartin Matuska (void) arg;
1343eda14cbcSMatt Macy dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
1344eda14cbcSMatt Macy
1345eda14cbcSMatt Macy dsl_scan_done(scn, B_FALSE, tx);
1346eda14cbcSMatt Macy dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
1347eda14cbcSMatt Macy spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
1348eda14cbcSMatt Macy }
1349eda14cbcSMatt Macy
1350eda14cbcSMatt Macy int
dsl_scan_cancel(dsl_pool_t * dp)1351eda14cbcSMatt Macy dsl_scan_cancel(dsl_pool_t *dp)
1352eda14cbcSMatt Macy {
1353c0a83fe0SMartin Matuska if (dsl_errorscrubbing(dp)) {
1354c0a83fe0SMartin Matuska return (dsl_sync_task(spa_name(dp->dp_spa),
1355c0a83fe0SMartin Matuska dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync,
1356c0a83fe0SMartin Matuska NULL, 3, ZFS_SPACE_CHECK_RESERVED));
1357c0a83fe0SMartin Matuska }
1358eda14cbcSMatt Macy return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
1359eda14cbcSMatt Macy dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
1360eda14cbcSMatt Macy }
1361eda14cbcSMatt Macy
1362eda14cbcSMatt Macy static int
dsl_scrub_pause_resume_check(void * arg,dmu_tx_t * tx)1363eda14cbcSMatt Macy dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
1364eda14cbcSMatt Macy {
1365eda14cbcSMatt Macy pool_scrub_cmd_t *cmd = arg;
1366eda14cbcSMatt Macy dsl_pool_t *dp = dmu_tx_pool(tx);
1367eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
1368eda14cbcSMatt Macy
1369eda14cbcSMatt Macy if (*cmd == POOL_SCRUB_PAUSE) {
1370eda14cbcSMatt Macy /* can't pause a scrub when there is no in-progress scrub */
1371eda14cbcSMatt Macy if (!dsl_scan_scrubbing(dp))
1372eda14cbcSMatt Macy return (SET_ERROR(ENOENT));
1373eda14cbcSMatt Macy
1374eda14cbcSMatt Macy /* can't pause a paused scrub */
1375eda14cbcSMatt Macy if (dsl_scan_is_paused_scrub(scn))
1376eda14cbcSMatt Macy return (SET_ERROR(EBUSY));
1377eda14cbcSMatt Macy } else if (*cmd != POOL_SCRUB_NORMAL) {
1378eda14cbcSMatt Macy return (SET_ERROR(ENOTSUP));
1379eda14cbcSMatt Macy }
1380eda14cbcSMatt Macy
1381eda14cbcSMatt Macy return (0);
1382eda14cbcSMatt Macy }
1383eda14cbcSMatt Macy
1384eda14cbcSMatt Macy static void
dsl_scrub_pause_resume_sync(void * arg,dmu_tx_t * tx)1385eda14cbcSMatt Macy dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
1386eda14cbcSMatt Macy {
1387eda14cbcSMatt Macy pool_scrub_cmd_t *cmd = arg;
1388eda14cbcSMatt Macy dsl_pool_t *dp = dmu_tx_pool(tx);
1389eda14cbcSMatt Macy spa_t *spa = dp->dp_spa;
1390eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
1391eda14cbcSMatt Macy
1392eda14cbcSMatt Macy if (*cmd == POOL_SCRUB_PAUSE) {
1393eda14cbcSMatt Macy /* can't pause a scrub when there is no in-progress scrub */
1394eda14cbcSMatt Macy spa->spa_scan_pass_scrub_pause = gethrestime_sec();
1395eda14cbcSMatt Macy scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
1396eda14cbcSMatt Macy scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
1397eda14cbcSMatt Macy dsl_scan_sync_state(scn, tx, SYNC_CACHED);
1398eda14cbcSMatt Macy spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
1399eda14cbcSMatt Macy spa_notify_waiters(spa);
1400eda14cbcSMatt Macy } else {
1401eda14cbcSMatt Macy ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
1402eda14cbcSMatt Macy if (dsl_scan_is_paused_scrub(scn)) {
1403eda14cbcSMatt Macy /*
1404eda14cbcSMatt Macy * We need to keep track of how much time we spend
1405eda14cbcSMatt Macy * paused per pass so that we can adjust the scrub rate
1406eda14cbcSMatt Macy * shown in the output of 'zpool status'
1407eda14cbcSMatt Macy */
1408eda14cbcSMatt Macy spa->spa_scan_pass_scrub_spent_paused +=
1409eda14cbcSMatt Macy gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
1410eda14cbcSMatt Macy spa->spa_scan_pass_scrub_pause = 0;
1411eda14cbcSMatt Macy scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
1412eda14cbcSMatt Macy scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
1413eda14cbcSMatt Macy dsl_scan_sync_state(scn, tx, SYNC_CACHED);
1414eda14cbcSMatt Macy }
1415eda14cbcSMatt Macy }
1416eda14cbcSMatt Macy }
1417eda14cbcSMatt Macy
1418eda14cbcSMatt Macy /*
1419eda14cbcSMatt Macy * Set scrub pause/resume state if it makes sense to do so
1420eda14cbcSMatt Macy */
1421eda14cbcSMatt Macy int
dsl_scrub_set_pause_resume(const dsl_pool_t * dp,pool_scrub_cmd_t cmd)1422eda14cbcSMatt Macy dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
1423eda14cbcSMatt Macy {
1424c0a83fe0SMartin Matuska if (dsl_errorscrubbing(dp)) {
1425c0a83fe0SMartin Matuska return (dsl_sync_task(spa_name(dp->dp_spa),
1426c0a83fe0SMartin Matuska dsl_errorscrub_pause_resume_check,
1427c0a83fe0SMartin Matuska dsl_errorscrub_pause_resume_sync, &cmd, 3,
1428c0a83fe0SMartin Matuska ZFS_SPACE_CHECK_RESERVED));
1429c0a83fe0SMartin Matuska }
1430eda14cbcSMatt Macy return (dsl_sync_task(spa_name(dp->dp_spa),
1431eda14cbcSMatt Macy dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
1432eda14cbcSMatt Macy ZFS_SPACE_CHECK_RESERVED));
1433eda14cbcSMatt Macy }
1434eda14cbcSMatt Macy
1435eda14cbcSMatt Macy
1436eda14cbcSMatt Macy /* start a new scan, or restart an existing one. */
1437eda14cbcSMatt Macy void
dsl_scan_restart_resilver(dsl_pool_t * dp,uint64_t txg)1438eda14cbcSMatt Macy dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg)
1439eda14cbcSMatt Macy {
1440eda14cbcSMatt Macy if (txg == 0) {
1441eda14cbcSMatt Macy dmu_tx_t *tx;
1442eda14cbcSMatt Macy tx = dmu_tx_create_dd(dp->dp_mos_dir);
1443b1c1ee44SMartin Matuska VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND));
1444eda14cbcSMatt Macy
1445eda14cbcSMatt Macy txg = dmu_tx_get_txg(tx);
1446eda14cbcSMatt Macy dp->dp_scan->scn_restart_txg = txg;
1447eda14cbcSMatt Macy dmu_tx_commit(tx);
1448eda14cbcSMatt Macy } else {
1449eda14cbcSMatt Macy dp->dp_scan->scn_restart_txg = txg;
1450eda14cbcSMatt Macy }
145181b22a98SMartin Matuska zfs_dbgmsg("restarting resilver for %s at txg=%llu",
145281b22a98SMartin Matuska dp->dp_spa->spa_name, (longlong_t)txg);
1453eda14cbcSMatt Macy }
1454eda14cbcSMatt Macy
1455eda14cbcSMatt Macy void
dsl_free(dsl_pool_t * dp,uint64_t txg,const blkptr_t * bp)1456eda14cbcSMatt Macy dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
1457eda14cbcSMatt Macy {
1458eda14cbcSMatt Macy zio_free(dp->dp_spa, txg, bp);
1459eda14cbcSMatt Macy }
1460eda14cbcSMatt Macy
1461eda14cbcSMatt Macy void
dsl_free_sync(zio_t * pio,dsl_pool_t * dp,uint64_t txg,const blkptr_t * bpp)1462eda14cbcSMatt Macy dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
1463eda14cbcSMatt Macy {
1464eda14cbcSMatt Macy ASSERT(dsl_pool_sync_context(dp));
1465eda14cbcSMatt Macy zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
1466eda14cbcSMatt Macy }
1467eda14cbcSMatt Macy
1468eda14cbcSMatt Macy static int
scan_ds_queue_compare(const void * a,const void * b)1469eda14cbcSMatt Macy scan_ds_queue_compare(const void *a, const void *b)
1470eda14cbcSMatt Macy {
1471eda14cbcSMatt Macy const scan_ds_t *sds_a = a, *sds_b = b;
1472eda14cbcSMatt Macy
1473eda14cbcSMatt Macy if (sds_a->sds_dsobj < sds_b->sds_dsobj)
1474eda14cbcSMatt Macy return (-1);
1475eda14cbcSMatt Macy if (sds_a->sds_dsobj == sds_b->sds_dsobj)
1476eda14cbcSMatt Macy return (0);
1477eda14cbcSMatt Macy return (1);
1478eda14cbcSMatt Macy }
1479eda14cbcSMatt Macy
1480eda14cbcSMatt Macy static void
scan_ds_queue_clear(dsl_scan_t * scn)1481eda14cbcSMatt Macy scan_ds_queue_clear(dsl_scan_t *scn)
1482eda14cbcSMatt Macy {
1483eda14cbcSMatt Macy void *cookie = NULL;
1484eda14cbcSMatt Macy scan_ds_t *sds;
1485eda14cbcSMatt Macy while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
1486eda14cbcSMatt Macy kmem_free(sds, sizeof (*sds));
1487eda14cbcSMatt Macy }
1488eda14cbcSMatt Macy }
1489eda14cbcSMatt Macy
1490eda14cbcSMatt Macy static boolean_t
scan_ds_queue_contains(dsl_scan_t * scn,uint64_t dsobj,uint64_t * txg)1491eda14cbcSMatt Macy scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg)
1492eda14cbcSMatt Macy {
1493eda14cbcSMatt Macy scan_ds_t srch, *sds;
1494eda14cbcSMatt Macy
1495eda14cbcSMatt Macy srch.sds_dsobj = dsobj;
1496eda14cbcSMatt Macy sds = avl_find(&scn->scn_queue, &srch, NULL);
1497eda14cbcSMatt Macy if (sds != NULL && txg != NULL)
1498eda14cbcSMatt Macy *txg = sds->sds_txg;
1499eda14cbcSMatt Macy return (sds != NULL);
1500eda14cbcSMatt Macy }
1501eda14cbcSMatt Macy
1502eda14cbcSMatt Macy static void
scan_ds_queue_insert(dsl_scan_t * scn,uint64_t dsobj,uint64_t txg)1503eda14cbcSMatt Macy scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
1504eda14cbcSMatt Macy {
1505eda14cbcSMatt Macy scan_ds_t *sds;
1506eda14cbcSMatt Macy avl_index_t where;
1507eda14cbcSMatt Macy
1508eda14cbcSMatt Macy sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
1509eda14cbcSMatt Macy sds->sds_dsobj = dsobj;
1510eda14cbcSMatt Macy sds->sds_txg = txg;
1511eda14cbcSMatt Macy
1512eda14cbcSMatt Macy VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
1513eda14cbcSMatt Macy avl_insert(&scn->scn_queue, sds, where);
1514eda14cbcSMatt Macy }
1515eda14cbcSMatt Macy
1516eda14cbcSMatt Macy static void
scan_ds_queue_remove(dsl_scan_t * scn,uint64_t dsobj)1517eda14cbcSMatt Macy scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
1518eda14cbcSMatt Macy {
1519eda14cbcSMatt Macy scan_ds_t srch, *sds;
1520eda14cbcSMatt Macy
1521eda14cbcSMatt Macy srch.sds_dsobj = dsobj;
1522eda14cbcSMatt Macy
1523eda14cbcSMatt Macy sds = avl_find(&scn->scn_queue, &srch, NULL);
1524eda14cbcSMatt Macy VERIFY(sds != NULL);
1525eda14cbcSMatt Macy avl_remove(&scn->scn_queue, sds);
1526eda14cbcSMatt Macy kmem_free(sds, sizeof (*sds));
1527eda14cbcSMatt Macy }
1528eda14cbcSMatt Macy
1529eda14cbcSMatt Macy static void
scan_ds_queue_sync(dsl_scan_t * scn,dmu_tx_t * tx)1530eda14cbcSMatt Macy scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
1531eda14cbcSMatt Macy {
1532eda14cbcSMatt Macy dsl_pool_t *dp = scn->scn_dp;
1533eda14cbcSMatt Macy spa_t *spa = dp->dp_spa;
1534eda14cbcSMatt Macy dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
1535eda14cbcSMatt Macy DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
1536eda14cbcSMatt Macy
1537a0b956f5SMartin Matuska ASSERT0(scn->scn_queues_pending);
1538eda14cbcSMatt Macy ASSERT(scn->scn_phys.scn_queue_obj != 0);
1539eda14cbcSMatt Macy
1540eda14cbcSMatt Macy VERIFY0(dmu_object_free(dp->dp_meta_objset,
1541eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj, tx));
1542eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
1543eda14cbcSMatt Macy DMU_OT_NONE, 0, tx);
1544eda14cbcSMatt Macy for (scan_ds_t *sds = avl_first(&scn->scn_queue);
1545eda14cbcSMatt Macy sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
1546eda14cbcSMatt Macy VERIFY0(zap_add_int_key(dp->dp_meta_objset,
1547eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
1548eda14cbcSMatt Macy sds->sds_txg, tx));
1549eda14cbcSMatt Macy }
1550eda14cbcSMatt Macy }
1551eda14cbcSMatt Macy
1552eda14cbcSMatt Macy /*
1553eda14cbcSMatt Macy * Computes the memory limit state that we're currently in. A sorted scan
1554eda14cbcSMatt Macy * needs quite a bit of memory to hold the sorting queue, so we need to
1555eda14cbcSMatt Macy * reasonably constrain the size so it doesn't impact overall system
1556eda14cbcSMatt Macy * performance. We compute two limits:
1557eda14cbcSMatt Macy * 1) Hard memory limit: if the amount of memory used by the sorting
1558eda14cbcSMatt Macy * queues on a pool gets above this value, we stop the metadata
1559eda14cbcSMatt Macy * scanning portion and start issuing the queued up and sorted
1560eda14cbcSMatt Macy * I/Os to reduce memory usage.
1561eda14cbcSMatt Macy * This limit is calculated as a fraction of physmem (by default 5%).
1562eda14cbcSMatt Macy * We constrain the lower bound of the hard limit to an absolute
1563eda14cbcSMatt Macy * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
1564eda14cbcSMatt Macy * the upper bound to 5% of the total pool size - no chance we'll
1565eda14cbcSMatt Macy * ever need that much memory, but just to keep the value in check.
1566eda14cbcSMatt Macy * 2) Soft memory limit: once we hit the hard memory limit, we start
1567eda14cbcSMatt Macy * issuing I/O to reduce queue memory usage, but we don't want to
1568eda14cbcSMatt Macy * completely empty out the queues, since we might be able to find I/Os
1569eda14cbcSMatt Macy * that will fill in the gaps of our non-sequential IOs at some point
1570eda14cbcSMatt Macy * in the future. So we stop the issuing of I/Os once the amount of
1571eda14cbcSMatt Macy * memory used drops below the soft limit (at which point we stop issuing
1572eda14cbcSMatt Macy * I/O and start scanning metadata again).
1573eda14cbcSMatt Macy *
1574eda14cbcSMatt Macy * This limit is calculated by subtracting a fraction of the hard
1575eda14cbcSMatt Macy * limit from the hard limit. By default this fraction is 5%, so
1576eda14cbcSMatt Macy * the soft limit is 95% of the hard limit. We cap the size of the
1577eda14cbcSMatt Macy * difference between the hard and soft limits at an absolute
1578eda14cbcSMatt Macy * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
1579eda14cbcSMatt Macy * sufficient to not cause too frequent switching between the
1580eda14cbcSMatt Macy * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
1581eda14cbcSMatt Macy * worth of queues is about 1.2 GiB of on-pool data, so scanning
1582eda14cbcSMatt Macy * that should take at least a decent fraction of a second).
1583eda14cbcSMatt Macy */
1584eda14cbcSMatt Macy static boolean_t
dsl_scan_should_clear(dsl_scan_t * scn)1585eda14cbcSMatt Macy dsl_scan_should_clear(dsl_scan_t *scn)
1586eda14cbcSMatt Macy {
1587eda14cbcSMatt Macy spa_t *spa = scn->scn_dp->dp_spa;
1588eda14cbcSMatt Macy vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
1589eda14cbcSMatt Macy uint64_t alloc, mlim_hard, mlim_soft, mused;
1590eda14cbcSMatt Macy
1591eda14cbcSMatt Macy alloc = metaslab_class_get_alloc(spa_normal_class(spa));
1592eda14cbcSMatt Macy alloc += metaslab_class_get_alloc(spa_special_class(spa));
1593eda14cbcSMatt Macy alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
1594eda14cbcSMatt Macy
1595eda14cbcSMatt Macy mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
1596eda14cbcSMatt Macy zfs_scan_mem_lim_min);
1597eda14cbcSMatt Macy mlim_hard = MIN(mlim_hard, alloc / 20);
1598eda14cbcSMatt Macy mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
1599eda14cbcSMatt Macy zfs_scan_mem_lim_soft_max);
1600eda14cbcSMatt Macy mused = 0;
1601eda14cbcSMatt Macy for (uint64_t i = 0; i < rvd->vdev_children; i++) {
1602eda14cbcSMatt Macy vdev_t *tvd = rvd->vdev_child[i];
1603eda14cbcSMatt Macy dsl_scan_io_queue_t *queue;
1604eda14cbcSMatt Macy
1605eda14cbcSMatt Macy mutex_enter(&tvd->vdev_scan_io_queue_lock);
1606eda14cbcSMatt Macy queue = tvd->vdev_scan_io_queue;
1607eda14cbcSMatt Macy if (queue != NULL) {
16081f1e2261SMartin Matuska /*
1609a0b956f5SMartin Matuska * # of extents in exts_by_addr = # in exts_by_size.
16101f1e2261SMartin Matuska * B-tree efficiency is ~75%, but can be as low as 50%.
16111f1e2261SMartin Matuska */
1612b59a0cdeSMartin Matuska mused += zfs_btree_numnodes(&queue->q_exts_by_size) * ((
1613b59a0cdeSMartin Matuska sizeof (zfs_range_seg_gap_t) + sizeof (uint64_t)) *
1614a0b956f5SMartin Matuska 3 / 2) + queue->q_sio_memused;
1615eda14cbcSMatt Macy }
1616eda14cbcSMatt Macy mutex_exit(&tvd->vdev_scan_io_queue_lock);
1617eda14cbcSMatt Macy }
1618eda14cbcSMatt Macy
1619eda14cbcSMatt Macy dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
1620eda14cbcSMatt Macy
1621eda14cbcSMatt Macy if (mused == 0)
1622a0b956f5SMartin Matuska ASSERT0(scn->scn_queues_pending);
1623eda14cbcSMatt Macy
1624eda14cbcSMatt Macy /*
1625eda14cbcSMatt Macy * If we are above our hard limit, we need to clear out memory.
1626eda14cbcSMatt Macy * If we are below our soft limit, we need to accumulate sequential IOs.
1627eda14cbcSMatt Macy * Otherwise, we should keep doing whatever we are currently doing.
1628eda14cbcSMatt Macy */
1629eda14cbcSMatt Macy if (mused >= mlim_hard)
1630eda14cbcSMatt Macy return (B_TRUE);
1631eda14cbcSMatt Macy else if (mused < mlim_soft)
1632eda14cbcSMatt Macy return (B_FALSE);
1633eda14cbcSMatt Macy else
1634eda14cbcSMatt Macy return (scn->scn_clearing);
1635eda14cbcSMatt Macy }
1636eda14cbcSMatt Macy
1637eda14cbcSMatt Macy static boolean_t
dsl_scan_check_suspend(dsl_scan_t * scn,const zbookmark_phys_t * zb)1638eda14cbcSMatt Macy dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
1639eda14cbcSMatt Macy {
1640eda14cbcSMatt Macy /* we never skip user/group accounting objects */
1641eda14cbcSMatt Macy if (zb && (int64_t)zb->zb_object < 0)
1642eda14cbcSMatt Macy return (B_FALSE);
1643eda14cbcSMatt Macy
1644eda14cbcSMatt Macy if (scn->scn_suspending)
1645eda14cbcSMatt Macy return (B_TRUE); /* we're already suspending */
1646eda14cbcSMatt Macy
1647eda14cbcSMatt Macy if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
1648eda14cbcSMatt Macy return (B_FALSE); /* we're resuming */
1649eda14cbcSMatt Macy
1650eda14cbcSMatt Macy /* We only know how to resume from level-0 and objset blocks. */
1651eda14cbcSMatt Macy if (zb && (zb->zb_level != 0 && zb->zb_level != ZB_ROOT_LEVEL))
1652eda14cbcSMatt Macy return (B_FALSE);
1653eda14cbcSMatt Macy
1654eda14cbcSMatt Macy /*
1655eda14cbcSMatt Macy * We suspend if:
1656eda14cbcSMatt Macy * - we have scanned for at least the minimum time (default 1 sec
1657eda14cbcSMatt Macy * for scrub, 3 sec for resilver), and either we have sufficient
1658eda14cbcSMatt Macy * dirty data that we are starting to write more quickly
1659eda14cbcSMatt Macy * (default 30%), someone is explicitly waiting for this txg
1660eda14cbcSMatt Macy * to complete, or we have used up all of the time in the txg
1661eda14cbcSMatt Macy * timeout (default 5 sec).
1662eda14cbcSMatt Macy * or
1663eda14cbcSMatt Macy * - the spa is shutting down because this pool is being exported
1664eda14cbcSMatt Macy * or the machine is rebooting.
1665eda14cbcSMatt Macy * or
1666eda14cbcSMatt Macy * - the scan queue has reached its memory use limit
1667eda14cbcSMatt Macy */
1668eda14cbcSMatt Macy uint64_t curr_time_ns = gethrtime();
1669eda14cbcSMatt Macy uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
1670eda14cbcSMatt Macy uint64_t sync_time_ns = curr_time_ns -
1671eda14cbcSMatt Macy scn->scn_dp->dp_spa->spa_sync_starttime;
1672a0b956f5SMartin Matuska uint64_t dirty_min_bytes = zfs_dirty_data_max *
1673a0b956f5SMartin Matuska zfs_vdev_async_write_active_min_dirty_percent / 100;
1674be181ee2SMartin Matuska uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
1675eda14cbcSMatt Macy zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
1676eda14cbcSMatt Macy
1677eda14cbcSMatt Macy if ((NSEC2MSEC(scan_time_ns) > mintime &&
1678a0b956f5SMartin Matuska (scn->scn_dp->dp_dirty_total >= dirty_min_bytes ||
1679eda14cbcSMatt Macy txg_sync_waiting(scn->scn_dp) ||
1680eda14cbcSMatt Macy NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
1681eda14cbcSMatt Macy spa_shutting_down(scn->scn_dp->dp_spa) ||
1682e2df9bb4SMartin Matuska (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
1683e2df9bb4SMartin Matuska !ddt_walk_ready(scn->scn_dp->dp_spa)) {
1684eda14cbcSMatt Macy if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
1685eda14cbcSMatt Macy dprintf("suspending at first available bookmark "
1686eda14cbcSMatt Macy "%llx/%llx/%llx/%llx\n",
1687eda14cbcSMatt Macy (longlong_t)zb->zb_objset,
1688eda14cbcSMatt Macy (longlong_t)zb->zb_object,
1689eda14cbcSMatt Macy (longlong_t)zb->zb_level,
1690eda14cbcSMatt Macy (longlong_t)zb->zb_blkid);
1691eda14cbcSMatt Macy SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
1692eda14cbcSMatt Macy zb->zb_objset, 0, 0, 0);
1693eda14cbcSMatt Macy } else if (zb != NULL) {
1694eda14cbcSMatt Macy dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
1695eda14cbcSMatt Macy (longlong_t)zb->zb_objset,
1696eda14cbcSMatt Macy (longlong_t)zb->zb_object,
1697eda14cbcSMatt Macy (longlong_t)zb->zb_level,
1698eda14cbcSMatt Macy (longlong_t)zb->zb_blkid);
1699eda14cbcSMatt Macy scn->scn_phys.scn_bookmark = *zb;
1700eda14cbcSMatt Macy } else {
1701eda14cbcSMatt Macy #ifdef ZFS_DEBUG
1702eda14cbcSMatt Macy dsl_scan_phys_t *scnp = &scn->scn_phys;
1703eda14cbcSMatt Macy dprintf("suspending at at DDT bookmark "
1704eda14cbcSMatt Macy "%llx/%llx/%llx/%llx\n",
1705eda14cbcSMatt Macy (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
1706eda14cbcSMatt Macy (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
1707eda14cbcSMatt Macy (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
1708eda14cbcSMatt Macy (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
1709eda14cbcSMatt Macy #endif
1710eda14cbcSMatt Macy }
1711eda14cbcSMatt Macy scn->scn_suspending = B_TRUE;
1712eda14cbcSMatt Macy return (B_TRUE);
1713eda14cbcSMatt Macy }
1714eda14cbcSMatt Macy return (B_FALSE);
1715eda14cbcSMatt Macy }
1716eda14cbcSMatt Macy
1717c0a83fe0SMartin Matuska static boolean_t
dsl_error_scrub_check_suspend(dsl_scan_t * scn,const zbookmark_phys_t * zb)1718c0a83fe0SMartin Matuska dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
1719c0a83fe0SMartin Matuska {
1720c0a83fe0SMartin Matuska /*
1721c0a83fe0SMartin Matuska * We suspend if:
1722c0a83fe0SMartin Matuska * - we have scrubbed for at least the minimum time (default 1 sec
1723c0a83fe0SMartin Matuska * for error scrub), someone is explicitly waiting for this txg
1724c0a83fe0SMartin Matuska * to complete, or we have used up all of the time in the txg
1725c0a83fe0SMartin Matuska * timeout (default 5 sec).
1726c0a83fe0SMartin Matuska * or
1727c0a83fe0SMartin Matuska * - the spa is shutting down because this pool is being exported
1728c0a83fe0SMartin Matuska * or the machine is rebooting.
1729c0a83fe0SMartin Matuska */
1730c0a83fe0SMartin Matuska uint64_t curr_time_ns = gethrtime();
1731c0a83fe0SMartin Matuska uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time;
1732c0a83fe0SMartin Matuska uint64_t sync_time_ns = curr_time_ns -
1733c0a83fe0SMartin Matuska scn->scn_dp->dp_spa->spa_sync_starttime;
1734c0a83fe0SMartin Matuska int mintime = zfs_scrub_min_time_ms;
1735c0a83fe0SMartin Matuska
1736c0a83fe0SMartin Matuska if ((NSEC2MSEC(error_scrub_time_ns) > mintime &&
1737c0a83fe0SMartin Matuska (txg_sync_waiting(scn->scn_dp) ||
1738c0a83fe0SMartin Matuska NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
1739c0a83fe0SMartin Matuska spa_shutting_down(scn->scn_dp->dp_spa)) {
1740c0a83fe0SMartin Matuska if (zb) {
1741c0a83fe0SMartin Matuska dprintf("error scrub suspending at bookmark "
1742c0a83fe0SMartin Matuska "%llx/%llx/%llx/%llx\n",
1743c0a83fe0SMartin Matuska (longlong_t)zb->zb_objset,
1744c0a83fe0SMartin Matuska (longlong_t)zb->zb_object,
1745c0a83fe0SMartin Matuska (longlong_t)zb->zb_level,
1746c0a83fe0SMartin Matuska (longlong_t)zb->zb_blkid);
1747c0a83fe0SMartin Matuska }
1748c0a83fe0SMartin Matuska return (B_TRUE);
1749c0a83fe0SMartin Matuska }
1750c0a83fe0SMartin Matuska return (B_FALSE);
1751c0a83fe0SMartin Matuska }
1752c0a83fe0SMartin Matuska
1753eda14cbcSMatt Macy typedef struct zil_scan_arg {
1754eda14cbcSMatt Macy dsl_pool_t *zsa_dp;
1755eda14cbcSMatt Macy zil_header_t *zsa_zh;
1756eda14cbcSMatt Macy } zil_scan_arg_t;
1757eda14cbcSMatt Macy
1758eda14cbcSMatt Macy static int
dsl_scan_zil_block(zilog_t * zilog,const blkptr_t * bp,void * arg,uint64_t claim_txg)1759180f8225SMatt Macy dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
1760180f8225SMatt Macy uint64_t claim_txg)
1761eda14cbcSMatt Macy {
1762e92ffd9bSMartin Matuska (void) zilog;
1763eda14cbcSMatt Macy zil_scan_arg_t *zsa = arg;
1764eda14cbcSMatt Macy dsl_pool_t *dp = zsa->zsa_dp;
1765eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
1766eda14cbcSMatt Macy zil_header_t *zh = zsa->zsa_zh;
1767eda14cbcSMatt Macy zbookmark_phys_t zb;
1768eda14cbcSMatt Macy
1769eda14cbcSMatt Macy ASSERT(!BP_IS_REDACTED(bp));
1770783d3ff6SMartin Matuska if (BP_IS_HOLE(bp) ||
1771783d3ff6SMartin Matuska BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
1772eda14cbcSMatt Macy return (0);
1773eda14cbcSMatt Macy
1774eda14cbcSMatt Macy /*
1775eda14cbcSMatt Macy * One block ("stubby") can be allocated a long time ago; we
1776eda14cbcSMatt Macy * want to visit that one because it has been allocated
1777eda14cbcSMatt Macy * (on-disk) even if it hasn't been claimed (even though for
1778eda14cbcSMatt Macy * scrub there's nothing to do to it).
1779eda14cbcSMatt Macy */
1780783d3ff6SMartin Matuska if (claim_txg == 0 &&
1781783d3ff6SMartin Matuska BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
1782eda14cbcSMatt Macy return (0);
1783eda14cbcSMatt Macy
1784eda14cbcSMatt Macy SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
1785eda14cbcSMatt Macy ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
1786eda14cbcSMatt Macy
1787eda14cbcSMatt Macy VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
1788eda14cbcSMatt Macy return (0);
1789eda14cbcSMatt Macy }
1790eda14cbcSMatt Macy
1791eda14cbcSMatt Macy static int
dsl_scan_zil_record(zilog_t * zilog,const lr_t * lrc,void * arg,uint64_t claim_txg)1792180f8225SMatt Macy dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
1793180f8225SMatt Macy uint64_t claim_txg)
1794eda14cbcSMatt Macy {
1795e92ffd9bSMartin Matuska (void) zilog;
1796eda14cbcSMatt Macy if (lrc->lrc_txtype == TX_WRITE) {
1797eda14cbcSMatt Macy zil_scan_arg_t *zsa = arg;
1798eda14cbcSMatt Macy dsl_pool_t *dp = zsa->zsa_dp;
1799eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
1800eda14cbcSMatt Macy zil_header_t *zh = zsa->zsa_zh;
1801180f8225SMatt Macy const lr_write_t *lr = (const lr_write_t *)lrc;
1802180f8225SMatt Macy const blkptr_t *bp = &lr->lr_blkptr;
1803eda14cbcSMatt Macy zbookmark_phys_t zb;
1804eda14cbcSMatt Macy
1805eda14cbcSMatt Macy ASSERT(!BP_IS_REDACTED(bp));
1806eda14cbcSMatt Macy if (BP_IS_HOLE(bp) ||
1807783d3ff6SMartin Matuska BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
1808eda14cbcSMatt Macy return (0);
1809eda14cbcSMatt Macy
1810eda14cbcSMatt Macy /*
1811eda14cbcSMatt Macy * birth can be < claim_txg if this record's txg is
1812eda14cbcSMatt Macy * already txg sync'ed (but this log block contains
1813eda14cbcSMatt Macy * other records that are not synced)
1814eda14cbcSMatt Macy */
1815783d3ff6SMartin Matuska if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
1816eda14cbcSMatt Macy return (0);
1817eda14cbcSMatt Macy
1818dbd5678dSMartin Matuska ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
1819eda14cbcSMatt Macy SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
1820eda14cbcSMatt Macy lr->lr_foid, ZB_ZIL_LEVEL,
1821eda14cbcSMatt Macy lr->lr_offset / BP_GET_LSIZE(bp));
1822eda14cbcSMatt Macy
1823eda14cbcSMatt Macy VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
1824eda14cbcSMatt Macy }
1825eda14cbcSMatt Macy return (0);
1826eda14cbcSMatt Macy }
1827eda14cbcSMatt Macy
1828eda14cbcSMatt Macy static void
dsl_scan_zil(dsl_pool_t * dp,zil_header_t * zh)1829eda14cbcSMatt Macy dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
1830eda14cbcSMatt Macy {
1831eda14cbcSMatt Macy uint64_t claim_txg = zh->zh_claim_txg;
1832eda14cbcSMatt Macy zil_scan_arg_t zsa = { dp, zh };
1833eda14cbcSMatt Macy zilog_t *zilog;
1834eda14cbcSMatt Macy
1835eda14cbcSMatt Macy ASSERT(spa_writeable(dp->dp_spa));
1836eda14cbcSMatt Macy
1837eda14cbcSMatt Macy /*
1838eda14cbcSMatt Macy * We only want to visit blocks that have been claimed but not yet
1839eda14cbcSMatt Macy * replayed (or, in read-only mode, blocks that *would* be claimed).
1840eda14cbcSMatt Macy */
1841eda14cbcSMatt Macy if (claim_txg == 0)
1842eda14cbcSMatt Macy return;
1843eda14cbcSMatt Macy
1844eda14cbcSMatt Macy zilog = zil_alloc(dp->dp_meta_objset, zh);
1845eda14cbcSMatt Macy
1846eda14cbcSMatt Macy (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
1847eda14cbcSMatt Macy claim_txg, B_FALSE);
1848eda14cbcSMatt Macy
1849eda14cbcSMatt Macy zil_free(zilog);
1850eda14cbcSMatt Macy }
1851eda14cbcSMatt Macy
1852eda14cbcSMatt Macy /*
1853eda14cbcSMatt Macy * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
1854eda14cbcSMatt Macy * here is to sort the AVL tree by the order each block will be needed.
1855eda14cbcSMatt Macy */
1856eda14cbcSMatt Macy static int
scan_prefetch_queue_compare(const void * a,const void * b)1857eda14cbcSMatt Macy scan_prefetch_queue_compare(const void *a, const void *b)
1858eda14cbcSMatt Macy {
1859eda14cbcSMatt Macy const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b;
1860eda14cbcSMatt Macy const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
1861eda14cbcSMatt Macy const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
1862eda14cbcSMatt Macy
1863eda14cbcSMatt Macy return (zbookmark_compare(spc_a->spc_datablkszsec,
1864eda14cbcSMatt Macy spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
1865eda14cbcSMatt Macy spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
1866eda14cbcSMatt Macy }
1867eda14cbcSMatt Macy
1868eda14cbcSMatt Macy static void
scan_prefetch_ctx_rele(scan_prefetch_ctx_t * spc,const void * tag)1869a0b956f5SMartin Matuska scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, const void *tag)
1870eda14cbcSMatt Macy {
1871eda14cbcSMatt Macy if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
1872eda14cbcSMatt Macy zfs_refcount_destroy(&spc->spc_refcnt);
1873eda14cbcSMatt Macy kmem_free(spc, sizeof (scan_prefetch_ctx_t));
1874eda14cbcSMatt Macy }
1875eda14cbcSMatt Macy }
1876eda14cbcSMatt Macy
1877eda14cbcSMatt Macy static scan_prefetch_ctx_t *
scan_prefetch_ctx_create(dsl_scan_t * scn,dnode_phys_t * dnp,const void * tag)1878a0b956f5SMartin Matuska scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, const void *tag)
1879eda14cbcSMatt Macy {
1880eda14cbcSMatt Macy scan_prefetch_ctx_t *spc;
1881eda14cbcSMatt Macy
1882eda14cbcSMatt Macy spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
1883eda14cbcSMatt Macy zfs_refcount_create(&spc->spc_refcnt);
1884eda14cbcSMatt Macy zfs_refcount_add(&spc->spc_refcnt, tag);
1885eda14cbcSMatt Macy spc->spc_scn = scn;
1886eda14cbcSMatt Macy if (dnp != NULL) {
1887eda14cbcSMatt Macy spc->spc_datablkszsec = dnp->dn_datablkszsec;
1888eda14cbcSMatt Macy spc->spc_indblkshift = dnp->dn_indblkshift;
1889eda14cbcSMatt Macy spc->spc_root = B_FALSE;
1890eda14cbcSMatt Macy } else {
1891eda14cbcSMatt Macy spc->spc_datablkszsec = 0;
1892eda14cbcSMatt Macy spc->spc_indblkshift = 0;
1893eda14cbcSMatt Macy spc->spc_root = B_TRUE;
1894eda14cbcSMatt Macy }
1895eda14cbcSMatt Macy
1896eda14cbcSMatt Macy return (spc);
1897eda14cbcSMatt Macy }
1898eda14cbcSMatt Macy
1899eda14cbcSMatt Macy static void
scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t * spc,const void * tag)1900a0b956f5SMartin Matuska scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, const void *tag)
1901eda14cbcSMatt Macy {
1902eda14cbcSMatt Macy zfs_refcount_add(&spc->spc_refcnt, tag);
1903eda14cbcSMatt Macy }
1904eda14cbcSMatt Macy
1905eda14cbcSMatt Macy static void
scan_ds_prefetch_queue_clear(dsl_scan_t * scn)1906eda14cbcSMatt Macy scan_ds_prefetch_queue_clear(dsl_scan_t *scn)
1907eda14cbcSMatt Macy {
1908eda14cbcSMatt Macy spa_t *spa = scn->scn_dp->dp_spa;
1909eda14cbcSMatt Macy void *cookie = NULL;
1910eda14cbcSMatt Macy scan_prefetch_issue_ctx_t *spic = NULL;
1911eda14cbcSMatt Macy
1912eda14cbcSMatt Macy mutex_enter(&spa->spa_scrub_lock);
1913eda14cbcSMatt Macy while ((spic = avl_destroy_nodes(&scn->scn_prefetch_queue,
1914eda14cbcSMatt Macy &cookie)) != NULL) {
1915eda14cbcSMatt Macy scan_prefetch_ctx_rele(spic->spic_spc, scn);
1916eda14cbcSMatt Macy kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
1917eda14cbcSMatt Macy }
1918eda14cbcSMatt Macy mutex_exit(&spa->spa_scrub_lock);
1919eda14cbcSMatt Macy }
1920eda14cbcSMatt Macy
1921eda14cbcSMatt Macy static boolean_t
dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t * spc,const zbookmark_phys_t * zb)1922eda14cbcSMatt Macy dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
1923eda14cbcSMatt Macy const zbookmark_phys_t *zb)
1924eda14cbcSMatt Macy {
1925eda14cbcSMatt Macy zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
1926eda14cbcSMatt Macy dnode_phys_t tmp_dnp;
1927eda14cbcSMatt Macy dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
1928eda14cbcSMatt Macy
1929eda14cbcSMatt Macy if (zb->zb_objset != last_zb->zb_objset)
1930eda14cbcSMatt Macy return (B_TRUE);
1931eda14cbcSMatt Macy if ((int64_t)zb->zb_object < 0)
1932eda14cbcSMatt Macy return (B_FALSE);
1933eda14cbcSMatt Macy
1934eda14cbcSMatt Macy tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
1935eda14cbcSMatt Macy tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
1936eda14cbcSMatt Macy
1937eda14cbcSMatt Macy if (zbookmark_subtree_completed(dnp, zb, last_zb))
1938eda14cbcSMatt Macy return (B_TRUE);
1939eda14cbcSMatt Macy
1940eda14cbcSMatt Macy return (B_FALSE);
1941eda14cbcSMatt Macy }
1942eda14cbcSMatt Macy
1943eda14cbcSMatt Macy static void
dsl_scan_prefetch(scan_prefetch_ctx_t * spc,blkptr_t * bp,zbookmark_phys_t * zb)1944eda14cbcSMatt Macy dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
1945eda14cbcSMatt Macy {
1946eda14cbcSMatt Macy avl_index_t idx;
1947eda14cbcSMatt Macy dsl_scan_t *scn = spc->spc_scn;
1948eda14cbcSMatt Macy spa_t *spa = scn->scn_dp->dp_spa;
1949eda14cbcSMatt Macy scan_prefetch_issue_ctx_t *spic;
1950eda14cbcSMatt Macy
1951eda14cbcSMatt Macy if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp))
1952eda14cbcSMatt Macy return;
1953eda14cbcSMatt Macy
1954783d3ff6SMartin Matuska if (BP_IS_HOLE(bp) ||
1955783d3ff6SMartin Matuska BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
1956eda14cbcSMatt Macy (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
1957eda14cbcSMatt Macy BP_GET_TYPE(bp) != DMU_OT_OBJSET))
1958eda14cbcSMatt Macy return;
1959eda14cbcSMatt Macy
1960eda14cbcSMatt Macy if (dsl_scan_check_prefetch_resume(spc, zb))
1961eda14cbcSMatt Macy return;
1962eda14cbcSMatt Macy
1963eda14cbcSMatt Macy scan_prefetch_ctx_add_ref(spc, scn);
1964eda14cbcSMatt Macy spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
1965eda14cbcSMatt Macy spic->spic_spc = spc;
1966eda14cbcSMatt Macy spic->spic_bp = *bp;
1967eda14cbcSMatt Macy spic->spic_zb = *zb;
1968eda14cbcSMatt Macy
1969eda14cbcSMatt Macy /*
1970eda14cbcSMatt Macy * Add the IO to the queue of blocks to prefetch. This allows us to
1971eda14cbcSMatt Macy * prioritize blocks that we will need first for the main traversal
1972eda14cbcSMatt Macy * thread.
1973eda14cbcSMatt Macy */
1974eda14cbcSMatt Macy mutex_enter(&spa->spa_scrub_lock);
1975eda14cbcSMatt Macy if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
1976eda14cbcSMatt Macy /* this block is already queued for prefetch */
1977eda14cbcSMatt Macy kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
1978eda14cbcSMatt Macy scan_prefetch_ctx_rele(spc, scn);
1979eda14cbcSMatt Macy mutex_exit(&spa->spa_scrub_lock);
1980eda14cbcSMatt Macy return;
1981eda14cbcSMatt Macy }
1982eda14cbcSMatt Macy
1983eda14cbcSMatt Macy avl_insert(&scn->scn_prefetch_queue, spic, idx);
1984eda14cbcSMatt Macy cv_broadcast(&spa->spa_scrub_io_cv);
1985eda14cbcSMatt Macy mutex_exit(&spa->spa_scrub_lock);
1986eda14cbcSMatt Macy }
1987eda14cbcSMatt Macy
1988eda14cbcSMatt Macy static void
dsl_scan_prefetch_dnode(dsl_scan_t * scn,dnode_phys_t * dnp,uint64_t objset,uint64_t object)1989eda14cbcSMatt Macy dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp,
1990eda14cbcSMatt Macy uint64_t objset, uint64_t object)
1991eda14cbcSMatt Macy {
1992eda14cbcSMatt Macy int i;
1993eda14cbcSMatt Macy zbookmark_phys_t zb;
1994eda14cbcSMatt Macy scan_prefetch_ctx_t *spc;
1995eda14cbcSMatt Macy
1996eda14cbcSMatt Macy if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1997eda14cbcSMatt Macy return;
1998eda14cbcSMatt Macy
1999eda14cbcSMatt Macy SET_BOOKMARK(&zb, objset, object, 0, 0);
2000eda14cbcSMatt Macy
2001eda14cbcSMatt Macy spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
2002eda14cbcSMatt Macy
2003eda14cbcSMatt Macy for (i = 0; i < dnp->dn_nblkptr; i++) {
2004eda14cbcSMatt Macy zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
2005eda14cbcSMatt Macy zb.zb_blkid = i;
2006eda14cbcSMatt Macy dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
2007eda14cbcSMatt Macy }
2008eda14cbcSMatt Macy
2009eda14cbcSMatt Macy if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
2010eda14cbcSMatt Macy zb.zb_level = 0;
2011eda14cbcSMatt Macy zb.zb_blkid = DMU_SPILL_BLKID;
2012eda14cbcSMatt Macy dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb);
2013eda14cbcSMatt Macy }
2014eda14cbcSMatt Macy
2015eda14cbcSMatt Macy scan_prefetch_ctx_rele(spc, FTAG);
2016eda14cbcSMatt Macy }
2017eda14cbcSMatt Macy
2018eda14cbcSMatt Macy static void
dsl_scan_prefetch_cb(zio_t * zio,const zbookmark_phys_t * zb,const blkptr_t * bp,arc_buf_t * buf,void * private)2019eda14cbcSMatt Macy dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
2020eda14cbcSMatt Macy arc_buf_t *buf, void *private)
2021eda14cbcSMatt Macy {
2022e92ffd9bSMartin Matuska (void) zio;
2023eda14cbcSMatt Macy scan_prefetch_ctx_t *spc = private;
2024eda14cbcSMatt Macy dsl_scan_t *scn = spc->spc_scn;
2025eda14cbcSMatt Macy spa_t *spa = scn->scn_dp->dp_spa;
2026eda14cbcSMatt Macy
2027eda14cbcSMatt Macy /* broadcast that the IO has completed for rate limiting purposes */
2028eda14cbcSMatt Macy mutex_enter(&spa->spa_scrub_lock);
2029eda14cbcSMatt Macy ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
2030eda14cbcSMatt Macy spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
2031eda14cbcSMatt Macy cv_broadcast(&spa->spa_scrub_io_cv);
2032eda14cbcSMatt Macy mutex_exit(&spa->spa_scrub_lock);
2033eda14cbcSMatt Macy
2034eda14cbcSMatt Macy /* if there was an error or we are done prefetching, just cleanup */
2035eda14cbcSMatt Macy if (buf == NULL || scn->scn_prefetch_stop)
2036eda14cbcSMatt Macy goto out;
2037eda14cbcSMatt Macy
2038eda14cbcSMatt Macy if (BP_GET_LEVEL(bp) > 0) {
2039eda14cbcSMatt Macy int i;
2040eda14cbcSMatt Macy blkptr_t *cbp;
2041eda14cbcSMatt Macy int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
2042eda14cbcSMatt Macy zbookmark_phys_t czb;
2043eda14cbcSMatt Macy
2044eda14cbcSMatt Macy for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
2045eda14cbcSMatt Macy SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
2046eda14cbcSMatt Macy zb->zb_level - 1, zb->zb_blkid * epb + i);
2047eda14cbcSMatt Macy dsl_scan_prefetch(spc, cbp, &czb);
2048eda14cbcSMatt Macy }
2049eda14cbcSMatt Macy } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
2050eda14cbcSMatt Macy dnode_phys_t *cdnp;
2051eda14cbcSMatt Macy int i;
2052eda14cbcSMatt Macy int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
2053eda14cbcSMatt Macy
2054eda14cbcSMatt Macy for (i = 0, cdnp = buf->b_data; i < epb;
2055eda14cbcSMatt Macy i += cdnp->dn_extra_slots + 1,
2056eda14cbcSMatt Macy cdnp += cdnp->dn_extra_slots + 1) {
2057eda14cbcSMatt Macy dsl_scan_prefetch_dnode(scn, cdnp,
2058eda14cbcSMatt Macy zb->zb_objset, zb->zb_blkid * epb + i);
2059eda14cbcSMatt Macy }
2060eda14cbcSMatt Macy } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
2061eda14cbcSMatt Macy objset_phys_t *osp = buf->b_data;
2062eda14cbcSMatt Macy
2063eda14cbcSMatt Macy dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
2064eda14cbcSMatt Macy zb->zb_objset, DMU_META_DNODE_OBJECT);
2065eda14cbcSMatt Macy
2066eda14cbcSMatt Macy if (OBJSET_BUF_HAS_USERUSED(buf)) {
2067315ee00fSMartin Matuska if (OBJSET_BUF_HAS_PROJECTUSED(buf)) {
2068315ee00fSMartin Matuska dsl_scan_prefetch_dnode(scn,
2069315ee00fSMartin Matuska &osp->os_projectused_dnode, zb->zb_objset,
2070315ee00fSMartin Matuska DMU_PROJECTUSED_OBJECT);
2071315ee00fSMartin Matuska }
2072eda14cbcSMatt Macy dsl_scan_prefetch_dnode(scn,
2073eda14cbcSMatt Macy &osp->os_groupused_dnode, zb->zb_objset,
2074eda14cbcSMatt Macy DMU_GROUPUSED_OBJECT);
2075eda14cbcSMatt Macy dsl_scan_prefetch_dnode(scn,
2076eda14cbcSMatt Macy &osp->os_userused_dnode, zb->zb_objset,
2077eda14cbcSMatt Macy DMU_USERUSED_OBJECT);
2078eda14cbcSMatt Macy }
2079eda14cbcSMatt Macy }
2080eda14cbcSMatt Macy
2081eda14cbcSMatt Macy out:
2082eda14cbcSMatt Macy if (buf != NULL)
2083eda14cbcSMatt Macy arc_buf_destroy(buf, private);
2084eda14cbcSMatt Macy scan_prefetch_ctx_rele(spc, scn);
2085eda14cbcSMatt Macy }
2086eda14cbcSMatt Macy
2087eda14cbcSMatt Macy static void
dsl_scan_prefetch_thread(void * arg)2088eda14cbcSMatt Macy dsl_scan_prefetch_thread(void *arg)
2089eda14cbcSMatt Macy {
2090eda14cbcSMatt Macy dsl_scan_t *scn = arg;
2091eda14cbcSMatt Macy spa_t *spa = scn->scn_dp->dp_spa;
2092eda14cbcSMatt Macy scan_prefetch_issue_ctx_t *spic;
2093eda14cbcSMatt Macy
2094eda14cbcSMatt Macy /* loop until we are told to stop */
2095eda14cbcSMatt Macy while (!scn->scn_prefetch_stop) {
2096eda14cbcSMatt Macy arc_flags_t flags = ARC_FLAG_NOWAIT |
2097eda14cbcSMatt Macy ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH;
2098eda14cbcSMatt Macy int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
2099eda14cbcSMatt Macy
2100eda14cbcSMatt Macy mutex_enter(&spa->spa_scrub_lock);
2101eda14cbcSMatt Macy
2102eda14cbcSMatt Macy /*
2103eda14cbcSMatt Macy * Wait until we have an IO to issue and are not above our
2104eda14cbcSMatt Macy * maximum in flight limit.
2105eda14cbcSMatt Macy */
2106eda14cbcSMatt Macy while (!scn->scn_prefetch_stop &&
2107eda14cbcSMatt Macy (avl_numnodes(&scn->scn_prefetch_queue) == 0 ||
2108eda14cbcSMatt Macy spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
2109eda14cbcSMatt Macy cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2110eda14cbcSMatt Macy }
2111eda14cbcSMatt Macy
2112eda14cbcSMatt Macy /* recheck if we should stop since we waited for the cv */
2113eda14cbcSMatt Macy if (scn->scn_prefetch_stop) {
2114eda14cbcSMatt Macy mutex_exit(&spa->spa_scrub_lock);
2115eda14cbcSMatt Macy break;
2116eda14cbcSMatt Macy }
2117eda14cbcSMatt Macy
2118eda14cbcSMatt Macy /* remove the prefetch IO from the tree */
2119eda14cbcSMatt Macy spic = avl_first(&scn->scn_prefetch_queue);
2120eda14cbcSMatt Macy spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
2121eda14cbcSMatt Macy avl_remove(&scn->scn_prefetch_queue, spic);
2122eda14cbcSMatt Macy
2123eda14cbcSMatt Macy mutex_exit(&spa->spa_scrub_lock);
2124eda14cbcSMatt Macy
2125eda14cbcSMatt Macy if (BP_IS_PROTECTED(&spic->spic_bp)) {
2126eda14cbcSMatt Macy ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE ||
2127eda14cbcSMatt Macy BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET);
2128eda14cbcSMatt Macy ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0);
2129eda14cbcSMatt Macy zio_flags |= ZIO_FLAG_RAW;
2130eda14cbcSMatt Macy }
2131eda14cbcSMatt Macy
2132315ee00fSMartin Matuska /* We don't need data L1 buffer since we do not prefetch L0. */
2133315ee00fSMartin Matuska blkptr_t *bp = &spic->spic_bp;
2134315ee00fSMartin Matuska if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
2135315ee00fSMartin Matuska BP_GET_TYPE(bp) != DMU_OT_OBJSET)
2136315ee00fSMartin Matuska flags |= ARC_FLAG_NO_BUF;
2137315ee00fSMartin Matuska
2138eda14cbcSMatt Macy /* issue the prefetch asynchronously */
2139315ee00fSMartin Matuska (void) arc_read(scn->scn_zio_root, spa, bp,
2140315ee00fSMartin Matuska dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB,
2141315ee00fSMartin Matuska zio_flags, &flags, &spic->spic_zb);
2142eda14cbcSMatt Macy
2143eda14cbcSMatt Macy kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
2144eda14cbcSMatt Macy }
2145eda14cbcSMatt Macy
2146eda14cbcSMatt Macy ASSERT(scn->scn_prefetch_stop);
2147eda14cbcSMatt Macy
2148eda14cbcSMatt Macy /* free any prefetches we didn't get to complete */
2149eda14cbcSMatt Macy mutex_enter(&spa->spa_scrub_lock);
2150eda14cbcSMatt Macy while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
2151eda14cbcSMatt Macy avl_remove(&scn->scn_prefetch_queue, spic);
2152eda14cbcSMatt Macy scan_prefetch_ctx_rele(spic->spic_spc, scn);
2153eda14cbcSMatt Macy kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
2154eda14cbcSMatt Macy }
2155eda14cbcSMatt Macy ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
2156eda14cbcSMatt Macy mutex_exit(&spa->spa_scrub_lock);
2157eda14cbcSMatt Macy }
2158eda14cbcSMatt Macy
2159eda14cbcSMatt Macy static boolean_t
dsl_scan_check_resume(dsl_scan_t * scn,const dnode_phys_t * dnp,const zbookmark_phys_t * zb)2160eda14cbcSMatt Macy dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
2161eda14cbcSMatt Macy const zbookmark_phys_t *zb)
2162eda14cbcSMatt Macy {
2163eda14cbcSMatt Macy /*
2164eda14cbcSMatt Macy * We never skip over user/group accounting objects (obj<0)
2165eda14cbcSMatt Macy */
2166eda14cbcSMatt Macy if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
2167eda14cbcSMatt Macy (int64_t)zb->zb_object >= 0) {
2168eda14cbcSMatt Macy /*
2169eda14cbcSMatt Macy * If we already visited this bp & everything below (in
2170eda14cbcSMatt Macy * a prior txg sync), don't bother doing it again.
2171eda14cbcSMatt Macy */
2172eda14cbcSMatt Macy if (zbookmark_subtree_completed(dnp, zb,
2173eda14cbcSMatt Macy &scn->scn_phys.scn_bookmark))
2174eda14cbcSMatt Macy return (B_TRUE);
2175eda14cbcSMatt Macy
2176eda14cbcSMatt Macy /*
2177eda14cbcSMatt Macy * If we found the block we're trying to resume from, or
2178271171e0SMartin Matuska * we went past it, zero it out to indicate that it's OK
2179271171e0SMartin Matuska * to start checking for suspending again.
2180eda14cbcSMatt Macy */
2181271171e0SMartin Matuska if (zbookmark_subtree_tbd(dnp, zb,
2182271171e0SMartin Matuska &scn->scn_phys.scn_bookmark)) {
2183eda14cbcSMatt Macy dprintf("resuming at %llx/%llx/%llx/%llx\n",
2184eda14cbcSMatt Macy (longlong_t)zb->zb_objset,
2185eda14cbcSMatt Macy (longlong_t)zb->zb_object,
2186eda14cbcSMatt Macy (longlong_t)zb->zb_level,
2187eda14cbcSMatt Macy (longlong_t)zb->zb_blkid);
2188da5137abSMartin Matuska memset(&scn->scn_phys.scn_bookmark, 0, sizeof (*zb));
2189eda14cbcSMatt Macy }
2190eda14cbcSMatt Macy }
2191eda14cbcSMatt Macy return (B_FALSE);
2192eda14cbcSMatt Macy }
2193eda14cbcSMatt Macy
21942276e539SMartin Matuska static void dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
2195eda14cbcSMatt Macy dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
2196eda14cbcSMatt Macy dmu_objset_type_t ostype, dmu_tx_t *tx);
2197eda14cbcSMatt Macy inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
2198eda14cbcSMatt Macy dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
2199eda14cbcSMatt Macy dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
2200eda14cbcSMatt Macy
2201eda14cbcSMatt Macy /*
2202eda14cbcSMatt Macy * Return nonzero on i/o error.
2203eda14cbcSMatt Macy * Return new buf to write out in *bufp.
2204eda14cbcSMatt Macy */
2205eda14cbcSMatt Macy inline __attribute__((always_inline)) static int
dsl_scan_recurse(dsl_scan_t * scn,dsl_dataset_t * ds,dmu_objset_type_t ostype,dnode_phys_t * dnp,const blkptr_t * bp,const zbookmark_phys_t * zb,dmu_tx_t * tx)2206eda14cbcSMatt Macy dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
2207eda14cbcSMatt Macy dnode_phys_t *dnp, const blkptr_t *bp,
2208eda14cbcSMatt Macy const zbookmark_phys_t *zb, dmu_tx_t *tx)
2209eda14cbcSMatt Macy {
2210eda14cbcSMatt Macy dsl_pool_t *dp = scn->scn_dp;
2211e3aa18adSMartin Matuska spa_t *spa = dp->dp_spa;
2212eda14cbcSMatt Macy int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
2213eda14cbcSMatt Macy int err;
2214eda14cbcSMatt Macy
2215eda14cbcSMatt Macy ASSERT(!BP_IS_REDACTED(bp));
2216eda14cbcSMatt Macy
2217c03c5b1cSMartin Matuska /*
2218c03c5b1cSMartin Matuska * There is an unlikely case of encountering dnodes with contradicting
2219c03c5b1cSMartin Matuska * dn_bonuslen and DNODE_FLAG_SPILL_BLKPTR flag before in files created
2220c03c5b1cSMartin Matuska * or modified before commit 4254acb was merged. As it is not possible
2221c03c5b1cSMartin Matuska * to know which of the two is correct, report an error.
2222c03c5b1cSMartin Matuska */
2223c03c5b1cSMartin Matuska if (dnp != NULL &&
2224c03c5b1cSMartin Matuska dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) {
2225c03c5b1cSMartin Matuska scn->scn_phys.scn_errors++;
2226783d3ff6SMartin Matuska spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
2227c03c5b1cSMartin Matuska return (SET_ERROR(EINVAL));
2228c03c5b1cSMartin Matuska }
2229c03c5b1cSMartin Matuska
2230eda14cbcSMatt Macy if (BP_GET_LEVEL(bp) > 0) {
2231eda14cbcSMatt Macy arc_flags_t flags = ARC_FLAG_WAIT;
2232eda14cbcSMatt Macy int i;
2233eda14cbcSMatt Macy blkptr_t *cbp;
2234eda14cbcSMatt Macy int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
2235eda14cbcSMatt Macy arc_buf_t *buf;
2236eda14cbcSMatt Macy
2237e3aa18adSMartin Matuska err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
2238eda14cbcSMatt Macy ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
2239eda14cbcSMatt Macy if (err) {
2240eda14cbcSMatt Macy scn->scn_phys.scn_errors++;
2241eda14cbcSMatt Macy return (err);
2242eda14cbcSMatt Macy }
2243eda14cbcSMatt Macy for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
2244eda14cbcSMatt Macy zbookmark_phys_t czb;
2245eda14cbcSMatt Macy
2246eda14cbcSMatt Macy SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
2247eda14cbcSMatt Macy zb->zb_level - 1,
2248eda14cbcSMatt Macy zb->zb_blkid * epb + i);
2249eda14cbcSMatt Macy dsl_scan_visitbp(cbp, &czb, dnp,
2250eda14cbcSMatt Macy ds, scn, ostype, tx);
2251eda14cbcSMatt Macy }
2252eda14cbcSMatt Macy arc_buf_destroy(buf, &buf);
2253eda14cbcSMatt Macy } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
2254eda14cbcSMatt Macy arc_flags_t flags = ARC_FLAG_WAIT;
2255eda14cbcSMatt Macy dnode_phys_t *cdnp;
2256eda14cbcSMatt Macy int i;
2257eda14cbcSMatt Macy int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
2258eda14cbcSMatt Macy arc_buf_t *buf;
2259eda14cbcSMatt Macy
2260eda14cbcSMatt Macy if (BP_IS_PROTECTED(bp)) {
2261eda14cbcSMatt Macy ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
2262eda14cbcSMatt Macy zio_flags |= ZIO_FLAG_RAW;
2263eda14cbcSMatt Macy }
2264eda14cbcSMatt Macy
2265e3aa18adSMartin Matuska err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
2266eda14cbcSMatt Macy ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
2267eda14cbcSMatt Macy if (err) {
2268eda14cbcSMatt Macy scn->scn_phys.scn_errors++;
2269eda14cbcSMatt Macy return (err);
2270eda14cbcSMatt Macy }
2271eda14cbcSMatt Macy for (i = 0, cdnp = buf->b_data; i < epb;
2272eda14cbcSMatt Macy i += cdnp->dn_extra_slots + 1,
2273eda14cbcSMatt Macy cdnp += cdnp->dn_extra_slots + 1) {
2274eda14cbcSMatt Macy dsl_scan_visitdnode(scn, ds, ostype,
2275eda14cbcSMatt Macy cdnp, zb->zb_blkid * epb + i, tx);
2276eda14cbcSMatt Macy }
2277eda14cbcSMatt Macy
2278eda14cbcSMatt Macy arc_buf_destroy(buf, &buf);
2279eda14cbcSMatt Macy } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
2280eda14cbcSMatt Macy arc_flags_t flags = ARC_FLAG_WAIT;
2281eda14cbcSMatt Macy objset_phys_t *osp;
2282eda14cbcSMatt Macy arc_buf_t *buf;
2283eda14cbcSMatt Macy
2284e3aa18adSMartin Matuska err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
2285eda14cbcSMatt Macy ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
2286eda14cbcSMatt Macy if (err) {
2287eda14cbcSMatt Macy scn->scn_phys.scn_errors++;
2288eda14cbcSMatt Macy return (err);
2289eda14cbcSMatt Macy }
2290eda14cbcSMatt Macy
2291eda14cbcSMatt Macy osp = buf->b_data;
2292eda14cbcSMatt Macy
2293eda14cbcSMatt Macy dsl_scan_visitdnode(scn, ds, osp->os_type,
2294eda14cbcSMatt Macy &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
2295eda14cbcSMatt Macy
2296eda14cbcSMatt Macy if (OBJSET_BUF_HAS_USERUSED(buf)) {
2297eda14cbcSMatt Macy /*
2298eda14cbcSMatt Macy * We also always visit user/group/project accounting
2299eda14cbcSMatt Macy * objects, and never skip them, even if we are
2300eda14cbcSMatt Macy * suspending. This is necessary so that the
2301eda14cbcSMatt Macy * space deltas from this txg get integrated.
2302eda14cbcSMatt Macy */
2303eda14cbcSMatt Macy if (OBJSET_BUF_HAS_PROJECTUSED(buf))
2304eda14cbcSMatt Macy dsl_scan_visitdnode(scn, ds, osp->os_type,
2305eda14cbcSMatt Macy &osp->os_projectused_dnode,
2306eda14cbcSMatt Macy DMU_PROJECTUSED_OBJECT, tx);
2307eda14cbcSMatt Macy dsl_scan_visitdnode(scn, ds, osp->os_type,
2308eda14cbcSMatt Macy &osp->os_groupused_dnode,
2309eda14cbcSMatt Macy DMU_GROUPUSED_OBJECT, tx);
2310eda14cbcSMatt Macy dsl_scan_visitdnode(scn, ds, osp->os_type,
2311eda14cbcSMatt Macy &osp->os_userused_dnode,
2312eda14cbcSMatt Macy DMU_USERUSED_OBJECT, tx);
2313eda14cbcSMatt Macy }
2314eda14cbcSMatt Macy arc_buf_destroy(buf, &buf);
2315d2a8fad3SMartin Matuska } else if (zfs_blkptr_verify(spa, bp,
2316e639e0d2SMartin Matuska BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
2317e3aa18adSMartin Matuska /*
2318e3aa18adSMartin Matuska * Sanity check the block pointer contents, this is handled
2319e3aa18adSMartin Matuska * by arc_read() for the cases above.
2320e3aa18adSMartin Matuska */
2321e3aa18adSMartin Matuska scn->scn_phys.scn_errors++;
2322783d3ff6SMartin Matuska spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
2323e3aa18adSMartin Matuska return (SET_ERROR(EINVAL));
2324eda14cbcSMatt Macy }
2325eda14cbcSMatt Macy
2326eda14cbcSMatt Macy return (0);
2327eda14cbcSMatt Macy }
2328eda14cbcSMatt Macy
2329eda14cbcSMatt Macy inline __attribute__((always_inline)) static void
dsl_scan_visitdnode(dsl_scan_t * scn,dsl_dataset_t * ds,dmu_objset_type_t ostype,dnode_phys_t * dnp,uint64_t object,dmu_tx_t * tx)2330eda14cbcSMatt Macy dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
2331eda14cbcSMatt Macy dmu_objset_type_t ostype, dnode_phys_t *dnp,
2332eda14cbcSMatt Macy uint64_t object, dmu_tx_t *tx)
2333eda14cbcSMatt Macy {
2334eda14cbcSMatt Macy int j;
2335eda14cbcSMatt Macy
2336eda14cbcSMatt Macy for (j = 0; j < dnp->dn_nblkptr; j++) {
2337eda14cbcSMatt Macy zbookmark_phys_t czb;
2338eda14cbcSMatt Macy
2339eda14cbcSMatt Macy SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
2340eda14cbcSMatt Macy dnp->dn_nlevels - 1, j);
2341eda14cbcSMatt Macy dsl_scan_visitbp(&dnp->dn_blkptr[j],
2342eda14cbcSMatt Macy &czb, dnp, ds, scn, ostype, tx);
2343eda14cbcSMatt Macy }
2344eda14cbcSMatt Macy
2345eda14cbcSMatt Macy if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
2346eda14cbcSMatt Macy zbookmark_phys_t czb;
2347eda14cbcSMatt Macy SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
2348eda14cbcSMatt Macy 0, DMU_SPILL_BLKID);
2349eda14cbcSMatt Macy dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
2350eda14cbcSMatt Macy &czb, dnp, ds, scn, ostype, tx);
2351eda14cbcSMatt Macy }
2352eda14cbcSMatt Macy }
2353eda14cbcSMatt Macy
2354eda14cbcSMatt Macy /*
2355eda14cbcSMatt Macy * The arguments are in this order because mdb can only print the
2356eda14cbcSMatt Macy * first 5; we want them to be useful.
2357eda14cbcSMatt Macy */
2358eda14cbcSMatt Macy static void
dsl_scan_visitbp(const blkptr_t * bp,const zbookmark_phys_t * zb,dnode_phys_t * dnp,dsl_dataset_t * ds,dsl_scan_t * scn,dmu_objset_type_t ostype,dmu_tx_t * tx)23592276e539SMartin Matuska dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
2360eda14cbcSMatt Macy dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
2361eda14cbcSMatt Macy dmu_objset_type_t ostype, dmu_tx_t *tx)
2362eda14cbcSMatt Macy {
2363eda14cbcSMatt Macy dsl_pool_t *dp = scn->scn_dp;
2364eda14cbcSMatt Macy
2365eda14cbcSMatt Macy if (dsl_scan_check_suspend(scn, zb))
2366eda14cbcSMatt Macy return;
2367eda14cbcSMatt Macy
2368eda14cbcSMatt Macy if (dsl_scan_check_resume(scn, dnp, zb))
2369eda14cbcSMatt Macy return;
2370eda14cbcSMatt Macy
2371eda14cbcSMatt Macy scn->scn_visited_this_txg++;
2372eda14cbcSMatt Macy
2373eda14cbcSMatt Macy if (BP_IS_HOLE(bp)) {
2374eda14cbcSMatt Macy scn->scn_holes_this_txg++;
2375eda14cbcSMatt Macy return;
2376eda14cbcSMatt Macy }
2377eda14cbcSMatt Macy
2378eda14cbcSMatt Macy if (BP_IS_REDACTED(bp)) {
2379eda14cbcSMatt Macy ASSERT(dsl_dataset_feature_is_active(ds,
2380eda14cbcSMatt Macy SPA_FEATURE_REDACTED_DATASETS));
2381eda14cbcSMatt Macy return;
2382eda14cbcSMatt Macy }
2383eda14cbcSMatt Macy
2384c9539b89SMartin Matuska /*
2385c9539b89SMartin Matuska * Check if this block contradicts any filesystem flags.
2386c9539b89SMartin Matuska */
2387c9539b89SMartin Matuska spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS;
2388c9539b89SMartin Matuska if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
2389c9539b89SMartin Matuska ASSERT(dsl_dataset_feature_is_active(ds, f));
2390c9539b89SMartin Matuska
2391c9539b89SMartin Matuska f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
2392c9539b89SMartin Matuska if (f != SPA_FEATURE_NONE)
2393c9539b89SMartin Matuska ASSERT(dsl_dataset_feature_is_active(ds, f));
2394c9539b89SMartin Matuska
2395c9539b89SMartin Matuska f = zio_compress_to_feature(BP_GET_COMPRESS(bp));
2396c9539b89SMartin Matuska if (f != SPA_FEATURE_NONE)
2397c9539b89SMartin Matuska ASSERT(dsl_dataset_feature_is_active(ds, f));
2398c9539b89SMartin Matuska
2399783d3ff6SMartin Matuska if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
2400c9539b89SMartin Matuska scn->scn_lt_min_this_txg++;
2401c9539b89SMartin Matuska return;
2402c9539b89SMartin Matuska }
2403c9539b89SMartin Matuska
24042276e539SMartin Matuska if (dsl_scan_recurse(scn, ds, ostype, dnp, bp, zb, tx) != 0)
24052276e539SMartin Matuska return;
2406eda14cbcSMatt Macy
2407eda14cbcSMatt Macy /*
2408eda14cbcSMatt Macy * If dsl_scan_ddt() has already visited this block, it will have
2409eda14cbcSMatt Macy * already done any translations or scrubbing, so don't call the
2410eda14cbcSMatt Macy * callback again.
2411eda14cbcSMatt Macy */
2412eda14cbcSMatt Macy if (ddt_class_contains(dp->dp_spa,
2413eda14cbcSMatt Macy scn->scn_phys.scn_ddt_class_max, bp)) {
2414eda14cbcSMatt Macy scn->scn_ddt_contained_this_txg++;
24152276e539SMartin Matuska return;
2416eda14cbcSMatt Macy }
2417eda14cbcSMatt Macy
2418eda14cbcSMatt Macy /*
2419eda14cbcSMatt Macy * If this block is from the future (after cur_max_txg), then we
2420eda14cbcSMatt Macy * are doing this on behalf of a deleted snapshot, and we will
2421eda14cbcSMatt Macy * revisit the future block on the next pass of this dataset.
2422eda14cbcSMatt Macy * Don't scan it now unless we need to because something
2423eda14cbcSMatt Macy * under it was modified.
2424eda14cbcSMatt Macy */
2425783d3ff6SMartin Matuska if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
2426eda14cbcSMatt Macy scn->scn_gt_max_this_txg++;
24272276e539SMartin Matuska return;
2428eda14cbcSMatt Macy }
2429eda14cbcSMatt Macy
2430eda14cbcSMatt Macy scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
2431eda14cbcSMatt Macy }
2432eda14cbcSMatt Macy
2433eda14cbcSMatt Macy static void
dsl_scan_visit_rootbp(dsl_scan_t * scn,dsl_dataset_t * ds,blkptr_t * bp,dmu_tx_t * tx)2434eda14cbcSMatt Macy dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
2435eda14cbcSMatt Macy dmu_tx_t *tx)
2436eda14cbcSMatt Macy {
2437eda14cbcSMatt Macy zbookmark_phys_t zb;
2438eda14cbcSMatt Macy scan_prefetch_ctx_t *spc;
2439eda14cbcSMatt Macy
2440eda14cbcSMatt Macy SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
2441eda14cbcSMatt Macy ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
2442eda14cbcSMatt Macy
2443eda14cbcSMatt Macy if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
2444eda14cbcSMatt Macy SET_BOOKMARK(&scn->scn_prefetch_bookmark,
2445eda14cbcSMatt Macy zb.zb_objset, 0, 0, 0);
2446eda14cbcSMatt Macy } else {
2447eda14cbcSMatt Macy scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
2448eda14cbcSMatt Macy }
2449eda14cbcSMatt Macy
2450eda14cbcSMatt Macy scn->scn_objsets_visited_this_txg++;
2451eda14cbcSMatt Macy
2452eda14cbcSMatt Macy spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
2453eda14cbcSMatt Macy dsl_scan_prefetch(spc, bp, &zb);
2454eda14cbcSMatt Macy scan_prefetch_ctx_rele(spc, FTAG);
2455eda14cbcSMatt Macy
2456eda14cbcSMatt Macy dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
2457eda14cbcSMatt Macy
2458eda14cbcSMatt Macy dprintf_ds(ds, "finished scan%s", "");
2459eda14cbcSMatt Macy }
2460eda14cbcSMatt Macy
2461eda14cbcSMatt Macy static void
ds_destroyed_scn_phys(dsl_dataset_t * ds,dsl_scan_phys_t * scn_phys)2462eda14cbcSMatt Macy ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys)
2463eda14cbcSMatt Macy {
2464eda14cbcSMatt Macy if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
2465eda14cbcSMatt Macy if (ds->ds_is_snapshot) {
2466eda14cbcSMatt Macy /*
2467eda14cbcSMatt Macy * Note:
2468eda14cbcSMatt Macy * - scn_cur_{min,max}_txg stays the same.
2469eda14cbcSMatt Macy * - Setting the flag is not really necessary if
2470eda14cbcSMatt Macy * scn_cur_max_txg == scn_max_txg, because there
2471eda14cbcSMatt Macy * is nothing after this snapshot that we care
2472eda14cbcSMatt Macy * about. However, we set it anyway and then
2473eda14cbcSMatt Macy * ignore it when we retraverse it in
2474eda14cbcSMatt Macy * dsl_scan_visitds().
2475eda14cbcSMatt Macy */
2476eda14cbcSMatt Macy scn_phys->scn_bookmark.zb_objset =
2477eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_next_snap_obj;
247881b22a98SMartin Matuska zfs_dbgmsg("destroying ds %llu on %s; currently "
247981b22a98SMartin Matuska "traversing; reset zb_objset to %llu",
2480eda14cbcSMatt Macy (u_longlong_t)ds->ds_object,
248181b22a98SMartin Matuska ds->ds_dir->dd_pool->dp_spa->spa_name,
2482eda14cbcSMatt Macy (u_longlong_t)dsl_dataset_phys(ds)->
2483eda14cbcSMatt Macy ds_next_snap_obj);
2484eda14cbcSMatt Macy scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN;
2485eda14cbcSMatt Macy } else {
2486eda14cbcSMatt Macy SET_BOOKMARK(&scn_phys->scn_bookmark,
2487eda14cbcSMatt Macy ZB_DESTROYED_OBJSET, 0, 0, 0);
248881b22a98SMartin Matuska zfs_dbgmsg("destroying ds %llu on %s; currently "
248981b22a98SMartin Matuska "traversing; reset bookmark to -1,0,0,0",
249081b22a98SMartin Matuska (u_longlong_t)ds->ds_object,
249181b22a98SMartin Matuska ds->ds_dir->dd_pool->dp_spa->spa_name);
2492eda14cbcSMatt Macy }
2493eda14cbcSMatt Macy }
2494eda14cbcSMatt Macy }
2495eda14cbcSMatt Macy
2496eda14cbcSMatt Macy /*
2497eda14cbcSMatt Macy * Invoked when a dataset is destroyed. We need to make sure that:
2498eda14cbcSMatt Macy *
2499eda14cbcSMatt Macy * 1) If it is the dataset that was currently being scanned, we write
2500eda14cbcSMatt Macy * a new dsl_scan_phys_t and marking the objset reference in it
2501eda14cbcSMatt Macy * as destroyed.
2502eda14cbcSMatt Macy * 2) Remove it from the work queue, if it was present.
2503eda14cbcSMatt Macy *
2504eda14cbcSMatt Macy * If the dataset was actually a snapshot, instead of marking the dataset
2505eda14cbcSMatt Macy * as destroyed, we instead substitute the next snapshot in line.
2506eda14cbcSMatt Macy */
2507eda14cbcSMatt Macy void
dsl_scan_ds_destroyed(dsl_dataset_t * ds,dmu_tx_t * tx)2508eda14cbcSMatt Macy dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
2509eda14cbcSMatt Macy {
2510eda14cbcSMatt Macy dsl_pool_t *dp = ds->ds_dir->dd_pool;
2511eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
2512eda14cbcSMatt Macy uint64_t mintxg;
2513eda14cbcSMatt Macy
2514eda14cbcSMatt Macy if (!dsl_scan_is_running(scn))
2515eda14cbcSMatt Macy return;
2516eda14cbcSMatt Macy
2517eda14cbcSMatt Macy ds_destroyed_scn_phys(ds, &scn->scn_phys);
2518eda14cbcSMatt Macy ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
2519eda14cbcSMatt Macy
2520eda14cbcSMatt Macy if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
2521eda14cbcSMatt Macy scan_ds_queue_remove(scn, ds->ds_object);
2522eda14cbcSMatt Macy if (ds->ds_is_snapshot)
2523eda14cbcSMatt Macy scan_ds_queue_insert(scn,
2524eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
2525eda14cbcSMatt Macy }
2526eda14cbcSMatt Macy
2527eda14cbcSMatt Macy if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
2528eda14cbcSMatt Macy ds->ds_object, &mintxg) == 0) {
2529eda14cbcSMatt Macy ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
2530eda14cbcSMatt Macy VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2531eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
2532eda14cbcSMatt Macy if (ds->ds_is_snapshot) {
2533eda14cbcSMatt Macy /*
2534eda14cbcSMatt Macy * We keep the same mintxg; it could be >
2535eda14cbcSMatt Macy * ds_creation_txg if the previous snapshot was
2536eda14cbcSMatt Macy * deleted too.
2537eda14cbcSMatt Macy */
2538eda14cbcSMatt Macy VERIFY(zap_add_int_key(dp->dp_meta_objset,
2539eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj,
2540eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_next_snap_obj,
2541eda14cbcSMatt Macy mintxg, tx) == 0);
254281b22a98SMartin Matuska zfs_dbgmsg("destroying ds %llu on %s; in queue; "
2543eda14cbcSMatt Macy "replacing with %llu",
2544eda14cbcSMatt Macy (u_longlong_t)ds->ds_object,
254581b22a98SMartin Matuska dp->dp_spa->spa_name,
2546eda14cbcSMatt Macy (u_longlong_t)dsl_dataset_phys(ds)->
2547eda14cbcSMatt Macy ds_next_snap_obj);
2548eda14cbcSMatt Macy } else {
254981b22a98SMartin Matuska zfs_dbgmsg("destroying ds %llu on %s; in queue; "
255081b22a98SMartin Matuska "removing",
255181b22a98SMartin Matuska (u_longlong_t)ds->ds_object,
255281b22a98SMartin Matuska dp->dp_spa->spa_name);
2553eda14cbcSMatt Macy }
2554eda14cbcSMatt Macy }
2555eda14cbcSMatt Macy
2556eda14cbcSMatt Macy /*
2557eda14cbcSMatt Macy * dsl_scan_sync() should be called after this, and should sync
2558eda14cbcSMatt Macy * out our changed state, but just to be safe, do it here.
2559eda14cbcSMatt Macy */
2560eda14cbcSMatt Macy dsl_scan_sync_state(scn, tx, SYNC_CACHED);
2561eda14cbcSMatt Macy }
2562eda14cbcSMatt Macy
2563eda14cbcSMatt Macy static void
ds_snapshotted_bookmark(dsl_dataset_t * ds,zbookmark_phys_t * scn_bookmark)2564eda14cbcSMatt Macy ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark)
2565eda14cbcSMatt Macy {
2566eda14cbcSMatt Macy if (scn_bookmark->zb_objset == ds->ds_object) {
2567eda14cbcSMatt Macy scn_bookmark->zb_objset =
2568eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_prev_snap_obj;
256981b22a98SMartin Matuska zfs_dbgmsg("snapshotting ds %llu on %s; currently traversing; "
2570eda14cbcSMatt Macy "reset zb_objset to %llu",
2571eda14cbcSMatt Macy (u_longlong_t)ds->ds_object,
257281b22a98SMartin Matuska ds->ds_dir->dd_pool->dp_spa->spa_name,
2573eda14cbcSMatt Macy (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
2574eda14cbcSMatt Macy }
2575eda14cbcSMatt Macy }
2576eda14cbcSMatt Macy
2577eda14cbcSMatt Macy /*
2578eda14cbcSMatt Macy * Called when a dataset is snapshotted. If we were currently traversing
2579eda14cbcSMatt Macy * this snapshot, we reset our bookmark to point at the newly created
2580eda14cbcSMatt Macy * snapshot. We also modify our work queue to remove the old snapshot and
2581eda14cbcSMatt Macy * replace with the new one.
2582eda14cbcSMatt Macy */
2583eda14cbcSMatt Macy void
dsl_scan_ds_snapshotted(dsl_dataset_t * ds,dmu_tx_t * tx)2584eda14cbcSMatt Macy dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
2585eda14cbcSMatt Macy {
2586eda14cbcSMatt Macy dsl_pool_t *dp = ds->ds_dir->dd_pool;
2587eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
2588eda14cbcSMatt Macy uint64_t mintxg;
2589eda14cbcSMatt Macy
2590eda14cbcSMatt Macy if (!dsl_scan_is_running(scn))
2591eda14cbcSMatt Macy return;
2592eda14cbcSMatt Macy
2593eda14cbcSMatt Macy ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
2594eda14cbcSMatt Macy
2595eda14cbcSMatt Macy ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
2596eda14cbcSMatt Macy ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
2597eda14cbcSMatt Macy
2598eda14cbcSMatt Macy if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
2599eda14cbcSMatt Macy scan_ds_queue_remove(scn, ds->ds_object);
2600eda14cbcSMatt Macy scan_ds_queue_insert(scn,
2601eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
2602eda14cbcSMatt Macy }
2603eda14cbcSMatt Macy
2604eda14cbcSMatt Macy if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
2605eda14cbcSMatt Macy ds->ds_object, &mintxg) == 0) {
2606eda14cbcSMatt Macy VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2607eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
2608eda14cbcSMatt Macy VERIFY(zap_add_int_key(dp->dp_meta_objset,
2609eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj,
2610eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
261181b22a98SMartin Matuska zfs_dbgmsg("snapshotting ds %llu on %s; in queue; "
2612eda14cbcSMatt Macy "replacing with %llu",
2613eda14cbcSMatt Macy (u_longlong_t)ds->ds_object,
261481b22a98SMartin Matuska dp->dp_spa->spa_name,
2615eda14cbcSMatt Macy (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
2616eda14cbcSMatt Macy }
2617eda14cbcSMatt Macy
2618eda14cbcSMatt Macy dsl_scan_sync_state(scn, tx, SYNC_CACHED);
2619eda14cbcSMatt Macy }
2620eda14cbcSMatt Macy
2621eda14cbcSMatt Macy static void
ds_clone_swapped_bookmark(dsl_dataset_t * ds1,dsl_dataset_t * ds2,zbookmark_phys_t * scn_bookmark)2622eda14cbcSMatt Macy ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
2623eda14cbcSMatt Macy zbookmark_phys_t *scn_bookmark)
2624eda14cbcSMatt Macy {
2625eda14cbcSMatt Macy if (scn_bookmark->zb_objset == ds1->ds_object) {
2626eda14cbcSMatt Macy scn_bookmark->zb_objset = ds2->ds_object;
262781b22a98SMartin Matuska zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; "
2628eda14cbcSMatt Macy "reset zb_objset to %llu",
2629eda14cbcSMatt Macy (u_longlong_t)ds1->ds_object,
263081b22a98SMartin Matuska ds1->ds_dir->dd_pool->dp_spa->spa_name,
2631eda14cbcSMatt Macy (u_longlong_t)ds2->ds_object);
2632eda14cbcSMatt Macy } else if (scn_bookmark->zb_objset == ds2->ds_object) {
2633eda14cbcSMatt Macy scn_bookmark->zb_objset = ds1->ds_object;
263481b22a98SMartin Matuska zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; "
2635eda14cbcSMatt Macy "reset zb_objset to %llu",
2636eda14cbcSMatt Macy (u_longlong_t)ds2->ds_object,
263781b22a98SMartin Matuska ds2->ds_dir->dd_pool->dp_spa->spa_name,
2638eda14cbcSMatt Macy (u_longlong_t)ds1->ds_object);
2639eda14cbcSMatt Macy }
2640eda14cbcSMatt Macy }
2641eda14cbcSMatt Macy
2642eda14cbcSMatt Macy /*
2643eda14cbcSMatt Macy * Called when an origin dataset and its clone are swapped. If we were
2644eda14cbcSMatt Macy * currently traversing the dataset, we need to switch to traversing the
2645eda14cbcSMatt Macy * newly promoted clone.
2646eda14cbcSMatt Macy */
2647eda14cbcSMatt Macy void
dsl_scan_ds_clone_swapped(dsl_dataset_t * ds1,dsl_dataset_t * ds2,dmu_tx_t * tx)2648eda14cbcSMatt Macy dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
2649eda14cbcSMatt Macy {
2650eda14cbcSMatt Macy dsl_pool_t *dp = ds1->ds_dir->dd_pool;
2651eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
2652eda14cbcSMatt Macy uint64_t mintxg1, mintxg2;
2653eda14cbcSMatt Macy boolean_t ds1_queued, ds2_queued;
2654eda14cbcSMatt Macy
2655eda14cbcSMatt Macy if (!dsl_scan_is_running(scn))
2656eda14cbcSMatt Macy return;
2657eda14cbcSMatt Macy
2658eda14cbcSMatt Macy ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
2659eda14cbcSMatt Macy ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
2660eda14cbcSMatt Macy
2661eda14cbcSMatt Macy /*
2662eda14cbcSMatt Macy * Handle the in-memory scan queue.
2663eda14cbcSMatt Macy */
2664eda14cbcSMatt Macy ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1);
2665eda14cbcSMatt Macy ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2);
2666eda14cbcSMatt Macy
2667eda14cbcSMatt Macy /* Sanity checking. */
2668eda14cbcSMatt Macy if (ds1_queued) {
2669eda14cbcSMatt Macy ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
2670eda14cbcSMatt Macy ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
2671eda14cbcSMatt Macy }
2672eda14cbcSMatt Macy if (ds2_queued) {
2673eda14cbcSMatt Macy ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
2674eda14cbcSMatt Macy ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
2675eda14cbcSMatt Macy }
2676eda14cbcSMatt Macy
2677eda14cbcSMatt Macy if (ds1_queued && ds2_queued) {
2678eda14cbcSMatt Macy /*
2679eda14cbcSMatt Macy * If both are queued, we don't need to do anything.
2680eda14cbcSMatt Macy * The swapping code below would not handle this case correctly,
2681eda14cbcSMatt Macy * since we can't insert ds2 if it is already there. That's
2682eda14cbcSMatt Macy * because scan_ds_queue_insert() prohibits a duplicate insert
2683eda14cbcSMatt Macy * and panics.
2684eda14cbcSMatt Macy */
2685eda14cbcSMatt Macy } else if (ds1_queued) {
2686eda14cbcSMatt Macy scan_ds_queue_remove(scn, ds1->ds_object);
2687eda14cbcSMatt Macy scan_ds_queue_insert(scn, ds2->ds_object, mintxg1);
2688eda14cbcSMatt Macy } else if (ds2_queued) {
2689eda14cbcSMatt Macy scan_ds_queue_remove(scn, ds2->ds_object);
2690eda14cbcSMatt Macy scan_ds_queue_insert(scn, ds1->ds_object, mintxg2);
2691eda14cbcSMatt Macy }
2692eda14cbcSMatt Macy
2693eda14cbcSMatt Macy /*
2694eda14cbcSMatt Macy * Handle the on-disk scan queue.
2695eda14cbcSMatt Macy * The on-disk state is an out-of-date version of the in-memory state,
2696eda14cbcSMatt Macy * so the in-memory and on-disk values for ds1_queued and ds2_queued may
2697eda14cbcSMatt Macy * be different. Therefore we need to apply the swap logic to the
2698eda14cbcSMatt Macy * on-disk state independently of the in-memory state.
2699eda14cbcSMatt Macy */
2700eda14cbcSMatt Macy ds1_queued = zap_lookup_int_key(dp->dp_meta_objset,
2701eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0;
2702eda14cbcSMatt Macy ds2_queued = zap_lookup_int_key(dp->dp_meta_objset,
2703eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0;
2704eda14cbcSMatt Macy
2705eda14cbcSMatt Macy /* Sanity checking. */
2706eda14cbcSMatt Macy if (ds1_queued) {
2707eda14cbcSMatt Macy ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
2708eda14cbcSMatt Macy ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
2709eda14cbcSMatt Macy }
2710eda14cbcSMatt Macy if (ds2_queued) {
2711eda14cbcSMatt Macy ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
2712eda14cbcSMatt Macy ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
2713eda14cbcSMatt Macy }
2714eda14cbcSMatt Macy
2715eda14cbcSMatt Macy if (ds1_queued && ds2_queued) {
2716eda14cbcSMatt Macy /*
2717eda14cbcSMatt Macy * If both are queued, we don't need to do anything.
2718eda14cbcSMatt Macy * Alternatively, we could check for EEXIST from
2719eda14cbcSMatt Macy * zap_add_int_key() and back out to the original state, but
2720eda14cbcSMatt Macy * that would be more work than checking for this case upfront.
2721eda14cbcSMatt Macy */
2722eda14cbcSMatt Macy } else if (ds1_queued) {
2723eda14cbcSMatt Macy VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
2724eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
2725eda14cbcSMatt Macy VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
2726eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx));
272781b22a98SMartin Matuska zfs_dbgmsg("clone_swap ds %llu on %s; in queue; "
2728eda14cbcSMatt Macy "replacing with %llu",
2729eda14cbcSMatt Macy (u_longlong_t)ds1->ds_object,
273081b22a98SMartin Matuska dp->dp_spa->spa_name,
2731eda14cbcSMatt Macy (u_longlong_t)ds2->ds_object);
2732eda14cbcSMatt Macy } else if (ds2_queued) {
2733eda14cbcSMatt Macy VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
2734eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
2735eda14cbcSMatt Macy VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
2736eda14cbcSMatt Macy scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx));
273781b22a98SMartin Matuska zfs_dbgmsg("clone_swap ds %llu on %s; in queue; "
2738eda14cbcSMatt Macy "replacing with %llu",
2739eda14cbcSMatt Macy (u_longlong_t)ds2->ds_object,
274081b22a98SMartin Matuska dp->dp_spa->spa_name,
2741eda14cbcSMatt Macy (u_longlong_t)ds1->ds_object);
2742eda14cbcSMatt Macy }
2743eda14cbcSMatt Macy
2744eda14cbcSMatt Macy dsl_scan_sync_state(scn, tx, SYNC_CACHED);
2745eda14cbcSMatt Macy }
2746eda14cbcSMatt Macy
2747eda14cbcSMatt Macy static int
enqueue_clones_cb(dsl_pool_t * dp,dsl_dataset_t * hds,void * arg)2748eda14cbcSMatt Macy enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
2749eda14cbcSMatt Macy {
2750eda14cbcSMatt Macy uint64_t originobj = *(uint64_t *)arg;
2751eda14cbcSMatt Macy dsl_dataset_t *ds;
2752eda14cbcSMatt Macy int err;
2753eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
2754eda14cbcSMatt Macy
2755eda14cbcSMatt Macy if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
2756eda14cbcSMatt Macy return (0);
2757eda14cbcSMatt Macy
2758eda14cbcSMatt Macy err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
2759eda14cbcSMatt Macy if (err)
2760eda14cbcSMatt Macy return (err);
2761eda14cbcSMatt Macy
2762eda14cbcSMatt Macy while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
2763eda14cbcSMatt Macy dsl_dataset_t *prev;
2764eda14cbcSMatt Macy err = dsl_dataset_hold_obj(dp,
2765eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
2766eda14cbcSMatt Macy
2767eda14cbcSMatt Macy dsl_dataset_rele(ds, FTAG);
2768eda14cbcSMatt Macy if (err)
2769eda14cbcSMatt Macy return (err);
2770eda14cbcSMatt Macy ds = prev;
2771eda14cbcSMatt Macy }
277249086aa3SAlexander Motin mutex_enter(&scn->scn_queue_lock);
2773eda14cbcSMatt Macy scan_ds_queue_insert(scn, ds->ds_object,
2774eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_prev_snap_txg);
277549086aa3SAlexander Motin mutex_exit(&scn->scn_queue_lock);
2776eda14cbcSMatt Macy dsl_dataset_rele(ds, FTAG);
2777eda14cbcSMatt Macy return (0);
2778eda14cbcSMatt Macy }
2779eda14cbcSMatt Macy
2780eda14cbcSMatt Macy static void
dsl_scan_visitds(dsl_scan_t * scn,uint64_t dsobj,dmu_tx_t * tx)2781eda14cbcSMatt Macy dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
2782eda14cbcSMatt Macy {
2783eda14cbcSMatt Macy dsl_pool_t *dp = scn->scn_dp;
2784eda14cbcSMatt Macy dsl_dataset_t *ds;
2785eda14cbcSMatt Macy
2786eda14cbcSMatt Macy VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
2787eda14cbcSMatt Macy
2788eda14cbcSMatt Macy if (scn->scn_phys.scn_cur_min_txg >=
2789eda14cbcSMatt Macy scn->scn_phys.scn_max_txg) {
2790eda14cbcSMatt Macy /*
2791eda14cbcSMatt Macy * This can happen if this snapshot was created after the
2792eda14cbcSMatt Macy * scan started, and we already completed a previous snapshot
2793eda14cbcSMatt Macy * that was created after the scan started. This snapshot
2794eda14cbcSMatt Macy * only references blocks with:
2795eda14cbcSMatt Macy *
2796eda14cbcSMatt Macy * birth < our ds_creation_txg
2797eda14cbcSMatt Macy * cur_min_txg is no less than ds_creation_txg.
2798eda14cbcSMatt Macy * We have already visited these blocks.
2799eda14cbcSMatt Macy * or
2800eda14cbcSMatt Macy * birth > scn_max_txg
2801eda14cbcSMatt Macy * The scan requested not to visit these blocks.
2802eda14cbcSMatt Macy *
2803eda14cbcSMatt Macy * Subsequent snapshots (and clones) can reference our
2804eda14cbcSMatt Macy * blocks, or blocks with even higher birth times.
2805eda14cbcSMatt Macy * Therefore we do not need to visit them either,
2806eda14cbcSMatt Macy * so we do not add them to the work queue.
2807eda14cbcSMatt Macy *
2808eda14cbcSMatt Macy * Note that checking for cur_min_txg >= cur_max_txg
2809eda14cbcSMatt Macy * is not sufficient, because in that case we may need to
2810eda14cbcSMatt Macy * visit subsequent snapshots. This happens when min_txg > 0,
2811eda14cbcSMatt Macy * which raises cur_min_txg. In this case we will visit
2812eda14cbcSMatt Macy * this dataset but skip all of its blocks, because the
2813eda14cbcSMatt Macy * rootbp's birth time is < cur_min_txg. Then we will
2814eda14cbcSMatt Macy * add the next snapshots/clones to the work queue.
2815eda14cbcSMatt Macy */
2816eda14cbcSMatt Macy char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
2817eda14cbcSMatt Macy dsl_dataset_name(ds, dsname);
2818eda14cbcSMatt Macy zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
2819eda14cbcSMatt Macy "cur_min_txg (%llu) >= max_txg (%llu)",
2820eda14cbcSMatt Macy (longlong_t)dsobj, dsname,
2821eda14cbcSMatt Macy (longlong_t)scn->scn_phys.scn_cur_min_txg,
2822eda14cbcSMatt Macy (longlong_t)scn->scn_phys.scn_max_txg);
2823eda14cbcSMatt Macy kmem_free(dsname, MAXNAMELEN);
2824eda14cbcSMatt Macy
2825eda14cbcSMatt Macy goto out;
2826eda14cbcSMatt Macy }
2827eda14cbcSMatt Macy
2828eda14cbcSMatt Macy /*
2829eda14cbcSMatt Macy * Only the ZIL in the head (non-snapshot) is valid. Even though
2830eda14cbcSMatt Macy * snapshots can have ZIL block pointers (which may be the same
2831eda14cbcSMatt Macy * BP as in the head), they must be ignored. In addition, $ORIGIN
2832eda14cbcSMatt Macy * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
2833eda14cbcSMatt Macy * need to look for a ZIL in it either. So we traverse the ZIL here,
2834eda14cbcSMatt Macy * rather than in scan_recurse(), because the regular snapshot
2835eda14cbcSMatt Macy * block-sharing rules don't apply to it.
2836eda14cbcSMatt Macy */
2837eda14cbcSMatt Macy if (!dsl_dataset_is_snapshot(ds) &&
2838eda14cbcSMatt Macy (dp->dp_origin_snap == NULL ||
2839eda14cbcSMatt Macy ds->ds_dir != dp->dp_origin_snap->ds_dir)) {
2840eda14cbcSMatt Macy objset_t *os;
2841eda14cbcSMatt Macy if (dmu_objset_from_ds(ds, &os) != 0) {
2842eda14cbcSMatt Macy goto out;
2843eda14cbcSMatt Macy }
2844eda14cbcSMatt Macy dsl_scan_zil(dp, &os->os_zil_header);
2845eda14cbcSMatt Macy }
2846eda14cbcSMatt Macy
2847eda14cbcSMatt Macy /*
2848eda14cbcSMatt Macy * Iterate over the bps in this ds.
2849eda14cbcSMatt Macy */
2850eda14cbcSMatt Macy dmu_buf_will_dirty(ds->ds_dbuf, tx);
2851eda14cbcSMatt Macy rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
2852eda14cbcSMatt Macy dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
2853eda14cbcSMatt Macy rrw_exit(&ds->ds_bp_rwlock, FTAG);
2854eda14cbcSMatt Macy
2855eda14cbcSMatt Macy char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
2856eda14cbcSMatt Macy dsl_dataset_name(ds, dsname);
2857eda14cbcSMatt Macy zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
2858eda14cbcSMatt Macy "suspending=%u",
2859eda14cbcSMatt Macy (longlong_t)dsobj, dsname,
2860eda14cbcSMatt Macy (longlong_t)scn->scn_phys.scn_cur_min_txg,
2861eda14cbcSMatt Macy (longlong_t)scn->scn_phys.scn_cur_max_txg,
2862eda14cbcSMatt Macy (int)scn->scn_suspending);
2863eda14cbcSMatt Macy kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
2864eda14cbcSMatt Macy
2865eda14cbcSMatt Macy if (scn->scn_suspending)
2866eda14cbcSMatt Macy goto out;
2867eda14cbcSMatt Macy
2868eda14cbcSMatt Macy /*
2869eda14cbcSMatt Macy * We've finished this pass over this dataset.
2870eda14cbcSMatt Macy */
2871eda14cbcSMatt Macy
2872eda14cbcSMatt Macy /*
2873eda14cbcSMatt Macy * If we did not completely visit this dataset, do another pass.
2874eda14cbcSMatt Macy */
2875eda14cbcSMatt Macy if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
287681b22a98SMartin Matuska zfs_dbgmsg("incomplete pass on %s; visiting again",
287781b22a98SMartin Matuska dp->dp_spa->spa_name);
2878eda14cbcSMatt Macy scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
2879eda14cbcSMatt Macy scan_ds_queue_insert(scn, ds->ds_object,
2880eda14cbcSMatt Macy scn->scn_phys.scn_cur_max_txg);
2881eda14cbcSMatt Macy goto out;
2882eda14cbcSMatt Macy }
2883eda14cbcSMatt Macy
2884eda14cbcSMatt Macy /*
2885eda14cbcSMatt Macy * Add descendant datasets to work queue.
2886eda14cbcSMatt Macy */
2887eda14cbcSMatt Macy if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
2888eda14cbcSMatt Macy scan_ds_queue_insert(scn,
2889eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_next_snap_obj,
2890eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_creation_txg);
2891eda14cbcSMatt Macy }
2892eda14cbcSMatt Macy if (dsl_dataset_phys(ds)->ds_num_children > 1) {
2893eda14cbcSMatt Macy boolean_t usenext = B_FALSE;
2894eda14cbcSMatt Macy if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
2895eda14cbcSMatt Macy uint64_t count;
2896eda14cbcSMatt Macy /*
2897eda14cbcSMatt Macy * A bug in a previous version of the code could
2898eda14cbcSMatt Macy * cause upgrade_clones_cb() to not set
2899eda14cbcSMatt Macy * ds_next_snap_obj when it should, leading to a
2900eda14cbcSMatt Macy * missing entry. Therefore we can only use the
2901eda14cbcSMatt Macy * next_clones_obj when its count is correct.
2902eda14cbcSMatt Macy */
2903eda14cbcSMatt Macy int err = zap_count(dp->dp_meta_objset,
2904eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
2905eda14cbcSMatt Macy if (err == 0 &&
2906eda14cbcSMatt Macy count == dsl_dataset_phys(ds)->ds_num_children - 1)
2907eda14cbcSMatt Macy usenext = B_TRUE;
2908eda14cbcSMatt Macy }
2909eda14cbcSMatt Macy
2910eda14cbcSMatt Macy if (usenext) {
2911eda14cbcSMatt Macy zap_cursor_t zc;
29127a7741afSMartin Matuska zap_attribute_t *za = zap_attribute_alloc();
2913eda14cbcSMatt Macy for (zap_cursor_init(&zc, dp->dp_meta_objset,
2914eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_next_clones_obj);
29157a7741afSMartin Matuska zap_cursor_retrieve(&zc, za) == 0;
2916eda14cbcSMatt Macy (void) zap_cursor_advance(&zc)) {
2917eda14cbcSMatt Macy scan_ds_queue_insert(scn,
29187a7741afSMartin Matuska zfs_strtonum(za->za_name, NULL),
2919eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_creation_txg);
2920eda14cbcSMatt Macy }
2921eda14cbcSMatt Macy zap_cursor_fini(&zc);
29227a7741afSMartin Matuska zap_attribute_free(za);
2923eda14cbcSMatt Macy } else {
2924eda14cbcSMatt Macy VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
2925eda14cbcSMatt Macy enqueue_clones_cb, &ds->ds_object,
2926eda14cbcSMatt Macy DS_FIND_CHILDREN));
2927eda14cbcSMatt Macy }
2928eda14cbcSMatt Macy }
2929eda14cbcSMatt Macy
2930eda14cbcSMatt Macy out:
2931eda14cbcSMatt Macy dsl_dataset_rele(ds, FTAG);
2932eda14cbcSMatt Macy }
2933eda14cbcSMatt Macy
2934eda14cbcSMatt Macy static int
enqueue_cb(dsl_pool_t * dp,dsl_dataset_t * hds,void * arg)2935eda14cbcSMatt Macy enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
2936eda14cbcSMatt Macy {
2937e92ffd9bSMartin Matuska (void) arg;
2938eda14cbcSMatt Macy dsl_dataset_t *ds;
2939eda14cbcSMatt Macy int err;
2940eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
2941eda14cbcSMatt Macy
2942eda14cbcSMatt Macy err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
2943eda14cbcSMatt Macy if (err)
2944eda14cbcSMatt Macy return (err);
2945eda14cbcSMatt Macy
2946eda14cbcSMatt Macy while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
2947eda14cbcSMatt Macy dsl_dataset_t *prev;
2948eda14cbcSMatt Macy err = dsl_dataset_hold_obj(dp,
2949eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
2950eda14cbcSMatt Macy if (err) {
2951eda14cbcSMatt Macy dsl_dataset_rele(ds, FTAG);
2952eda14cbcSMatt Macy return (err);
2953eda14cbcSMatt Macy }
2954eda14cbcSMatt Macy
2955eda14cbcSMatt Macy /*
2956eda14cbcSMatt Macy * If this is a clone, we don't need to worry about it for now.
2957eda14cbcSMatt Macy */
2958eda14cbcSMatt Macy if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
2959eda14cbcSMatt Macy dsl_dataset_rele(ds, FTAG);
2960eda14cbcSMatt Macy dsl_dataset_rele(prev, FTAG);
2961eda14cbcSMatt Macy return (0);
2962eda14cbcSMatt Macy }
2963eda14cbcSMatt Macy dsl_dataset_rele(ds, FTAG);
2964eda14cbcSMatt Macy ds = prev;
2965eda14cbcSMatt Macy }
2966eda14cbcSMatt Macy
296749086aa3SAlexander Motin mutex_enter(&scn->scn_queue_lock);
2968eda14cbcSMatt Macy scan_ds_queue_insert(scn, ds->ds_object,
2969eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_prev_snap_txg);
297049086aa3SAlexander Motin mutex_exit(&scn->scn_queue_lock);
2971eda14cbcSMatt Macy dsl_dataset_rele(ds, FTAG);
2972eda14cbcSMatt Macy return (0);
2973eda14cbcSMatt Macy }
2974eda14cbcSMatt Macy
2975eda14cbcSMatt Macy void
dsl_scan_ddt_entry(dsl_scan_t * scn,enum zio_checksum checksum,ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,dmu_tx_t * tx)2976eda14cbcSMatt Macy dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
2977e2df9bb4SMartin Matuska ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
2978eda14cbcSMatt Macy {
2979e92ffd9bSMartin Matuska (void) tx;
2980e2df9bb4SMartin Matuska const ddt_key_t *ddk = &ddlwe->ddlwe_key;
2981eda14cbcSMatt Macy blkptr_t bp;
2982eda14cbcSMatt Macy zbookmark_phys_t zb = { 0 };
2983eda14cbcSMatt Macy
2984eda14cbcSMatt Macy if (!dsl_scan_is_running(scn))
2985eda14cbcSMatt Macy return;
2986eda14cbcSMatt Macy
2987eda14cbcSMatt Macy /*
2988eda14cbcSMatt Macy * This function is special because it is the only thing
2989eda14cbcSMatt Macy * that can add scan_io_t's to the vdev scan queues from
2990eda14cbcSMatt Macy * outside dsl_scan_sync(). For the most part this is ok
2991eda14cbcSMatt Macy * as long as it is called from within syncing context.
2992eda14cbcSMatt Macy * However, dsl_scan_sync() expects that no new sio's will
2993eda14cbcSMatt Macy * be added between when all the work for a scan is done
2994eda14cbcSMatt Macy * and the next txg when the scan is actually marked as
2995eda14cbcSMatt Macy * completed. This check ensures we do not issue new sio's
2996eda14cbcSMatt Macy * during this period.
2997eda14cbcSMatt Macy */
2998eda14cbcSMatt Macy if (scn->scn_done_txg != 0)
2999eda14cbcSMatt Macy return;
3000eda14cbcSMatt Macy
3001e2df9bb4SMartin Matuska for (int p = 0; p < DDT_NPHYS(ddt); p++) {
3002e2df9bb4SMartin Matuska ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
3003e2df9bb4SMartin Matuska uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v);
3004e2df9bb4SMartin Matuska
3005e2df9bb4SMartin Matuska if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg)
3006eda14cbcSMatt Macy continue;
3007e2df9bb4SMartin Matuska ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp);
3008eda14cbcSMatt Macy
3009eda14cbcSMatt Macy scn->scn_visited_this_txg++;
3010eda14cbcSMatt Macy scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
3011eda14cbcSMatt Macy }
3012eda14cbcSMatt Macy }
3013eda14cbcSMatt Macy
3014eda14cbcSMatt Macy /*
3015eda14cbcSMatt Macy * Scrub/dedup interaction.
3016eda14cbcSMatt Macy *
3017eda14cbcSMatt Macy * If there are N references to a deduped block, we don't want to scrub it
3018eda14cbcSMatt Macy * N times -- ideally, we should scrub it exactly once.
3019eda14cbcSMatt Macy *
30204fefe1b7SMartin Matuska * We leverage the fact that the dde's replication class (ddt_class_t)
3021eda14cbcSMatt Macy * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
3022eda14cbcSMatt Macy * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
3023eda14cbcSMatt Macy *
3024eda14cbcSMatt Macy * To prevent excess scrubbing, the scrub begins by walking the DDT
3025eda14cbcSMatt Macy * to find all blocks with refcnt > 1, and scrubs each of these once.
3026eda14cbcSMatt Macy * Since there are two replication classes which contain blocks with
3027eda14cbcSMatt Macy * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
3028eda14cbcSMatt Macy * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
3029eda14cbcSMatt Macy *
3030eda14cbcSMatt Macy * There would be nothing more to say if a block's refcnt couldn't change
3031eda14cbcSMatt Macy * during a scrub, but of course it can so we must account for changes
3032eda14cbcSMatt Macy * in a block's replication class.
3033eda14cbcSMatt Macy *
3034eda14cbcSMatt Macy * Here's an example of what can occur:
3035eda14cbcSMatt Macy *
3036eda14cbcSMatt Macy * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
3037eda14cbcSMatt Macy * when visited during the top-down scrub phase, it will be scrubbed twice.
3038eda14cbcSMatt Macy * This negates our scrub optimization, but is otherwise harmless.
3039eda14cbcSMatt Macy *
3040eda14cbcSMatt Macy * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
3041eda14cbcSMatt Macy * on each visit during the top-down scrub phase, it will never be scrubbed.
3042eda14cbcSMatt Macy * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
3043eda14cbcSMatt Macy * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
3044eda14cbcSMatt Macy * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
3045eda14cbcSMatt Macy * while a scrub is in progress, it scrubs the block right then.
3046eda14cbcSMatt Macy */
3047eda14cbcSMatt Macy static void
dsl_scan_ddt(dsl_scan_t * scn,dmu_tx_t * tx)3048eda14cbcSMatt Macy dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
3049eda14cbcSMatt Macy {
3050eda14cbcSMatt Macy ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
3051e2df9bb4SMartin Matuska ddt_lightweight_entry_t ddlwe = {0};
3052eda14cbcSMatt Macy int error;
3053eda14cbcSMatt Macy uint64_t n = 0;
3054eda14cbcSMatt Macy
3055e2df9bb4SMartin Matuska while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) {
3056eda14cbcSMatt Macy ddt_t *ddt;
3057eda14cbcSMatt Macy
3058eda14cbcSMatt Macy if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
3059eda14cbcSMatt Macy break;
3060eda14cbcSMatt Macy dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
3061eda14cbcSMatt Macy (longlong_t)ddb->ddb_class,
3062eda14cbcSMatt Macy (longlong_t)ddb->ddb_type,
3063eda14cbcSMatt Macy (longlong_t)ddb->ddb_checksum,
3064eda14cbcSMatt Macy (longlong_t)ddb->ddb_cursor);
3065eda14cbcSMatt Macy
3066eda14cbcSMatt Macy /* There should be no pending changes to the dedup table */
3067eda14cbcSMatt Macy ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
3068eda14cbcSMatt Macy ASSERT(avl_first(&ddt->ddt_tree) == NULL);
3069eda14cbcSMatt Macy
3070e2df9bb4SMartin Matuska dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx);
3071eda14cbcSMatt Macy n++;
3072eda14cbcSMatt Macy
3073eda14cbcSMatt Macy if (dsl_scan_check_suspend(scn, NULL))
3074eda14cbcSMatt Macy break;
3075eda14cbcSMatt Macy }
3076eda14cbcSMatt Macy
3077e2df9bb4SMartin Matuska if (error == EAGAIN) {
3078e2df9bb4SMartin Matuska dsl_scan_check_suspend(scn, NULL);
3079e2df9bb4SMartin Matuska error = 0;
3080e2df9bb4SMartin Matuska
3081e2df9bb4SMartin Matuska zfs_dbgmsg("waiting for ddt to become ready for scan "
3082e2df9bb4SMartin Matuska "on %s with class_max = %u; suspending=%u",
3083e2df9bb4SMartin Matuska scn->scn_dp->dp_spa->spa_name,
3084e2df9bb4SMartin Matuska (int)scn->scn_phys.scn_ddt_class_max,
3085e2df9bb4SMartin Matuska (int)scn->scn_suspending);
3086e2df9bb4SMartin Matuska } else
3087e2df9bb4SMartin Matuska zfs_dbgmsg("scanned %llu ddt entries on %s with "
3088e2df9bb4SMartin Matuska "class_max = %u; suspending=%u", (longlong_t)n,
3089e2df9bb4SMartin Matuska scn->scn_dp->dp_spa->spa_name,
3090e2df9bb4SMartin Matuska (int)scn->scn_phys.scn_ddt_class_max,
3091e2df9bb4SMartin Matuska (int)scn->scn_suspending);
3092eda14cbcSMatt Macy
3093eda14cbcSMatt Macy ASSERT(error == 0 || error == ENOENT);
3094eda14cbcSMatt Macy ASSERT(error != ENOENT ||
3095eda14cbcSMatt Macy ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
3096eda14cbcSMatt Macy }
3097eda14cbcSMatt Macy
3098eda14cbcSMatt Macy static uint64_t
dsl_scan_ds_maxtxg(dsl_dataset_t * ds)3099eda14cbcSMatt Macy dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
3100eda14cbcSMatt Macy {
3101eda14cbcSMatt Macy uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
3102eda14cbcSMatt Macy if (ds->ds_is_snapshot)
3103eda14cbcSMatt Macy return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
3104eda14cbcSMatt Macy return (smt);
3105eda14cbcSMatt Macy }
3106eda14cbcSMatt Macy
3107eda14cbcSMatt Macy static void
dsl_scan_visit(dsl_scan_t * scn,dmu_tx_t * tx)3108eda14cbcSMatt Macy dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
3109eda14cbcSMatt Macy {
3110eda14cbcSMatt Macy scan_ds_t *sds;
3111eda14cbcSMatt Macy dsl_pool_t *dp = scn->scn_dp;
3112eda14cbcSMatt Macy
3113eda14cbcSMatt Macy if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
3114eda14cbcSMatt Macy scn->scn_phys.scn_ddt_class_max) {
3115eda14cbcSMatt Macy scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
3116eda14cbcSMatt Macy scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
3117eda14cbcSMatt Macy dsl_scan_ddt(scn, tx);
3118eda14cbcSMatt Macy if (scn->scn_suspending)
3119eda14cbcSMatt Macy return;
3120eda14cbcSMatt Macy }
3121eda14cbcSMatt Macy
3122eda14cbcSMatt Macy if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
3123eda14cbcSMatt Macy /* First do the MOS & ORIGIN */
3124eda14cbcSMatt Macy
3125eda14cbcSMatt Macy scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
3126eda14cbcSMatt Macy scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
3127eda14cbcSMatt Macy dsl_scan_visit_rootbp(scn, NULL,
3128eda14cbcSMatt Macy &dp->dp_meta_rootbp, tx);
3129eda14cbcSMatt Macy if (scn->scn_suspending)
3130eda14cbcSMatt Macy return;
3131eda14cbcSMatt Macy
3132eda14cbcSMatt Macy if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
3133eda14cbcSMatt Macy VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
3134eda14cbcSMatt Macy enqueue_cb, NULL, DS_FIND_CHILDREN));
3135eda14cbcSMatt Macy } else {
3136eda14cbcSMatt Macy dsl_scan_visitds(scn,
3137eda14cbcSMatt Macy dp->dp_origin_snap->ds_object, tx);
3138eda14cbcSMatt Macy }
3139eda14cbcSMatt Macy ASSERT(!scn->scn_suspending);
3140eda14cbcSMatt Macy } else if (scn->scn_phys.scn_bookmark.zb_objset !=
3141eda14cbcSMatt Macy ZB_DESTROYED_OBJSET) {
3142eda14cbcSMatt Macy uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
3143eda14cbcSMatt Macy /*
3144eda14cbcSMatt Macy * If we were suspended, continue from here. Note if the
3145eda14cbcSMatt Macy * ds we were suspended on was deleted, the zb_objset may
3146eda14cbcSMatt Macy * be -1, so we will skip this and find a new objset
3147eda14cbcSMatt Macy * below.
3148eda14cbcSMatt Macy */
3149eda14cbcSMatt Macy dsl_scan_visitds(scn, dsobj, tx);
3150eda14cbcSMatt Macy if (scn->scn_suspending)
3151eda14cbcSMatt Macy return;
3152eda14cbcSMatt Macy }
3153eda14cbcSMatt Macy
3154eda14cbcSMatt Macy /*
3155eda14cbcSMatt Macy * In case we suspended right at the end of the ds, zero the
3156eda14cbcSMatt Macy * bookmark so we don't think that we're still trying to resume.
3157eda14cbcSMatt Macy */
3158da5137abSMartin Matuska memset(&scn->scn_phys.scn_bookmark, 0, sizeof (zbookmark_phys_t));
3159eda14cbcSMatt Macy
3160eda14cbcSMatt Macy /*
3161eda14cbcSMatt Macy * Keep pulling things out of the dataset avl queue. Updates to the
3162eda14cbcSMatt Macy * persistent zap-object-as-queue happen only at checkpoints.
3163eda14cbcSMatt Macy */
3164eda14cbcSMatt Macy while ((sds = avl_first(&scn->scn_queue)) != NULL) {
3165eda14cbcSMatt Macy dsl_dataset_t *ds;
3166eda14cbcSMatt Macy uint64_t dsobj = sds->sds_dsobj;
3167eda14cbcSMatt Macy uint64_t txg = sds->sds_txg;
3168eda14cbcSMatt Macy
3169eda14cbcSMatt Macy /* dequeue and free the ds from the queue */
3170eda14cbcSMatt Macy scan_ds_queue_remove(scn, dsobj);
3171eda14cbcSMatt Macy sds = NULL;
3172eda14cbcSMatt Macy
3173eda14cbcSMatt Macy /* set up min / max txg */
3174eda14cbcSMatt Macy VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
3175eda14cbcSMatt Macy if (txg != 0) {
3176eda14cbcSMatt Macy scn->scn_phys.scn_cur_min_txg =
3177eda14cbcSMatt Macy MAX(scn->scn_phys.scn_min_txg, txg);
3178eda14cbcSMatt Macy } else {
3179eda14cbcSMatt Macy scn->scn_phys.scn_cur_min_txg =
3180eda14cbcSMatt Macy MAX(scn->scn_phys.scn_min_txg,
3181eda14cbcSMatt Macy dsl_dataset_phys(ds)->ds_prev_snap_txg);
3182eda14cbcSMatt Macy }
3183eda14cbcSMatt Macy scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
3184eda14cbcSMatt Macy dsl_dataset_rele(ds, FTAG);
3185eda14cbcSMatt Macy
3186eda14cbcSMatt Macy dsl_scan_visitds(scn, dsobj, tx);
3187eda14cbcSMatt Macy if (scn->scn_suspending)
3188eda14cbcSMatt Macy return;
3189eda14cbcSMatt Macy }
3190eda14cbcSMatt Macy
3191eda14cbcSMatt Macy /* No more objsets to fetch, we're done */
3192eda14cbcSMatt Macy scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
3193eda14cbcSMatt Macy ASSERT0(scn->scn_suspending);
3194eda14cbcSMatt Macy }
3195eda14cbcSMatt Macy
3196eda14cbcSMatt Macy static uint64_t
dsl_scan_count_data_disks(spa_t * spa)3197c9539b89SMartin Matuska dsl_scan_count_data_disks(spa_t *spa)
3198eda14cbcSMatt Macy {
3199c9539b89SMartin Matuska vdev_t *rvd = spa->spa_root_vdev;
3200eda14cbcSMatt Macy uint64_t i, leaves = 0;
3201eda14cbcSMatt Macy
320216038816SMartin Matuska for (i = 0; i < rvd->vdev_children; i++) {
320316038816SMartin Matuska vdev_t *vd = rvd->vdev_child[i];
320416038816SMartin Matuska if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache)
320516038816SMartin Matuska continue;
320616038816SMartin Matuska leaves += vdev_get_ndisks(vd) - vdev_get_nparity(vd);
3207eda14cbcSMatt Macy }
3208eda14cbcSMatt Macy return (leaves);
3209eda14cbcSMatt Macy }
3210eda14cbcSMatt Macy
3211eda14cbcSMatt Macy static void
scan_io_queues_update_zio_stats(dsl_scan_io_queue_t * q,const blkptr_t * bp)3212eda14cbcSMatt Macy scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp)
3213eda14cbcSMatt Macy {
3214eda14cbcSMatt Macy int i;
3215eda14cbcSMatt Macy uint64_t cur_size = 0;
3216eda14cbcSMatt Macy
3217eda14cbcSMatt Macy for (i = 0; i < BP_GET_NDVAS(bp); i++) {
3218eda14cbcSMatt Macy cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
3219eda14cbcSMatt Macy }
3220eda14cbcSMatt Macy
3221eda14cbcSMatt Macy q->q_total_zio_size_this_txg += cur_size;
3222eda14cbcSMatt Macy q->q_zios_this_txg++;
3223eda14cbcSMatt Macy }
3224eda14cbcSMatt Macy
3225eda14cbcSMatt Macy static void
scan_io_queues_update_seg_stats(dsl_scan_io_queue_t * q,uint64_t start,uint64_t end)3226eda14cbcSMatt Macy scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
3227eda14cbcSMatt Macy uint64_t end)
3228eda14cbcSMatt Macy {
3229eda14cbcSMatt Macy q->q_total_seg_size_this_txg += end - start;
3230eda14cbcSMatt Macy q->q_segs_this_txg++;
3231eda14cbcSMatt Macy }
3232eda14cbcSMatt Macy
3233eda14cbcSMatt Macy static boolean_t
scan_io_queue_check_suspend(dsl_scan_t * scn)3234eda14cbcSMatt Macy scan_io_queue_check_suspend(dsl_scan_t *scn)
3235eda14cbcSMatt Macy {
3236eda14cbcSMatt Macy /* See comment in dsl_scan_check_suspend() */
3237eda14cbcSMatt Macy uint64_t curr_time_ns = gethrtime();
3238eda14cbcSMatt Macy uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
3239eda14cbcSMatt Macy uint64_t sync_time_ns = curr_time_ns -
3240eda14cbcSMatt Macy scn->scn_dp->dp_spa->spa_sync_starttime;
3241a0b956f5SMartin Matuska uint64_t dirty_min_bytes = zfs_dirty_data_max *
3242a0b956f5SMartin Matuska zfs_vdev_async_write_active_min_dirty_percent / 100;
3243be181ee2SMartin Matuska uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
3244eda14cbcSMatt Macy zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
3245eda14cbcSMatt Macy
3246eda14cbcSMatt Macy return ((NSEC2MSEC(scan_time_ns) > mintime &&
3247a0b956f5SMartin Matuska (scn->scn_dp->dp_dirty_total >= dirty_min_bytes ||
3248eda14cbcSMatt Macy txg_sync_waiting(scn->scn_dp) ||
3249eda14cbcSMatt Macy NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
3250eda14cbcSMatt Macy spa_shutting_down(scn->scn_dp->dp_spa));
3251eda14cbcSMatt Macy }
3252eda14cbcSMatt Macy
3253eda14cbcSMatt Macy /*
3254eda14cbcSMatt Macy * Given a list of scan_io_t's in io_list, this issues the I/Os out to
3255eda14cbcSMatt Macy * disk. This consumes the io_list and frees the scan_io_t's. This is
3256eda14cbcSMatt Macy * called when emptying queues, either when we're up against the memory
3257eda14cbcSMatt Macy * limit or when we have finished scanning. Returns B_TRUE if we stopped
3258eda14cbcSMatt Macy * processing the list before we finished. Any sios that were not issued
3259eda14cbcSMatt Macy * will remain in the io_list.
3260eda14cbcSMatt Macy */
3261eda14cbcSMatt Macy static boolean_t
scan_io_queue_issue(dsl_scan_io_queue_t * queue,list_t * io_list)3262eda14cbcSMatt Macy scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
3263eda14cbcSMatt Macy {
3264eda14cbcSMatt Macy dsl_scan_t *scn = queue->q_scn;
3265eda14cbcSMatt Macy scan_io_t *sio;
3266eda14cbcSMatt Macy boolean_t suspended = B_FALSE;
3267eda14cbcSMatt Macy
3268eda14cbcSMatt Macy while ((sio = list_head(io_list)) != NULL) {
3269eda14cbcSMatt Macy blkptr_t bp;
3270eda14cbcSMatt Macy
3271eda14cbcSMatt Macy if (scan_io_queue_check_suspend(scn)) {
3272eda14cbcSMatt Macy suspended = B_TRUE;
3273eda14cbcSMatt Macy break;
3274eda14cbcSMatt Macy }
3275eda14cbcSMatt Macy
3276eda14cbcSMatt Macy sio2bp(sio, &bp);
3277eda14cbcSMatt Macy scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
3278eda14cbcSMatt Macy &sio->sio_zb, queue);
3279eda14cbcSMatt Macy (void) list_remove_head(io_list);
3280eda14cbcSMatt Macy scan_io_queues_update_zio_stats(queue, &bp);
3281eda14cbcSMatt Macy sio_free(sio);
3282eda14cbcSMatt Macy }
3283eda14cbcSMatt Macy return (suspended);
3284eda14cbcSMatt Macy }
3285eda14cbcSMatt Macy
3286eda14cbcSMatt Macy /*
3287eda14cbcSMatt Macy * This function removes sios from an IO queue which reside within a given
3288b59a0cdeSMartin Matuska * zfs_range_seg_t and inserts them (in offset order) into a list. Note that
3289eda14cbcSMatt Macy * we only ever return a maximum of 32 sios at once. If there are more sios
3290eda14cbcSMatt Macy * to process within this segment that did not make it onto the list we
3291eda14cbcSMatt Macy * return B_TRUE and otherwise B_FALSE.
3292eda14cbcSMatt Macy */
3293eda14cbcSMatt Macy static boolean_t
scan_io_queue_gather(dsl_scan_io_queue_t * queue,zfs_range_seg_t * rs,list_t * list)3294b59a0cdeSMartin Matuska scan_io_queue_gather(dsl_scan_io_queue_t *queue, zfs_range_seg_t *rs,
3295b59a0cdeSMartin Matuska list_t *list)
3296eda14cbcSMatt Macy {
3297eda14cbcSMatt Macy scan_io_t *srch_sio, *sio, *next_sio;
3298eda14cbcSMatt Macy avl_index_t idx;
3299eda14cbcSMatt Macy uint_t num_sios = 0;
3300eda14cbcSMatt Macy int64_t bytes_issued = 0;
3301eda14cbcSMatt Macy
3302eda14cbcSMatt Macy ASSERT(rs != NULL);
3303eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
3304eda14cbcSMatt Macy
3305eda14cbcSMatt Macy srch_sio = sio_alloc(1);
3306eda14cbcSMatt Macy srch_sio->sio_nr_dvas = 1;
3307b59a0cdeSMartin Matuska SIO_SET_OFFSET(srch_sio, zfs_rs_get_start(rs, queue->q_exts_by_addr));
3308eda14cbcSMatt Macy
3309eda14cbcSMatt Macy /*
3310eda14cbcSMatt Macy * The exact start of the extent might not contain any matching zios,
3311eda14cbcSMatt Macy * so if that's the case, examine the next one in the tree.
3312eda14cbcSMatt Macy */
3313eda14cbcSMatt Macy sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
3314eda14cbcSMatt Macy sio_free(srch_sio);
3315eda14cbcSMatt Macy
3316eda14cbcSMatt Macy if (sio == NULL)
3317eda14cbcSMatt Macy sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
3318eda14cbcSMatt Macy
3319b59a0cdeSMartin Matuska while (sio != NULL && SIO_GET_OFFSET(sio) < zfs_rs_get_end(rs,
3320eda14cbcSMatt Macy queue->q_exts_by_addr) && num_sios <= 32) {
3321b59a0cdeSMartin Matuska ASSERT3U(SIO_GET_OFFSET(sio), >=, zfs_rs_get_start(rs,
3322eda14cbcSMatt Macy queue->q_exts_by_addr));
3323b59a0cdeSMartin Matuska ASSERT3U(SIO_GET_END_OFFSET(sio), <=, zfs_rs_get_end(rs,
3324eda14cbcSMatt Macy queue->q_exts_by_addr));
3325eda14cbcSMatt Macy
3326eda14cbcSMatt Macy next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
3327eda14cbcSMatt Macy avl_remove(&queue->q_sios_by_addr, sio);
3328a0b956f5SMartin Matuska if (avl_is_empty(&queue->q_sios_by_addr))
3329a0b956f5SMartin Matuska atomic_add_64(&queue->q_scn->scn_queues_pending, -1);
3330eda14cbcSMatt Macy queue->q_sio_memused -= SIO_GET_MUSED(sio);
3331eda14cbcSMatt Macy
3332eda14cbcSMatt Macy bytes_issued += SIO_GET_ASIZE(sio);
3333eda14cbcSMatt Macy num_sios++;
3334eda14cbcSMatt Macy list_insert_tail(list, sio);
3335eda14cbcSMatt Macy sio = next_sio;
3336eda14cbcSMatt Macy }
3337eda14cbcSMatt Macy
3338eda14cbcSMatt Macy /*
3339eda14cbcSMatt Macy * We limit the number of sios we process at once to 32 to avoid
3340eda14cbcSMatt Macy * biting off more than we can chew. If we didn't take everything
3341eda14cbcSMatt Macy * in the segment we update it to reflect the work we were able to
3342eda14cbcSMatt Macy * complete. Otherwise, we remove it from the range tree entirely.
3343eda14cbcSMatt Macy */
3344b59a0cdeSMartin Matuska if (sio != NULL && SIO_GET_OFFSET(sio) < zfs_rs_get_end(rs,
3345eda14cbcSMatt Macy queue->q_exts_by_addr)) {
3346b59a0cdeSMartin Matuska zfs_range_tree_adjust_fill(queue->q_exts_by_addr, rs,
3347eda14cbcSMatt Macy -bytes_issued);
3348b59a0cdeSMartin Matuska zfs_range_tree_resize_segment(queue->q_exts_by_addr, rs,
3349b59a0cdeSMartin Matuska SIO_GET_OFFSET(sio), zfs_rs_get_end(rs,
3350eda14cbcSMatt Macy queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
3351a0b956f5SMartin Matuska queue->q_last_ext_addr = SIO_GET_OFFSET(sio);
3352eda14cbcSMatt Macy return (B_TRUE);
3353eda14cbcSMatt Macy } else {
3354b59a0cdeSMartin Matuska uint64_t rstart = zfs_rs_get_start(rs, queue->q_exts_by_addr);
3355b59a0cdeSMartin Matuska uint64_t rend = zfs_rs_get_end(rs, queue->q_exts_by_addr);
3356b59a0cdeSMartin Matuska zfs_range_tree_remove(queue->q_exts_by_addr, rstart, rend -
3357b59a0cdeSMartin Matuska rstart);
3358a0b956f5SMartin Matuska queue->q_last_ext_addr = -1;
3359eda14cbcSMatt Macy return (B_FALSE);
3360eda14cbcSMatt Macy }
3361eda14cbcSMatt Macy }
3362eda14cbcSMatt Macy
3363eda14cbcSMatt Macy /*
3364eda14cbcSMatt Macy * This is called from the queue emptying thread and selects the next
3365eda14cbcSMatt Macy * extent from which we are to issue I/Os. The behavior of this function
3366eda14cbcSMatt Macy * depends on the state of the scan, the current memory consumption and
3367eda14cbcSMatt Macy * whether or not we are performing a scan shutdown.
3368eda14cbcSMatt Macy * 1) We select extents in an elevator algorithm (LBA-order) if the scan
3369eda14cbcSMatt Macy * needs to perform a checkpoint
3370eda14cbcSMatt Macy * 2) We select the largest available extent if we are up against the
3371eda14cbcSMatt Macy * memory limit.
3372eda14cbcSMatt Macy * 3) Otherwise we don't select any extents.
3373eda14cbcSMatt Macy */
3374b59a0cdeSMartin Matuska static zfs_range_seg_t *
scan_io_queue_fetch_ext(dsl_scan_io_queue_t * queue)3375eda14cbcSMatt Macy scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
3376eda14cbcSMatt Macy {
3377eda14cbcSMatt Macy dsl_scan_t *scn = queue->q_scn;
3378b59a0cdeSMartin Matuska zfs_range_tree_t *rt = queue->q_exts_by_addr;
3379eda14cbcSMatt Macy
3380eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
3381eda14cbcSMatt Macy ASSERT(scn->scn_is_sorted);
3382eda14cbcSMatt Macy
3383a0b956f5SMartin Matuska if (!scn->scn_checkpointing && !scn->scn_clearing)
3384eda14cbcSMatt Macy return (NULL);
3385eda14cbcSMatt Macy
3386eda14cbcSMatt Macy /*
3387eda14cbcSMatt Macy * During normal clearing, we want to issue our largest segments
3388eda14cbcSMatt Macy * first, keeping IO as sequential as possible, and leaving the
3389eda14cbcSMatt Macy * smaller extents for later with the hope that they might eventually
3390eda14cbcSMatt Macy * grow to larger sequential segments. However, when the scan is
3391eda14cbcSMatt Macy * checkpointing, no new extents will be added to the sorting queue,
3392eda14cbcSMatt Macy * so the way we are sorted now is as good as it will ever get.
3393eda14cbcSMatt Macy * In this case, we instead switch to issuing extents in LBA order.
3394eda14cbcSMatt Macy */
3395a0b956f5SMartin Matuska if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) ||
3396a0b956f5SMartin Matuska zfs_scan_issue_strategy == 1)
3397b59a0cdeSMartin Matuska return (zfs_range_tree_first(rt));
3398a0b956f5SMartin Matuska
3399eda14cbcSMatt Macy /*
3400a0b956f5SMartin Matuska * Try to continue previous extent if it is not completed yet. After
3401a0b956f5SMartin Matuska * shrink in scan_io_queue_gather() it may no longer be the best, but
3402a0b956f5SMartin Matuska * otherwise we leave shorter remnant every txg.
3403eda14cbcSMatt Macy */
3404a0b956f5SMartin Matuska uint64_t start;
3405be181ee2SMartin Matuska uint64_t size = 1ULL << rt->rt_shift;
3406b59a0cdeSMartin Matuska zfs_range_seg_t *addr_rs;
3407a0b956f5SMartin Matuska if (queue->q_last_ext_addr != -1) {
3408a0b956f5SMartin Matuska start = queue->q_last_ext_addr;
3409b59a0cdeSMartin Matuska addr_rs = zfs_range_tree_find(rt, start, size);
3410a0b956f5SMartin Matuska if (addr_rs != NULL)
3411eda14cbcSMatt Macy return (addr_rs);
3412eda14cbcSMatt Macy }
3413a0b956f5SMartin Matuska
3414a0b956f5SMartin Matuska /*
3415a0b956f5SMartin Matuska * Nothing to continue, so find new best extent.
3416a0b956f5SMartin Matuska */
3417a0b956f5SMartin Matuska uint64_t *v = zfs_btree_first(&queue->q_exts_by_size, NULL);
3418a0b956f5SMartin Matuska if (v == NULL)
3419a0b956f5SMartin Matuska return (NULL);
3420a0b956f5SMartin Matuska queue->q_last_ext_addr = start = *v << rt->rt_shift;
3421a0b956f5SMartin Matuska
3422a0b956f5SMartin Matuska /*
3423a0b956f5SMartin Matuska * We need to get the original entry in the by_addr tree so we can
3424a0b956f5SMartin Matuska * modify it.
3425a0b956f5SMartin Matuska */
3426b59a0cdeSMartin Matuska addr_rs = zfs_range_tree_find(rt, start, size);
3427a0b956f5SMartin Matuska ASSERT3P(addr_rs, !=, NULL);
3428b59a0cdeSMartin Matuska ASSERT3U(zfs_rs_get_start(addr_rs, rt), ==, start);
3429b59a0cdeSMartin Matuska ASSERT3U(zfs_rs_get_end(addr_rs, rt), >, start);
3430a0b956f5SMartin Matuska return (addr_rs);
3431eda14cbcSMatt Macy }
3432eda14cbcSMatt Macy
3433eda14cbcSMatt Macy static void
scan_io_queues_run_one(void * arg)3434eda14cbcSMatt Macy scan_io_queues_run_one(void *arg)
3435eda14cbcSMatt Macy {
3436eda14cbcSMatt Macy dsl_scan_io_queue_t *queue = arg;
3437eda14cbcSMatt Macy kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
3438eda14cbcSMatt Macy boolean_t suspended = B_FALSE;
3439b59a0cdeSMartin Matuska zfs_range_seg_t *rs;
34401f1e2261SMartin Matuska scan_io_t *sio;
34411f1e2261SMartin Matuska zio_t *zio;
3442eda14cbcSMatt Macy list_t sio_list;
3443eda14cbcSMatt Macy
3444eda14cbcSMatt Macy ASSERT(queue->q_scn->scn_is_sorted);
3445eda14cbcSMatt Macy
3446eda14cbcSMatt Macy list_create(&sio_list, sizeof (scan_io_t),
3447eda14cbcSMatt Macy offsetof(scan_io_t, sio_nodes.sio_list_node));
34481f1e2261SMartin Matuska zio = zio_null(queue->q_scn->scn_zio_root, queue->q_scn->scn_dp->dp_spa,
34491f1e2261SMartin Matuska NULL, NULL, NULL, ZIO_FLAG_CANFAIL);
3450eda14cbcSMatt Macy mutex_enter(q_lock);
34511f1e2261SMartin Matuska queue->q_zio = zio;
3452eda14cbcSMatt Macy
345316038816SMartin Matuska /* Calculate maximum in-flight bytes for this vdev. */
345416038816SMartin Matuska queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit *
345516038816SMartin Matuska (vdev_get_ndisks(queue->q_vd) - vdev_get_nparity(queue->q_vd)));
3456eda14cbcSMatt Macy
3457eda14cbcSMatt Macy /* reset per-queue scan statistics for this txg */
3458eda14cbcSMatt Macy queue->q_total_seg_size_this_txg = 0;
3459eda14cbcSMatt Macy queue->q_segs_this_txg = 0;
3460eda14cbcSMatt Macy queue->q_total_zio_size_this_txg = 0;
3461eda14cbcSMatt Macy queue->q_zios_this_txg = 0;
3462eda14cbcSMatt Macy
3463eda14cbcSMatt Macy /* loop until we run out of time or sios */
3464eda14cbcSMatt Macy while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) {
3465eda14cbcSMatt Macy uint64_t seg_start = 0, seg_end = 0;
3466a0b956f5SMartin Matuska boolean_t more_left;
3467eda14cbcSMatt Macy
3468eda14cbcSMatt Macy ASSERT(list_is_empty(&sio_list));
3469eda14cbcSMatt Macy
3470eda14cbcSMatt Macy /* loop while we still have sios left to process in this rs */
3471a0b956f5SMartin Matuska do {
3472eda14cbcSMatt Macy scan_io_t *first_sio, *last_sio;
3473eda14cbcSMatt Macy
3474eda14cbcSMatt Macy /*
3475eda14cbcSMatt Macy * We have selected which extent needs to be
3476eda14cbcSMatt Macy * processed next. Gather up the corresponding sios.
3477eda14cbcSMatt Macy */
3478eda14cbcSMatt Macy more_left = scan_io_queue_gather(queue, rs, &sio_list);
3479eda14cbcSMatt Macy ASSERT(!list_is_empty(&sio_list));
3480eda14cbcSMatt Macy first_sio = list_head(&sio_list);
3481eda14cbcSMatt Macy last_sio = list_tail(&sio_list);
3482eda14cbcSMatt Macy
3483eda14cbcSMatt Macy seg_end = SIO_GET_END_OFFSET(last_sio);
3484eda14cbcSMatt Macy if (seg_start == 0)
3485eda14cbcSMatt Macy seg_start = SIO_GET_OFFSET(first_sio);
3486eda14cbcSMatt Macy
3487eda14cbcSMatt Macy /*
3488eda14cbcSMatt Macy * Issuing sios can take a long time so drop the
3489eda14cbcSMatt Macy * queue lock. The sio queue won't be updated by
3490eda14cbcSMatt Macy * other threads since we're in syncing context so
3491eda14cbcSMatt Macy * we can be sure that our trees will remain exactly
3492eda14cbcSMatt Macy * as we left them.
3493eda14cbcSMatt Macy */
3494eda14cbcSMatt Macy mutex_exit(q_lock);
3495eda14cbcSMatt Macy suspended = scan_io_queue_issue(queue, &sio_list);
3496eda14cbcSMatt Macy mutex_enter(q_lock);
3497eda14cbcSMatt Macy
3498eda14cbcSMatt Macy if (suspended)
3499eda14cbcSMatt Macy break;
3500a0b956f5SMartin Matuska } while (more_left);
3501eda14cbcSMatt Macy
3502eda14cbcSMatt Macy /* update statistics for debugging purposes */
3503eda14cbcSMatt Macy scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
3504eda14cbcSMatt Macy
3505eda14cbcSMatt Macy if (suspended)
3506eda14cbcSMatt Macy break;
3507eda14cbcSMatt Macy }
3508eda14cbcSMatt Macy
3509eda14cbcSMatt Macy /*
3510eda14cbcSMatt Macy * If we were suspended in the middle of processing,
3511eda14cbcSMatt Macy * requeue any unfinished sios and exit.
3512eda14cbcSMatt Macy */
35134e8d558cSMartin Matuska while ((sio = list_remove_head(&sio_list)) != NULL)
3514eda14cbcSMatt Macy scan_io_queue_insert_impl(queue, sio);
3515eda14cbcSMatt Macy
35161f1e2261SMartin Matuska queue->q_zio = NULL;
3517eda14cbcSMatt Macy mutex_exit(q_lock);
35181f1e2261SMartin Matuska zio_nowait(zio);
3519eda14cbcSMatt Macy list_destroy(&sio_list);
3520eda14cbcSMatt Macy }
3521eda14cbcSMatt Macy
3522eda14cbcSMatt Macy /*
3523eda14cbcSMatt Macy * Performs an emptying run on all scan queues in the pool. This just
3524eda14cbcSMatt Macy * punches out one thread per top-level vdev, each of which processes
3525eda14cbcSMatt Macy * only that vdev's scan queue. We can parallelize the I/O here because
3526eda14cbcSMatt Macy * we know that each queue's I/Os only affect its own top-level vdev.
3527eda14cbcSMatt Macy *
3528eda14cbcSMatt Macy * This function waits for the queue runs to complete, and must be
3529eda14cbcSMatt Macy * called from dsl_scan_sync (or in general, syncing context).
3530eda14cbcSMatt Macy */
3531eda14cbcSMatt Macy static void
scan_io_queues_run(dsl_scan_t * scn)3532eda14cbcSMatt Macy scan_io_queues_run(dsl_scan_t *scn)
3533eda14cbcSMatt Macy {
3534eda14cbcSMatt Macy spa_t *spa = scn->scn_dp->dp_spa;
3535eda14cbcSMatt Macy
3536eda14cbcSMatt Macy ASSERT(scn->scn_is_sorted);
3537eda14cbcSMatt Macy ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3538eda14cbcSMatt Macy
3539a0b956f5SMartin Matuska if (scn->scn_queues_pending == 0)
3540eda14cbcSMatt Macy return;
3541eda14cbcSMatt Macy
3542eda14cbcSMatt Macy if (scn->scn_taskq == NULL) {
3543eda14cbcSMatt Macy int nthreads = spa->spa_root_vdev->vdev_children;
3544eda14cbcSMatt Macy
3545eda14cbcSMatt Macy /*
3546eda14cbcSMatt Macy * We need to make this taskq *always* execute as many
3547eda14cbcSMatt Macy * threads in parallel as we have top-level vdevs and no
3548eda14cbcSMatt Macy * less, otherwise strange serialization of the calls to
3549eda14cbcSMatt Macy * scan_io_queues_run_one can occur during spa_sync runs
3550eda14cbcSMatt Macy * and that significantly impacts performance.
3551eda14cbcSMatt Macy */
3552eda14cbcSMatt Macy scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads,
3553eda14cbcSMatt Macy minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE);
3554eda14cbcSMatt Macy }
3555eda14cbcSMatt Macy
3556eda14cbcSMatt Macy for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
3557eda14cbcSMatt Macy vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
3558eda14cbcSMatt Macy
3559eda14cbcSMatt Macy mutex_enter(&vd->vdev_scan_io_queue_lock);
3560eda14cbcSMatt Macy if (vd->vdev_scan_io_queue != NULL) {
3561eda14cbcSMatt Macy VERIFY(taskq_dispatch(scn->scn_taskq,
3562eda14cbcSMatt Macy scan_io_queues_run_one, vd->vdev_scan_io_queue,
3563eda14cbcSMatt Macy TQ_SLEEP) != TASKQID_INVALID);
3564eda14cbcSMatt Macy }
3565eda14cbcSMatt Macy mutex_exit(&vd->vdev_scan_io_queue_lock);
3566eda14cbcSMatt Macy }
3567eda14cbcSMatt Macy
3568eda14cbcSMatt Macy /*
3569eda14cbcSMatt Macy * Wait for the queues to finish issuing their IOs for this run
3570eda14cbcSMatt Macy * before we return. There may still be IOs in flight at this
3571eda14cbcSMatt Macy * point.
3572eda14cbcSMatt Macy */
3573eda14cbcSMatt Macy taskq_wait(scn->scn_taskq);
3574eda14cbcSMatt Macy }
3575eda14cbcSMatt Macy
3576eda14cbcSMatt Macy static boolean_t
dsl_scan_async_block_should_pause(dsl_scan_t * scn)3577eda14cbcSMatt Macy dsl_scan_async_block_should_pause(dsl_scan_t *scn)
3578eda14cbcSMatt Macy {
3579eda14cbcSMatt Macy uint64_t elapsed_nanosecs;
3580eda14cbcSMatt Macy
3581eda14cbcSMatt Macy if (zfs_recover)
3582eda14cbcSMatt Macy return (B_FALSE);
3583eda14cbcSMatt Macy
3584eda14cbcSMatt Macy if (zfs_async_block_max_blocks != 0 &&
3585eda14cbcSMatt Macy scn->scn_visited_this_txg >= zfs_async_block_max_blocks) {
3586eda14cbcSMatt Macy return (B_TRUE);
3587eda14cbcSMatt Macy }
3588eda14cbcSMatt Macy
3589eda14cbcSMatt Macy if (zfs_max_async_dedup_frees != 0 &&
3590eda14cbcSMatt Macy scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) {
3591eda14cbcSMatt Macy return (B_TRUE);
3592eda14cbcSMatt Macy }
3593eda14cbcSMatt Macy
3594eda14cbcSMatt Macy elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
3595eda14cbcSMatt Macy return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
3596eda14cbcSMatt Macy (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
3597eda14cbcSMatt Macy txg_sync_waiting(scn->scn_dp)) ||
3598eda14cbcSMatt Macy spa_shutting_down(scn->scn_dp->dp_spa));
3599eda14cbcSMatt Macy }
3600eda14cbcSMatt Macy
3601eda14cbcSMatt Macy static int
dsl_scan_free_block_cb(void * arg,const blkptr_t * bp,dmu_tx_t * tx)3602eda14cbcSMatt Macy dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
3603eda14cbcSMatt Macy {
3604eda14cbcSMatt Macy dsl_scan_t *scn = arg;
3605eda14cbcSMatt Macy
3606eda14cbcSMatt Macy if (!scn->scn_is_bptree ||
3607eda14cbcSMatt Macy (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
3608eda14cbcSMatt Macy if (dsl_scan_async_block_should_pause(scn))
3609eda14cbcSMatt Macy return (SET_ERROR(ERESTART));
3610eda14cbcSMatt Macy }
3611eda14cbcSMatt Macy
3612eda14cbcSMatt Macy zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
3613eda14cbcSMatt Macy dmu_tx_get_txg(tx), bp, 0));
3614eda14cbcSMatt Macy dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
3615eda14cbcSMatt Macy -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
3616eda14cbcSMatt Macy -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
3617eda14cbcSMatt Macy scn->scn_visited_this_txg++;
3618eda14cbcSMatt Macy if (BP_GET_DEDUP(bp))
3619eda14cbcSMatt Macy scn->scn_dedup_frees_this_txg++;
3620eda14cbcSMatt Macy return (0);
3621eda14cbcSMatt Macy }
3622eda14cbcSMatt Macy
3623eda14cbcSMatt Macy static void
dsl_scan_update_stats(dsl_scan_t * scn)3624eda14cbcSMatt Macy dsl_scan_update_stats(dsl_scan_t *scn)
3625eda14cbcSMatt Macy {
3626eda14cbcSMatt Macy spa_t *spa = scn->scn_dp->dp_spa;
3627eda14cbcSMatt Macy uint64_t i;
3628eda14cbcSMatt Macy uint64_t seg_size_total = 0, zio_size_total = 0;
3629eda14cbcSMatt Macy uint64_t seg_count_total = 0, zio_count_total = 0;
3630eda14cbcSMatt Macy
3631eda14cbcSMatt Macy for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
3632eda14cbcSMatt Macy vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
3633eda14cbcSMatt Macy dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue;
3634eda14cbcSMatt Macy
3635eda14cbcSMatt Macy if (queue == NULL)
3636eda14cbcSMatt Macy continue;
3637eda14cbcSMatt Macy
3638eda14cbcSMatt Macy seg_size_total += queue->q_total_seg_size_this_txg;
3639eda14cbcSMatt Macy zio_size_total += queue->q_total_zio_size_this_txg;
3640eda14cbcSMatt Macy seg_count_total += queue->q_segs_this_txg;
3641eda14cbcSMatt Macy zio_count_total += queue->q_zios_this_txg;
3642eda14cbcSMatt Macy }
3643eda14cbcSMatt Macy
3644eda14cbcSMatt Macy if (seg_count_total == 0 || zio_count_total == 0) {
3645eda14cbcSMatt Macy scn->scn_avg_seg_size_this_txg = 0;
3646eda14cbcSMatt Macy scn->scn_avg_zio_size_this_txg = 0;
3647eda14cbcSMatt Macy scn->scn_segs_this_txg = 0;
3648eda14cbcSMatt Macy scn->scn_zios_this_txg = 0;
3649eda14cbcSMatt Macy return;
3650eda14cbcSMatt Macy }
3651eda14cbcSMatt Macy
3652eda14cbcSMatt Macy scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total;
3653eda14cbcSMatt Macy scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total;
3654eda14cbcSMatt Macy scn->scn_segs_this_txg = seg_count_total;
3655eda14cbcSMatt Macy scn->scn_zios_this_txg = zio_count_total;
3656eda14cbcSMatt Macy }
3657eda14cbcSMatt Macy
3658eda14cbcSMatt Macy static int
bpobj_dsl_scan_free_block_cb(void * arg,const blkptr_t * bp,boolean_t bp_freed,dmu_tx_t * tx)3659eda14cbcSMatt Macy bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
3660eda14cbcSMatt Macy dmu_tx_t *tx)
3661eda14cbcSMatt Macy {
3662eda14cbcSMatt Macy ASSERT(!bp_freed);
3663eda14cbcSMatt Macy return (dsl_scan_free_block_cb(arg, bp, tx));
3664eda14cbcSMatt Macy }
3665eda14cbcSMatt Macy
3666eda14cbcSMatt Macy static int
dsl_scan_obsolete_block_cb(void * arg,const blkptr_t * bp,boolean_t bp_freed,dmu_tx_t * tx)3667eda14cbcSMatt Macy dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
3668eda14cbcSMatt Macy dmu_tx_t *tx)
3669eda14cbcSMatt Macy {
3670eda14cbcSMatt Macy ASSERT(!bp_freed);
3671eda14cbcSMatt Macy dsl_scan_t *scn = arg;
3672eda14cbcSMatt Macy const dva_t *dva = &bp->blk_dva[0];
3673eda14cbcSMatt Macy
3674eda14cbcSMatt Macy if (dsl_scan_async_block_should_pause(scn))
3675eda14cbcSMatt Macy return (SET_ERROR(ERESTART));
3676eda14cbcSMatt Macy
3677eda14cbcSMatt Macy spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa,
3678eda14cbcSMatt Macy DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva),
3679eda14cbcSMatt Macy DVA_GET_ASIZE(dva), tx);
3680eda14cbcSMatt Macy scn->scn_visited_this_txg++;
3681eda14cbcSMatt Macy return (0);
3682eda14cbcSMatt Macy }
3683eda14cbcSMatt Macy
3684eda14cbcSMatt Macy boolean_t
dsl_scan_active(dsl_scan_t * scn)3685eda14cbcSMatt Macy dsl_scan_active(dsl_scan_t *scn)
3686eda14cbcSMatt Macy {
3687eda14cbcSMatt Macy spa_t *spa = scn->scn_dp->dp_spa;
3688eda14cbcSMatt Macy uint64_t used = 0, comp, uncomp;
3689eda14cbcSMatt Macy boolean_t clones_left;
3690eda14cbcSMatt Macy
3691eda14cbcSMatt Macy if (spa->spa_load_state != SPA_LOAD_NONE)
3692eda14cbcSMatt Macy return (B_FALSE);
3693eda14cbcSMatt Macy if (spa_shutting_down(spa))
3694eda14cbcSMatt Macy return (B_FALSE);
3695eda14cbcSMatt Macy if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) ||
3696eda14cbcSMatt Macy (scn->scn_async_destroying && !scn->scn_async_stalled))
3697eda14cbcSMatt Macy return (B_TRUE);
3698eda14cbcSMatt Macy
3699eda14cbcSMatt Macy if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
3700eda14cbcSMatt Macy (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
3701eda14cbcSMatt Macy &used, &comp, &uncomp);
3702eda14cbcSMatt Macy }
3703eda14cbcSMatt Macy clones_left = spa_livelist_delete_check(spa);
3704eda14cbcSMatt Macy return ((used != 0) || (clones_left));
3705eda14cbcSMatt Macy }
3706eda14cbcSMatt Macy
3707c0a83fe0SMartin Matuska boolean_t
dsl_errorscrub_active(dsl_scan_t * scn)3708c0a83fe0SMartin Matuska dsl_errorscrub_active(dsl_scan_t *scn)
3709c0a83fe0SMartin Matuska {
3710c0a83fe0SMartin Matuska spa_t *spa = scn->scn_dp->dp_spa;
3711c0a83fe0SMartin Matuska if (spa->spa_load_state != SPA_LOAD_NONE)
3712c0a83fe0SMartin Matuska return (B_FALSE);
3713c0a83fe0SMartin Matuska if (spa_shutting_down(spa))
3714c0a83fe0SMartin Matuska return (B_FALSE);
3715c0a83fe0SMartin Matuska if (dsl_errorscrubbing(scn->scn_dp))
3716c0a83fe0SMartin Matuska return (B_TRUE);
3717c0a83fe0SMartin Matuska return (B_FALSE);
3718c0a83fe0SMartin Matuska }
3719c0a83fe0SMartin Matuska
3720eda14cbcSMatt Macy static boolean_t
dsl_scan_check_deferred(vdev_t * vd)3721eda14cbcSMatt Macy dsl_scan_check_deferred(vdev_t *vd)
3722eda14cbcSMatt Macy {
3723eda14cbcSMatt Macy boolean_t need_resilver = B_FALSE;
3724eda14cbcSMatt Macy
3725eda14cbcSMatt Macy for (int c = 0; c < vd->vdev_children; c++) {
3726eda14cbcSMatt Macy need_resilver |=
3727eda14cbcSMatt Macy dsl_scan_check_deferred(vd->vdev_child[c]);
3728eda14cbcSMatt Macy }
3729eda14cbcSMatt Macy
3730eda14cbcSMatt Macy if (!vdev_is_concrete(vd) || vd->vdev_aux ||
3731eda14cbcSMatt Macy !vd->vdev_ops->vdev_op_leaf)
3732eda14cbcSMatt Macy return (need_resilver);
3733eda14cbcSMatt Macy
3734eda14cbcSMatt Macy if (!vd->vdev_resilver_deferred)
3735eda14cbcSMatt Macy need_resilver = B_TRUE;
3736eda14cbcSMatt Macy
3737eda14cbcSMatt Macy return (need_resilver);
3738eda14cbcSMatt Macy }
3739eda14cbcSMatt Macy
3740eda14cbcSMatt Macy static boolean_t
dsl_scan_need_resilver(spa_t * spa,const dva_t * dva,size_t psize,uint64_t phys_birth)3741eda14cbcSMatt Macy dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
3742eda14cbcSMatt Macy uint64_t phys_birth)
3743eda14cbcSMatt Macy {
3744eda14cbcSMatt Macy vdev_t *vd;
3745eda14cbcSMatt Macy
3746eda14cbcSMatt Macy vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
3747eda14cbcSMatt Macy
3748eda14cbcSMatt Macy if (vd->vdev_ops == &vdev_indirect_ops) {
3749eda14cbcSMatt Macy /*
3750eda14cbcSMatt Macy * The indirect vdev can point to multiple
3751eda14cbcSMatt Macy * vdevs. For simplicity, always create
3752eda14cbcSMatt Macy * the resilver zio_t. zio_vdev_io_start()
3753eda14cbcSMatt Macy * will bypass the child resilver i/o's if
3754eda14cbcSMatt Macy * they are on vdevs that don't have DTL's.
3755eda14cbcSMatt Macy */
3756eda14cbcSMatt Macy return (B_TRUE);
3757eda14cbcSMatt Macy }
3758eda14cbcSMatt Macy
3759eda14cbcSMatt Macy if (DVA_GET_GANG(dva)) {
3760eda14cbcSMatt Macy /*
3761eda14cbcSMatt Macy * Gang members may be spread across multiple
3762eda14cbcSMatt Macy * vdevs, so the best estimate we have is the
3763eda14cbcSMatt Macy * scrub range, which has already been checked.
3764eda14cbcSMatt Macy * XXX -- it would be better to change our
3765eda14cbcSMatt Macy * allocation policy to ensure that all
3766eda14cbcSMatt Macy * gang members reside on the same vdev.
3767eda14cbcSMatt Macy */
3768eda14cbcSMatt Macy return (B_TRUE);
3769eda14cbcSMatt Macy }
3770eda14cbcSMatt Macy
3771eda14cbcSMatt Macy /*
3772eda14cbcSMatt Macy * Check if the top-level vdev must resilver this offset.
3773eda14cbcSMatt Macy * When the offset does not intersect with a dirty leaf DTL
3774eda14cbcSMatt Macy * then it may be possible to skip the resilver IO. The psize
3775eda14cbcSMatt Macy * is provided instead of asize to simplify the check for RAIDZ.
3776eda14cbcSMatt Macy */
37777877fdebSMatt Macy if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth))
3778eda14cbcSMatt Macy return (B_FALSE);
3779eda14cbcSMatt Macy
3780eda14cbcSMatt Macy /*
3781eda14cbcSMatt Macy * Check that this top-level vdev has a device under it which
3782eda14cbcSMatt Macy * is resilvering and is not deferred.
3783eda14cbcSMatt Macy */
3784eda14cbcSMatt Macy if (!dsl_scan_check_deferred(vd))
3785eda14cbcSMatt Macy return (B_FALSE);
3786eda14cbcSMatt Macy
3787eda14cbcSMatt Macy return (B_TRUE);
3788eda14cbcSMatt Macy }
3789eda14cbcSMatt Macy
3790eda14cbcSMatt Macy static int
dsl_process_async_destroys(dsl_pool_t * dp,dmu_tx_t * tx)3791eda14cbcSMatt Macy dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
3792eda14cbcSMatt Macy {
3793eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
3794eda14cbcSMatt Macy spa_t *spa = dp->dp_spa;
3795eda14cbcSMatt Macy int err = 0;
3796eda14cbcSMatt Macy
3797eda14cbcSMatt Macy if (spa_suspend_async_destroy(spa))
3798eda14cbcSMatt Macy return (0);
3799eda14cbcSMatt Macy
3800eda14cbcSMatt Macy if (zfs_free_bpobj_enabled &&
3801eda14cbcSMatt Macy spa_version(spa) >= SPA_VERSION_DEADLISTS) {
3802eda14cbcSMatt Macy scn->scn_is_bptree = B_FALSE;
3803eda14cbcSMatt Macy scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
3804eda14cbcSMatt Macy scn->scn_zio_root = zio_root(spa, NULL,
3805eda14cbcSMatt Macy NULL, ZIO_FLAG_MUSTSUCCEED);
3806eda14cbcSMatt Macy err = bpobj_iterate(&dp->dp_free_bpobj,
3807eda14cbcSMatt Macy bpobj_dsl_scan_free_block_cb, scn, tx);
3808eda14cbcSMatt Macy VERIFY0(zio_wait(scn->scn_zio_root));
3809eda14cbcSMatt Macy scn->scn_zio_root = NULL;
3810eda14cbcSMatt Macy
3811eda14cbcSMatt Macy if (err != 0 && err != ERESTART)
3812eda14cbcSMatt Macy zfs_panic_recover("error %u from bpobj_iterate()", err);
3813eda14cbcSMatt Macy }
3814eda14cbcSMatt Macy
3815eda14cbcSMatt Macy if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
3816eda14cbcSMatt Macy ASSERT(scn->scn_async_destroying);
3817eda14cbcSMatt Macy scn->scn_is_bptree = B_TRUE;
3818eda14cbcSMatt Macy scn->scn_zio_root = zio_root(spa, NULL,
3819eda14cbcSMatt Macy NULL, ZIO_FLAG_MUSTSUCCEED);
3820eda14cbcSMatt Macy err = bptree_iterate(dp->dp_meta_objset,
3821eda14cbcSMatt Macy dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
3822eda14cbcSMatt Macy VERIFY0(zio_wait(scn->scn_zio_root));
3823eda14cbcSMatt Macy scn->scn_zio_root = NULL;
3824eda14cbcSMatt Macy
3825eda14cbcSMatt Macy if (err == EIO || err == ECKSUM) {
3826eda14cbcSMatt Macy err = 0;
3827eda14cbcSMatt Macy } else if (err != 0 && err != ERESTART) {
3828eda14cbcSMatt Macy zfs_panic_recover("error %u from "
3829eda14cbcSMatt Macy "traverse_dataset_destroyed()", err);
3830eda14cbcSMatt Macy }
3831eda14cbcSMatt Macy
3832eda14cbcSMatt Macy if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
3833eda14cbcSMatt Macy /* finished; deactivate async destroy feature */
3834eda14cbcSMatt Macy spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
3835eda14cbcSMatt Macy ASSERT(!spa_feature_is_active(spa,
3836eda14cbcSMatt Macy SPA_FEATURE_ASYNC_DESTROY));
3837eda14cbcSMatt Macy VERIFY0(zap_remove(dp->dp_meta_objset,
3838eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT,
3839eda14cbcSMatt Macy DMU_POOL_BPTREE_OBJ, tx));
3840eda14cbcSMatt Macy VERIFY0(bptree_free(dp->dp_meta_objset,
3841eda14cbcSMatt Macy dp->dp_bptree_obj, tx));
3842eda14cbcSMatt Macy dp->dp_bptree_obj = 0;
3843eda14cbcSMatt Macy scn->scn_async_destroying = B_FALSE;
3844eda14cbcSMatt Macy scn->scn_async_stalled = B_FALSE;
3845eda14cbcSMatt Macy } else {
3846eda14cbcSMatt Macy /*
3847eda14cbcSMatt Macy * If we didn't make progress, mark the async
3848eda14cbcSMatt Macy * destroy as stalled, so that we will not initiate
3849eda14cbcSMatt Macy * a spa_sync() on its behalf. Note that we only
3850eda14cbcSMatt Macy * check this if we are not finished, because if the
3851eda14cbcSMatt Macy * bptree had no blocks for us to visit, we can
3852eda14cbcSMatt Macy * finish without "making progress".
3853eda14cbcSMatt Macy */
3854eda14cbcSMatt Macy scn->scn_async_stalled =
3855eda14cbcSMatt Macy (scn->scn_visited_this_txg == 0);
3856eda14cbcSMatt Macy }
3857eda14cbcSMatt Macy }
3858eda14cbcSMatt Macy if (scn->scn_visited_this_txg) {
3859eda14cbcSMatt Macy zfs_dbgmsg("freed %llu blocks in %llums from "
386081b22a98SMartin Matuska "free_bpobj/bptree on %s in txg %llu; err=%u",
3861eda14cbcSMatt Macy (longlong_t)scn->scn_visited_this_txg,
3862eda14cbcSMatt Macy (longlong_t)
3863eda14cbcSMatt Macy NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
386481b22a98SMartin Matuska spa->spa_name, (longlong_t)tx->tx_txg, err);
3865eda14cbcSMatt Macy scn->scn_visited_this_txg = 0;
3866eda14cbcSMatt Macy scn->scn_dedup_frees_this_txg = 0;
3867eda14cbcSMatt Macy
3868eda14cbcSMatt Macy /*
38692a58b312SMartin Matuska * Write out changes to the DDT and the BRT that may be required
38702a58b312SMartin Matuska * as a result of the blocks freed. This ensures that the DDT
38712a58b312SMartin Matuska * and the BRT are clean when a scrub/resilver runs.
3872eda14cbcSMatt Macy */
3873eda14cbcSMatt Macy ddt_sync(spa, tx->tx_txg);
38742a58b312SMartin Matuska brt_sync(spa, tx->tx_txg);
3875eda14cbcSMatt Macy }
3876eda14cbcSMatt Macy if (err != 0)
3877eda14cbcSMatt Macy return (err);
3878eda14cbcSMatt Macy if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
3879eda14cbcSMatt Macy zfs_free_leak_on_eio &&
3880eda14cbcSMatt Macy (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
3881eda14cbcSMatt Macy dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
3882eda14cbcSMatt Macy dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
3883eda14cbcSMatt Macy /*
3884eda14cbcSMatt Macy * We have finished background destroying, but there is still
3885eda14cbcSMatt Macy * some space left in the dp_free_dir. Transfer this leaked
3886eda14cbcSMatt Macy * space to the dp_leak_dir.
3887eda14cbcSMatt Macy */
3888eda14cbcSMatt Macy if (dp->dp_leak_dir == NULL) {
3889eda14cbcSMatt Macy rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
3890eda14cbcSMatt Macy (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
3891eda14cbcSMatt Macy LEAK_DIR_NAME, tx);
3892eda14cbcSMatt Macy VERIFY0(dsl_pool_open_special_dir(dp,
3893eda14cbcSMatt Macy LEAK_DIR_NAME, &dp->dp_leak_dir));
3894eda14cbcSMatt Macy rrw_exit(&dp->dp_config_rwlock, FTAG);
3895eda14cbcSMatt Macy }
3896eda14cbcSMatt Macy dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
3897eda14cbcSMatt Macy dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
3898eda14cbcSMatt Macy dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
3899eda14cbcSMatt Macy dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
3900eda14cbcSMatt Macy dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
3901eda14cbcSMatt Macy -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
3902eda14cbcSMatt Macy -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
3903eda14cbcSMatt Macy -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
3904eda14cbcSMatt Macy }
3905eda14cbcSMatt Macy
3906eda14cbcSMatt Macy if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
3907eda14cbcSMatt Macy !spa_livelist_delete_check(spa)) {
3908eda14cbcSMatt Macy /* finished; verify that space accounting went to zero */
3909eda14cbcSMatt Macy ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
3910eda14cbcSMatt Macy ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
3911eda14cbcSMatt Macy ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
3912eda14cbcSMatt Macy }
3913eda14cbcSMatt Macy
3914eda14cbcSMatt Macy spa_notify_waiters(spa);
3915eda14cbcSMatt Macy
3916eda14cbcSMatt Macy EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj),
3917eda14cbcSMatt Macy 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3918eda14cbcSMatt Macy DMU_POOL_OBSOLETE_BPOBJ));
3919eda14cbcSMatt Macy if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) {
3920eda14cbcSMatt Macy ASSERT(spa_feature_is_active(dp->dp_spa,
3921eda14cbcSMatt Macy SPA_FEATURE_OBSOLETE_COUNTS));
3922eda14cbcSMatt Macy
3923eda14cbcSMatt Macy scn->scn_is_bptree = B_FALSE;
3924eda14cbcSMatt Macy scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms;
3925eda14cbcSMatt Macy err = bpobj_iterate(&dp->dp_obsolete_bpobj,
3926eda14cbcSMatt Macy dsl_scan_obsolete_block_cb, scn, tx);
3927eda14cbcSMatt Macy if (err != 0 && err != ERESTART)
3928eda14cbcSMatt Macy zfs_panic_recover("error %u from bpobj_iterate()", err);
3929eda14cbcSMatt Macy
3930eda14cbcSMatt Macy if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
3931eda14cbcSMatt Macy dsl_pool_destroy_obsolete_bpobj(dp, tx);
3932eda14cbcSMatt Macy }
3933eda14cbcSMatt Macy return (0);
3934eda14cbcSMatt Macy }
3935eda14cbcSMatt Macy
3936c0a83fe0SMartin Matuska static void
name_to_bookmark(char * buf,zbookmark_phys_t * zb)3937c0a83fe0SMartin Matuska name_to_bookmark(char *buf, zbookmark_phys_t *zb)
3938c0a83fe0SMartin Matuska {
3939c0a83fe0SMartin Matuska zb->zb_objset = zfs_strtonum(buf, &buf);
3940c0a83fe0SMartin Matuska ASSERT(*buf == ':');
3941c0a83fe0SMartin Matuska zb->zb_object = zfs_strtonum(buf + 1, &buf);
3942c0a83fe0SMartin Matuska ASSERT(*buf == ':');
3943c0a83fe0SMartin Matuska zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
3944c0a83fe0SMartin Matuska ASSERT(*buf == ':');
3945c0a83fe0SMartin Matuska zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
3946c0a83fe0SMartin Matuska ASSERT(*buf == '\0');
3947c0a83fe0SMartin Matuska }
3948c0a83fe0SMartin Matuska
3949c0a83fe0SMartin Matuska static void
name_to_object(char * buf,uint64_t * obj)3950c0a83fe0SMartin Matuska name_to_object(char *buf, uint64_t *obj)
3951c0a83fe0SMartin Matuska {
3952c0a83fe0SMartin Matuska *obj = zfs_strtonum(buf, &buf);
3953c0a83fe0SMartin Matuska ASSERT(*buf == '\0');
3954c0a83fe0SMartin Matuska }
3955c0a83fe0SMartin Matuska
3956c0a83fe0SMartin Matuska static void
read_by_block_level(dsl_scan_t * scn,zbookmark_phys_t zb)3957c0a83fe0SMartin Matuska read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb)
3958c0a83fe0SMartin Matuska {
3959c0a83fe0SMartin Matuska dsl_pool_t *dp = scn->scn_dp;
3960c0a83fe0SMartin Matuska dsl_dataset_t *ds;
3961c0a83fe0SMartin Matuska objset_t *os;
3962c0a83fe0SMartin Matuska if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0)
3963c0a83fe0SMartin Matuska return;
3964c0a83fe0SMartin Matuska
3965c0a83fe0SMartin Matuska if (dmu_objset_from_ds(ds, &os) != 0) {
3966c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
3967c0a83fe0SMartin Matuska return;
3968c0a83fe0SMartin Matuska }
3969c0a83fe0SMartin Matuska
3970c0a83fe0SMartin Matuska /*
3971c0a83fe0SMartin Matuska * If the key is not loaded dbuf_dnode_findbp() will error out with
3972c0a83fe0SMartin Matuska * EACCES. However in that case dnode_hold() will eventually call
3973c0a83fe0SMartin Matuska * dbuf_read()->zio_wait() which may call spa_log_error(). This will
3974c0a83fe0SMartin Matuska * lead to a deadlock due to us holding the mutex spa_errlist_lock.
3975c0a83fe0SMartin Matuska * Avoid this by checking here if the keys are loaded, if not return.
3976c0a83fe0SMartin Matuska * If the keys are not loaded the head_errlog feature is meaningless
3977c0a83fe0SMartin Matuska * as we cannot figure out the birth txg of the block pointer.
3978c0a83fe0SMartin Matuska */
3979c0a83fe0SMartin Matuska if (dsl_dataset_get_keystatus(ds->ds_dir) ==
3980c0a83fe0SMartin Matuska ZFS_KEYSTATUS_UNAVAILABLE) {
3981c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
3982c0a83fe0SMartin Matuska return;
3983c0a83fe0SMartin Matuska }
3984c0a83fe0SMartin Matuska
3985c0a83fe0SMartin Matuska dnode_t *dn;
3986c0a83fe0SMartin Matuska blkptr_t bp;
3987c0a83fe0SMartin Matuska
3988c0a83fe0SMartin Matuska if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) {
3989c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
3990c0a83fe0SMartin Matuska return;
3991c0a83fe0SMartin Matuska }
3992c0a83fe0SMartin Matuska
3993c0a83fe0SMartin Matuska rw_enter(&dn->dn_struct_rwlock, RW_READER);
3994c0a83fe0SMartin Matuska int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL,
3995c0a83fe0SMartin Matuska NULL);
3996c0a83fe0SMartin Matuska
3997c0a83fe0SMartin Matuska if (error) {
3998c0a83fe0SMartin Matuska rw_exit(&dn->dn_struct_rwlock);
3999c0a83fe0SMartin Matuska dnode_rele(dn, FTAG);
4000c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
4001c0a83fe0SMartin Matuska return;
4002c0a83fe0SMartin Matuska }
4003c0a83fe0SMartin Matuska
4004c0a83fe0SMartin Matuska if (!error && BP_IS_HOLE(&bp)) {
4005c0a83fe0SMartin Matuska rw_exit(&dn->dn_struct_rwlock);
4006c0a83fe0SMartin Matuska dnode_rele(dn, FTAG);
4007c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
4008c0a83fe0SMartin Matuska return;
4009c0a83fe0SMartin Matuska }
4010c0a83fe0SMartin Matuska
4011c0a83fe0SMartin Matuska int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW |
4012c0a83fe0SMartin Matuska ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB;
4013c0a83fe0SMartin Matuska
4014c0a83fe0SMartin Matuska /* If it's an intent log block, failure is expected. */
4015c0a83fe0SMartin Matuska if (zb.zb_level == ZB_ZIL_LEVEL)
4016c0a83fe0SMartin Matuska zio_flags |= ZIO_FLAG_SPECULATIVE;
4017c0a83fe0SMartin Matuska
4018c0a83fe0SMartin Matuska ASSERT(!BP_IS_EMBEDDED(&bp));
4019c0a83fe0SMartin Matuska scan_exec_io(dp, &bp, zio_flags, &zb, NULL);
4020c0a83fe0SMartin Matuska rw_exit(&dn->dn_struct_rwlock);
4021c0a83fe0SMartin Matuska dnode_rele(dn, FTAG);
4022c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
4023c0a83fe0SMartin Matuska }
4024c0a83fe0SMartin Matuska
4025c0a83fe0SMartin Matuska /*
4026c0a83fe0SMartin Matuska * We keep track of the scrubbed error blocks in "count". This will be used
4027c0a83fe0SMartin Matuska * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This
4028c0a83fe0SMartin Matuska * function is modelled after check_filesystem().
4029c0a83fe0SMartin Matuska */
4030c0a83fe0SMartin Matuska static int
scrub_filesystem(spa_t * spa,uint64_t fs,zbookmark_err_phys_t * zep,int * count)4031c0a83fe0SMartin Matuska scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep,
4032c0a83fe0SMartin Matuska int *count)
4033c0a83fe0SMartin Matuska {
4034c0a83fe0SMartin Matuska dsl_dataset_t *ds;
4035c0a83fe0SMartin Matuska dsl_pool_t *dp = spa->spa_dsl_pool;
4036c0a83fe0SMartin Matuska dsl_scan_t *scn = dp->dp_scan;
4037c0a83fe0SMartin Matuska
4038c0a83fe0SMartin Matuska int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds);
4039c0a83fe0SMartin Matuska if (error != 0)
4040c0a83fe0SMartin Matuska return (error);
4041c0a83fe0SMartin Matuska
4042c0a83fe0SMartin Matuska uint64_t latest_txg;
4043c0a83fe0SMartin Matuska uint64_t txg_to_consider = spa->spa_syncing_txg;
4044c0a83fe0SMartin Matuska boolean_t check_snapshot = B_TRUE;
4045c0a83fe0SMartin Matuska
4046c0a83fe0SMartin Matuska error = find_birth_txg(ds, zep, &latest_txg);
4047c0a83fe0SMartin Matuska
4048c0a83fe0SMartin Matuska /*
4049c0a83fe0SMartin Matuska * If find_birth_txg() errors out, then err on the side of caution and
4050c0a83fe0SMartin Matuska * proceed. In worst case scenario scrub all objects. If zep->zb_birth
4051c0a83fe0SMartin Matuska * is 0 (e.g. in case of encryption with unloaded keys) also proceed to
4052c0a83fe0SMartin Matuska * scrub all objects.
4053c0a83fe0SMartin Matuska */
4054c0a83fe0SMartin Matuska if (error == 0 && zep->zb_birth == latest_txg) {
4055c0a83fe0SMartin Matuska /* Block neither free nor re written. */
4056c0a83fe0SMartin Matuska zbookmark_phys_t zb;
4057c0a83fe0SMartin Matuska zep_to_zb(fs, zep, &zb);
4058c0a83fe0SMartin Matuska scn->scn_zio_root = zio_root(spa, NULL, NULL,
4059c0a83fe0SMartin Matuska ZIO_FLAG_CANFAIL);
4060c0a83fe0SMartin Matuska /* We have already acquired the config lock for spa */
4061c0a83fe0SMartin Matuska read_by_block_level(scn, zb);
4062c0a83fe0SMartin Matuska
4063c0a83fe0SMartin Matuska (void) zio_wait(scn->scn_zio_root);
4064c0a83fe0SMartin Matuska scn->scn_zio_root = NULL;
4065c0a83fe0SMartin Matuska
4066c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_examined++;
4067c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_to_examine--;
4068c0a83fe0SMartin Matuska (*count)++;
4069c0a83fe0SMartin Matuska if ((*count) == zfs_scrub_error_blocks_per_txg ||
4070c0a83fe0SMartin Matuska dsl_error_scrub_check_suspend(scn, &zb)) {
4071c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
4072c0a83fe0SMartin Matuska return (SET_ERROR(EFAULT));
4073c0a83fe0SMartin Matuska }
4074c0a83fe0SMartin Matuska
4075c0a83fe0SMartin Matuska check_snapshot = B_FALSE;
4076c0a83fe0SMartin Matuska } else if (error == 0) {
4077c0a83fe0SMartin Matuska txg_to_consider = latest_txg;
4078c0a83fe0SMartin Matuska }
4079c0a83fe0SMartin Matuska
4080c0a83fe0SMartin Matuska /*
4081c0a83fe0SMartin Matuska * Retrieve the number of snapshots if the dataset is not a snapshot.
4082c0a83fe0SMartin Matuska */
4083c0a83fe0SMartin Matuska uint64_t snap_count = 0;
4084c0a83fe0SMartin Matuska if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
4085c0a83fe0SMartin Matuska
4086c0a83fe0SMartin Matuska error = zap_count(spa->spa_meta_objset,
4087c0a83fe0SMartin Matuska dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
4088c0a83fe0SMartin Matuska
4089c0a83fe0SMartin Matuska if (error != 0) {
4090c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
4091c0a83fe0SMartin Matuska return (error);
4092c0a83fe0SMartin Matuska }
4093c0a83fe0SMartin Matuska }
4094c0a83fe0SMartin Matuska
4095c0a83fe0SMartin Matuska if (snap_count == 0) {
4096c0a83fe0SMartin Matuska /* Filesystem without snapshots. */
4097c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
4098c0a83fe0SMartin Matuska return (0);
4099c0a83fe0SMartin Matuska }
4100c0a83fe0SMartin Matuska
4101c0a83fe0SMartin Matuska uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
4102c0a83fe0SMartin Matuska uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
4103c0a83fe0SMartin Matuska
4104c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
4105c0a83fe0SMartin Matuska
4106c0a83fe0SMartin Matuska /* Check only snapshots created from this file system. */
4107c0a83fe0SMartin Matuska while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
4108c0a83fe0SMartin Matuska snap_obj_txg <= txg_to_consider) {
4109c0a83fe0SMartin Matuska
4110c0a83fe0SMartin Matuska error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds);
4111c0a83fe0SMartin Matuska if (error != 0)
4112c0a83fe0SMartin Matuska return (error);
4113c0a83fe0SMartin Matuska
4114c0a83fe0SMartin Matuska if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) {
4115c0a83fe0SMartin Matuska snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
4116c0a83fe0SMartin Matuska snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
4117c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
4118c0a83fe0SMartin Matuska continue;
4119c0a83fe0SMartin Matuska }
4120c0a83fe0SMartin Matuska
4121c0a83fe0SMartin Matuska boolean_t affected = B_TRUE;
4122c0a83fe0SMartin Matuska if (check_snapshot) {
4123c0a83fe0SMartin Matuska uint64_t blk_txg;
4124c0a83fe0SMartin Matuska error = find_birth_txg(ds, zep, &blk_txg);
4125c0a83fe0SMartin Matuska
4126c0a83fe0SMartin Matuska /*
4127c0a83fe0SMartin Matuska * Scrub the snapshot also when zb_birth == 0 or when
4128c0a83fe0SMartin Matuska * find_birth_txg() returns an error.
4129c0a83fe0SMartin Matuska */
4130c0a83fe0SMartin Matuska affected = (error == 0 && zep->zb_birth == blk_txg) ||
4131c0a83fe0SMartin Matuska (error != 0) || (zep->zb_birth == 0);
4132c0a83fe0SMartin Matuska }
4133c0a83fe0SMartin Matuska
4134c0a83fe0SMartin Matuska /* Scrub snapshots. */
4135c0a83fe0SMartin Matuska if (affected) {
4136c0a83fe0SMartin Matuska zbookmark_phys_t zb;
4137c0a83fe0SMartin Matuska zep_to_zb(snap_obj, zep, &zb);
4138c0a83fe0SMartin Matuska scn->scn_zio_root = zio_root(spa, NULL, NULL,
4139c0a83fe0SMartin Matuska ZIO_FLAG_CANFAIL);
4140c0a83fe0SMartin Matuska /* We have already acquired the config lock for spa */
4141c0a83fe0SMartin Matuska read_by_block_level(scn, zb);
4142c0a83fe0SMartin Matuska
4143c0a83fe0SMartin Matuska (void) zio_wait(scn->scn_zio_root);
4144c0a83fe0SMartin Matuska scn->scn_zio_root = NULL;
4145c0a83fe0SMartin Matuska
4146c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_examined++;
4147c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_to_examine--;
4148c0a83fe0SMartin Matuska (*count)++;
4149c0a83fe0SMartin Matuska if ((*count) == zfs_scrub_error_blocks_per_txg ||
4150c0a83fe0SMartin Matuska dsl_error_scrub_check_suspend(scn, &zb)) {
4151c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
4152c0a83fe0SMartin Matuska return (EFAULT);
4153c0a83fe0SMartin Matuska }
4154c0a83fe0SMartin Matuska }
4155c0a83fe0SMartin Matuska snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
4156c0a83fe0SMartin Matuska snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
4157c0a83fe0SMartin Matuska dsl_dataset_rele(ds, FTAG);
4158c0a83fe0SMartin Matuska }
4159c0a83fe0SMartin Matuska return (0);
4160c0a83fe0SMartin Matuska }
4161c0a83fe0SMartin Matuska
4162c0a83fe0SMartin Matuska void
dsl_errorscrub_sync(dsl_pool_t * dp,dmu_tx_t * tx)4163c0a83fe0SMartin Matuska dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
4164c0a83fe0SMartin Matuska {
4165c0a83fe0SMartin Matuska spa_t *spa = dp->dp_spa;
4166c0a83fe0SMartin Matuska dsl_scan_t *scn = dp->dp_scan;
4167c0a83fe0SMartin Matuska
4168c0a83fe0SMartin Matuska /*
4169c0a83fe0SMartin Matuska * Only process scans in sync pass 1.
4170c0a83fe0SMartin Matuska */
4171c0a83fe0SMartin Matuska
4172c0a83fe0SMartin Matuska if (spa_sync_pass(spa) > 1)
4173c0a83fe0SMartin Matuska return;
4174c0a83fe0SMartin Matuska
4175c0a83fe0SMartin Matuska /*
4176c0a83fe0SMartin Matuska * If the spa is shutting down, then stop scanning. This will
4177c0a83fe0SMartin Matuska * ensure that the scan does not dirty any new data during the
4178c0a83fe0SMartin Matuska * shutdown phase.
4179c0a83fe0SMartin Matuska */
4180c0a83fe0SMartin Matuska if (spa_shutting_down(spa))
4181c0a83fe0SMartin Matuska return;
4182c0a83fe0SMartin Matuska
4183c0a83fe0SMartin Matuska if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) {
4184c0a83fe0SMartin Matuska return;
4185c0a83fe0SMartin Matuska }
4186c0a83fe0SMartin Matuska
4187c0a83fe0SMartin Matuska if (dsl_scan_resilvering(scn->scn_dp)) {
4188c0a83fe0SMartin Matuska /* cancel the error scrub if resilver started */
4189c0a83fe0SMartin Matuska dsl_scan_cancel(scn->scn_dp);
4190c0a83fe0SMartin Matuska return;
4191c0a83fe0SMartin Matuska }
4192c0a83fe0SMartin Matuska
4193c0a83fe0SMartin Matuska spa->spa_scrub_active = B_TRUE;
4194c0a83fe0SMartin Matuska scn->scn_sync_start_time = gethrtime();
4195c0a83fe0SMartin Matuska
4196c0a83fe0SMartin Matuska /*
4197c0a83fe0SMartin Matuska * zfs_scan_suspend_progress can be set to disable scrub progress.
4198c0a83fe0SMartin Matuska * See more detailed comment in dsl_scan_sync().
4199c0a83fe0SMartin Matuska */
4200c0a83fe0SMartin Matuska if (zfs_scan_suspend_progress) {
4201c0a83fe0SMartin Matuska uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
4202c0a83fe0SMartin Matuska int mintime = zfs_scrub_min_time_ms;
4203c0a83fe0SMartin Matuska
4204c0a83fe0SMartin Matuska while (zfs_scan_suspend_progress &&
4205c0a83fe0SMartin Matuska !txg_sync_waiting(scn->scn_dp) &&
4206c0a83fe0SMartin Matuska !spa_shutting_down(scn->scn_dp->dp_spa) &&
4207c0a83fe0SMartin Matuska NSEC2MSEC(scan_time_ns) < mintime) {
4208c0a83fe0SMartin Matuska delay(hz);
4209c0a83fe0SMartin Matuska scan_time_ns = gethrtime() - scn->scn_sync_start_time;
4210c0a83fe0SMartin Matuska }
4211c0a83fe0SMartin Matuska return;
4212c0a83fe0SMartin Matuska }
4213c0a83fe0SMartin Matuska
4214c0a83fe0SMartin Matuska int i = 0;
4215c0a83fe0SMartin Matuska zap_attribute_t *za;
4216c0a83fe0SMartin Matuska zbookmark_phys_t *zb;
4217c0a83fe0SMartin Matuska boolean_t limit_exceeded = B_FALSE;
4218c0a83fe0SMartin Matuska
42197a7741afSMartin Matuska za = zap_attribute_alloc();
4220c0a83fe0SMartin Matuska zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP);
4221c0a83fe0SMartin Matuska
4222c0a83fe0SMartin Matuska if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
4223c0a83fe0SMartin Matuska for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
4224c0a83fe0SMartin Matuska zap_cursor_advance(&scn->errorscrub_cursor)) {
4225c0a83fe0SMartin Matuska name_to_bookmark(za->za_name, zb);
4226c0a83fe0SMartin Matuska
4227c0a83fe0SMartin Matuska scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
4228c0a83fe0SMartin Matuska NULL, ZIO_FLAG_CANFAIL);
4229c0a83fe0SMartin Matuska dsl_pool_config_enter(dp, FTAG);
4230c0a83fe0SMartin Matuska read_by_block_level(scn, *zb);
4231c0a83fe0SMartin Matuska dsl_pool_config_exit(dp, FTAG);
4232c0a83fe0SMartin Matuska
4233c0a83fe0SMartin Matuska (void) zio_wait(scn->scn_zio_root);
4234c0a83fe0SMartin Matuska scn->scn_zio_root = NULL;
4235c0a83fe0SMartin Matuska
4236c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_examined += 1;
4237c0a83fe0SMartin Matuska scn->errorscrub_phys.dep_to_examine -= 1;
4238c0a83fe0SMartin Matuska i++;
4239c0a83fe0SMartin Matuska if (i == zfs_scrub_error_blocks_per_txg ||
4240c0a83fe0SMartin Matuska dsl_error_scrub_check_suspend(scn, zb)) {
4241c0a83fe0SMartin Matuska limit_exceeded = B_TRUE;
4242c0a83fe0SMartin Matuska break;
4243c0a83fe0SMartin Matuska }
4244c0a83fe0SMartin Matuska }
4245c0a83fe0SMartin Matuska
4246c0a83fe0SMartin Matuska if (!limit_exceeded)
4247c0a83fe0SMartin Matuska dsl_errorscrub_done(scn, B_TRUE, tx);
4248c0a83fe0SMartin Matuska
4249c0a83fe0SMartin Matuska dsl_errorscrub_sync_state(scn, tx);
42507a7741afSMartin Matuska zap_attribute_free(za);
4251c0a83fe0SMartin Matuska kmem_free(zb, sizeof (*zb));
4252c0a83fe0SMartin Matuska return;
4253c0a83fe0SMartin Matuska }
4254c0a83fe0SMartin Matuska
4255c0a83fe0SMartin Matuska int error = 0;
4256c0a83fe0SMartin Matuska for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
4257c0a83fe0SMartin Matuska zap_cursor_advance(&scn->errorscrub_cursor)) {
4258c0a83fe0SMartin Matuska
4259c0a83fe0SMartin Matuska zap_cursor_t *head_ds_cursor;
4260c0a83fe0SMartin Matuska zap_attribute_t *head_ds_attr;
4261c0a83fe0SMartin Matuska zbookmark_err_phys_t head_ds_block;
4262c0a83fe0SMartin Matuska
4263c0a83fe0SMartin Matuska head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
42647a7741afSMartin Matuska head_ds_attr = zap_attribute_alloc();
4265c0a83fe0SMartin Matuska
4266c0a83fe0SMartin Matuska uint64_t head_ds_err_obj = za->za_first_integer;
4267c0a83fe0SMartin Matuska uint64_t head_ds;
4268c0a83fe0SMartin Matuska name_to_object(za->za_name, &head_ds);
4269c0a83fe0SMartin Matuska boolean_t config_held = B_FALSE;
4270c0a83fe0SMartin Matuska uint64_t top_affected_fs;
4271c0a83fe0SMartin Matuska
4272c0a83fe0SMartin Matuska for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset,
4273c0a83fe0SMartin Matuska head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor,
4274c0a83fe0SMartin Matuska head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) {
4275c0a83fe0SMartin Matuska
4276c0a83fe0SMartin Matuska name_to_errphys(head_ds_attr->za_name, &head_ds_block);
4277c0a83fe0SMartin Matuska
4278c0a83fe0SMartin Matuska /*
4279c0a83fe0SMartin Matuska * In case we are called from spa_sync the pool
4280c0a83fe0SMartin Matuska * config is already held.
4281c0a83fe0SMartin Matuska */
4282c0a83fe0SMartin Matuska if (!dsl_pool_config_held(dp)) {
4283c0a83fe0SMartin Matuska dsl_pool_config_enter(dp, FTAG);
4284c0a83fe0SMartin Matuska config_held = B_TRUE;
4285c0a83fe0SMartin Matuska }
4286c0a83fe0SMartin Matuska
4287c0a83fe0SMartin Matuska error = find_top_affected_fs(spa,
4288c0a83fe0SMartin Matuska head_ds, &head_ds_block, &top_affected_fs);
4289c0a83fe0SMartin Matuska if (error)
4290c0a83fe0SMartin Matuska break;
4291c0a83fe0SMartin Matuska
4292c0a83fe0SMartin Matuska error = scrub_filesystem(spa, top_affected_fs,
4293c0a83fe0SMartin Matuska &head_ds_block, &i);
4294c0a83fe0SMartin Matuska
4295c0a83fe0SMartin Matuska if (error == SET_ERROR(EFAULT)) {
4296c0a83fe0SMartin Matuska limit_exceeded = B_TRUE;
4297c0a83fe0SMartin Matuska break;
4298c0a83fe0SMartin Matuska }
4299c0a83fe0SMartin Matuska }
4300c0a83fe0SMartin Matuska
4301c0a83fe0SMartin Matuska zap_cursor_fini(head_ds_cursor);
4302c0a83fe0SMartin Matuska kmem_free(head_ds_cursor, sizeof (*head_ds_cursor));
43037a7741afSMartin Matuska zap_attribute_free(head_ds_attr);
4304c0a83fe0SMartin Matuska
4305c0a83fe0SMartin Matuska if (config_held)
4306c0a83fe0SMartin Matuska dsl_pool_config_exit(dp, FTAG);
4307c0a83fe0SMartin Matuska }
4308c0a83fe0SMartin Matuska
43097a7741afSMartin Matuska zap_attribute_free(za);
4310c0a83fe0SMartin Matuska kmem_free(zb, sizeof (*zb));
4311c0a83fe0SMartin Matuska if (!limit_exceeded)
4312c0a83fe0SMartin Matuska dsl_errorscrub_done(scn, B_TRUE, tx);
4313c0a83fe0SMartin Matuska
4314c0a83fe0SMartin Matuska dsl_errorscrub_sync_state(scn, tx);
4315c0a83fe0SMartin Matuska }
4316c0a83fe0SMartin Matuska
4317eda14cbcSMatt Macy /*
4318eda14cbcSMatt Macy * This is the primary entry point for scans that is called from syncing
4319eda14cbcSMatt Macy * context. Scans must happen entirely during syncing context so that we
4320eda14cbcSMatt Macy * can guarantee that blocks we are currently scanning will not change out
4321eda14cbcSMatt Macy * from under us. While a scan is active, this function controls how quickly
4322eda14cbcSMatt Macy * transaction groups proceed, instead of the normal handling provided by
4323eda14cbcSMatt Macy * txg_sync_thread().
4324eda14cbcSMatt Macy */
4325eda14cbcSMatt Macy void
dsl_scan_sync(dsl_pool_t * dp,dmu_tx_t * tx)4326eda14cbcSMatt Macy dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
4327eda14cbcSMatt Macy {
4328eda14cbcSMatt Macy int err = 0;
4329eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
4330eda14cbcSMatt Macy spa_t *spa = dp->dp_spa;
4331eda14cbcSMatt Macy state_sync_type_t sync_type = SYNC_OPTIONAL;
43327a7741afSMartin Matuska int restart_early = 0;
4333eda14cbcSMatt Macy
43347a7741afSMartin Matuska if (spa->spa_resilver_deferred) {
43357a7741afSMartin Matuska uint64_t to_issue, issued;
43367a7741afSMartin Matuska
43377a7741afSMartin Matuska if (!spa_feature_is_active(dp->dp_spa,
43387a7741afSMartin Matuska SPA_FEATURE_RESILVER_DEFER))
4339eda14cbcSMatt Macy spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
4340eda14cbcSMatt Macy
4341eda14cbcSMatt Macy /*
43427a7741afSMartin Matuska * See print_scan_scrub_resilver_status() issued/total_i
43437a7741afSMartin Matuska * @ cmd/zpool/zpool_main.c
4344eda14cbcSMatt Macy */
43457a7741afSMartin Matuska to_issue =
43467a7741afSMartin Matuska scn->scn_phys.scn_to_examine - scn->scn_phys.scn_skipped;
43477a7741afSMartin Matuska issued =
43487a7741afSMartin Matuska scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
43497a7741afSMartin Matuska restart_early =
43507a7741afSMartin Matuska zfs_resilver_disable_defer ||
43517a7741afSMartin Matuska (issued < (to_issue * zfs_resilver_defer_percent / 100));
4352eda14cbcSMatt Macy }
4353eda14cbcSMatt Macy
4354eda14cbcSMatt Macy /*
4355eda14cbcSMatt Macy * Only process scans in sync pass 1.
4356eda14cbcSMatt Macy */
4357eda14cbcSMatt Macy if (spa_sync_pass(spa) > 1)
4358eda14cbcSMatt Macy return;
4359eda14cbcSMatt Macy
43607a7741afSMartin Matuska
43617a7741afSMartin Matuska /*
43627a7741afSMartin Matuska * Check for scn_restart_txg before checking spa_load_state, so
43637a7741afSMartin Matuska * that we can restart an old-style scan while the pool is being
43647a7741afSMartin Matuska * imported (see dsl_scan_init). We also restart scans if there
43657a7741afSMartin Matuska * is a deferred resilver and the user has manually disabled
43667a7741afSMartin Matuska * deferred resilvers via zfs_resilver_disable_defer, or if the
43677a7741afSMartin Matuska * current scan progress is below zfs_resilver_defer_percent.
43687a7741afSMartin Matuska */
43697a7741afSMartin Matuska if (dsl_scan_restarting(scn, tx) || restart_early) {
437017aab35aSMartin Matuska setup_sync_arg_t setup_sync_arg = {
437117aab35aSMartin Matuska .func = POOL_SCAN_SCRUB,
437217aab35aSMartin Matuska .txgstart = 0,
437317aab35aSMartin Matuska .txgend = 0,
437417aab35aSMartin Matuska };
43757a7741afSMartin Matuska dsl_scan_done(scn, B_FALSE, tx);
43767a7741afSMartin Matuska if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
437717aab35aSMartin Matuska setup_sync_arg.func = POOL_SCAN_RESILVER;
43787a7741afSMartin Matuska zfs_dbgmsg("restarting scan func=%u on %s txg=%llu early=%d",
437917aab35aSMartin Matuska setup_sync_arg.func, dp->dp_spa->spa_name,
438017aab35aSMartin Matuska (longlong_t)tx->tx_txg, restart_early);
438117aab35aSMartin Matuska dsl_scan_setup_sync(&setup_sync_arg, tx);
43827a7741afSMartin Matuska }
43837a7741afSMartin Matuska
4384eda14cbcSMatt Macy /*
4385eda14cbcSMatt Macy * If the spa is shutting down, then stop scanning. This will
4386eda14cbcSMatt Macy * ensure that the scan does not dirty any new data during the
4387eda14cbcSMatt Macy * shutdown phase.
4388eda14cbcSMatt Macy */
4389eda14cbcSMatt Macy if (spa_shutting_down(spa))
4390eda14cbcSMatt Macy return;
4391eda14cbcSMatt Macy
4392eda14cbcSMatt Macy /*
4393eda14cbcSMatt Macy * If the scan is inactive due to a stalled async destroy, try again.
4394eda14cbcSMatt Macy */
4395eda14cbcSMatt Macy if (!scn->scn_async_stalled && !dsl_scan_active(scn))
4396eda14cbcSMatt Macy return;
4397eda14cbcSMatt Macy
4398eda14cbcSMatt Macy /* reset scan statistics */
4399eda14cbcSMatt Macy scn->scn_visited_this_txg = 0;
4400eda14cbcSMatt Macy scn->scn_dedup_frees_this_txg = 0;
4401eda14cbcSMatt Macy scn->scn_holes_this_txg = 0;
4402eda14cbcSMatt Macy scn->scn_lt_min_this_txg = 0;
4403eda14cbcSMatt Macy scn->scn_gt_max_this_txg = 0;
4404eda14cbcSMatt Macy scn->scn_ddt_contained_this_txg = 0;
4405eda14cbcSMatt Macy scn->scn_objsets_visited_this_txg = 0;
4406eda14cbcSMatt Macy scn->scn_avg_seg_size_this_txg = 0;
4407eda14cbcSMatt Macy scn->scn_segs_this_txg = 0;
4408eda14cbcSMatt Macy scn->scn_avg_zio_size_this_txg = 0;
4409eda14cbcSMatt Macy scn->scn_zios_this_txg = 0;
4410eda14cbcSMatt Macy scn->scn_suspending = B_FALSE;
4411eda14cbcSMatt Macy scn->scn_sync_start_time = gethrtime();
4412eda14cbcSMatt Macy spa->spa_scrub_active = B_TRUE;
4413eda14cbcSMatt Macy
4414eda14cbcSMatt Macy /*
4415eda14cbcSMatt Macy * First process the async destroys. If we suspend, don't do
4416eda14cbcSMatt Macy * any scrubbing or resilvering. This ensures that there are no
4417eda14cbcSMatt Macy * async destroys while we are scanning, so the scan code doesn't
4418eda14cbcSMatt Macy * have to worry about traversing it. It is also faster to free the
4419eda14cbcSMatt Macy * blocks than to scrub them.
4420eda14cbcSMatt Macy */
4421eda14cbcSMatt Macy err = dsl_process_async_destroys(dp, tx);
4422eda14cbcSMatt Macy if (err != 0)
4423eda14cbcSMatt Macy return;
4424eda14cbcSMatt Macy
4425eda14cbcSMatt Macy if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
4426eda14cbcSMatt Macy return;
4427eda14cbcSMatt Macy
4428eda14cbcSMatt Macy /*
4429eda14cbcSMatt Macy * Wait a few txgs after importing to begin scanning so that
4430eda14cbcSMatt Macy * we can get the pool imported quickly.
4431eda14cbcSMatt Macy */
4432eda14cbcSMatt Macy if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
4433eda14cbcSMatt Macy return;
4434eda14cbcSMatt Macy
4435eda14cbcSMatt Macy /*
4436eda14cbcSMatt Macy * zfs_scan_suspend_progress can be set to disable scan progress.
4437eda14cbcSMatt Macy * We don't want to spin the txg_sync thread, so we add a delay
4438eda14cbcSMatt Macy * here to simulate the time spent doing a scan. This is mostly
4439eda14cbcSMatt Macy * useful for testing and debugging.
4440eda14cbcSMatt Macy */
4441eda14cbcSMatt Macy if (zfs_scan_suspend_progress) {
4442eda14cbcSMatt Macy uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
4443be181ee2SMartin Matuska uint_t mintime = (scn->scn_phys.scn_func ==
4444be181ee2SMartin Matuska POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms :
4445be181ee2SMartin Matuska zfs_scrub_min_time_ms;
4446eda14cbcSMatt Macy
4447eda14cbcSMatt Macy while (zfs_scan_suspend_progress &&
4448eda14cbcSMatt Macy !txg_sync_waiting(scn->scn_dp) &&
4449eda14cbcSMatt Macy !spa_shutting_down(scn->scn_dp->dp_spa) &&
4450eda14cbcSMatt Macy NSEC2MSEC(scan_time_ns) < mintime) {
4451eda14cbcSMatt Macy delay(hz);
4452eda14cbcSMatt Macy scan_time_ns = gethrtime() - scn->scn_sync_start_time;
4453eda14cbcSMatt Macy }
4454eda14cbcSMatt Macy return;
4455eda14cbcSMatt Macy }
4456eda14cbcSMatt Macy
4457eda14cbcSMatt Macy /*
4458c9539b89SMartin Matuska * Disabled by default, set zfs_scan_report_txgs to report
4459c9539b89SMartin Matuska * average performance over the last zfs_scan_report_txgs TXGs.
4460c9539b89SMartin Matuska */
44610a97523dSMartin Matuska if (zfs_scan_report_txgs != 0 &&
4462c9539b89SMartin Matuska tx->tx_txg % zfs_scan_report_txgs == 0) {
4463c9539b89SMartin Matuska scn->scn_issued_before_pass += spa->spa_scan_pass_issued;
4464c9539b89SMartin Matuska spa_scan_stat_init(spa);
4465c9539b89SMartin Matuska }
4466c9539b89SMartin Matuska
4467c9539b89SMartin Matuska /*
4468eda14cbcSMatt Macy * It is possible to switch from unsorted to sorted at any time,
4469eda14cbcSMatt Macy * but afterwards the scan will remain sorted unless reloaded from
4470eda14cbcSMatt Macy * a checkpoint after a reboot.
4471eda14cbcSMatt Macy */
4472eda14cbcSMatt Macy if (!zfs_scan_legacy) {
4473eda14cbcSMatt Macy scn->scn_is_sorted = B_TRUE;
4474eda14cbcSMatt Macy if (scn->scn_last_checkpoint == 0)
4475eda14cbcSMatt Macy scn->scn_last_checkpoint = ddi_get_lbolt();
4476eda14cbcSMatt Macy }
4477eda14cbcSMatt Macy
4478eda14cbcSMatt Macy /*
4479eda14cbcSMatt Macy * For sorted scans, determine what kind of work we will be doing
4480eda14cbcSMatt Macy * this txg based on our memory limitations and whether or not we
4481eda14cbcSMatt Macy * need to perform a checkpoint.
4482eda14cbcSMatt Macy */
4483eda14cbcSMatt Macy if (scn->scn_is_sorted) {
4484eda14cbcSMatt Macy /*
4485eda14cbcSMatt Macy * If we are over our checkpoint interval, set scn_clearing
4486eda14cbcSMatt Macy * so that we can begin checkpointing immediately. The
4487eda14cbcSMatt Macy * checkpoint allows us to save a consistent bookmark
4488eda14cbcSMatt Macy * representing how much data we have scrubbed so far.
4489eda14cbcSMatt Macy * Otherwise, use the memory limit to determine if we should
4490eda14cbcSMatt Macy * scan for metadata or start issue scrub IOs. We accumulate
4491eda14cbcSMatt Macy * metadata until we hit our hard memory limit at which point
4492eda14cbcSMatt Macy * we issue scrub IOs until we are at our soft memory limit.
4493eda14cbcSMatt Macy */
4494eda14cbcSMatt Macy if (scn->scn_checkpointing ||
4495eda14cbcSMatt Macy ddi_get_lbolt() - scn->scn_last_checkpoint >
4496eda14cbcSMatt Macy SEC_TO_TICK(zfs_scan_checkpoint_intval)) {
4497eda14cbcSMatt Macy if (!scn->scn_checkpointing)
449881b22a98SMartin Matuska zfs_dbgmsg("begin scan checkpoint for %s",
449981b22a98SMartin Matuska spa->spa_name);
4500eda14cbcSMatt Macy
4501eda14cbcSMatt Macy scn->scn_checkpointing = B_TRUE;
4502eda14cbcSMatt Macy scn->scn_clearing = B_TRUE;
4503eda14cbcSMatt Macy } else {
4504eda14cbcSMatt Macy boolean_t should_clear = dsl_scan_should_clear(scn);
4505eda14cbcSMatt Macy if (should_clear && !scn->scn_clearing) {
450681b22a98SMartin Matuska zfs_dbgmsg("begin scan clearing for %s",
450781b22a98SMartin Matuska spa->spa_name);
4508eda14cbcSMatt Macy scn->scn_clearing = B_TRUE;
4509eda14cbcSMatt Macy } else if (!should_clear && scn->scn_clearing) {
451081b22a98SMartin Matuska zfs_dbgmsg("finish scan clearing for %s",
451181b22a98SMartin Matuska spa->spa_name);
4512eda14cbcSMatt Macy scn->scn_clearing = B_FALSE;
4513eda14cbcSMatt Macy }
4514eda14cbcSMatt Macy }
4515eda14cbcSMatt Macy } else {
4516eda14cbcSMatt Macy ASSERT0(scn->scn_checkpointing);
4517eda14cbcSMatt Macy ASSERT0(scn->scn_clearing);
4518eda14cbcSMatt Macy }
4519eda14cbcSMatt Macy
4520eda14cbcSMatt Macy if (!scn->scn_clearing && scn->scn_done_txg == 0) {
4521eda14cbcSMatt Macy /* Need to scan metadata for more blocks to scrub */
4522eda14cbcSMatt Macy dsl_scan_phys_t *scnp = &scn->scn_phys;
4523eda14cbcSMatt Macy taskqid_t prefetch_tqid;
4524eda14cbcSMatt Macy
4525eda14cbcSMatt Macy /*
4526c9539b89SMartin Matuska * Calculate the max number of in-flight bytes for pool-wide
4527c9539b89SMartin Matuska * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
4528c9539b89SMartin Matuska * Limits for the issuing phase are done per top-level vdev and
4529c9539b89SMartin Matuska * are handled separately.
4530eda14cbcSMatt Macy */
4531c9539b89SMartin Matuska scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
4532c9539b89SMartin Matuska zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
4533eda14cbcSMatt Macy
4534eda14cbcSMatt Macy if (scnp->scn_ddt_bookmark.ddb_class <=
4535eda14cbcSMatt Macy scnp->scn_ddt_class_max) {
4536eda14cbcSMatt Macy ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark));
453781b22a98SMartin Matuska zfs_dbgmsg("doing scan sync for %s txg %llu; "
4538eda14cbcSMatt Macy "ddt bm=%llu/%llu/%llu/%llx",
453981b22a98SMartin Matuska spa->spa_name,
4540eda14cbcSMatt Macy (longlong_t)tx->tx_txg,
4541eda14cbcSMatt Macy (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
4542eda14cbcSMatt Macy (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
4543eda14cbcSMatt Macy (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
4544eda14cbcSMatt Macy (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
4545eda14cbcSMatt Macy } else {
454681b22a98SMartin Matuska zfs_dbgmsg("doing scan sync for %s txg %llu; "
4547eda14cbcSMatt Macy "bm=%llu/%llu/%llu/%llu",
454881b22a98SMartin Matuska spa->spa_name,
4549eda14cbcSMatt Macy (longlong_t)tx->tx_txg,
4550eda14cbcSMatt Macy (longlong_t)scnp->scn_bookmark.zb_objset,
4551eda14cbcSMatt Macy (longlong_t)scnp->scn_bookmark.zb_object,
4552eda14cbcSMatt Macy (longlong_t)scnp->scn_bookmark.zb_level,
4553eda14cbcSMatt Macy (longlong_t)scnp->scn_bookmark.zb_blkid);
4554eda14cbcSMatt Macy }
4555eda14cbcSMatt Macy
4556eda14cbcSMatt Macy scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
4557eda14cbcSMatt Macy NULL, ZIO_FLAG_CANFAIL);
4558eda14cbcSMatt Macy
4559eda14cbcSMatt Macy scn->scn_prefetch_stop = B_FALSE;
4560eda14cbcSMatt Macy prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq,
4561eda14cbcSMatt Macy dsl_scan_prefetch_thread, scn, TQ_SLEEP);
4562eda14cbcSMatt Macy ASSERT(prefetch_tqid != TASKQID_INVALID);
4563eda14cbcSMatt Macy
4564eda14cbcSMatt Macy dsl_pool_config_enter(dp, FTAG);
4565eda14cbcSMatt Macy dsl_scan_visit(scn, tx);
4566eda14cbcSMatt Macy dsl_pool_config_exit(dp, FTAG);
4567eda14cbcSMatt Macy
4568eda14cbcSMatt Macy mutex_enter(&dp->dp_spa->spa_scrub_lock);
4569eda14cbcSMatt Macy scn->scn_prefetch_stop = B_TRUE;
4570eda14cbcSMatt Macy cv_broadcast(&spa->spa_scrub_io_cv);
4571eda14cbcSMatt Macy mutex_exit(&dp->dp_spa->spa_scrub_lock);
4572eda14cbcSMatt Macy
4573eda14cbcSMatt Macy taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid);
4574eda14cbcSMatt Macy (void) zio_wait(scn->scn_zio_root);
4575eda14cbcSMatt Macy scn->scn_zio_root = NULL;
4576eda14cbcSMatt Macy
457781b22a98SMartin Matuska zfs_dbgmsg("scan visited %llu blocks of %s in %llums "
4578eda14cbcSMatt Macy "(%llu os's, %llu holes, %llu < mintxg, "
4579eda14cbcSMatt Macy "%llu in ddt, %llu > maxtxg)",
4580eda14cbcSMatt Macy (longlong_t)scn->scn_visited_this_txg,
458181b22a98SMartin Matuska spa->spa_name,
4582eda14cbcSMatt Macy (longlong_t)NSEC2MSEC(gethrtime() -
4583eda14cbcSMatt Macy scn->scn_sync_start_time),
4584eda14cbcSMatt Macy (longlong_t)scn->scn_objsets_visited_this_txg,
4585eda14cbcSMatt Macy (longlong_t)scn->scn_holes_this_txg,
4586eda14cbcSMatt Macy (longlong_t)scn->scn_lt_min_this_txg,
4587eda14cbcSMatt Macy (longlong_t)scn->scn_ddt_contained_this_txg,
4588eda14cbcSMatt Macy (longlong_t)scn->scn_gt_max_this_txg);
4589eda14cbcSMatt Macy
4590eda14cbcSMatt Macy if (!scn->scn_suspending) {
4591eda14cbcSMatt Macy ASSERT0(avl_numnodes(&scn->scn_queue));
4592eda14cbcSMatt Macy scn->scn_done_txg = tx->tx_txg + 1;
4593eda14cbcSMatt Macy if (scn->scn_is_sorted) {
4594eda14cbcSMatt Macy scn->scn_checkpointing = B_TRUE;
4595eda14cbcSMatt Macy scn->scn_clearing = B_TRUE;
4596c9539b89SMartin Matuska scn->scn_issued_before_pass +=
4597c9539b89SMartin Matuska spa->spa_scan_pass_issued;
4598c9539b89SMartin Matuska spa_scan_stat_init(spa);
4599eda14cbcSMatt Macy }
460081b22a98SMartin Matuska zfs_dbgmsg("scan complete for %s txg %llu",
460181b22a98SMartin Matuska spa->spa_name,
4602eda14cbcSMatt Macy (longlong_t)tx->tx_txg);
4603eda14cbcSMatt Macy }
4604a0b956f5SMartin Matuska } else if (scn->scn_is_sorted && scn->scn_queues_pending != 0) {
4605eda14cbcSMatt Macy ASSERT(scn->scn_clearing);
4606eda14cbcSMatt Macy
4607eda14cbcSMatt Macy /* need to issue scrubbing IOs from per-vdev queues */
4608eda14cbcSMatt Macy scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
4609eda14cbcSMatt Macy NULL, ZIO_FLAG_CANFAIL);
4610eda14cbcSMatt Macy scan_io_queues_run(scn);
4611eda14cbcSMatt Macy (void) zio_wait(scn->scn_zio_root);
4612eda14cbcSMatt Macy scn->scn_zio_root = NULL;
4613eda14cbcSMatt Macy
4614eda14cbcSMatt Macy /* calculate and dprintf the current memory usage */
4615eda14cbcSMatt Macy (void) dsl_scan_should_clear(scn);
4616eda14cbcSMatt Macy dsl_scan_update_stats(scn);
4617eda14cbcSMatt Macy
461881b22a98SMartin Matuska zfs_dbgmsg("scan issued %llu blocks for %s (%llu segs) "
461981b22a98SMartin Matuska "in %llums (avg_block_size = %llu, avg_seg_size = %llu)",
4620eda14cbcSMatt Macy (longlong_t)scn->scn_zios_this_txg,
462181b22a98SMartin Matuska spa->spa_name,
4622eda14cbcSMatt Macy (longlong_t)scn->scn_segs_this_txg,
4623eda14cbcSMatt Macy (longlong_t)NSEC2MSEC(gethrtime() -
4624eda14cbcSMatt Macy scn->scn_sync_start_time),
4625eda14cbcSMatt Macy (longlong_t)scn->scn_avg_zio_size_this_txg,
4626eda14cbcSMatt Macy (longlong_t)scn->scn_avg_seg_size_this_txg);
4627eda14cbcSMatt Macy } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) {
4628eda14cbcSMatt Macy /* Finished with everything. Mark the scrub as complete */
462981b22a98SMartin Matuska zfs_dbgmsg("scan issuing complete txg %llu for %s",
463081b22a98SMartin Matuska (longlong_t)tx->tx_txg,
463181b22a98SMartin Matuska spa->spa_name);
4632eda14cbcSMatt Macy ASSERT3U(scn->scn_done_txg, !=, 0);
4633eda14cbcSMatt Macy ASSERT0(spa->spa_scrub_inflight);
4634a0b956f5SMartin Matuska ASSERT0(scn->scn_queues_pending);
4635eda14cbcSMatt Macy dsl_scan_done(scn, B_TRUE, tx);
4636eda14cbcSMatt Macy sync_type = SYNC_MANDATORY;
4637eda14cbcSMatt Macy }
4638eda14cbcSMatt Macy
4639eda14cbcSMatt Macy dsl_scan_sync_state(scn, tx, sync_type);
4640eda14cbcSMatt Macy }
4641eda14cbcSMatt Macy
4642eda14cbcSMatt Macy static void
count_block_issued(spa_t * spa,const blkptr_t * bp,boolean_t all)4643a0b956f5SMartin Matuska count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all)
4644eda14cbcSMatt Macy {
4645eda14cbcSMatt Macy /*
4646eda14cbcSMatt Macy * Don't count embedded bp's, since we already did the work of
4647eda14cbcSMatt Macy * scanning these when we scanned the containing block.
4648eda14cbcSMatt Macy */
4649eda14cbcSMatt Macy if (BP_IS_EMBEDDED(bp))
4650eda14cbcSMatt Macy return;
4651eda14cbcSMatt Macy
4652eda14cbcSMatt Macy /*
4653eda14cbcSMatt Macy * Update the spa's stats on how many bytes we have issued.
4654eda14cbcSMatt Macy * Sequential scrubs create a zio for each DVA of the bp. Each
4655eda14cbcSMatt Macy * of these will include all DVAs for repair purposes, but the
4656eda14cbcSMatt Macy * zio code will only try the first one unless there is an issue.
4657eda14cbcSMatt Macy * Therefore, we should only count the first DVA for these IOs.
4658eda14cbcSMatt Macy */
4659eda14cbcSMatt Macy atomic_add_64(&spa->spa_scan_pass_issued,
4660a0b956f5SMartin Matuska all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
4661eda14cbcSMatt Macy }
4662eda14cbcSMatt Macy
4663a0b956f5SMartin Matuska static void
count_block_skipped(dsl_scan_t * scn,const blkptr_t * bp,boolean_t all)46640a97523dSMartin Matuska count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all)
46650a97523dSMartin Matuska {
46660a97523dSMartin Matuska if (BP_IS_EMBEDDED(bp))
46670a97523dSMartin Matuska return;
46680a97523dSMartin Matuska atomic_add_64(&scn->scn_phys.scn_skipped,
46690a97523dSMartin Matuska all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
46700a97523dSMartin Matuska }
46710a97523dSMartin Matuska
46720a97523dSMartin Matuska static void
count_block(zfs_all_blkstats_t * zab,const blkptr_t * bp)4673a0b956f5SMartin Matuska count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
4674a0b956f5SMartin Matuska {
4675eda14cbcSMatt Macy /*
4676eda14cbcSMatt Macy * If we resume after a reboot, zab will be NULL; don't record
4677eda14cbcSMatt Macy * incomplete stats in that case.
4678eda14cbcSMatt Macy */
4679eda14cbcSMatt Macy if (zab == NULL)
4680eda14cbcSMatt Macy return;
4681eda14cbcSMatt Macy
4682a0b956f5SMartin Matuska for (int i = 0; i < 4; i++) {
4683eda14cbcSMatt Macy int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
4684eda14cbcSMatt Macy int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
4685eda14cbcSMatt Macy
4686eda14cbcSMatt Macy if (t & DMU_OT_NEWTYPE)
4687eda14cbcSMatt Macy t = DMU_OT_OTHER;
4688eda14cbcSMatt Macy zfs_blkstat_t *zb = &zab->zab_type[l][t];
4689eda14cbcSMatt Macy int equal;
4690eda14cbcSMatt Macy
4691eda14cbcSMatt Macy zb->zb_count++;
4692eda14cbcSMatt Macy zb->zb_asize += BP_GET_ASIZE(bp);
4693eda14cbcSMatt Macy zb->zb_lsize += BP_GET_LSIZE(bp);
4694eda14cbcSMatt Macy zb->zb_psize += BP_GET_PSIZE(bp);
4695eda14cbcSMatt Macy zb->zb_gangs += BP_COUNT_GANG(bp);
4696eda14cbcSMatt Macy
4697eda14cbcSMatt Macy switch (BP_GET_NDVAS(bp)) {
4698eda14cbcSMatt Macy case 2:
4699eda14cbcSMatt Macy if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
4700eda14cbcSMatt Macy DVA_GET_VDEV(&bp->blk_dva[1]))
4701eda14cbcSMatt Macy zb->zb_ditto_2_of_2_samevdev++;
4702eda14cbcSMatt Macy break;
4703eda14cbcSMatt Macy case 3:
4704eda14cbcSMatt Macy equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
4705eda14cbcSMatt Macy DVA_GET_VDEV(&bp->blk_dva[1])) +
4706eda14cbcSMatt Macy (DVA_GET_VDEV(&bp->blk_dva[0]) ==
4707eda14cbcSMatt Macy DVA_GET_VDEV(&bp->blk_dva[2])) +
4708eda14cbcSMatt Macy (DVA_GET_VDEV(&bp->blk_dva[1]) ==
4709eda14cbcSMatt Macy DVA_GET_VDEV(&bp->blk_dva[2]));
4710eda14cbcSMatt Macy if (equal == 1)
4711eda14cbcSMatt Macy zb->zb_ditto_2_of_3_samevdev++;
4712eda14cbcSMatt Macy else if (equal == 3)
4713eda14cbcSMatt Macy zb->zb_ditto_3_of_3_samevdev++;
4714eda14cbcSMatt Macy break;
4715eda14cbcSMatt Macy }
4716eda14cbcSMatt Macy }
4717eda14cbcSMatt Macy }
4718eda14cbcSMatt Macy
4719eda14cbcSMatt Macy static void
scan_io_queue_insert_impl(dsl_scan_io_queue_t * queue,scan_io_t * sio)4720eda14cbcSMatt Macy scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
4721eda14cbcSMatt Macy {
4722eda14cbcSMatt Macy avl_index_t idx;
4723eda14cbcSMatt Macy dsl_scan_t *scn = queue->q_scn;
4724eda14cbcSMatt Macy
4725eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
4726eda14cbcSMatt Macy
4727a0b956f5SMartin Matuska if (unlikely(avl_is_empty(&queue->q_sios_by_addr)))
4728a0b956f5SMartin Matuska atomic_add_64(&scn->scn_queues_pending, 1);
4729eda14cbcSMatt Macy if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
4730eda14cbcSMatt Macy /* block is already scheduled for reading */
4731eda14cbcSMatt Macy sio_free(sio);
4732eda14cbcSMatt Macy return;
4733eda14cbcSMatt Macy }
4734eda14cbcSMatt Macy avl_insert(&queue->q_sios_by_addr, sio, idx);
4735eda14cbcSMatt Macy queue->q_sio_memused += SIO_GET_MUSED(sio);
4736b59a0cdeSMartin Matuska zfs_range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio),
4737a0b956f5SMartin Matuska SIO_GET_ASIZE(sio));
4738eda14cbcSMatt Macy }
4739eda14cbcSMatt Macy
4740eda14cbcSMatt Macy /*
4741eda14cbcSMatt Macy * Given all the info we got from our metadata scanning process, we
4742eda14cbcSMatt Macy * construct a scan_io_t and insert it into the scan sorting queue. The
4743eda14cbcSMatt Macy * I/O must already be suitable for us to process. This is controlled
4744eda14cbcSMatt Macy * by dsl_scan_enqueue().
4745eda14cbcSMatt Macy */
4746eda14cbcSMatt Macy static void
scan_io_queue_insert(dsl_scan_io_queue_t * queue,const blkptr_t * bp,int dva_i,int zio_flags,const zbookmark_phys_t * zb)4747eda14cbcSMatt Macy scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
4748eda14cbcSMatt Macy int zio_flags, const zbookmark_phys_t *zb)
4749eda14cbcSMatt Macy {
4750eda14cbcSMatt Macy scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
4751eda14cbcSMatt Macy
4752eda14cbcSMatt Macy ASSERT0(BP_IS_GANG(bp));
4753eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
4754eda14cbcSMatt Macy
4755eda14cbcSMatt Macy bp2sio(bp, sio, dva_i);
4756eda14cbcSMatt Macy sio->sio_flags = zio_flags;
4757eda14cbcSMatt Macy sio->sio_zb = *zb;
4758eda14cbcSMatt Macy
4759a0b956f5SMartin Matuska queue->q_last_ext_addr = -1;
4760eda14cbcSMatt Macy scan_io_queue_insert_impl(queue, sio);
4761eda14cbcSMatt Macy }
4762eda14cbcSMatt Macy
4763eda14cbcSMatt Macy /*
4764eda14cbcSMatt Macy * Given a set of I/O parameters as discovered by the metadata traversal
4765eda14cbcSMatt Macy * process, attempts to place the I/O into the sorted queues (if allowed),
4766eda14cbcSMatt Macy * or immediately executes the I/O.
4767eda14cbcSMatt Macy */
4768eda14cbcSMatt Macy static void
dsl_scan_enqueue(dsl_pool_t * dp,const blkptr_t * bp,int zio_flags,const zbookmark_phys_t * zb)4769eda14cbcSMatt Macy dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
4770eda14cbcSMatt Macy const zbookmark_phys_t *zb)
4771eda14cbcSMatt Macy {
4772eda14cbcSMatt Macy spa_t *spa = dp->dp_spa;
4773eda14cbcSMatt Macy
4774eda14cbcSMatt Macy ASSERT(!BP_IS_EMBEDDED(bp));
4775eda14cbcSMatt Macy
4776eda14cbcSMatt Macy /*
4777eda14cbcSMatt Macy * Gang blocks are hard to issue sequentially, so we just issue them
4778eda14cbcSMatt Macy * here immediately instead of queuing them.
4779eda14cbcSMatt Macy */
4780eda14cbcSMatt Macy if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) {
4781eda14cbcSMatt Macy scan_exec_io(dp, bp, zio_flags, zb, NULL);
4782eda14cbcSMatt Macy return;
4783eda14cbcSMatt Macy }
4784eda14cbcSMatt Macy
4785eda14cbcSMatt Macy for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
4786eda14cbcSMatt Macy dva_t dva;
4787eda14cbcSMatt Macy vdev_t *vdev;
4788eda14cbcSMatt Macy
4789eda14cbcSMatt Macy dva = bp->blk_dva[i];
4790eda14cbcSMatt Macy vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
4791eda14cbcSMatt Macy ASSERT(vdev != NULL);
4792eda14cbcSMatt Macy
4793eda14cbcSMatt Macy mutex_enter(&vdev->vdev_scan_io_queue_lock);
4794eda14cbcSMatt Macy if (vdev->vdev_scan_io_queue == NULL)
4795eda14cbcSMatt Macy vdev->vdev_scan_io_queue = scan_io_queue_create(vdev);
4796eda14cbcSMatt Macy ASSERT(dp->dp_scan != NULL);
4797eda14cbcSMatt Macy scan_io_queue_insert(vdev->vdev_scan_io_queue, bp,
4798eda14cbcSMatt Macy i, zio_flags, zb);
4799eda14cbcSMatt Macy mutex_exit(&vdev->vdev_scan_io_queue_lock);
4800eda14cbcSMatt Macy }
4801eda14cbcSMatt Macy }
4802eda14cbcSMatt Macy
4803eda14cbcSMatt Macy static int
dsl_scan_scrub_cb(dsl_pool_t * dp,const blkptr_t * bp,const zbookmark_phys_t * zb)4804eda14cbcSMatt Macy dsl_scan_scrub_cb(dsl_pool_t *dp,
4805eda14cbcSMatt Macy const blkptr_t *bp, const zbookmark_phys_t *zb)
4806eda14cbcSMatt Macy {
4807eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
4808eda14cbcSMatt Macy spa_t *spa = dp->dp_spa;
4809783d3ff6SMartin Matuska uint64_t phys_birth = BP_GET_BIRTH(bp);
4810eda14cbcSMatt Macy size_t psize = BP_GET_PSIZE(bp);
4811eda14cbcSMatt Macy boolean_t needs_io = B_FALSE;
4812eda14cbcSMatt Macy int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
4813eda14cbcSMatt Macy
4814a0b956f5SMartin Matuska count_block(dp->dp_blkstats, bp);
4815eda14cbcSMatt Macy if (phys_birth <= scn->scn_phys.scn_min_txg ||
4816eda14cbcSMatt Macy phys_birth >= scn->scn_phys.scn_max_txg) {
48170a97523dSMartin Matuska count_block_skipped(scn, bp, B_TRUE);
4818eda14cbcSMatt Macy return (0);
4819eda14cbcSMatt Macy }
4820eda14cbcSMatt Macy
4821eda14cbcSMatt Macy /* Embedded BP's have phys_birth==0, so we reject them above. */
4822eda14cbcSMatt Macy ASSERT(!BP_IS_EMBEDDED(bp));
4823eda14cbcSMatt Macy
4824eda14cbcSMatt Macy ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
4825eda14cbcSMatt Macy if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
4826eda14cbcSMatt Macy zio_flags |= ZIO_FLAG_SCRUB;
4827eda14cbcSMatt Macy needs_io = B_TRUE;
4828eda14cbcSMatt Macy } else {
4829eda14cbcSMatt Macy ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
4830eda14cbcSMatt Macy zio_flags |= ZIO_FLAG_RESILVER;
4831eda14cbcSMatt Macy needs_io = B_FALSE;
4832eda14cbcSMatt Macy }
4833eda14cbcSMatt Macy
4834eda14cbcSMatt Macy /* If it's an intent log block, failure is expected. */
4835eda14cbcSMatt Macy if (zb->zb_level == ZB_ZIL_LEVEL)
4836eda14cbcSMatt Macy zio_flags |= ZIO_FLAG_SPECULATIVE;
4837eda14cbcSMatt Macy
4838eda14cbcSMatt Macy for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
4839eda14cbcSMatt Macy const dva_t *dva = &bp->blk_dva[d];
4840eda14cbcSMatt Macy
4841eda14cbcSMatt Macy /*
4842eda14cbcSMatt Macy * Keep track of how much data we've examined so that
48437877fdebSMatt Macy * zpool(8) status can make useful progress reports.
4844eda14cbcSMatt Macy */
4845a0b956f5SMartin Matuska uint64_t asize = DVA_GET_ASIZE(dva);
4846a0b956f5SMartin Matuska scn->scn_phys.scn_examined += asize;
4847a0b956f5SMartin Matuska spa->spa_scan_pass_exam += asize;
4848eda14cbcSMatt Macy
4849eda14cbcSMatt Macy /* if it's a resilver, this may not be in the target range */
4850eda14cbcSMatt Macy if (!needs_io)
4851eda14cbcSMatt Macy needs_io = dsl_scan_need_resilver(spa, dva, psize,
4852eda14cbcSMatt Macy phys_birth);
4853eda14cbcSMatt Macy }
4854eda14cbcSMatt Macy
4855eda14cbcSMatt Macy if (needs_io && !zfs_no_scrub_io) {
4856eda14cbcSMatt Macy dsl_scan_enqueue(dp, bp, zio_flags, zb);
4857eda14cbcSMatt Macy } else {
48580a97523dSMartin Matuska count_block_skipped(scn, bp, B_TRUE);
4859eda14cbcSMatt Macy }
4860eda14cbcSMatt Macy
4861eda14cbcSMatt Macy /* do not relocate this block */
4862eda14cbcSMatt Macy return (0);
4863eda14cbcSMatt Macy }
4864eda14cbcSMatt Macy
4865eda14cbcSMatt Macy static void
dsl_scan_scrub_done(zio_t * zio)4866eda14cbcSMatt Macy dsl_scan_scrub_done(zio_t *zio)
4867eda14cbcSMatt Macy {
4868eda14cbcSMatt Macy spa_t *spa = zio->io_spa;
4869eda14cbcSMatt Macy blkptr_t *bp = zio->io_bp;
4870eda14cbcSMatt Macy dsl_scan_io_queue_t *queue = zio->io_private;
4871eda14cbcSMatt Macy
4872eda14cbcSMatt Macy abd_free(zio->io_abd);
4873eda14cbcSMatt Macy
4874eda14cbcSMatt Macy if (queue == NULL) {
4875eda14cbcSMatt Macy mutex_enter(&spa->spa_scrub_lock);
4876eda14cbcSMatt Macy ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
4877eda14cbcSMatt Macy spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
4878eda14cbcSMatt Macy cv_broadcast(&spa->spa_scrub_io_cv);
4879eda14cbcSMatt Macy mutex_exit(&spa->spa_scrub_lock);
4880eda14cbcSMatt Macy } else {
4881eda14cbcSMatt Macy mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock);
4882eda14cbcSMatt Macy ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp));
4883eda14cbcSMatt Macy queue->q_inflight_bytes -= BP_GET_PSIZE(bp);
4884eda14cbcSMatt Macy cv_broadcast(&queue->q_zio_cv);
4885eda14cbcSMatt Macy mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock);
4886eda14cbcSMatt Macy }
4887eda14cbcSMatt Macy
4888eda14cbcSMatt Macy if (zio->io_error && (zio->io_error != ECKSUM ||
4889eda14cbcSMatt Macy !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
4890c0a83fe0SMartin Matuska if (dsl_errorscrubbing(spa->spa_dsl_pool) &&
4891c0a83fe0SMartin Matuska !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) {
4892c0a83fe0SMartin Matuska atomic_inc_64(&spa->spa_dsl_pool->dp_scan
4893c0a83fe0SMartin Matuska ->errorscrub_phys.dep_errors);
4894c0a83fe0SMartin Matuska } else {
4895c0a83fe0SMartin Matuska atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys
4896c0a83fe0SMartin Matuska .scn_errors);
4897c0a83fe0SMartin Matuska }
4898eda14cbcSMatt Macy }
4899eda14cbcSMatt Macy }
4900eda14cbcSMatt Macy
4901eda14cbcSMatt Macy /*
4902eda14cbcSMatt Macy * Given a scanning zio's information, executes the zio. The zio need
4903eda14cbcSMatt Macy * not necessarily be only sortable, this function simply executes the
4904eda14cbcSMatt Macy * zio, no matter what it is. The optional queue argument allows the
4905eda14cbcSMatt Macy * caller to specify that they want per top level vdev IO rate limiting
4906eda14cbcSMatt Macy * instead of the legacy global limiting.
4907eda14cbcSMatt Macy */
4908eda14cbcSMatt Macy static void
scan_exec_io(dsl_pool_t * dp,const blkptr_t * bp,int zio_flags,const zbookmark_phys_t * zb,dsl_scan_io_queue_t * queue)4909eda14cbcSMatt Macy scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
4910eda14cbcSMatt Macy const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue)
4911eda14cbcSMatt Macy {
4912eda14cbcSMatt Macy spa_t *spa = dp->dp_spa;
4913eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
4914eda14cbcSMatt Macy size_t size = BP_GET_PSIZE(bp);
4915eda14cbcSMatt Macy abd_t *data = abd_alloc_for_io(size, B_FALSE);
49161f1e2261SMartin Matuska zio_t *pio;
4917eda14cbcSMatt Macy
4918eda14cbcSMatt Macy if (queue == NULL) {
491916038816SMartin Matuska ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
4920eda14cbcSMatt Macy mutex_enter(&spa->spa_scrub_lock);
4921eda14cbcSMatt Macy while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
4922eda14cbcSMatt Macy cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
4923eda14cbcSMatt Macy spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
4924eda14cbcSMatt Macy mutex_exit(&spa->spa_scrub_lock);
49251f1e2261SMartin Matuska pio = scn->scn_zio_root;
4926eda14cbcSMatt Macy } else {
4927eda14cbcSMatt Macy kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
4928eda14cbcSMatt Macy
492916038816SMartin Matuska ASSERT3U(queue->q_maxinflight_bytes, >, 0);
4930eda14cbcSMatt Macy mutex_enter(q_lock);
4931eda14cbcSMatt Macy while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
4932eda14cbcSMatt Macy cv_wait(&queue->q_zio_cv, q_lock);
4933eda14cbcSMatt Macy queue->q_inflight_bytes += BP_GET_PSIZE(bp);
49341f1e2261SMartin Matuska pio = queue->q_zio;
4935eda14cbcSMatt Macy mutex_exit(q_lock);
4936eda14cbcSMatt Macy }
4937eda14cbcSMatt Macy
49381f1e2261SMartin Matuska ASSERT(pio != NULL);
4939a0b956f5SMartin Matuska count_block_issued(spa, bp, queue == NULL);
49401f1e2261SMartin Matuska zio_nowait(zio_read(pio, spa, bp, data, size, dsl_scan_scrub_done,
49411f1e2261SMartin Matuska queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
4942eda14cbcSMatt Macy }
4943eda14cbcSMatt Macy
4944eda14cbcSMatt Macy /*
4945eda14cbcSMatt Macy * This is the primary extent sorting algorithm. We balance two parameters:
4946eda14cbcSMatt Macy * 1) how many bytes of I/O are in an extent
4947eda14cbcSMatt Macy * 2) how well the extent is filled with I/O (as a fraction of its total size)
4948eda14cbcSMatt Macy * Since we allow extents to have gaps between their constituent I/Os, it's
4949eda14cbcSMatt Macy * possible to have a fairly large extent that contains the same amount of
4950eda14cbcSMatt Macy * I/O bytes than a much smaller extent, which just packs the I/O more tightly.
4951eda14cbcSMatt Macy * The algorithm sorts based on a score calculated from the extent's size,
4952eda14cbcSMatt Macy * the relative fill volume (in %) and a "fill weight" parameter that controls
4953eda14cbcSMatt Macy * the split between whether we prefer larger extents or more well populated
4954eda14cbcSMatt Macy * extents:
4955eda14cbcSMatt Macy *
4956eda14cbcSMatt Macy * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT)
4957eda14cbcSMatt Macy *
4958eda14cbcSMatt Macy * Example:
4959eda14cbcSMatt Macy * 1) assume extsz = 64 MiB
4960eda14cbcSMatt Macy * 2) assume fill = 32 MiB (extent is half full)
4961eda14cbcSMatt Macy * 3) assume fill_weight = 3
4962eda14cbcSMatt Macy * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100
4963eda14cbcSMatt Macy * SCORE = 32M + (50 * 3 * 32M) / 100
4964eda14cbcSMatt Macy * SCORE = 32M + (4800M / 100)
4965eda14cbcSMatt Macy * SCORE = 32M + 48M
4966eda14cbcSMatt Macy * ^ ^
4967eda14cbcSMatt Macy * | +--- final total relative fill-based score
4968eda14cbcSMatt Macy * +--------- final total fill-based score
4969eda14cbcSMatt Macy * SCORE = 80M
4970eda14cbcSMatt Macy *
4971eda14cbcSMatt Macy * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards
4972eda14cbcSMatt Macy * extents that are more completely filled (in a 3:2 ratio) vs just larger.
4973eda14cbcSMatt Macy * Note that as an optimization, we replace multiplication and division by
4974eda14cbcSMatt Macy * 100 with bitshifting by 7 (which effectively multiplies and divides by 128).
4975a0b956f5SMartin Matuska *
4976a0b956f5SMartin Matuska * Since we do not care if one extent is only few percent better than another,
4977a0b956f5SMartin Matuska * compress the score into 6 bits via binary logarithm AKA highbit64() and
4978a0b956f5SMartin Matuska * put into otherwise unused due to ashift high bits of offset. This allows
4979a0b956f5SMartin Matuska * to reduce q_exts_by_size B-tree elements to only 64 bits and compare them
4980a0b956f5SMartin Matuska * with single operation. Plus it makes scrubs more sequential and reduces
4981a0b956f5SMartin Matuska * chances that minor extent change move it within the B-tree.
4982eda14cbcSMatt Macy */
49834e8d558cSMartin Matuska __attribute__((always_inline)) inline
4984eda14cbcSMatt Macy static int
ext_size_compare(const void * x,const void * y)4985eda14cbcSMatt Macy ext_size_compare(const void *x, const void *y)
4986eda14cbcSMatt Macy {
4987a0b956f5SMartin Matuska const uint64_t *a = x, *b = y;
4988eda14cbcSMatt Macy
4989a0b956f5SMartin Matuska return (TREE_CMP(*a, *b));
4990eda14cbcSMatt Macy }
4991a0b956f5SMartin Matuska
ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf,uint64_t,ext_size_compare)49924e8d558cSMartin Matuska ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t,
49934e8d558cSMartin Matuska ext_size_compare)
49944e8d558cSMartin Matuska
4995a0b956f5SMartin Matuska static void
4996b59a0cdeSMartin Matuska ext_size_create(zfs_range_tree_t *rt, void *arg)
4997a0b956f5SMartin Matuska {
4998a0b956f5SMartin Matuska (void) rt;
4999a0b956f5SMartin Matuska zfs_btree_t *size_tree = arg;
5000a0b956f5SMartin Matuska
50014e8d558cSMartin Matuska zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf,
50024e8d558cSMartin Matuska sizeof (uint64_t));
5003eda14cbcSMatt Macy }
5004eda14cbcSMatt Macy
5005a0b956f5SMartin Matuska static void
ext_size_destroy(zfs_range_tree_t * rt,void * arg)5006b59a0cdeSMartin Matuska ext_size_destroy(zfs_range_tree_t *rt, void *arg)
5007a0b956f5SMartin Matuska {
5008a0b956f5SMartin Matuska (void) rt;
5009a0b956f5SMartin Matuska zfs_btree_t *size_tree = arg;
5010a0b956f5SMartin Matuska ASSERT0(zfs_btree_numnodes(size_tree));
5011a0b956f5SMartin Matuska
5012a0b956f5SMartin Matuska zfs_btree_destroy(size_tree);
5013a0b956f5SMartin Matuska }
5014a0b956f5SMartin Matuska
5015a0b956f5SMartin Matuska static uint64_t
ext_size_value(zfs_range_tree_t * rt,zfs_range_seg_gap_t * rsg)5016b59a0cdeSMartin Matuska ext_size_value(zfs_range_tree_t *rt, zfs_range_seg_gap_t *rsg)
5017a0b956f5SMartin Matuska {
5018a0b956f5SMartin Matuska (void) rt;
5019a0b956f5SMartin Matuska uint64_t size = rsg->rs_end - rsg->rs_start;
5020a0b956f5SMartin Matuska uint64_t score = rsg->rs_fill + ((((rsg->rs_fill << 7) / size) *
5021a0b956f5SMartin Matuska fill_weight * rsg->rs_fill) >> 7);
5022a0b956f5SMartin Matuska ASSERT3U(rt->rt_shift, >=, 8);
5023a0b956f5SMartin Matuska return (((uint64_t)(64 - highbit64(score)) << 56) | rsg->rs_start);
5024a0b956f5SMartin Matuska }
5025a0b956f5SMartin Matuska
5026a0b956f5SMartin Matuska static void
ext_size_add(zfs_range_tree_t * rt,zfs_range_seg_t * rs,void * arg)5027b59a0cdeSMartin Matuska ext_size_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg)
5028a0b956f5SMartin Matuska {
5029a0b956f5SMartin Matuska zfs_btree_t *size_tree = arg;
5030b59a0cdeSMartin Matuska ASSERT3U(rt->rt_type, ==, ZFS_RANGE_SEG_GAP);
5031b59a0cdeSMartin Matuska uint64_t v = ext_size_value(rt, (zfs_range_seg_gap_t *)rs);
5032a0b956f5SMartin Matuska zfs_btree_add(size_tree, &v);
5033a0b956f5SMartin Matuska }
5034a0b956f5SMartin Matuska
5035a0b956f5SMartin Matuska static void
ext_size_remove(zfs_range_tree_t * rt,zfs_range_seg_t * rs,void * arg)5036b59a0cdeSMartin Matuska ext_size_remove(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg)
5037a0b956f5SMartin Matuska {
5038a0b956f5SMartin Matuska zfs_btree_t *size_tree = arg;
5039b59a0cdeSMartin Matuska ASSERT3U(rt->rt_type, ==, ZFS_RANGE_SEG_GAP);
5040b59a0cdeSMartin Matuska uint64_t v = ext_size_value(rt, (zfs_range_seg_gap_t *)rs);
5041a0b956f5SMartin Matuska zfs_btree_remove(size_tree, &v);
5042a0b956f5SMartin Matuska }
5043a0b956f5SMartin Matuska
5044a0b956f5SMartin Matuska static void
ext_size_vacate(zfs_range_tree_t * rt,void * arg)5045b59a0cdeSMartin Matuska ext_size_vacate(zfs_range_tree_t *rt, void *arg)
5046a0b956f5SMartin Matuska {
5047a0b956f5SMartin Matuska zfs_btree_t *size_tree = arg;
5048a0b956f5SMartin Matuska zfs_btree_clear(size_tree);
5049a0b956f5SMartin Matuska zfs_btree_destroy(size_tree);
5050a0b956f5SMartin Matuska
5051a0b956f5SMartin Matuska ext_size_create(rt, arg);
5052a0b956f5SMartin Matuska }
5053a0b956f5SMartin Matuska
5054b59a0cdeSMartin Matuska static const zfs_range_tree_ops_t ext_size_ops = {
5055a0b956f5SMartin Matuska .rtop_create = ext_size_create,
5056a0b956f5SMartin Matuska .rtop_destroy = ext_size_destroy,
5057a0b956f5SMartin Matuska .rtop_add = ext_size_add,
5058a0b956f5SMartin Matuska .rtop_remove = ext_size_remove,
5059a0b956f5SMartin Matuska .rtop_vacate = ext_size_vacate
5060a0b956f5SMartin Matuska };
5061a0b956f5SMartin Matuska
5062eda14cbcSMatt Macy /*
5063eda14cbcSMatt Macy * Comparator for the q_sios_by_addr tree. Sorting is simply performed
5064eda14cbcSMatt Macy * based on LBA-order (from lowest to highest).
5065eda14cbcSMatt Macy */
5066eda14cbcSMatt Macy static int
sio_addr_compare(const void * x,const void * y)5067eda14cbcSMatt Macy sio_addr_compare(const void *x, const void *y)
5068eda14cbcSMatt Macy {
5069eda14cbcSMatt Macy const scan_io_t *a = x, *b = y;
5070eda14cbcSMatt Macy
5071eda14cbcSMatt Macy return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
5072eda14cbcSMatt Macy }
5073eda14cbcSMatt Macy
5074eda14cbcSMatt Macy /* IO queues are created on demand when they are needed. */
5075eda14cbcSMatt Macy static dsl_scan_io_queue_t *
scan_io_queue_create(vdev_t * vd)5076eda14cbcSMatt Macy scan_io_queue_create(vdev_t *vd)
5077eda14cbcSMatt Macy {
5078eda14cbcSMatt Macy dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
5079eda14cbcSMatt Macy dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP);
5080eda14cbcSMatt Macy
5081eda14cbcSMatt Macy q->q_scn = scn;
5082eda14cbcSMatt Macy q->q_vd = vd;
5083eda14cbcSMatt Macy q->q_sio_memused = 0;
5084a0b956f5SMartin Matuska q->q_last_ext_addr = -1;
5085eda14cbcSMatt Macy cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
5086b59a0cdeSMartin Matuska q->q_exts_by_addr = zfs_range_tree_create_gap(&ext_size_ops,
5087b59a0cdeSMartin Matuska ZFS_RANGE_SEG_GAP, &q->q_exts_by_size, 0, vd->vdev_ashift,
5088b59a0cdeSMartin Matuska zfs_scan_max_ext_gap);
5089eda14cbcSMatt Macy avl_create(&q->q_sios_by_addr, sio_addr_compare,
5090eda14cbcSMatt Macy sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
5091eda14cbcSMatt Macy
5092eda14cbcSMatt Macy return (q);
5093eda14cbcSMatt Macy }
5094eda14cbcSMatt Macy
5095eda14cbcSMatt Macy /*
5096eda14cbcSMatt Macy * Destroys a scan queue and all segments and scan_io_t's contained in it.
5097eda14cbcSMatt Macy * No further execution of I/O occurs, anything pending in the queue is
5098eda14cbcSMatt Macy * simply freed without being executed.
5099eda14cbcSMatt Macy */
5100eda14cbcSMatt Macy void
dsl_scan_io_queue_destroy(dsl_scan_io_queue_t * queue)5101eda14cbcSMatt Macy dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
5102eda14cbcSMatt Macy {
5103eda14cbcSMatt Macy dsl_scan_t *scn = queue->q_scn;
5104eda14cbcSMatt Macy scan_io_t *sio;
5105eda14cbcSMatt Macy void *cookie = NULL;
5106eda14cbcSMatt Macy
5107eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
5108eda14cbcSMatt Macy
5109a0b956f5SMartin Matuska if (!avl_is_empty(&queue->q_sios_by_addr))
5110a0b956f5SMartin Matuska atomic_add_64(&scn->scn_queues_pending, -1);
5111eda14cbcSMatt Macy while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
5112eda14cbcSMatt Macy NULL) {
5113b59a0cdeSMartin Matuska ASSERT(zfs_range_tree_contains(queue->q_exts_by_addr,
5114eda14cbcSMatt Macy SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
5115eda14cbcSMatt Macy queue->q_sio_memused -= SIO_GET_MUSED(sio);
5116eda14cbcSMatt Macy sio_free(sio);
5117eda14cbcSMatt Macy }
5118eda14cbcSMatt Macy
5119eda14cbcSMatt Macy ASSERT0(queue->q_sio_memused);
5120b59a0cdeSMartin Matuska zfs_range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
5121b59a0cdeSMartin Matuska zfs_range_tree_destroy(queue->q_exts_by_addr);
5122eda14cbcSMatt Macy avl_destroy(&queue->q_sios_by_addr);
5123eda14cbcSMatt Macy cv_destroy(&queue->q_zio_cv);
5124eda14cbcSMatt Macy
5125eda14cbcSMatt Macy kmem_free(queue, sizeof (*queue));
5126eda14cbcSMatt Macy }
5127eda14cbcSMatt Macy
5128eda14cbcSMatt Macy /*
5129eda14cbcSMatt Macy * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is
5130eda14cbcSMatt Macy * called on behalf of vdev_top_transfer when creating or destroying
5131eda14cbcSMatt Macy * a mirror vdev due to zpool attach/detach.
5132eda14cbcSMatt Macy */
5133eda14cbcSMatt Macy void
dsl_scan_io_queue_vdev_xfer(vdev_t * svd,vdev_t * tvd)5134eda14cbcSMatt Macy dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd)
5135eda14cbcSMatt Macy {
5136eda14cbcSMatt Macy mutex_enter(&svd->vdev_scan_io_queue_lock);
5137eda14cbcSMatt Macy mutex_enter(&tvd->vdev_scan_io_queue_lock);
5138eda14cbcSMatt Macy
5139eda14cbcSMatt Macy VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
5140eda14cbcSMatt Macy tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
5141eda14cbcSMatt Macy svd->vdev_scan_io_queue = NULL;
5142eda14cbcSMatt Macy if (tvd->vdev_scan_io_queue != NULL)
5143eda14cbcSMatt Macy tvd->vdev_scan_io_queue->q_vd = tvd;
5144eda14cbcSMatt Macy
5145eda14cbcSMatt Macy mutex_exit(&tvd->vdev_scan_io_queue_lock);
5146eda14cbcSMatt Macy mutex_exit(&svd->vdev_scan_io_queue_lock);
5147eda14cbcSMatt Macy }
5148eda14cbcSMatt Macy
5149eda14cbcSMatt Macy static void
scan_io_queues_destroy(dsl_scan_t * scn)5150eda14cbcSMatt Macy scan_io_queues_destroy(dsl_scan_t *scn)
5151eda14cbcSMatt Macy {
5152eda14cbcSMatt Macy vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
5153eda14cbcSMatt Macy
5154eda14cbcSMatt Macy for (uint64_t i = 0; i < rvd->vdev_children; i++) {
5155eda14cbcSMatt Macy vdev_t *tvd = rvd->vdev_child[i];
5156eda14cbcSMatt Macy
5157eda14cbcSMatt Macy mutex_enter(&tvd->vdev_scan_io_queue_lock);
5158eda14cbcSMatt Macy if (tvd->vdev_scan_io_queue != NULL)
5159eda14cbcSMatt Macy dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue);
5160eda14cbcSMatt Macy tvd->vdev_scan_io_queue = NULL;
5161eda14cbcSMatt Macy mutex_exit(&tvd->vdev_scan_io_queue_lock);
5162eda14cbcSMatt Macy }
5163eda14cbcSMatt Macy }
5164eda14cbcSMatt Macy
5165eda14cbcSMatt Macy static void
dsl_scan_freed_dva(spa_t * spa,const blkptr_t * bp,int dva_i)5166eda14cbcSMatt Macy dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
5167eda14cbcSMatt Macy {
5168eda14cbcSMatt Macy dsl_pool_t *dp = spa->spa_dsl_pool;
5169eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
5170eda14cbcSMatt Macy vdev_t *vdev;
5171eda14cbcSMatt Macy kmutex_t *q_lock;
5172eda14cbcSMatt Macy dsl_scan_io_queue_t *queue;
5173eda14cbcSMatt Macy scan_io_t *srch_sio, *sio;
5174eda14cbcSMatt Macy avl_index_t idx;
5175eda14cbcSMatt Macy uint64_t start, size;
5176eda14cbcSMatt Macy
5177eda14cbcSMatt Macy vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i]));
5178eda14cbcSMatt Macy ASSERT(vdev != NULL);
5179eda14cbcSMatt Macy q_lock = &vdev->vdev_scan_io_queue_lock;
5180eda14cbcSMatt Macy queue = vdev->vdev_scan_io_queue;
5181eda14cbcSMatt Macy
5182eda14cbcSMatt Macy mutex_enter(q_lock);
5183eda14cbcSMatt Macy if (queue == NULL) {
5184eda14cbcSMatt Macy mutex_exit(q_lock);
5185eda14cbcSMatt Macy return;
5186eda14cbcSMatt Macy }
5187eda14cbcSMatt Macy
5188eda14cbcSMatt Macy srch_sio = sio_alloc(BP_GET_NDVAS(bp));
5189eda14cbcSMatt Macy bp2sio(bp, srch_sio, dva_i);
5190eda14cbcSMatt Macy start = SIO_GET_OFFSET(srch_sio);
5191eda14cbcSMatt Macy size = SIO_GET_ASIZE(srch_sio);
5192eda14cbcSMatt Macy
5193eda14cbcSMatt Macy /*
5194eda14cbcSMatt Macy * We can find the zio in two states:
5195eda14cbcSMatt Macy * 1) Cold, just sitting in the queue of zio's to be issued at
5196eda14cbcSMatt Macy * some point in the future. In this case, all we do is
5197eda14cbcSMatt Macy * remove the zio from the q_sios_by_addr tree, decrement
5198b59a0cdeSMartin Matuska * its data volume from the containing zfs_range_seg_t and
5199eda14cbcSMatt Macy * resort the q_exts_by_size tree to reflect that the
5200b59a0cdeSMartin Matuska * zfs_range_seg_t has lost some of its 'fill'. We don't shorten
5201b59a0cdeSMartin Matuska * the zfs_range_seg_t - this is usually rare enough not to be
5202eda14cbcSMatt Macy * worth the extra hassle of trying keep track of precise
5203eda14cbcSMatt Macy * extent boundaries.
5204eda14cbcSMatt Macy * 2) Hot, where the zio is currently in-flight in
5205eda14cbcSMatt Macy * dsl_scan_issue_ios. In this case, we can't simply
5206eda14cbcSMatt Macy * reach in and stop the in-flight zio's, so we instead
5207eda14cbcSMatt Macy * block the caller. Eventually, dsl_scan_issue_ios will
5208eda14cbcSMatt Macy * be done with issuing the zio's it gathered and will
5209eda14cbcSMatt Macy * signal us.
5210eda14cbcSMatt Macy */
5211eda14cbcSMatt Macy sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
5212eda14cbcSMatt Macy sio_free(srch_sio);
5213eda14cbcSMatt Macy
5214eda14cbcSMatt Macy if (sio != NULL) {
5215eda14cbcSMatt Macy blkptr_t tmpbp;
5216eda14cbcSMatt Macy
5217eda14cbcSMatt Macy /* Got it while it was cold in the queue */
5218eda14cbcSMatt Macy ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
5219a0b956f5SMartin Matuska ASSERT3U(size, ==, SIO_GET_ASIZE(sio));
5220eda14cbcSMatt Macy avl_remove(&queue->q_sios_by_addr, sio);
5221a0b956f5SMartin Matuska if (avl_is_empty(&queue->q_sios_by_addr))
5222a0b956f5SMartin Matuska atomic_add_64(&scn->scn_queues_pending, -1);
5223eda14cbcSMatt Macy queue->q_sio_memused -= SIO_GET_MUSED(sio);
5224eda14cbcSMatt Macy
5225b59a0cdeSMartin Matuska ASSERT(zfs_range_tree_contains(queue->q_exts_by_addr, start,
5226b59a0cdeSMartin Matuska size));
5227b59a0cdeSMartin Matuska zfs_range_tree_remove_fill(queue->q_exts_by_addr, start, size);
5228eda14cbcSMatt Macy
52290a97523dSMartin Matuska /* count the block as though we skipped it */
5230eda14cbcSMatt Macy sio2bp(sio, &tmpbp);
52310a97523dSMartin Matuska count_block_skipped(scn, &tmpbp, B_FALSE);
5232eda14cbcSMatt Macy
5233eda14cbcSMatt Macy sio_free(sio);
5234eda14cbcSMatt Macy }
5235eda14cbcSMatt Macy mutex_exit(q_lock);
5236eda14cbcSMatt Macy }
5237eda14cbcSMatt Macy
5238eda14cbcSMatt Macy /*
5239eda14cbcSMatt Macy * Callback invoked when a zio_free() zio is executing. This needs to be
5240eda14cbcSMatt Macy * intercepted to prevent the zio from deallocating a particular portion
5241eda14cbcSMatt Macy * of disk space and it then getting reallocated and written to, while we
5242eda14cbcSMatt Macy * still have it queued up for processing.
5243eda14cbcSMatt Macy */
5244eda14cbcSMatt Macy void
dsl_scan_freed(spa_t * spa,const blkptr_t * bp)5245eda14cbcSMatt Macy dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
5246eda14cbcSMatt Macy {
5247eda14cbcSMatt Macy dsl_pool_t *dp = spa->spa_dsl_pool;
5248eda14cbcSMatt Macy dsl_scan_t *scn = dp->dp_scan;
5249eda14cbcSMatt Macy
5250eda14cbcSMatt Macy ASSERT(!BP_IS_EMBEDDED(bp));
5251eda14cbcSMatt Macy ASSERT(scn != NULL);
5252eda14cbcSMatt Macy if (!dsl_scan_is_running(scn))
5253eda14cbcSMatt Macy return;
5254eda14cbcSMatt Macy
5255eda14cbcSMatt Macy for (int i = 0; i < BP_GET_NDVAS(bp); i++)
5256eda14cbcSMatt Macy dsl_scan_freed_dva(spa, bp, i);
5257eda14cbcSMatt Macy }
5258eda14cbcSMatt Macy
5259eda14cbcSMatt Macy /*
5260eda14cbcSMatt Macy * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has
5261eda14cbcSMatt Macy * not started, start it. Otherwise, only restart if max txg in DTL range is
5262eda14cbcSMatt Macy * greater than the max txg in the current scan. If the DTL max is less than
5263eda14cbcSMatt Macy * the scan max, then the vdev has not missed any new data since the resilver
5264eda14cbcSMatt Macy * started, so a restart is not needed.
5265eda14cbcSMatt Macy */
5266eda14cbcSMatt Macy void
dsl_scan_assess_vdev(dsl_pool_t * dp,vdev_t * vd)5267eda14cbcSMatt Macy dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd)
5268eda14cbcSMatt Macy {
5269eda14cbcSMatt Macy uint64_t min, max;
5270eda14cbcSMatt Macy
5271eda14cbcSMatt Macy if (!vdev_resilver_needed(vd, &min, &max))
5272eda14cbcSMatt Macy return;
5273eda14cbcSMatt Macy
5274eda14cbcSMatt Macy if (!dsl_scan_resilvering(dp)) {
5275eda14cbcSMatt Macy spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
5276eda14cbcSMatt Macy return;
5277eda14cbcSMatt Macy }
5278eda14cbcSMatt Macy
5279eda14cbcSMatt Macy if (max <= dp->dp_scan->scn_phys.scn_max_txg)
5280eda14cbcSMatt Macy return;
5281eda14cbcSMatt Macy
5282eda14cbcSMatt Macy /* restart is needed, check if it can be deferred */
5283eda14cbcSMatt Macy if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
5284eda14cbcSMatt Macy vdev_defer_resilver(vd);
5285eda14cbcSMatt Macy else
5286eda14cbcSMatt Macy spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
5287eda14cbcSMatt Macy }
5288eda14cbcSMatt Macy
5289dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, U64, ZMOD_RW,
5290eda14cbcSMatt Macy "Max bytes in flight per leaf vdev for scrubs and resilvers");
5291eda14cbcSMatt Macy
5292be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, UINT, ZMOD_RW,
5293eda14cbcSMatt Macy "Min millisecs to scrub per txg");
5294eda14cbcSMatt Macy
5295be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, UINT, ZMOD_RW,
5296eda14cbcSMatt Macy "Min millisecs to obsolete per txg");
5297eda14cbcSMatt Macy
5298be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, UINT, ZMOD_RW,
5299eda14cbcSMatt Macy "Min millisecs to free per txg");
5300eda14cbcSMatt Macy
5301be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, UINT, ZMOD_RW,
5302eda14cbcSMatt Macy "Min millisecs to resilver per txg");
5303eda14cbcSMatt Macy
5304eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW,
5305eda14cbcSMatt Macy "Set to prevent scans from progressing");
5306eda14cbcSMatt Macy
5307eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW,
5308eda14cbcSMatt Macy "Set to disable scrub I/O");
5309eda14cbcSMatt Macy
5310eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW,
5311eda14cbcSMatt Macy "Set to disable scrub prefetching");
5312eda14cbcSMatt Macy
5313dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW,
5314eda14cbcSMatt Macy "Max number of blocks freed in one txg");
5315eda14cbcSMatt Macy
5316dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW,
5317eda14cbcSMatt Macy "Max number of dedup blocks freed in one txg");
5318eda14cbcSMatt Macy
5319eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW,
5320eda14cbcSMatt Macy "Enable processing of the free_bpobj");
5321eda14cbcSMatt Macy
5322a0b956f5SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scan_blkstats, INT, ZMOD_RW,
5323a0b956f5SMartin Matuska "Enable block statistics calculation during scrub");
5324a0b956f5SMartin Matuska
5325be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, UINT, ZMOD_RW,
5326eda14cbcSMatt Macy "Fraction of RAM for scan hard limit");
5327eda14cbcSMatt Macy
5328be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, UINT, ZMOD_RW,
5329c03c5b1cSMartin Matuska "IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size");
5330eda14cbcSMatt Macy
5331eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW,
5332eda14cbcSMatt Macy "Scrub using legacy non-sequential method");
5333eda14cbcSMatt Macy
5334be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, UINT, ZMOD_RW,
5335eda14cbcSMatt Macy "Scan progress on-disk checkpointing interval");
5336eda14cbcSMatt Macy
5337dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, U64, ZMOD_RW,
5338eda14cbcSMatt Macy "Max gap in bytes between sequential scrub / resilver I/Os");
5339eda14cbcSMatt Macy
5340be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, UINT, ZMOD_RW,
5341eda14cbcSMatt Macy "Fraction of hard limit used as soft limit");
5342eda14cbcSMatt Macy
5343eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW,
5344eda14cbcSMatt Macy "Tunable to attempt to reduce lock contention");
5345eda14cbcSMatt Macy
5346be181ee2SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW,
5347eda14cbcSMatt Macy "Tunable to adjust bias towards more filled segments during scans");
5348eda14cbcSMatt Macy
5349c9539b89SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW,
5350c9539b89SMartin Matuska "Tunable to report resilver performance over the last N txgs");
5351c9539b89SMartin Matuska
5352eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,
5353eda14cbcSMatt Macy "Process all resilvers immediately");
5354c0a83fe0SMartin Matuska
53557a7741afSMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, resilver_defer_percent, UINT, ZMOD_RW,
53567a7741afSMartin Matuska "Issued IO percent complete after which resilvers are deferred");
53577a7741afSMartin Matuska
535848f52d91SJohn Baldwin ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, UINT, ZMOD_RW,
5359c0a83fe0SMartin Matuska "Error blocks to be scrubbed in one txg");
5360