1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2016 by Delphix. All rights reserved.
24 * Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
25 * Copyright 2019 Joyent, Inc.
26 * Copyright 2023 RackTop Systems, Inc.
27 */
28
29 #include <sys/spa.h>
30 #include <sys/spa_impl.h>
31 #include <sys/txg.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/vdev_trim.h>
34 #include <sys/refcount.h>
35 #include <sys/metaslab_impl.h>
36 #include <sys/dsl_synctask.h>
37 #include <sys/zap.h>
38 #include <sys/dmu_tx.h>
39
40 /*
41 * TRIM is a feature which is used to notify a SSD that some previously
42 * written space is no longer allocated by the pool. This is useful because
43 * writes to a SSD must be performed to blocks which have first been erased.
44 * Ensuring the SSD always has a supply of erased blocks for new writes
45 * helps prevent the performance from deteriorating.
46 *
47 * There are two supported TRIM methods; manual and automatic.
48 *
49 * Manual TRIM:
50 *
51 * A manual TRIM is initiated by running the 'zpool trim' command. A single
52 * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for
53 * managing that vdev TRIM process. This involves iterating over all the
54 * metaslabs, calculating the unallocated space ranges, and then issuing the
55 * required TRIM I/Os.
56 *
57 * While a metaslab is being actively trimmed it is not eligible to perform
58 * new allocations. After traversing all of the metaslabs the thread is
59 * terminated. Finally, both the requested options and current progress of
60 * the TRIM are regularly written to the pool. This allows the TRIM to be
61 * suspended and resumed as needed.
62 *
63 * Automatic TRIM:
64 *
65 * An automatic TRIM is enabled by setting the 'autotrim' pool property
66 * to 'on'. When enabled, a `vdev_autotrim' thread is created for each
67 * top-level (not leaf) vdev in the pool. These threads perform the same
68 * core TRIM process as a manual TRIM, but with a few key differences.
69 *
70 * 1) Automatic TRIM happens continuously in the background and operates
71 * solely on recently freed blocks (ms_trim not ms_allocatable).
72 *
73 * 2) Each thread is associated with a top-level (not leaf) vdev. This has
74 * the benefit of simplifying the threading model, it makes it easier
75 * to coordinate administrative commands, and it ensures only a single
76 * metaslab is disabled at a time. Unlike manual TRIM, this means each
77 * 'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its
78 * children.
79 *
80 * 3) There is no automatic TRIM progress information stored on disk, nor
81 * is it reported by 'zpool status'.
82 *
83 * While the automatic TRIM process is highly effective it is more likely
84 * than a manual TRIM to encounter tiny ranges. Ranges less than or equal to
85 * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently
86 * TRIM and are skipped. This means small amounts of freed space may not
87 * be automatically trimmed.
88 *
89 * Furthermore, devices with attached hot spares and devices being actively
90 * replaced are skipped. This is done to avoid adding additional stress to
91 * a potentially unhealthy device and to minimize the required rebuild time.
92 *
93 * For this reason it may be beneficial to occasionally manually TRIM a pool
94 * even when automatic TRIM is enabled.
95 */
96
97 /*
98 * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths.
99 */
100 unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024;
101
102 /*
103 * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped.
104 */
105 unsigned int zfs_trim_extent_bytes_min = 32 * 1024;
106
107 /*
108 * Skip uninitialized metaslabs during the TRIM process. This option is
109 * useful for pools constructed from large thinly-provisioned devices where
110 * TRIM operations are slow. As a pool ages an increasing fraction of
111 * the pools metaslabs will be initialized progressively degrading the
112 * usefulness of this option. This setting is stored when starting a
113 * manual TRIM and will persist for the duration of the requested TRIM.
114 */
115 unsigned int zfs_trim_metaslab_skip = 0;
116
117 /*
118 * Maximum number of queued TRIM I/Os per leaf vdev. The number of
119 * concurrent TRIM I/Os issued to the device is controlled by the
120 * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options.
121 */
122 unsigned int zfs_trim_queue_limit = 10;
123
124 /*
125 * The minimum number of transaction groups between automatic trims of a
126 * metaslab. This setting represents a trade-off between issuing more
127 * efficient TRIM operations, by allowing them to be aggregated longer,
128 * and issuing them promptly so the trimmed space is available. Note
129 * that this value is a minimum; metaslabs can be trimmed less frequently
130 * when there are a large number of ranges which need to be trimmed.
131 *
132 * Increasing this value will allow frees to be aggregated for a longer
133 * time. This can result is larger TRIM operations, and increased memory
134 * usage in order to track the ranges to be trimmed. Decreasing this value
135 * has the opposite effect. The default value of 32 was determined though
136 * testing to be a reasonable compromise.
137 */
138 unsigned int zfs_trim_txg_batch = 32;
139
140 /*
141 * The trim_args are a control structure which describe how a leaf vdev
142 * should be trimmed. The core elements are the vdev, the metaslab being
143 * trimmed and a range tree containing the extents to TRIM. All provided
144 * ranges must be within the metaslab.
145 */
146 typedef struct trim_args {
147 /*
148 * These fields are set by the caller of vdev_trim_ranges().
149 */
150 vdev_t *trim_vdev; /* Leaf vdev to TRIM */
151 metaslab_t *trim_msp; /* Disabled metaslab */
152 range_tree_t *trim_tree; /* TRIM ranges (in metaslab) */
153 trim_type_t trim_type; /* Manual or auto TRIM */
154 uint64_t trim_extent_bytes_max; /* Maximum TRIM I/O size */
155 uint64_t trim_extent_bytes_min; /* Minimum TRIM I/O size */
156 enum trim_flag trim_flags; /* TRIM flags (secure) */
157
158 /*
159 * These fields are updated by vdev_trim_ranges().
160 */
161 hrtime_t trim_start_time; /* Start time */
162 uint64_t trim_bytes_done; /* Bytes trimmed */
163 } trim_args_t;
164
165 /*
166 * Determines whether a vdev_trim_thread() should be stopped.
167 */
168 static boolean_t
vdev_trim_should_stop(vdev_t * vd)169 vdev_trim_should_stop(vdev_t *vd)
170 {
171 return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) ||
172 vd->vdev_detached || vd->vdev_top->vdev_removing);
173 }
174
175 /*
176 * Determines whether a vdev_autotrim_thread() should be stopped.
177 */
178 static boolean_t
vdev_autotrim_should_stop(vdev_t * tvd)179 vdev_autotrim_should_stop(vdev_t *tvd)
180 {
181 return (tvd->vdev_autotrim_exit_wanted ||
182 !vdev_writeable(tvd) || tvd->vdev_removing ||
183 spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF);
184 }
185
186 /*
187 * The sync task for updating the on-disk state of a manual TRIM. This
188 * is scheduled by vdev_trim_change_state().
189 */
190 static void
vdev_trim_zap_update_sync(void * arg,dmu_tx_t * tx)191 vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx)
192 {
193 /*
194 * We pass in the guid instead of the vdev_t since the vdev may
195 * have been freed prior to the sync task being processed. This
196 * happens when a vdev is detached as we call spa_config_vdev_exit(),
197 * stop the trimming thread, schedule the sync task, and free
198 * the vdev. Later when the scheduled sync task is invoked, it would
199 * find that the vdev has been freed.
200 */
201 uint64_t guid = *(uint64_t *)arg;
202 uint64_t txg = dmu_tx_get_txg(tx);
203 kmem_free(arg, sizeof (uint64_t));
204
205 vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
206 if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
207 return;
208
209 uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK];
210 vd->vdev_trim_offset[txg & TXG_MASK] = 0;
211
212 VERIFY3U(vd->vdev_leaf_zap, !=, 0);
213
214 objset_t *mos = vd->vdev_spa->spa_meta_objset;
215
216 if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) {
217
218 if (vd->vdev_trim_last_offset == UINT64_MAX)
219 last_offset = 0;
220
221 vd->vdev_trim_last_offset = last_offset;
222 VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
223 VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
224 sizeof (last_offset), 1, &last_offset, tx));
225 }
226
227 if (vd->vdev_trim_action_time > 0) {
228 uint64_t val = (uint64_t)vd->vdev_trim_action_time;
229 VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
230 VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val),
231 1, &val, tx));
232 }
233
234 if (vd->vdev_trim_rate > 0) {
235 uint64_t rate = (uint64_t)vd->vdev_trim_rate;
236
237 if (rate == UINT64_MAX)
238 rate = 0;
239
240 VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
241 VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx));
242 }
243
244 uint64_t partial = vd->vdev_trim_partial;
245 if (partial == UINT64_MAX)
246 partial = 0;
247
248 VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
249 sizeof (partial), 1, &partial, tx));
250
251 uint64_t secure = vd->vdev_trim_secure;
252 if (secure == UINT64_MAX)
253 secure = 0;
254
255 VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
256 sizeof (secure), 1, &secure, tx));
257
258
259 uint64_t trim_state = vd->vdev_trim_state;
260 VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
261 sizeof (trim_state), 1, &trim_state, tx));
262 }
263
264 /*
265 * Update the on-disk state of a manual TRIM. This is called to request
266 * that a TRIM be started/suspended/canceled, or to change one of the
267 * TRIM options (partial, secure, rate).
268 */
269 static void
vdev_trim_change_state(vdev_t * vd,vdev_trim_state_t new_state,uint64_t rate,boolean_t partial,boolean_t secure)270 vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
271 uint64_t rate, boolean_t partial, boolean_t secure)
272 {
273 ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
274 spa_t *spa = vd->vdev_spa;
275
276 if (new_state == vd->vdev_trim_state)
277 return;
278
279 /*
280 * Copy the vd's guid, this will be freed by the sync task.
281 */
282 uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
283 *guid = vd->vdev_guid;
284
285 /*
286 * If we're suspending, then preserve the original start time.
287 */
288 if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) {
289 vd->vdev_trim_action_time = gethrestime_sec();
290 }
291
292 /*
293 * If we're activating, then preserve the requested rate and trim
294 * method. Setting the last offset and rate to UINT64_MAX is used
295 * as a sentinel to indicate they should be reset to default values.
296 */
297 if (new_state == VDEV_TRIM_ACTIVE) {
298 if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE ||
299 vd->vdev_trim_state == VDEV_TRIM_CANCELED) {
300 vd->vdev_trim_last_offset = UINT64_MAX;
301 vd->vdev_trim_rate = UINT64_MAX;
302 vd->vdev_trim_partial = UINT64_MAX;
303 vd->vdev_trim_secure = UINT64_MAX;
304 }
305
306 if (rate != 0)
307 vd->vdev_trim_rate = rate;
308
309 if (partial != 0)
310 vd->vdev_trim_partial = partial;
311
312 if (secure != 0)
313 vd->vdev_trim_secure = secure;
314 }
315
316 boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED);
317 vd->vdev_trim_state = new_state;
318
319 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
320 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
321 dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync,
322 guid, 2, ZFS_SPACE_CHECK_NONE, tx);
323
324 switch (new_state) {
325 case VDEV_TRIM_ACTIVE:
326 spa_event_notify(spa, vd, NULL,
327 resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START);
328 spa_history_log_internal(spa, "trim", tx,
329 "vdev=%s activated", vd->vdev_path);
330 break;
331 case VDEV_TRIM_SUSPENDED:
332 spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND);
333 spa_history_log_internal(spa, "trim", tx,
334 "vdev=%s suspended", vd->vdev_path);
335 break;
336 case VDEV_TRIM_CANCELED:
337 spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
338 spa_history_log_internal(spa, "trim", tx,
339 "vdev=%s canceled", vd->vdev_path);
340 break;
341 case VDEV_TRIM_COMPLETE:
342 spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH);
343 spa_history_log_internal(spa, "trim", tx,
344 "vdev=%s complete", vd->vdev_path);
345 break;
346 default:
347 panic("invalid state %llu", (unsigned long long)new_state);
348 }
349
350 dmu_tx_commit(tx);
351 }
352
353 /*
354 * The zio_done_func_t done callback for each manual TRIM issued. It is
355 * responsible for updating the TRIM stats, reissuing failed TRIM I/Os,
356 * and limiting the number of in-flight TRIM I/Os.
357 */
358 static void
vdev_trim_cb(zio_t * zio)359 vdev_trim_cb(zio_t *zio)
360 {
361 vdev_t *vd = zio->io_vd;
362
363 mutex_enter(&vd->vdev_trim_io_lock);
364 if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
365 /*
366 * The I/O failed because the vdev was unavailable; roll the
367 * last offset back. (This works because spa_sync waits on
368 * spa_txg_zio before it runs sync tasks.)
369 */
370 uint64_t *offset =
371 &vd->vdev_trim_offset[zio->io_txg & TXG_MASK];
372 *offset = MIN(*offset, zio->io_offset);
373 } else {
374 if (zio->io_error != 0) {
375 vd->vdev_stat.vs_trim_errors++;
376 /*
377 * spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
378 * 0, 0, 0, 0, 1, zio->io_orig_size);
379 */
380 } else {
381 /*
382 * spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
383 * 1, zio->io_orig_size, 0, 0, 0, 0);
384 */
385 }
386
387 vd->vdev_trim_bytes_done += zio->io_orig_size;
388 }
389
390 ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0);
391 vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--;
392 cv_broadcast(&vd->vdev_trim_io_cv);
393 mutex_exit(&vd->vdev_trim_io_lock);
394
395 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
396 }
397
398 /*
399 * The zio_done_func_t done callback for each automatic TRIM issued. It
400 * is responsible for updating the TRIM stats and limiting the number of
401 * in-flight TRIM I/Os. Automatic TRIM I/Os are best effort and are
402 * never reissued on failure.
403 */
404 static void
vdev_autotrim_cb(zio_t * zio)405 vdev_autotrim_cb(zio_t *zio)
406 {
407 vdev_t *vd = zio->io_vd;
408
409 mutex_enter(&vd->vdev_trim_io_lock);
410
411 if (zio->io_error != 0) {
412 vd->vdev_stat.vs_trim_errors++;
413 /*
414 * spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO,
415 * 0, 0, 0, 0, 1, zio->io_orig_size);
416 */
417 } else {
418 /*
419 * spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO,
420 * 1, zio->io_orig_size, 0, 0, 0, 0);
421 */
422
423 vd->vdev_autotrim_bytes_done += zio->io_orig_size;
424 }
425
426 ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_AUTO], >, 0);
427 vd->vdev_trim_inflight[TRIM_TYPE_AUTO]--;
428 cv_broadcast(&vd->vdev_trim_io_cv);
429 mutex_exit(&vd->vdev_trim_io_lock);
430
431 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
432 }
433
434 /*
435 * Returns the average trim rate in bytes/sec for the ta->trim_vdev.
436 */
437 static uint64_t
vdev_trim_calculate_rate(trim_args_t * ta)438 vdev_trim_calculate_rate(trim_args_t *ta)
439 {
440 return (ta->trim_bytes_done * 1000 /
441 (NSEC2MSEC(gethrtime() - ta->trim_start_time) + 1));
442 }
443
444 /*
445 * Issues a physical TRIM and takes care of rate limiting (bytes/sec)
446 * and number of concurrent TRIM I/Os.
447 */
448 static int
vdev_trim_range(trim_args_t * ta,uint64_t start,uint64_t size)449 vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
450 {
451 vdev_t *vd = ta->trim_vdev;
452 spa_t *spa = vd->vdev_spa;
453
454 mutex_enter(&vd->vdev_trim_io_lock);
455
456 /*
457 * Limit manual TRIM I/Os to the requested rate. This does not
458 * apply to automatic TRIM since no per vdev rate can be specified.
459 */
460 if (ta->trim_type == TRIM_TYPE_MANUAL) {
461 while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) &&
462 vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) {
463 cv_timedwait_sig(&vd->vdev_trim_io_cv,
464 &vd->vdev_trim_io_lock, ddi_get_lbolt() +
465 MSEC_TO_TICK(10));
466 }
467 }
468 ta->trim_bytes_done += size;
469
470 /* Limit in-flight trimming I/Os */
471 while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] >=
472 zfs_trim_queue_limit) {
473 cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
474 }
475 vd->vdev_trim_inflight[ta->trim_type]++;
476 mutex_exit(&vd->vdev_trim_io_lock);
477
478 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
479 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
480 uint64_t txg = dmu_tx_get_txg(tx);
481
482 spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
483 mutex_enter(&vd->vdev_trim_lock);
484
485 if (ta->trim_type == TRIM_TYPE_MANUAL &&
486 vd->vdev_trim_offset[txg & TXG_MASK] == 0) {
487 uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
488 *guid = vd->vdev_guid;
489
490 /* This is the first write of this txg. */
491 dsl_sync_task_nowait(spa_get_dsl(spa),
492 vdev_trim_zap_update_sync, guid, 2,
493 ZFS_SPACE_CHECK_RESERVED, tx);
494 }
495
496 /*
497 * We know the vdev_t will still be around since all consumers of
498 * vdev_free must stop the trimming first.
499 */
500 if ((ta->trim_type == TRIM_TYPE_MANUAL &&
501 vdev_trim_should_stop(vd)) ||
502 (ta->trim_type == TRIM_TYPE_AUTO &&
503 vdev_autotrim_should_stop(vd->vdev_top))) {
504 mutex_enter(&vd->vdev_trim_io_lock);
505 vd->vdev_trim_inflight[ta->trim_type]--;
506 mutex_exit(&vd->vdev_trim_io_lock);
507 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
508 mutex_exit(&vd->vdev_trim_lock);
509 dmu_tx_commit(tx);
510 return (SET_ERROR(EINTR));
511 }
512 mutex_exit(&vd->vdev_trim_lock);
513
514 if (ta->trim_type == TRIM_TYPE_MANUAL)
515 vd->vdev_trim_offset[txg & TXG_MASK] = start + size;
516
517 zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd,
518 start, size, ta->trim_type == TRIM_TYPE_MANUAL ?
519 vdev_trim_cb : vdev_autotrim_cb, NULL,
520 ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, ta->trim_flags));
521 /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */
522
523 dmu_tx_commit(tx);
524
525 return (0);
526 }
527
528 /*
529 * Issues TRIM I/Os for all ranges in the provided ta->trim_tree range tree.
530 * Additional parameters describing how the TRIM should be performed must
531 * be set in the trim_args structure. See the trim_args definition for
532 * additional information.
533 */
534 static int
vdev_trim_ranges(trim_args_t * ta)535 vdev_trim_ranges(trim_args_t *ta)
536 {
537 vdev_t *vd = ta->trim_vdev;
538 zfs_btree_t *t = &ta->trim_tree->rt_root;
539 zfs_btree_index_t idx;
540 uint64_t extent_bytes_max = ta->trim_extent_bytes_max;
541 uint64_t extent_bytes_min = ta->trim_extent_bytes_min;
542 spa_t *spa = vd->vdev_spa;
543 int error = 0;
544
545 ta->trim_start_time = gethrtime();
546 ta->trim_bytes_done = 0;
547
548 for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL;
549 rs = zfs_btree_next(t, &idx, &idx)) {
550 uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs,
551 ta->trim_tree);
552
553 if (extent_bytes_min && size < extent_bytes_min) {
554 /*
555 * spa_iostats_trim_add(spa, ta->trim_type,
556 * 0, 0, 1, size, 0, 0);
557 */
558 continue;
559 }
560
561 /* Split range into legally-sized physical chunks */
562 uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1;
563
564 for (uint64_t w = 0; w < writes_required; w++) {
565 error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE +
566 rs_get_start(rs, ta->trim_tree) +
567 (w *extent_bytes_max), MIN(size -
568 (w * extent_bytes_max), extent_bytes_max));
569 if (error != 0) {
570 goto done;
571 }
572 }
573 }
574
575 done:
576 /*
577 * Make sure all TRIMs for this metaslab have completed before
578 * returning. TRIM zios have lower priority over regular or syncing
579 * zios, so all TRIM zios for this metaslab must complete before the
580 * metaslab is re-enabled. Otherwise it's possible write zios to
581 * this metaslab could cut ahead of still queued TRIM zios for this
582 * metaslab causing corruption if the ranges overlap.
583 */
584 mutex_enter(&vd->vdev_trim_io_lock);
585 while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) {
586 cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
587 }
588 mutex_exit(&vd->vdev_trim_io_lock);
589
590 return (error);
591 }
592
593 /*
594 * Calculates the completion percentage of a manual TRIM.
595 */
596 static void
vdev_trim_calculate_progress(vdev_t * vd)597 vdev_trim_calculate_progress(vdev_t *vd)
598 {
599 ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
600 spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
601 ASSERT(vd->vdev_leaf_zap != 0);
602
603 vd->vdev_trim_bytes_est = 0;
604 vd->vdev_trim_bytes_done = 0;
605
606 for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
607 metaslab_t *msp = vd->vdev_top->vdev_ms[i];
608 mutex_enter(&msp->ms_lock);
609
610 uint64_t ms_free = msp->ms_size -
611 metaslab_allocated_space(msp);
612
613 if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
614 ms_free /= vd->vdev_top->vdev_children;
615
616 /*
617 * Convert the metaslab range to a physical range
618 * on our vdev. We use this to determine if we are
619 * in the middle of this metaslab range.
620 */
621 range_seg64_t logical_rs, physical_rs;
622 logical_rs.rs_start = msp->ms_start;
623 logical_rs.rs_end = msp->ms_start + msp->ms_size;
624 vdev_xlate(vd, &logical_rs, &physical_rs);
625
626 if (vd->vdev_trim_last_offset <= physical_rs.rs_start) {
627 vd->vdev_trim_bytes_est += ms_free;
628 mutex_exit(&msp->ms_lock);
629 continue;
630 } else if (vd->vdev_trim_last_offset > physical_rs.rs_end) {
631 vd->vdev_trim_bytes_done += ms_free;
632 vd->vdev_trim_bytes_est += ms_free;
633 mutex_exit(&msp->ms_lock);
634 continue;
635 }
636
637 /*
638 * If we get here, we're in the middle of trimming this
639 * metaslab. Load it and walk the free tree for more
640 * accurate progress estimation.
641 */
642 VERIFY0(metaslab_load(msp));
643
644 range_tree_t *rt = msp->ms_allocatable;
645 zfs_btree_t *bt = &rt->rt_root;
646 zfs_btree_index_t idx;
647 for (range_seg_t *rs = zfs_btree_first(bt, &idx);
648 rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) {
649 logical_rs.rs_start = rs_get_start(rs, rt);
650 logical_rs.rs_end = rs_get_end(rs, rt);
651 vdev_xlate(vd, &logical_rs, &physical_rs);
652
653 uint64_t size = physical_rs.rs_end -
654 physical_rs.rs_start;
655 vd->vdev_trim_bytes_est += size;
656 if (vd->vdev_trim_last_offset >= physical_rs.rs_end) {
657 vd->vdev_trim_bytes_done += size;
658 } else if (vd->vdev_trim_last_offset >
659 physical_rs.rs_start &&
660 vd->vdev_trim_last_offset <=
661 physical_rs.rs_end) {
662 vd->vdev_trim_bytes_done +=
663 vd->vdev_trim_last_offset -
664 physical_rs.rs_start;
665 }
666 }
667 mutex_exit(&msp->ms_lock);
668 }
669 }
670
671 /*
672 * Load from disk the vdev's manual TRIM information. This includes the
673 * state, progress, and options provided when initiating the manual TRIM.
674 */
675 static int
vdev_trim_load(vdev_t * vd)676 vdev_trim_load(vdev_t *vd)
677 {
678 int err = 0;
679 ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
680 spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
681 ASSERT(vd->vdev_leaf_zap != 0);
682
683 if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE ||
684 vd->vdev_trim_state == VDEV_TRIM_SUSPENDED) {
685 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
686 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
687 sizeof (vd->vdev_trim_last_offset), 1,
688 &vd->vdev_trim_last_offset);
689 if (err == ENOENT) {
690 vd->vdev_trim_last_offset = 0;
691 err = 0;
692 }
693
694 if (err == 0) {
695 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
696 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE,
697 sizeof (vd->vdev_trim_rate), 1,
698 &vd->vdev_trim_rate);
699 if (err == ENOENT) {
700 vd->vdev_trim_rate = 0;
701 err = 0;
702 }
703 }
704
705 if (err == 0) {
706 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
707 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
708 sizeof (vd->vdev_trim_partial), 1,
709 &vd->vdev_trim_partial);
710 if (err == ENOENT) {
711 vd->vdev_trim_partial = 0;
712 err = 0;
713 }
714 }
715
716 if (err == 0) {
717 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
718 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
719 sizeof (vd->vdev_trim_secure), 1,
720 &vd->vdev_trim_secure);
721 if (err == ENOENT) {
722 vd->vdev_trim_secure = 0;
723 err = 0;
724 }
725 }
726 }
727
728 vdev_trim_calculate_progress(vd);
729
730 return (err);
731 }
732
733 /*
734 * Convert the logical range into a physical range and add it to the
735 * range tree passed in the trim_args_t.
736 */
737 static void
vdev_trim_range_add(void * arg,uint64_t start,uint64_t size)738 vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
739 {
740 trim_args_t *ta = arg;
741 vdev_t *vd = ta->trim_vdev;
742 range_seg64_t logical_rs, physical_rs;
743 logical_rs.rs_start = start;
744 logical_rs.rs_end = start + size;
745
746 /*
747 * Every range to be trimmed must be part of ms_allocatable.
748 * When ZFS_DEBUG_TRIM is set load the metaslab to verify this
749 * is always the case.
750 */
751 if (zfs_flags & ZFS_DEBUG_TRIM) {
752 metaslab_t *msp = ta->trim_msp;
753 VERIFY0(metaslab_load(msp));
754 VERIFY3B(msp->ms_loaded, ==, B_TRUE);
755 VERIFY(range_tree_contains(msp->ms_allocatable, start, size));
756 }
757
758 ASSERT(vd->vdev_ops->vdev_op_leaf);
759 vdev_xlate(vd, &logical_rs, &physical_rs);
760
761 IMPLY(vd->vdev_top == vd,
762 logical_rs.rs_start == physical_rs.rs_start);
763 IMPLY(vd->vdev_top == vd,
764 logical_rs.rs_end == physical_rs.rs_end);
765
766 /*
767 * Only a manual trim will be traversing the vdev sequentially.
768 * For an auto trim all valid ranges should be added.
769 */
770 if (ta->trim_type == TRIM_TYPE_MANUAL) {
771
772 /* Only add segments that we have not visited yet */
773 if (physical_rs.rs_end <= vd->vdev_trim_last_offset)
774 return;
775
776 /* Pick up where we left off mid-range. */
777 if (vd->vdev_trim_last_offset > physical_rs.rs_start) {
778 ASSERT3U(physical_rs.rs_end, >,
779 vd->vdev_trim_last_offset);
780 physical_rs.rs_start = vd->vdev_trim_last_offset;
781 }
782 }
783
784 ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
785
786 /*
787 * With raidz, it's possible that the logical range does not live on
788 * this leaf vdev. We only add the physical range to this vdev's if it
789 * has a length greater than 0.
790 */
791 if (physical_rs.rs_end > physical_rs.rs_start) {
792 range_tree_add(ta->trim_tree, physical_rs.rs_start,
793 physical_rs.rs_end - physical_rs.rs_start);
794 } else {
795 ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
796 }
797 }
798
799 /*
800 * Each manual TRIM thread is responsible for trimming the unallocated
801 * space for each leaf vdev. This is accomplished by sequentially iterating
802 * over its top-level metaslabs and issuing TRIM I/O for the space described
803 * by its ms_allocatable. While a metaslab is undergoing trimming it is
804 * not eligible for new allocations.
805 */
806 static void
vdev_trim_thread(void * arg)807 vdev_trim_thread(void *arg)
808 {
809 vdev_t *vd = arg;
810 spa_t *spa = vd->vdev_spa;
811 trim_args_t ta;
812 int error = 0;
813
814 /*
815 * The VDEV_LEAF_ZAP_TRIM_* entries may have been updated by
816 * vdev_trim(). Wait for the updated values to be reflected
817 * in the zap in order to start with the requested settings.
818 */
819 txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
820
821 ASSERT(vdev_is_concrete(vd));
822 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
823
824 vd->vdev_trim_last_offset = 0;
825 vd->vdev_trim_rate = 0;
826 vd->vdev_trim_partial = 0;
827 vd->vdev_trim_secure = 0;
828
829 VERIFY0(vdev_trim_load(vd));
830
831 ta.trim_vdev = vd;
832 ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
833 ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min;
834 ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
835 ta.trim_type = TRIM_TYPE_MANUAL;
836 ta.trim_flags = 0;
837
838 /*
839 * When a secure TRIM has been requested infer that the intent
840 * is that everything must be trimmed. Override the default
841 * minimum TRIM size to prevent ranges from being skipped.
842 */
843 if (vd->vdev_trim_secure) {
844 ta.trim_flags |= ZIO_TRIM_SECURE;
845 ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
846 }
847
848 uint64_t ms_count = 0;
849 for (uint64_t i = 0; !vd->vdev_detached &&
850 i < vd->vdev_top->vdev_ms_count; i++) {
851 metaslab_t *msp = vd->vdev_top->vdev_ms[i];
852
853 /*
854 * If we've expanded the top-level vdev or it's our
855 * first pass, calculate our progress.
856 */
857 if (vd->vdev_top->vdev_ms_count != ms_count) {
858 vdev_trim_calculate_progress(vd);
859 ms_count = vd->vdev_top->vdev_ms_count;
860 }
861
862 spa_config_exit(spa, SCL_CONFIG, FTAG);
863 metaslab_disable(msp);
864 mutex_enter(&msp->ms_lock);
865 VERIFY0(metaslab_load(msp));
866
867 /*
868 * If a partial TRIM was requested skip metaslabs which have
869 * never been initialized and thus have never been written.
870 */
871 if (msp->ms_sm == NULL && vd->vdev_trim_partial) {
872 mutex_exit(&msp->ms_lock);
873 metaslab_enable(msp, B_FALSE, B_FALSE);
874 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
875 vdev_trim_calculate_progress(vd);
876 continue;
877 }
878
879 ta.trim_msp = msp;
880 range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, &ta);
881 range_tree_vacate(msp->ms_trim, NULL, NULL);
882 mutex_exit(&msp->ms_lock);
883
884 error = vdev_trim_ranges(&ta);
885 metaslab_enable(msp, B_TRUE, B_FALSE);
886 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
887
888 range_tree_vacate(ta.trim_tree, NULL, NULL);
889 if (error != 0)
890 break;
891 }
892
893 spa_config_exit(spa, SCL_CONFIG, FTAG);
894
895 range_tree_destroy(ta.trim_tree);
896
897 mutex_enter(&vd->vdev_trim_lock);
898 if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) {
899 vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE,
900 vd->vdev_trim_rate, vd->vdev_trim_partial,
901 vd->vdev_trim_secure);
902 }
903 ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0);
904
905 /*
906 * Drop the vdev_trim_lock while we sync out the txg since it's
907 * possible that a device might be trying to come online and must
908 * check to see if it needs to restart a trim. That thread will be
909 * holding the spa_config_lock which would prevent the txg_wait_synced
910 * from completing.
911 */
912 mutex_exit(&vd->vdev_trim_lock);
913 txg_wait_synced(spa_get_dsl(spa), 0);
914 mutex_enter(&vd->vdev_trim_lock);
915
916 vd->vdev_trim_thread = NULL;
917 cv_broadcast(&vd->vdev_trim_cv);
918 mutex_exit(&vd->vdev_trim_lock);
919 }
920
921 /*
922 * Initiates a manual TRIM for the vdev_t. Callers must hold vdev_trim_lock,
923 * the vdev_t must be a leaf and cannot already be manually trimming.
924 */
925 void
vdev_trim(vdev_t * vd,uint64_t rate,boolean_t partial,boolean_t secure)926 vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure)
927 {
928 ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
929 ASSERT(vd->vdev_ops->vdev_op_leaf);
930 ASSERT(vdev_is_concrete(vd));
931 ASSERT3P(vd->vdev_trim_thread, ==, NULL);
932 ASSERT(!vd->vdev_detached);
933 ASSERT(!vd->vdev_trim_exit_wanted);
934 ASSERT(!vd->vdev_top->vdev_removing);
935
936 vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure);
937 vd->vdev_trim_thread = thread_create(NULL, 0,
938 vdev_trim_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
939 }
940
941 /*
942 * Wait for the trimming thread to be terminated (canceled or stopped).
943 */
944 static void
vdev_trim_stop_wait_impl(vdev_t * vd)945 vdev_trim_stop_wait_impl(vdev_t *vd)
946 {
947 ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
948
949 while (vd->vdev_trim_thread != NULL)
950 cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock);
951
952 ASSERT3P(vd->vdev_trim_thread, ==, NULL);
953 vd->vdev_trim_exit_wanted = B_FALSE;
954 }
955
956 /*
957 * Wait for vdev trim threads which were listed to cleanly exit.
958 */
959 void
vdev_trim_stop_wait(spa_t * spa,list_t * vd_list)960 vdev_trim_stop_wait(spa_t *spa, list_t *vd_list)
961 {
962 vdev_t *vd;
963
964 ASSERT(MUTEX_HELD(&spa_namespace_lock));
965
966 while ((vd = list_remove_head(vd_list)) != NULL) {
967 mutex_enter(&vd->vdev_trim_lock);
968 vdev_trim_stop_wait_impl(vd);
969 mutex_exit(&vd->vdev_trim_lock);
970 }
971 }
972
973 /*
974 * Stop trimming a device, with the resultant trimming state being tgt_state.
975 * For blocking behavior pass NULL for vd_list. Otherwise, when a list_t is
976 * provided the stopping vdev is inserted in to the list. Callers are then
977 * required to call vdev_trim_stop_wait() to block for all the trim threads
978 * to exit. The caller must hold vdev_trim_lock and must not be writing to
979 * the spa config, as the trimming thread may try to enter the config as a
980 * reader before exiting.
981 */
982 void
vdev_trim_stop(vdev_t * vd,vdev_trim_state_t tgt_state,list_t * vd_list)983 vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list)
984 {
985 ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
986 ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
987 ASSERT(vd->vdev_ops->vdev_op_leaf);
988 ASSERT(vdev_is_concrete(vd));
989
990 /*
991 * Allow cancel requests to proceed even if the trim thread has
992 * stopped.
993 */
994 if (vd->vdev_trim_thread == NULL && tgt_state != VDEV_TRIM_CANCELED)
995 return;
996
997 vdev_trim_change_state(vd, tgt_state, 0, 0, 0);
998 vd->vdev_trim_exit_wanted = B_TRUE;
999
1000 if (vd_list == NULL) {
1001 vdev_trim_stop_wait_impl(vd);
1002 } else {
1003 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1004 list_insert_tail(vd_list, vd);
1005 }
1006 }
1007
1008 /*
1009 * Requests that all listed vdevs stop trimming.
1010 */
1011 static void
vdev_trim_stop_all_impl(vdev_t * vd,vdev_trim_state_t tgt_state,list_t * vd_list)1012 vdev_trim_stop_all_impl(vdev_t *vd, vdev_trim_state_t tgt_state,
1013 list_t *vd_list)
1014 {
1015 if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
1016 mutex_enter(&vd->vdev_trim_lock);
1017 vdev_trim_stop(vd, tgt_state, vd_list);
1018 mutex_exit(&vd->vdev_trim_lock);
1019 return;
1020 }
1021
1022 for (uint64_t i = 0; i < vd->vdev_children; i++) {
1023 vdev_trim_stop_all_impl(vd->vdev_child[i], tgt_state,
1024 vd_list);
1025 }
1026 }
1027
1028 /*
1029 * Convenience function to stop trimming of a vdev tree and set all trim
1030 * thread pointers to NULL.
1031 */
1032 void
vdev_trim_stop_all(vdev_t * vd,vdev_trim_state_t tgt_state)1033 vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
1034 {
1035 spa_t *spa = vd->vdev_spa;
1036 list_t vd_list;
1037
1038 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1039
1040 list_create(&vd_list, sizeof (vdev_t),
1041 offsetof(vdev_t, vdev_trim_node));
1042
1043 vdev_trim_stop_all_impl(vd, tgt_state, &vd_list);
1044 vdev_trim_stop_wait(spa, &vd_list);
1045
1046 if (vd->vdev_spa->spa_sync_on) {
1047 /* Make sure that our state has been synced to disk */
1048 txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
1049 }
1050
1051 list_destroy(&vd_list);
1052 }
1053
1054 /*
1055 * Conditionally restarts a manual TRIM given its on-disk state.
1056 */
1057 void
vdev_trim_restart(vdev_t * vd)1058 vdev_trim_restart(vdev_t *vd)
1059 {
1060 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1061 ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
1062
1063 if (vd->vdev_leaf_zap != 0) {
1064 mutex_enter(&vd->vdev_trim_lock);
1065 uint64_t trim_state = VDEV_TRIM_NONE;
1066 int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
1067 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
1068 sizeof (trim_state), 1, &trim_state);
1069 ASSERT(err == 0 || err == ENOENT);
1070 vd->vdev_trim_state = trim_state;
1071
1072 uint64_t timestamp = 0;
1073 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
1074 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME,
1075 sizeof (timestamp), 1, ×tamp);
1076 ASSERT(err == 0 || err == ENOENT);
1077 vd->vdev_trim_action_time = (time_t)timestamp;
1078
1079 if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||
1080 vd->vdev_offline) {
1081 /* load progress for reporting, but don't resume */
1082 VERIFY0(vdev_trim_load(vd));
1083 } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE &&
1084 vdev_writeable(vd) && !vd->vdev_top->vdev_removing &&
1085 vd->vdev_trim_thread == NULL) {
1086 VERIFY0(vdev_trim_load(vd));
1087 vdev_trim(vd, vd->vdev_trim_rate,
1088 vd->vdev_trim_partial, vd->vdev_trim_secure);
1089 }
1090
1091 mutex_exit(&vd->vdev_trim_lock);
1092 }
1093
1094 for (uint64_t i = 0; i < vd->vdev_children; i++) {
1095 vdev_trim_restart(vd->vdev_child[i]);
1096 }
1097 }
1098
1099 /*
1100 * Used by the automatic TRIM when ZFS_DEBUG_TRIM is set to verify that
1101 * every TRIM range is contained within ms_allocatable.
1102 */
1103 static void
vdev_trim_range_verify(void * arg,uint64_t start,uint64_t size)1104 vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size)
1105 {
1106 trim_args_t *ta = arg;
1107 metaslab_t *msp = ta->trim_msp;
1108
1109 VERIFY3B(msp->ms_loaded, ==, B_TRUE);
1110 VERIFY3U(msp->ms_disabled, >, 0);
1111 VERIFY(range_tree_contains(msp->ms_allocatable, start, size));
1112 }
1113
1114 /*
1115 * Each automatic TRIM thread is responsible for managing the trimming of a
1116 * top-level vdev in the pool. No automatic TRIM state is maintained on-disk.
1117 *
1118 * N.B. This behavior is different from a manual TRIM where a thread
1119 * is created for each leaf vdev, instead of each top-level vdev.
1120 */
1121 static void
vdev_autotrim_thread(void * arg)1122 vdev_autotrim_thread(void *arg)
1123 {
1124 vdev_t *vd = arg;
1125 spa_t *spa = vd->vdev_spa;
1126 int shift = 0;
1127
1128 mutex_enter(&vd->vdev_autotrim_lock);
1129 ASSERT3P(vd->vdev_top, ==, vd);
1130 ASSERT3P(vd->vdev_autotrim_thread, !=, NULL);
1131 mutex_exit(&vd->vdev_autotrim_lock);
1132 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1133
1134 uint64_t extent_bytes_max = zfs_trim_extent_bytes_max;
1135 uint64_t extent_bytes_min = zfs_trim_extent_bytes_min;
1136
1137 while (!vdev_autotrim_should_stop(vd)) {
1138 int txgs_per_trim = MAX(zfs_trim_txg_batch, 1);
1139 boolean_t issued_trim = B_FALSE;
1140
1141 /*
1142 * All of the metaslabs are divided in to groups of size
1143 * num_metaslabs / zfs_trim_txg_batch. Each of these groups
1144 * is composed of metaslabs which are spread evenly over the
1145 * device.
1146 *
1147 * For example, when zfs_trim_txg_batch = 32 (default) then
1148 * group 0 will contain metaslabs 0, 32, 64, ...;
1149 * group 1 will contain metaslabs 1, 33, 65, ...;
1150 * group 2 will contain metaslabs 2, 34, 66, ...; and so on.
1151 *
1152 * On each pass through the while() loop one of these groups
1153 * is selected. This is accomplished by using a shift value
1154 * to select the starting metaslab, then striding over the
1155 * metaslabs using the zfs_trim_txg_batch size. This is
1156 * done to accomplish two things.
1157 *
1158 * 1) By dividing the metaslabs into groups, and making sure
1159 * that each group takes a minimum of one txg to process.
1160 * Then zfs_trim_txg_batch controls the minimum number of
1161 * txgs which must occur before a metaslab is revisited.
1162 *
1163 * 2) Selecting non-consecutive metaslabs distributes the
1164 * TRIM commands for a group evenly over the entire device.
1165 * This can be advantageous for certain types of devices.
1166 */
1167 for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count;
1168 i += txgs_per_trim) {
1169 metaslab_t *msp = vd->vdev_ms[i];
1170 range_tree_t *trim_tree;
1171
1172 spa_config_exit(spa, SCL_CONFIG, FTAG);
1173 metaslab_disable(msp);
1174 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1175
1176 mutex_enter(&msp->ms_lock);
1177
1178 /*
1179 * Skip the metaslab when it has never been allocated
1180 * or when there are no recent frees to trim.
1181 */
1182 if (msp->ms_sm == NULL ||
1183 range_tree_is_empty(msp->ms_trim)) {
1184 mutex_exit(&msp->ms_lock);
1185 metaslab_enable(msp, B_FALSE, B_FALSE);
1186 continue;
1187 }
1188
1189 /*
1190 * Skip the metaslab when it has already been disabled.
1191 * This may happen when a manual TRIM or initialize
1192 * operation is running concurrently. In the case
1193 * of a manual TRIM, the ms_trim tree will have been
1194 * vacated. Only ranges added after the manual TRIM
1195 * disabled the metaslab will be included in the tree.
1196 * These will be processed when the automatic TRIM
1197 * next revisits this metaslab.
1198 */
1199 if (msp->ms_disabled > 1) {
1200 mutex_exit(&msp->ms_lock);
1201 metaslab_enable(msp, B_FALSE, B_FALSE);
1202 continue;
1203 }
1204
1205 /*
1206 * Allocate an empty range tree which is swapped in
1207 * for the existing ms_trim tree while it is processed.
1208 */
1209 trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
1210 0, 0);
1211 range_tree_swap(&msp->ms_trim, &trim_tree);
1212 ASSERT(range_tree_is_empty(msp->ms_trim));
1213
1214 /*
1215 * There are two cases when constructing the per-vdev
1216 * trim trees for a metaslab. If the top-level vdev
1217 * has no children then it is also a leaf and should
1218 * be trimmed. Otherwise our children are the leaves
1219 * and a trim tree should be constructed for each.
1220 */
1221 trim_args_t *tap;
1222 uint64_t children = vd->vdev_children;
1223 if (children == 0) {
1224 children = 1;
1225 tap = kmem_zalloc(sizeof (trim_args_t) *
1226 children, KM_SLEEP);
1227 tap[0].trim_vdev = vd;
1228 } else {
1229 tap = kmem_zalloc(sizeof (trim_args_t) *
1230 children, KM_SLEEP);
1231
1232 for (uint64_t c = 0; c < children; c++) {
1233 tap[c].trim_vdev = vd->vdev_child[c];
1234 }
1235 }
1236
1237 for (uint64_t c = 0; c < children; c++) {
1238 trim_args_t *ta = &tap[c];
1239 vdev_t *cvd = ta->trim_vdev;
1240
1241 ta->trim_msp = msp;
1242 ta->trim_extent_bytes_max = extent_bytes_max;
1243 ta->trim_extent_bytes_min = extent_bytes_min;
1244 ta->trim_type = TRIM_TYPE_AUTO;
1245 ta->trim_flags = 0;
1246
1247 if (cvd->vdev_detached ||
1248 !vdev_writeable(cvd) ||
1249 !cvd->vdev_has_trim ||
1250 cvd->vdev_trim_thread != NULL) {
1251 continue;
1252 }
1253
1254 /*
1255 * When a device has an attached hot spare, or
1256 * is being replaced it will not be trimmed.
1257 * This is done to avoid adding additional
1258 * stress to a potentially unhealthy device,
1259 * and to minimize the required rebuild time.
1260 */
1261 if (!cvd->vdev_ops->vdev_op_leaf)
1262 continue;
1263
1264 ta->trim_tree = range_tree_create(NULL,
1265 RANGE_SEG64, NULL, 0, 0);
1266 range_tree_walk(trim_tree,
1267 vdev_trim_range_add, ta);
1268 }
1269
1270 mutex_exit(&msp->ms_lock);
1271 spa_config_exit(spa, SCL_CONFIG, FTAG);
1272
1273 /*
1274 * Issue the TRIM I/Os for all ranges covered by the
1275 * TRIM trees. These ranges are safe to TRIM because
1276 * no new allocations will be performed until the call
1277 * to metaslab_enabled() below.
1278 */
1279 for (uint64_t c = 0; c < children; c++) {
1280 trim_args_t *ta = &tap[c];
1281
1282 /*
1283 * Always yield to a manual TRIM if one has
1284 * been started for the child vdev.
1285 */
1286 if (ta->trim_tree == NULL ||
1287 ta->trim_vdev->vdev_trim_thread != NULL) {
1288 continue;
1289 }
1290
1291 /*
1292 * After this point metaslab_enable() must be
1293 * called with the sync flag set. This is done
1294 * here because vdev_trim_ranges() is allowed
1295 * to be interrupted (EINTR) before issuing all
1296 * of the required TRIM I/Os.
1297 */
1298 issued_trim = B_TRUE;
1299
1300 int error = vdev_trim_ranges(ta);
1301 if (error)
1302 break;
1303 }
1304
1305 /*
1306 * Verify every range which was trimmed is still
1307 * contained within the ms_allocatable tree.
1308 */
1309 if (zfs_flags & ZFS_DEBUG_TRIM) {
1310 mutex_enter(&msp->ms_lock);
1311 VERIFY0(metaslab_load(msp));
1312 VERIFY3P(tap[0].trim_msp, ==, msp);
1313 range_tree_walk(trim_tree,
1314 vdev_trim_range_verify, &tap[0]);
1315 mutex_exit(&msp->ms_lock);
1316 }
1317
1318 range_tree_vacate(trim_tree, NULL, NULL);
1319 range_tree_destroy(trim_tree);
1320
1321 metaslab_enable(msp, issued_trim, B_FALSE);
1322 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1323
1324 for (uint64_t c = 0; c < children; c++) {
1325 trim_args_t *ta = &tap[c];
1326
1327 if (ta->trim_tree == NULL)
1328 continue;
1329
1330 range_tree_vacate(ta->trim_tree, NULL, NULL);
1331 range_tree_destroy(ta->trim_tree);
1332 }
1333
1334 kmem_free(tap, sizeof (trim_args_t) * children);
1335 }
1336
1337 spa_config_exit(spa, SCL_CONFIG, FTAG);
1338
1339 /*
1340 * After completing the group of metaslabs wait for the next
1341 * open txg. This is done to make sure that a minimum of
1342 * zfs_trim_txg_batch txgs will occur before these metaslabs
1343 * are trimmed again.
1344 */
1345 txg_wait_open(spa_get_dsl(spa), 0, issued_trim);
1346
1347 shift++;
1348 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1349 }
1350
1351 for (uint64_t c = 0; c < vd->vdev_children; c++) {
1352 vdev_t *cvd = vd->vdev_child[c];
1353 mutex_enter(&cvd->vdev_trim_io_lock);
1354
1355 while (cvd->vdev_trim_inflight[1] > 0) {
1356 cv_wait(&cvd->vdev_trim_io_cv,
1357 &cvd->vdev_trim_io_lock);
1358 }
1359 mutex_exit(&cvd->vdev_trim_io_lock);
1360 }
1361
1362 spa_config_exit(spa, SCL_CONFIG, FTAG);
1363
1364 /*
1365 * When exiting because the autotrim property was set to off, then
1366 * abandon any unprocessed ms_trim ranges to reclaim the memory.
1367 */
1368 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_OFF) {
1369 for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
1370 metaslab_t *msp = vd->vdev_ms[i];
1371
1372 mutex_enter(&msp->ms_lock);
1373 range_tree_vacate(msp->ms_trim, NULL, NULL);
1374 mutex_exit(&msp->ms_lock);
1375 }
1376 }
1377
1378 mutex_enter(&vd->vdev_autotrim_lock);
1379 ASSERT(vd->vdev_autotrim_thread != NULL);
1380 vd->vdev_autotrim_thread = NULL;
1381 cv_broadcast(&vd->vdev_autotrim_cv);
1382 mutex_exit(&vd->vdev_autotrim_lock);
1383 }
1384
1385 /*
1386 * Starts an autotrim thread, if needed, for each top-level vdev which can be
1387 * trimmed. A top-level vdev which has been evacuated will never be trimmed.
1388 */
1389 void
vdev_autotrim(spa_t * spa)1390 vdev_autotrim(spa_t *spa)
1391 {
1392 vdev_t *root_vd = spa->spa_root_vdev;
1393
1394 for (uint64_t i = 0; i < root_vd->vdev_children; i++) {
1395 vdev_t *tvd = root_vd->vdev_child[i];
1396
1397 mutex_enter(&tvd->vdev_autotrim_lock);
1398 if (vdev_writeable(tvd) && !tvd->vdev_removing &&
1399 tvd->vdev_autotrim_thread == NULL) {
1400 ASSERT3P(tvd->vdev_top, ==, tvd);
1401
1402 tvd->vdev_autotrim_thread = thread_create(NULL, 0,
1403 vdev_autotrim_thread, tvd, 0, &p0, TS_RUN,
1404 maxclsyspri);
1405 ASSERT(tvd->vdev_autotrim_thread != NULL);
1406 }
1407 mutex_exit(&tvd->vdev_autotrim_lock);
1408 }
1409 }
1410
1411 /*
1412 * Wait for the vdev_autotrim_thread associated with the passed top-level
1413 * vdev to be terminated (canceled or stopped).
1414 */
1415 void
vdev_autotrim_stop_wait(vdev_t * tvd)1416 vdev_autotrim_stop_wait(vdev_t *tvd)
1417 {
1418 mutex_enter(&tvd->vdev_autotrim_lock);
1419 if (tvd->vdev_autotrim_thread != NULL) {
1420 tvd->vdev_autotrim_exit_wanted = B_TRUE;
1421
1422 while (tvd->vdev_autotrim_thread != NULL) {
1423 cv_wait(&tvd->vdev_autotrim_cv,
1424 &tvd->vdev_autotrim_lock);
1425 }
1426
1427 ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL);
1428 tvd->vdev_autotrim_exit_wanted = B_FALSE;
1429 }
1430 mutex_exit(&tvd->vdev_autotrim_lock);
1431 }
1432
1433 /*
1434 * Wait for all of the vdev_autotrim_thread associated with the pool to
1435 * be terminated (canceled or stopped).
1436 */
1437 void
vdev_autotrim_stop_all(spa_t * spa)1438 vdev_autotrim_stop_all(spa_t *spa)
1439 {
1440 vdev_t *root_vd = spa->spa_root_vdev;
1441
1442 for (uint64_t i = 0; i < root_vd->vdev_children; i++)
1443 vdev_autotrim_stop_wait(root_vd->vdev_child[i]);
1444 }
1445
1446 /*
1447 * Conditionally restart all of the vdev_autotrim_thread's for the pool.
1448 */
1449 void
vdev_autotrim_restart(spa_t * spa)1450 vdev_autotrim_restart(spa_t *spa)
1451 {
1452 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1453
1454 if (spa->spa_autotrim)
1455 vdev_autotrim(spa);
1456 }
1457