1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6*eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7*eda14cbcSMatt Macy * 8*eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10*eda14cbcSMatt Macy * See the License for the specific language governing permissions 11*eda14cbcSMatt Macy * and limitations under the License. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14*eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16*eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17*eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18*eda14cbcSMatt Macy * 19*eda14cbcSMatt Macy * CDDL HEADER END 20*eda14cbcSMatt Macy */ 21*eda14cbcSMatt Macy 22*eda14cbcSMatt Macy /* 23*eda14cbcSMatt Macy * Copyright (c) 2016 by Delphix. All rights reserved. 24*eda14cbcSMatt Macy * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. 25*eda14cbcSMatt Macy */ 26*eda14cbcSMatt Macy 27*eda14cbcSMatt Macy #include <sys/spa.h> 28*eda14cbcSMatt Macy #include <sys/spa_impl.h> 29*eda14cbcSMatt Macy #include <sys/txg.h> 30*eda14cbcSMatt Macy #include <sys/vdev_impl.h> 31*eda14cbcSMatt Macy #include <sys/vdev_trim.h> 32*eda14cbcSMatt Macy #include <sys/metaslab_impl.h> 33*eda14cbcSMatt Macy #include <sys/dsl_synctask.h> 34*eda14cbcSMatt Macy #include <sys/zap.h> 35*eda14cbcSMatt Macy #include <sys/dmu_tx.h> 36*eda14cbcSMatt Macy #include <sys/arc_impl.h> 37*eda14cbcSMatt Macy 38*eda14cbcSMatt Macy /* 39*eda14cbcSMatt Macy * TRIM is a feature which is used to notify a SSD that some previously 40*eda14cbcSMatt Macy * written space is no longer allocated by the pool. This is useful because 41*eda14cbcSMatt Macy * writes to a SSD must be performed to blocks which have first been erased. 42*eda14cbcSMatt Macy * Ensuring the SSD always has a supply of erased blocks for new writes 43*eda14cbcSMatt Macy * helps prevent the performance from deteriorating. 44*eda14cbcSMatt Macy * 45*eda14cbcSMatt Macy * There are two supported TRIM methods; manual and automatic. 46*eda14cbcSMatt Macy * 47*eda14cbcSMatt Macy * Manual TRIM: 48*eda14cbcSMatt Macy * 49*eda14cbcSMatt Macy * A manual TRIM is initiated by running the 'zpool trim' command. A single 50*eda14cbcSMatt Macy * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for 51*eda14cbcSMatt Macy * managing that vdev TRIM process. This involves iterating over all the 52*eda14cbcSMatt Macy * metaslabs, calculating the unallocated space ranges, and then issuing the 53*eda14cbcSMatt Macy * required TRIM I/Os. 54*eda14cbcSMatt Macy * 55*eda14cbcSMatt Macy * While a metaslab is being actively trimmed it is not eligible to perform 56*eda14cbcSMatt Macy * new allocations. After traversing all of the metaslabs the thread is 57*eda14cbcSMatt Macy * terminated. Finally, both the requested options and current progress of 58*eda14cbcSMatt Macy * the TRIM are regularly written to the pool. This allows the TRIM to be 59*eda14cbcSMatt Macy * suspended and resumed as needed. 60*eda14cbcSMatt Macy * 61*eda14cbcSMatt Macy * Automatic TRIM: 62*eda14cbcSMatt Macy * 63*eda14cbcSMatt Macy * An automatic TRIM is enabled by setting the 'autotrim' pool property 64*eda14cbcSMatt Macy * to 'on'. When enabled, a `vdev_autotrim' thread is created for each 65*eda14cbcSMatt Macy * top-level (not leaf) vdev in the pool. These threads perform the same 66*eda14cbcSMatt Macy * core TRIM process as a manual TRIM, but with a few key differences. 67*eda14cbcSMatt Macy * 68*eda14cbcSMatt Macy * 1) Automatic TRIM happens continuously in the background and operates 69*eda14cbcSMatt Macy * solely on recently freed blocks (ms_trim not ms_allocatable). 70*eda14cbcSMatt Macy * 71*eda14cbcSMatt Macy * 2) Each thread is associated with a top-level (not leaf) vdev. This has 72*eda14cbcSMatt Macy * the benefit of simplifying the threading model, it makes it easier 73*eda14cbcSMatt Macy * to coordinate administrative commands, and it ensures only a single 74*eda14cbcSMatt Macy * metaslab is disabled at a time. Unlike manual TRIM, this means each 75*eda14cbcSMatt Macy * 'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its 76*eda14cbcSMatt Macy * children. 77*eda14cbcSMatt Macy * 78*eda14cbcSMatt Macy * 3) There is no automatic TRIM progress information stored on disk, nor 79*eda14cbcSMatt Macy * is it reported by 'zpool status'. 80*eda14cbcSMatt Macy * 81*eda14cbcSMatt Macy * While the automatic TRIM process is highly effective it is more likely 82*eda14cbcSMatt Macy * than a manual TRIM to encounter tiny ranges. Ranges less than or equal to 83*eda14cbcSMatt Macy * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently 84*eda14cbcSMatt Macy * TRIM and are skipped. This means small amounts of freed space may not 85*eda14cbcSMatt Macy * be automatically trimmed. 86*eda14cbcSMatt Macy * 87*eda14cbcSMatt Macy * Furthermore, devices with attached hot spares and devices being actively 88*eda14cbcSMatt Macy * replaced are skipped. This is done to avoid adding additional stress to 89*eda14cbcSMatt Macy * a potentially unhealthy device and to minimize the required rebuild time. 90*eda14cbcSMatt Macy * 91*eda14cbcSMatt Macy * For this reason it may be beneficial to occasionally manually TRIM a pool 92*eda14cbcSMatt Macy * even when automatic TRIM is enabled. 93*eda14cbcSMatt Macy */ 94*eda14cbcSMatt Macy 95*eda14cbcSMatt Macy /* 96*eda14cbcSMatt Macy * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths. 97*eda14cbcSMatt Macy */ 98*eda14cbcSMatt Macy unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024; 99*eda14cbcSMatt Macy 100*eda14cbcSMatt Macy /* 101*eda14cbcSMatt Macy * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped. 102*eda14cbcSMatt Macy */ 103*eda14cbcSMatt Macy unsigned int zfs_trim_extent_bytes_min = 32 * 1024; 104*eda14cbcSMatt Macy 105*eda14cbcSMatt Macy /* 106*eda14cbcSMatt Macy * Skip uninitialized metaslabs during the TRIM process. This option is 107*eda14cbcSMatt Macy * useful for pools constructed from large thinly-provisioned devices where 108*eda14cbcSMatt Macy * TRIM operations are slow. As a pool ages an increasing fraction of 109*eda14cbcSMatt Macy * the pools metaslabs will be initialized progressively degrading the 110*eda14cbcSMatt Macy * usefulness of this option. This setting is stored when starting a 111*eda14cbcSMatt Macy * manual TRIM and will persist for the duration of the requested TRIM. 112*eda14cbcSMatt Macy */ 113*eda14cbcSMatt Macy unsigned int zfs_trim_metaslab_skip = 0; 114*eda14cbcSMatt Macy 115*eda14cbcSMatt Macy /* 116*eda14cbcSMatt Macy * Maximum number of queued TRIM I/Os per leaf vdev. The number of 117*eda14cbcSMatt Macy * concurrent TRIM I/Os issued to the device is controlled by the 118*eda14cbcSMatt Macy * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options. 119*eda14cbcSMatt Macy */ 120*eda14cbcSMatt Macy unsigned int zfs_trim_queue_limit = 10; 121*eda14cbcSMatt Macy 122*eda14cbcSMatt Macy /* 123*eda14cbcSMatt Macy * The minimum number of transaction groups between automatic trims of a 124*eda14cbcSMatt Macy * metaslab. This setting represents a trade-off between issuing more 125*eda14cbcSMatt Macy * efficient TRIM operations, by allowing them to be aggregated longer, 126*eda14cbcSMatt Macy * and issuing them promptly so the trimmed space is available. Note 127*eda14cbcSMatt Macy * that this value is a minimum; metaslabs can be trimmed less frequently 128*eda14cbcSMatt Macy * when there are a large number of ranges which need to be trimmed. 129*eda14cbcSMatt Macy * 130*eda14cbcSMatt Macy * Increasing this value will allow frees to be aggregated for a longer 131*eda14cbcSMatt Macy * time. This can result is larger TRIM operations, and increased memory 132*eda14cbcSMatt Macy * usage in order to track the ranges to be trimmed. Decreasing this value 133*eda14cbcSMatt Macy * has the opposite effect. The default value of 32 was determined though 134*eda14cbcSMatt Macy * testing to be a reasonable compromise. 135*eda14cbcSMatt Macy */ 136*eda14cbcSMatt Macy unsigned int zfs_trim_txg_batch = 32; 137*eda14cbcSMatt Macy 138*eda14cbcSMatt Macy /* 139*eda14cbcSMatt Macy * The trim_args are a control structure which describe how a leaf vdev 140*eda14cbcSMatt Macy * should be trimmed. The core elements are the vdev, the metaslab being 141*eda14cbcSMatt Macy * trimmed and a range tree containing the extents to TRIM. All provided 142*eda14cbcSMatt Macy * ranges must be within the metaslab. 143*eda14cbcSMatt Macy */ 144*eda14cbcSMatt Macy typedef struct trim_args { 145*eda14cbcSMatt Macy /* 146*eda14cbcSMatt Macy * These fields are set by the caller of vdev_trim_ranges(). 147*eda14cbcSMatt Macy */ 148*eda14cbcSMatt Macy vdev_t *trim_vdev; /* Leaf vdev to TRIM */ 149*eda14cbcSMatt Macy metaslab_t *trim_msp; /* Disabled metaslab */ 150*eda14cbcSMatt Macy range_tree_t *trim_tree; /* TRIM ranges (in metaslab) */ 151*eda14cbcSMatt Macy trim_type_t trim_type; /* Manual or auto TRIM */ 152*eda14cbcSMatt Macy uint64_t trim_extent_bytes_max; /* Maximum TRIM I/O size */ 153*eda14cbcSMatt Macy uint64_t trim_extent_bytes_min; /* Minimum TRIM I/O size */ 154*eda14cbcSMatt Macy enum trim_flag trim_flags; /* TRIM flags (secure) */ 155*eda14cbcSMatt Macy 156*eda14cbcSMatt Macy /* 157*eda14cbcSMatt Macy * These fields are updated by vdev_trim_ranges(). 158*eda14cbcSMatt Macy */ 159*eda14cbcSMatt Macy hrtime_t trim_start_time; /* Start time */ 160*eda14cbcSMatt Macy uint64_t trim_bytes_done; /* Bytes trimmed */ 161*eda14cbcSMatt Macy } trim_args_t; 162*eda14cbcSMatt Macy 163*eda14cbcSMatt Macy /* 164*eda14cbcSMatt Macy * Determines whether a vdev_trim_thread() should be stopped. 165*eda14cbcSMatt Macy */ 166*eda14cbcSMatt Macy static boolean_t 167*eda14cbcSMatt Macy vdev_trim_should_stop(vdev_t *vd) 168*eda14cbcSMatt Macy { 169*eda14cbcSMatt Macy return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) || 170*eda14cbcSMatt Macy vd->vdev_detached || vd->vdev_top->vdev_removing); 171*eda14cbcSMatt Macy } 172*eda14cbcSMatt Macy 173*eda14cbcSMatt Macy /* 174*eda14cbcSMatt Macy * Determines whether a vdev_autotrim_thread() should be stopped. 175*eda14cbcSMatt Macy */ 176*eda14cbcSMatt Macy static boolean_t 177*eda14cbcSMatt Macy vdev_autotrim_should_stop(vdev_t *tvd) 178*eda14cbcSMatt Macy { 179*eda14cbcSMatt Macy return (tvd->vdev_autotrim_exit_wanted || 180*eda14cbcSMatt Macy !vdev_writeable(tvd) || tvd->vdev_removing || 181*eda14cbcSMatt Macy spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); 182*eda14cbcSMatt Macy } 183*eda14cbcSMatt Macy 184*eda14cbcSMatt Macy /* 185*eda14cbcSMatt Macy * The sync task for updating the on-disk state of a manual TRIM. This 186*eda14cbcSMatt Macy * is scheduled by vdev_trim_change_state(). 187*eda14cbcSMatt Macy */ 188*eda14cbcSMatt Macy static void 189*eda14cbcSMatt Macy vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx) 190*eda14cbcSMatt Macy { 191*eda14cbcSMatt Macy /* 192*eda14cbcSMatt Macy * We pass in the guid instead of the vdev_t since the vdev may 193*eda14cbcSMatt Macy * have been freed prior to the sync task being processed. This 194*eda14cbcSMatt Macy * happens when a vdev is detached as we call spa_config_vdev_exit(), 195*eda14cbcSMatt Macy * stop the trimming thread, schedule the sync task, and free 196*eda14cbcSMatt Macy * the vdev. Later when the scheduled sync task is invoked, it would 197*eda14cbcSMatt Macy * find that the vdev has been freed. 198*eda14cbcSMatt Macy */ 199*eda14cbcSMatt Macy uint64_t guid = *(uint64_t *)arg; 200*eda14cbcSMatt Macy uint64_t txg = dmu_tx_get_txg(tx); 201*eda14cbcSMatt Macy kmem_free(arg, sizeof (uint64_t)); 202*eda14cbcSMatt Macy 203*eda14cbcSMatt Macy vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); 204*eda14cbcSMatt Macy if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) 205*eda14cbcSMatt Macy return; 206*eda14cbcSMatt Macy 207*eda14cbcSMatt Macy uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK]; 208*eda14cbcSMatt Macy vd->vdev_trim_offset[txg & TXG_MASK] = 0; 209*eda14cbcSMatt Macy 210*eda14cbcSMatt Macy VERIFY3U(vd->vdev_leaf_zap, !=, 0); 211*eda14cbcSMatt Macy 212*eda14cbcSMatt Macy objset_t *mos = vd->vdev_spa->spa_meta_objset; 213*eda14cbcSMatt Macy 214*eda14cbcSMatt Macy if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) { 215*eda14cbcSMatt Macy 216*eda14cbcSMatt Macy if (vd->vdev_trim_last_offset == UINT64_MAX) 217*eda14cbcSMatt Macy last_offset = 0; 218*eda14cbcSMatt Macy 219*eda14cbcSMatt Macy vd->vdev_trim_last_offset = last_offset; 220*eda14cbcSMatt Macy VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 221*eda14cbcSMatt Macy VDEV_LEAF_ZAP_TRIM_LAST_OFFSET, 222*eda14cbcSMatt Macy sizeof (last_offset), 1, &last_offset, tx)); 223*eda14cbcSMatt Macy } 224*eda14cbcSMatt Macy 225*eda14cbcSMatt Macy if (vd->vdev_trim_action_time > 0) { 226*eda14cbcSMatt Macy uint64_t val = (uint64_t)vd->vdev_trim_action_time; 227*eda14cbcSMatt Macy VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 228*eda14cbcSMatt Macy VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val), 229*eda14cbcSMatt Macy 1, &val, tx)); 230*eda14cbcSMatt Macy } 231*eda14cbcSMatt Macy 232*eda14cbcSMatt Macy if (vd->vdev_trim_rate > 0) { 233*eda14cbcSMatt Macy uint64_t rate = (uint64_t)vd->vdev_trim_rate; 234*eda14cbcSMatt Macy 235*eda14cbcSMatt Macy if (rate == UINT64_MAX) 236*eda14cbcSMatt Macy rate = 0; 237*eda14cbcSMatt Macy 238*eda14cbcSMatt Macy VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 239*eda14cbcSMatt Macy VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx)); 240*eda14cbcSMatt Macy } 241*eda14cbcSMatt Macy 242*eda14cbcSMatt Macy uint64_t partial = vd->vdev_trim_partial; 243*eda14cbcSMatt Macy if (partial == UINT64_MAX) 244*eda14cbcSMatt Macy partial = 0; 245*eda14cbcSMatt Macy 246*eda14cbcSMatt Macy VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL, 247*eda14cbcSMatt Macy sizeof (partial), 1, &partial, tx)); 248*eda14cbcSMatt Macy 249*eda14cbcSMatt Macy uint64_t secure = vd->vdev_trim_secure; 250*eda14cbcSMatt Macy if (secure == UINT64_MAX) 251*eda14cbcSMatt Macy secure = 0; 252*eda14cbcSMatt Macy 253*eda14cbcSMatt Macy VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE, 254*eda14cbcSMatt Macy sizeof (secure), 1, &secure, tx)); 255*eda14cbcSMatt Macy 256*eda14cbcSMatt Macy 257*eda14cbcSMatt Macy uint64_t trim_state = vd->vdev_trim_state; 258*eda14cbcSMatt Macy VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE, 259*eda14cbcSMatt Macy sizeof (trim_state), 1, &trim_state, tx)); 260*eda14cbcSMatt Macy } 261*eda14cbcSMatt Macy 262*eda14cbcSMatt Macy /* 263*eda14cbcSMatt Macy * Update the on-disk state of a manual TRIM. This is called to request 264*eda14cbcSMatt Macy * that a TRIM be started/suspended/canceled, or to change one of the 265*eda14cbcSMatt Macy * TRIM options (partial, secure, rate). 266*eda14cbcSMatt Macy */ 267*eda14cbcSMatt Macy static void 268*eda14cbcSMatt Macy vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, 269*eda14cbcSMatt Macy uint64_t rate, boolean_t partial, boolean_t secure) 270*eda14cbcSMatt Macy { 271*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); 272*eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 273*eda14cbcSMatt Macy 274*eda14cbcSMatt Macy if (new_state == vd->vdev_trim_state) 275*eda14cbcSMatt Macy return; 276*eda14cbcSMatt Macy 277*eda14cbcSMatt Macy /* 278*eda14cbcSMatt Macy * Copy the vd's guid, this will be freed by the sync task. 279*eda14cbcSMatt Macy */ 280*eda14cbcSMatt Macy uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); 281*eda14cbcSMatt Macy *guid = vd->vdev_guid; 282*eda14cbcSMatt Macy 283*eda14cbcSMatt Macy /* 284*eda14cbcSMatt Macy * If we're suspending, then preserve the original start time. 285*eda14cbcSMatt Macy */ 286*eda14cbcSMatt Macy if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) { 287*eda14cbcSMatt Macy vd->vdev_trim_action_time = gethrestime_sec(); 288*eda14cbcSMatt Macy } 289*eda14cbcSMatt Macy 290*eda14cbcSMatt Macy /* 291*eda14cbcSMatt Macy * If we're activating, then preserve the requested rate and trim 292*eda14cbcSMatt Macy * method. Setting the last offset and rate to UINT64_MAX is used 293*eda14cbcSMatt Macy * as a sentinel to indicate they should be reset to default values. 294*eda14cbcSMatt Macy */ 295*eda14cbcSMatt Macy if (new_state == VDEV_TRIM_ACTIVE) { 296*eda14cbcSMatt Macy if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE || 297*eda14cbcSMatt Macy vd->vdev_trim_state == VDEV_TRIM_CANCELED) { 298*eda14cbcSMatt Macy vd->vdev_trim_last_offset = UINT64_MAX; 299*eda14cbcSMatt Macy vd->vdev_trim_rate = UINT64_MAX; 300*eda14cbcSMatt Macy vd->vdev_trim_partial = UINT64_MAX; 301*eda14cbcSMatt Macy vd->vdev_trim_secure = UINT64_MAX; 302*eda14cbcSMatt Macy } 303*eda14cbcSMatt Macy 304*eda14cbcSMatt Macy if (rate != 0) 305*eda14cbcSMatt Macy vd->vdev_trim_rate = rate; 306*eda14cbcSMatt Macy 307*eda14cbcSMatt Macy if (partial != 0) 308*eda14cbcSMatt Macy vd->vdev_trim_partial = partial; 309*eda14cbcSMatt Macy 310*eda14cbcSMatt Macy if (secure != 0) 311*eda14cbcSMatt Macy vd->vdev_trim_secure = secure; 312*eda14cbcSMatt Macy } 313*eda14cbcSMatt Macy 314*eda14cbcSMatt Macy boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED); 315*eda14cbcSMatt Macy vd->vdev_trim_state = new_state; 316*eda14cbcSMatt Macy 317*eda14cbcSMatt Macy dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 318*eda14cbcSMatt Macy VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 319*eda14cbcSMatt Macy dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync, 320*eda14cbcSMatt Macy guid, 2, ZFS_SPACE_CHECK_NONE, tx); 321*eda14cbcSMatt Macy 322*eda14cbcSMatt Macy switch (new_state) { 323*eda14cbcSMatt Macy case VDEV_TRIM_ACTIVE: 324*eda14cbcSMatt Macy spa_event_notify(spa, vd, NULL, 325*eda14cbcSMatt Macy resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START); 326*eda14cbcSMatt Macy spa_history_log_internal(spa, "trim", tx, 327*eda14cbcSMatt Macy "vdev=%s activated", vd->vdev_path); 328*eda14cbcSMatt Macy break; 329*eda14cbcSMatt Macy case VDEV_TRIM_SUSPENDED: 330*eda14cbcSMatt Macy spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND); 331*eda14cbcSMatt Macy spa_history_log_internal(spa, "trim", tx, 332*eda14cbcSMatt Macy "vdev=%s suspended", vd->vdev_path); 333*eda14cbcSMatt Macy break; 334*eda14cbcSMatt Macy case VDEV_TRIM_CANCELED: 335*eda14cbcSMatt Macy spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); 336*eda14cbcSMatt Macy spa_history_log_internal(spa, "trim", tx, 337*eda14cbcSMatt Macy "vdev=%s canceled", vd->vdev_path); 338*eda14cbcSMatt Macy break; 339*eda14cbcSMatt Macy case VDEV_TRIM_COMPLETE: 340*eda14cbcSMatt Macy spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH); 341*eda14cbcSMatt Macy spa_history_log_internal(spa, "trim", tx, 342*eda14cbcSMatt Macy "vdev=%s complete", vd->vdev_path); 343*eda14cbcSMatt Macy break; 344*eda14cbcSMatt Macy default: 345*eda14cbcSMatt Macy panic("invalid state %llu", (unsigned long long)new_state); 346*eda14cbcSMatt Macy } 347*eda14cbcSMatt Macy 348*eda14cbcSMatt Macy dmu_tx_commit(tx); 349*eda14cbcSMatt Macy 350*eda14cbcSMatt Macy if (new_state != VDEV_TRIM_ACTIVE) 351*eda14cbcSMatt Macy spa_notify_waiters(spa); 352*eda14cbcSMatt Macy } 353*eda14cbcSMatt Macy 354*eda14cbcSMatt Macy /* 355*eda14cbcSMatt Macy * The zio_done_func_t done callback for each manual TRIM issued. It is 356*eda14cbcSMatt Macy * responsible for updating the TRIM stats, reissuing failed TRIM I/Os, 357*eda14cbcSMatt Macy * and limiting the number of in flight TRIM I/Os. 358*eda14cbcSMatt Macy */ 359*eda14cbcSMatt Macy static void 360*eda14cbcSMatt Macy vdev_trim_cb(zio_t *zio) 361*eda14cbcSMatt Macy { 362*eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 363*eda14cbcSMatt Macy 364*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_io_lock); 365*eda14cbcSMatt Macy if (zio->io_error == ENXIO && !vdev_writeable(vd)) { 366*eda14cbcSMatt Macy /* 367*eda14cbcSMatt Macy * The I/O failed because the vdev was unavailable; roll the 368*eda14cbcSMatt Macy * last offset back. (This works because spa_sync waits on 369*eda14cbcSMatt Macy * spa_txg_zio before it runs sync tasks.) 370*eda14cbcSMatt Macy */ 371*eda14cbcSMatt Macy uint64_t *offset = 372*eda14cbcSMatt Macy &vd->vdev_trim_offset[zio->io_txg & TXG_MASK]; 373*eda14cbcSMatt Macy *offset = MIN(*offset, zio->io_offset); 374*eda14cbcSMatt Macy } else { 375*eda14cbcSMatt Macy if (zio->io_error != 0) { 376*eda14cbcSMatt Macy vd->vdev_stat.vs_trim_errors++; 377*eda14cbcSMatt Macy spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL, 378*eda14cbcSMatt Macy 0, 0, 0, 0, 1, zio->io_orig_size); 379*eda14cbcSMatt Macy } else { 380*eda14cbcSMatt Macy spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL, 381*eda14cbcSMatt Macy 1, zio->io_orig_size, 0, 0, 0, 0); 382*eda14cbcSMatt Macy } 383*eda14cbcSMatt Macy 384*eda14cbcSMatt Macy vd->vdev_trim_bytes_done += zio->io_orig_size; 385*eda14cbcSMatt Macy } 386*eda14cbcSMatt Macy 387*eda14cbcSMatt Macy ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0); 388*eda14cbcSMatt Macy vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--; 389*eda14cbcSMatt Macy cv_broadcast(&vd->vdev_trim_io_cv); 390*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_io_lock); 391*eda14cbcSMatt Macy 392*eda14cbcSMatt Macy spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 393*eda14cbcSMatt Macy } 394*eda14cbcSMatt Macy 395*eda14cbcSMatt Macy /* 396*eda14cbcSMatt Macy * The zio_done_func_t done callback for each automatic TRIM issued. It 397*eda14cbcSMatt Macy * is responsible for updating the TRIM stats and limiting the number of 398*eda14cbcSMatt Macy * in flight TRIM I/Os. Automatic TRIM I/Os are best effort and are 399*eda14cbcSMatt Macy * never reissued on failure. 400*eda14cbcSMatt Macy */ 401*eda14cbcSMatt Macy static void 402*eda14cbcSMatt Macy vdev_autotrim_cb(zio_t *zio) 403*eda14cbcSMatt Macy { 404*eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 405*eda14cbcSMatt Macy 406*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_io_lock); 407*eda14cbcSMatt Macy 408*eda14cbcSMatt Macy if (zio->io_error != 0) { 409*eda14cbcSMatt Macy vd->vdev_stat.vs_trim_errors++; 410*eda14cbcSMatt Macy spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO, 411*eda14cbcSMatt Macy 0, 0, 0, 0, 1, zio->io_orig_size); 412*eda14cbcSMatt Macy } else { 413*eda14cbcSMatt Macy spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO, 414*eda14cbcSMatt Macy 1, zio->io_orig_size, 0, 0, 0, 0); 415*eda14cbcSMatt Macy } 416*eda14cbcSMatt Macy 417*eda14cbcSMatt Macy ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_AUTO], >, 0); 418*eda14cbcSMatt Macy vd->vdev_trim_inflight[TRIM_TYPE_AUTO]--; 419*eda14cbcSMatt Macy cv_broadcast(&vd->vdev_trim_io_cv); 420*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_io_lock); 421*eda14cbcSMatt Macy 422*eda14cbcSMatt Macy spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 423*eda14cbcSMatt Macy } 424*eda14cbcSMatt Macy 425*eda14cbcSMatt Macy /* 426*eda14cbcSMatt Macy * The zio_done_func_t done callback for each TRIM issued via 427*eda14cbcSMatt Macy * vdev_trim_simple(). It is responsible for updating the TRIM stats and 428*eda14cbcSMatt Macy * limiting the number of in flight TRIM I/Os. Simple TRIM I/Os are best 429*eda14cbcSMatt Macy * effort and are never reissued on failure. 430*eda14cbcSMatt Macy */ 431*eda14cbcSMatt Macy static void 432*eda14cbcSMatt Macy vdev_trim_simple_cb(zio_t *zio) 433*eda14cbcSMatt Macy { 434*eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 435*eda14cbcSMatt Macy 436*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_io_lock); 437*eda14cbcSMatt Macy 438*eda14cbcSMatt Macy if (zio->io_error != 0) { 439*eda14cbcSMatt Macy vd->vdev_stat.vs_trim_errors++; 440*eda14cbcSMatt Macy spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, 441*eda14cbcSMatt Macy 0, 0, 0, 0, 1, zio->io_orig_size); 442*eda14cbcSMatt Macy } else { 443*eda14cbcSMatt Macy spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, 444*eda14cbcSMatt Macy 1, zio->io_orig_size, 0, 0, 0, 0); 445*eda14cbcSMatt Macy } 446*eda14cbcSMatt Macy 447*eda14cbcSMatt Macy ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0); 448*eda14cbcSMatt Macy vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--; 449*eda14cbcSMatt Macy cv_broadcast(&vd->vdev_trim_io_cv); 450*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_io_lock); 451*eda14cbcSMatt Macy 452*eda14cbcSMatt Macy spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 453*eda14cbcSMatt Macy } 454*eda14cbcSMatt Macy /* 455*eda14cbcSMatt Macy * Returns the average trim rate in bytes/sec for the ta->trim_vdev. 456*eda14cbcSMatt Macy */ 457*eda14cbcSMatt Macy static uint64_t 458*eda14cbcSMatt Macy vdev_trim_calculate_rate(trim_args_t *ta) 459*eda14cbcSMatt Macy { 460*eda14cbcSMatt Macy return (ta->trim_bytes_done * 1000 / 461*eda14cbcSMatt Macy (NSEC2MSEC(gethrtime() - ta->trim_start_time) + 1)); 462*eda14cbcSMatt Macy } 463*eda14cbcSMatt Macy 464*eda14cbcSMatt Macy /* 465*eda14cbcSMatt Macy * Issues a physical TRIM and takes care of rate limiting (bytes/sec) 466*eda14cbcSMatt Macy * and number of concurrent TRIM I/Os. 467*eda14cbcSMatt Macy */ 468*eda14cbcSMatt Macy static int 469*eda14cbcSMatt Macy vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) 470*eda14cbcSMatt Macy { 471*eda14cbcSMatt Macy vdev_t *vd = ta->trim_vdev; 472*eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 473*eda14cbcSMatt Macy void *cb; 474*eda14cbcSMatt Macy 475*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_io_lock); 476*eda14cbcSMatt Macy 477*eda14cbcSMatt Macy /* 478*eda14cbcSMatt Macy * Limit manual TRIM I/Os to the requested rate. This does not 479*eda14cbcSMatt Macy * apply to automatic TRIM since no per vdev rate can be specified. 480*eda14cbcSMatt Macy */ 481*eda14cbcSMatt Macy if (ta->trim_type == TRIM_TYPE_MANUAL) { 482*eda14cbcSMatt Macy while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) && 483*eda14cbcSMatt Macy vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) { 484*eda14cbcSMatt Macy cv_timedwait_sig(&vd->vdev_trim_io_cv, 485*eda14cbcSMatt Macy &vd->vdev_trim_io_lock, ddi_get_lbolt() + 486*eda14cbcSMatt Macy MSEC_TO_TICK(10)); 487*eda14cbcSMatt Macy } 488*eda14cbcSMatt Macy } 489*eda14cbcSMatt Macy ta->trim_bytes_done += size; 490*eda14cbcSMatt Macy 491*eda14cbcSMatt Macy /* Limit in flight trimming I/Os */ 492*eda14cbcSMatt Macy while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] + 493*eda14cbcSMatt Macy vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) { 494*eda14cbcSMatt Macy cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); 495*eda14cbcSMatt Macy } 496*eda14cbcSMatt Macy vd->vdev_trim_inflight[ta->trim_type]++; 497*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_io_lock); 498*eda14cbcSMatt Macy 499*eda14cbcSMatt Macy dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 500*eda14cbcSMatt Macy VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 501*eda14cbcSMatt Macy uint64_t txg = dmu_tx_get_txg(tx); 502*eda14cbcSMatt Macy 503*eda14cbcSMatt Macy spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); 504*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_lock); 505*eda14cbcSMatt Macy 506*eda14cbcSMatt Macy if (ta->trim_type == TRIM_TYPE_MANUAL && 507*eda14cbcSMatt Macy vd->vdev_trim_offset[txg & TXG_MASK] == 0) { 508*eda14cbcSMatt Macy uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); 509*eda14cbcSMatt Macy *guid = vd->vdev_guid; 510*eda14cbcSMatt Macy 511*eda14cbcSMatt Macy /* This is the first write of this txg. */ 512*eda14cbcSMatt Macy dsl_sync_task_nowait(spa_get_dsl(spa), 513*eda14cbcSMatt Macy vdev_trim_zap_update_sync, guid, 2, 514*eda14cbcSMatt Macy ZFS_SPACE_CHECK_RESERVED, tx); 515*eda14cbcSMatt Macy } 516*eda14cbcSMatt Macy 517*eda14cbcSMatt Macy /* 518*eda14cbcSMatt Macy * We know the vdev_t will still be around since all consumers of 519*eda14cbcSMatt Macy * vdev_free must stop the trimming first. 520*eda14cbcSMatt Macy */ 521*eda14cbcSMatt Macy if ((ta->trim_type == TRIM_TYPE_MANUAL && 522*eda14cbcSMatt Macy vdev_trim_should_stop(vd)) || 523*eda14cbcSMatt Macy (ta->trim_type == TRIM_TYPE_AUTO && 524*eda14cbcSMatt Macy vdev_autotrim_should_stop(vd->vdev_top))) { 525*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_io_lock); 526*eda14cbcSMatt Macy vd->vdev_trim_inflight[ta->trim_type]--; 527*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_io_lock); 528*eda14cbcSMatt Macy spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 529*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_lock); 530*eda14cbcSMatt Macy dmu_tx_commit(tx); 531*eda14cbcSMatt Macy return (SET_ERROR(EINTR)); 532*eda14cbcSMatt Macy } 533*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_lock); 534*eda14cbcSMatt Macy 535*eda14cbcSMatt Macy if (ta->trim_type == TRIM_TYPE_MANUAL) 536*eda14cbcSMatt Macy vd->vdev_trim_offset[txg & TXG_MASK] = start + size; 537*eda14cbcSMatt Macy 538*eda14cbcSMatt Macy if (ta->trim_type == TRIM_TYPE_MANUAL) { 539*eda14cbcSMatt Macy cb = vdev_trim_cb; 540*eda14cbcSMatt Macy } else if (ta->trim_type == TRIM_TYPE_AUTO) { 541*eda14cbcSMatt Macy cb = vdev_autotrim_cb; 542*eda14cbcSMatt Macy } else { 543*eda14cbcSMatt Macy cb = vdev_trim_simple_cb; 544*eda14cbcSMatt Macy } 545*eda14cbcSMatt Macy 546*eda14cbcSMatt Macy zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd, 547*eda14cbcSMatt Macy start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, 548*eda14cbcSMatt Macy ta->trim_flags)); 549*eda14cbcSMatt Macy /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */ 550*eda14cbcSMatt Macy 551*eda14cbcSMatt Macy dmu_tx_commit(tx); 552*eda14cbcSMatt Macy 553*eda14cbcSMatt Macy return (0); 554*eda14cbcSMatt Macy } 555*eda14cbcSMatt Macy 556*eda14cbcSMatt Macy /* 557*eda14cbcSMatt Macy * Issues TRIM I/Os for all ranges in the provided ta->trim_tree range tree. 558*eda14cbcSMatt Macy * Additional parameters describing how the TRIM should be performed must 559*eda14cbcSMatt Macy * be set in the trim_args structure. See the trim_args definition for 560*eda14cbcSMatt Macy * additional information. 561*eda14cbcSMatt Macy */ 562*eda14cbcSMatt Macy static int 563*eda14cbcSMatt Macy vdev_trim_ranges(trim_args_t *ta) 564*eda14cbcSMatt Macy { 565*eda14cbcSMatt Macy vdev_t *vd = ta->trim_vdev; 566*eda14cbcSMatt Macy zfs_btree_t *t = &ta->trim_tree->rt_root; 567*eda14cbcSMatt Macy zfs_btree_index_t idx; 568*eda14cbcSMatt Macy uint64_t extent_bytes_max = ta->trim_extent_bytes_max; 569*eda14cbcSMatt Macy uint64_t extent_bytes_min = ta->trim_extent_bytes_min; 570*eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 571*eda14cbcSMatt Macy 572*eda14cbcSMatt Macy ta->trim_start_time = gethrtime(); 573*eda14cbcSMatt Macy ta->trim_bytes_done = 0; 574*eda14cbcSMatt Macy 575*eda14cbcSMatt Macy for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; 576*eda14cbcSMatt Macy rs = zfs_btree_next(t, &idx, &idx)) { 577*eda14cbcSMatt Macy uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs, 578*eda14cbcSMatt Macy ta->trim_tree); 579*eda14cbcSMatt Macy 580*eda14cbcSMatt Macy if (extent_bytes_min && size < extent_bytes_min) { 581*eda14cbcSMatt Macy spa_iostats_trim_add(spa, ta->trim_type, 582*eda14cbcSMatt Macy 0, 0, 1, size, 0, 0); 583*eda14cbcSMatt Macy continue; 584*eda14cbcSMatt Macy } 585*eda14cbcSMatt Macy 586*eda14cbcSMatt Macy /* Split range into legally-sized physical chunks */ 587*eda14cbcSMatt Macy uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1; 588*eda14cbcSMatt Macy 589*eda14cbcSMatt Macy for (uint64_t w = 0; w < writes_required; w++) { 590*eda14cbcSMatt Macy int error; 591*eda14cbcSMatt Macy 592*eda14cbcSMatt Macy error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + 593*eda14cbcSMatt Macy rs_get_start(rs, ta->trim_tree) + 594*eda14cbcSMatt Macy (w *extent_bytes_max), MIN(size - 595*eda14cbcSMatt Macy (w * extent_bytes_max), extent_bytes_max)); 596*eda14cbcSMatt Macy if (error != 0) { 597*eda14cbcSMatt Macy return (error); 598*eda14cbcSMatt Macy } 599*eda14cbcSMatt Macy } 600*eda14cbcSMatt Macy } 601*eda14cbcSMatt Macy 602*eda14cbcSMatt Macy return (0); 603*eda14cbcSMatt Macy } 604*eda14cbcSMatt Macy 605*eda14cbcSMatt Macy /* 606*eda14cbcSMatt Macy * Calculates the completion percentage of a manual TRIM. 607*eda14cbcSMatt Macy */ 608*eda14cbcSMatt Macy static void 609*eda14cbcSMatt Macy vdev_trim_calculate_progress(vdev_t *vd) 610*eda14cbcSMatt Macy { 611*eda14cbcSMatt Macy ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || 612*eda14cbcSMatt Macy spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); 613*eda14cbcSMatt Macy ASSERT(vd->vdev_leaf_zap != 0); 614*eda14cbcSMatt Macy 615*eda14cbcSMatt Macy vd->vdev_trim_bytes_est = 0; 616*eda14cbcSMatt Macy vd->vdev_trim_bytes_done = 0; 617*eda14cbcSMatt Macy 618*eda14cbcSMatt Macy for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { 619*eda14cbcSMatt Macy metaslab_t *msp = vd->vdev_top->vdev_ms[i]; 620*eda14cbcSMatt Macy mutex_enter(&msp->ms_lock); 621*eda14cbcSMatt Macy 622*eda14cbcSMatt Macy uint64_t ms_free = msp->ms_size - 623*eda14cbcSMatt Macy metaslab_allocated_space(msp); 624*eda14cbcSMatt Macy 625*eda14cbcSMatt Macy if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) 626*eda14cbcSMatt Macy ms_free /= vd->vdev_top->vdev_children; 627*eda14cbcSMatt Macy 628*eda14cbcSMatt Macy /* 629*eda14cbcSMatt Macy * Convert the metaslab range to a physical range 630*eda14cbcSMatt Macy * on our vdev. We use this to determine if we are 631*eda14cbcSMatt Macy * in the middle of this metaslab range. 632*eda14cbcSMatt Macy */ 633*eda14cbcSMatt Macy range_seg64_t logical_rs, physical_rs; 634*eda14cbcSMatt Macy logical_rs.rs_start = msp->ms_start; 635*eda14cbcSMatt Macy logical_rs.rs_end = msp->ms_start + msp->ms_size; 636*eda14cbcSMatt Macy vdev_xlate(vd, &logical_rs, &physical_rs); 637*eda14cbcSMatt Macy 638*eda14cbcSMatt Macy if (vd->vdev_trim_last_offset <= physical_rs.rs_start) { 639*eda14cbcSMatt Macy vd->vdev_trim_bytes_est += ms_free; 640*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 641*eda14cbcSMatt Macy continue; 642*eda14cbcSMatt Macy } else if (vd->vdev_trim_last_offset > physical_rs.rs_end) { 643*eda14cbcSMatt Macy vd->vdev_trim_bytes_done += ms_free; 644*eda14cbcSMatt Macy vd->vdev_trim_bytes_est += ms_free; 645*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 646*eda14cbcSMatt Macy continue; 647*eda14cbcSMatt Macy } 648*eda14cbcSMatt Macy 649*eda14cbcSMatt Macy /* 650*eda14cbcSMatt Macy * If we get here, we're in the middle of trimming this 651*eda14cbcSMatt Macy * metaslab. Load it and walk the free tree for more 652*eda14cbcSMatt Macy * accurate progress estimation. 653*eda14cbcSMatt Macy */ 654*eda14cbcSMatt Macy VERIFY0(metaslab_load(msp)); 655*eda14cbcSMatt Macy 656*eda14cbcSMatt Macy range_tree_t *rt = msp->ms_allocatable; 657*eda14cbcSMatt Macy zfs_btree_t *bt = &rt->rt_root; 658*eda14cbcSMatt Macy zfs_btree_index_t idx; 659*eda14cbcSMatt Macy for (range_seg_t *rs = zfs_btree_first(bt, &idx); 660*eda14cbcSMatt Macy rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { 661*eda14cbcSMatt Macy logical_rs.rs_start = rs_get_start(rs, rt); 662*eda14cbcSMatt Macy logical_rs.rs_end = rs_get_end(rs, rt); 663*eda14cbcSMatt Macy vdev_xlate(vd, &logical_rs, &physical_rs); 664*eda14cbcSMatt Macy 665*eda14cbcSMatt Macy uint64_t size = physical_rs.rs_end - 666*eda14cbcSMatt Macy physical_rs.rs_start; 667*eda14cbcSMatt Macy vd->vdev_trim_bytes_est += size; 668*eda14cbcSMatt Macy if (vd->vdev_trim_last_offset >= physical_rs.rs_end) { 669*eda14cbcSMatt Macy vd->vdev_trim_bytes_done += size; 670*eda14cbcSMatt Macy } else if (vd->vdev_trim_last_offset > 671*eda14cbcSMatt Macy physical_rs.rs_start && 672*eda14cbcSMatt Macy vd->vdev_trim_last_offset <= 673*eda14cbcSMatt Macy physical_rs.rs_end) { 674*eda14cbcSMatt Macy vd->vdev_trim_bytes_done += 675*eda14cbcSMatt Macy vd->vdev_trim_last_offset - 676*eda14cbcSMatt Macy physical_rs.rs_start; 677*eda14cbcSMatt Macy } 678*eda14cbcSMatt Macy } 679*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 680*eda14cbcSMatt Macy } 681*eda14cbcSMatt Macy } 682*eda14cbcSMatt Macy 683*eda14cbcSMatt Macy /* 684*eda14cbcSMatt Macy * Load from disk the vdev's manual TRIM information. This includes the 685*eda14cbcSMatt Macy * state, progress, and options provided when initiating the manual TRIM. 686*eda14cbcSMatt Macy */ 687*eda14cbcSMatt Macy static int 688*eda14cbcSMatt Macy vdev_trim_load(vdev_t *vd) 689*eda14cbcSMatt Macy { 690*eda14cbcSMatt Macy int err = 0; 691*eda14cbcSMatt Macy ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || 692*eda14cbcSMatt Macy spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); 693*eda14cbcSMatt Macy ASSERT(vd->vdev_leaf_zap != 0); 694*eda14cbcSMatt Macy 695*eda14cbcSMatt Macy if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE || 696*eda14cbcSMatt Macy vd->vdev_trim_state == VDEV_TRIM_SUSPENDED) { 697*eda14cbcSMatt Macy err = zap_lookup(vd->vdev_spa->spa_meta_objset, 698*eda14cbcSMatt Macy vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET, 699*eda14cbcSMatt Macy sizeof (vd->vdev_trim_last_offset), 1, 700*eda14cbcSMatt Macy &vd->vdev_trim_last_offset); 701*eda14cbcSMatt Macy if (err == ENOENT) { 702*eda14cbcSMatt Macy vd->vdev_trim_last_offset = 0; 703*eda14cbcSMatt Macy err = 0; 704*eda14cbcSMatt Macy } 705*eda14cbcSMatt Macy 706*eda14cbcSMatt Macy if (err == 0) { 707*eda14cbcSMatt Macy err = zap_lookup(vd->vdev_spa->spa_meta_objset, 708*eda14cbcSMatt Macy vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE, 709*eda14cbcSMatt Macy sizeof (vd->vdev_trim_rate), 1, 710*eda14cbcSMatt Macy &vd->vdev_trim_rate); 711*eda14cbcSMatt Macy if (err == ENOENT) { 712*eda14cbcSMatt Macy vd->vdev_trim_rate = 0; 713*eda14cbcSMatt Macy err = 0; 714*eda14cbcSMatt Macy } 715*eda14cbcSMatt Macy } 716*eda14cbcSMatt Macy 717*eda14cbcSMatt Macy if (err == 0) { 718*eda14cbcSMatt Macy err = zap_lookup(vd->vdev_spa->spa_meta_objset, 719*eda14cbcSMatt Macy vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL, 720*eda14cbcSMatt Macy sizeof (vd->vdev_trim_partial), 1, 721*eda14cbcSMatt Macy &vd->vdev_trim_partial); 722*eda14cbcSMatt Macy if (err == ENOENT) { 723*eda14cbcSMatt Macy vd->vdev_trim_partial = 0; 724*eda14cbcSMatt Macy err = 0; 725*eda14cbcSMatt Macy } 726*eda14cbcSMatt Macy } 727*eda14cbcSMatt Macy 728*eda14cbcSMatt Macy if (err == 0) { 729*eda14cbcSMatt Macy err = zap_lookup(vd->vdev_spa->spa_meta_objset, 730*eda14cbcSMatt Macy vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE, 731*eda14cbcSMatt Macy sizeof (vd->vdev_trim_secure), 1, 732*eda14cbcSMatt Macy &vd->vdev_trim_secure); 733*eda14cbcSMatt Macy if (err == ENOENT) { 734*eda14cbcSMatt Macy vd->vdev_trim_secure = 0; 735*eda14cbcSMatt Macy err = 0; 736*eda14cbcSMatt Macy } 737*eda14cbcSMatt Macy } 738*eda14cbcSMatt Macy } 739*eda14cbcSMatt Macy 740*eda14cbcSMatt Macy vdev_trim_calculate_progress(vd); 741*eda14cbcSMatt Macy 742*eda14cbcSMatt Macy return (err); 743*eda14cbcSMatt Macy } 744*eda14cbcSMatt Macy 745*eda14cbcSMatt Macy /* 746*eda14cbcSMatt Macy * Convert the logical range into a physical range and add it to the 747*eda14cbcSMatt Macy * range tree passed in the trim_args_t. 748*eda14cbcSMatt Macy */ 749*eda14cbcSMatt Macy static void 750*eda14cbcSMatt Macy vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) 751*eda14cbcSMatt Macy { 752*eda14cbcSMatt Macy trim_args_t *ta = arg; 753*eda14cbcSMatt Macy vdev_t *vd = ta->trim_vdev; 754*eda14cbcSMatt Macy range_seg64_t logical_rs, physical_rs; 755*eda14cbcSMatt Macy logical_rs.rs_start = start; 756*eda14cbcSMatt Macy logical_rs.rs_end = start + size; 757*eda14cbcSMatt Macy 758*eda14cbcSMatt Macy /* 759*eda14cbcSMatt Macy * Every range to be trimmed must be part of ms_allocatable. 760*eda14cbcSMatt Macy * When ZFS_DEBUG_TRIM is set load the metaslab to verify this 761*eda14cbcSMatt Macy * is always the case. 762*eda14cbcSMatt Macy */ 763*eda14cbcSMatt Macy if (zfs_flags & ZFS_DEBUG_TRIM) { 764*eda14cbcSMatt Macy metaslab_t *msp = ta->trim_msp; 765*eda14cbcSMatt Macy VERIFY0(metaslab_load(msp)); 766*eda14cbcSMatt Macy VERIFY3B(msp->ms_loaded, ==, B_TRUE); 767*eda14cbcSMatt Macy VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); 768*eda14cbcSMatt Macy } 769*eda14cbcSMatt Macy 770*eda14cbcSMatt Macy ASSERT(vd->vdev_ops->vdev_op_leaf); 771*eda14cbcSMatt Macy vdev_xlate(vd, &logical_rs, &physical_rs); 772*eda14cbcSMatt Macy 773*eda14cbcSMatt Macy IMPLY(vd->vdev_top == vd, 774*eda14cbcSMatt Macy logical_rs.rs_start == physical_rs.rs_start); 775*eda14cbcSMatt Macy IMPLY(vd->vdev_top == vd, 776*eda14cbcSMatt Macy logical_rs.rs_end == physical_rs.rs_end); 777*eda14cbcSMatt Macy 778*eda14cbcSMatt Macy /* 779*eda14cbcSMatt Macy * Only a manual trim will be traversing the vdev sequentially. 780*eda14cbcSMatt Macy * For an auto trim all valid ranges should be added. 781*eda14cbcSMatt Macy */ 782*eda14cbcSMatt Macy if (ta->trim_type == TRIM_TYPE_MANUAL) { 783*eda14cbcSMatt Macy 784*eda14cbcSMatt Macy /* Only add segments that we have not visited yet */ 785*eda14cbcSMatt Macy if (physical_rs.rs_end <= vd->vdev_trim_last_offset) 786*eda14cbcSMatt Macy return; 787*eda14cbcSMatt Macy 788*eda14cbcSMatt Macy /* Pick up where we left off mid-range. */ 789*eda14cbcSMatt Macy if (vd->vdev_trim_last_offset > physical_rs.rs_start) { 790*eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, >, 791*eda14cbcSMatt Macy vd->vdev_trim_last_offset); 792*eda14cbcSMatt Macy physical_rs.rs_start = vd->vdev_trim_last_offset; 793*eda14cbcSMatt Macy } 794*eda14cbcSMatt Macy } 795*eda14cbcSMatt Macy 796*eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); 797*eda14cbcSMatt Macy 798*eda14cbcSMatt Macy /* 799*eda14cbcSMatt Macy * With raidz, it's possible that the logical range does not live on 800*eda14cbcSMatt Macy * this leaf vdev. We only add the physical range to this vdev's if it 801*eda14cbcSMatt Macy * has a length greater than 0. 802*eda14cbcSMatt Macy */ 803*eda14cbcSMatt Macy if (physical_rs.rs_end > physical_rs.rs_start) { 804*eda14cbcSMatt Macy range_tree_add(ta->trim_tree, physical_rs.rs_start, 805*eda14cbcSMatt Macy physical_rs.rs_end - physical_rs.rs_start); 806*eda14cbcSMatt Macy } else { 807*eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); 808*eda14cbcSMatt Macy } 809*eda14cbcSMatt Macy } 810*eda14cbcSMatt Macy 811*eda14cbcSMatt Macy /* 812*eda14cbcSMatt Macy * Each manual TRIM thread is responsible for trimming the unallocated 813*eda14cbcSMatt Macy * space for each leaf vdev. This is accomplished by sequentially iterating 814*eda14cbcSMatt Macy * over its top-level metaslabs and issuing TRIM I/O for the space described 815*eda14cbcSMatt Macy * by its ms_allocatable. While a metaslab is undergoing trimming it is 816*eda14cbcSMatt Macy * not eligible for new allocations. 817*eda14cbcSMatt Macy */ 818*eda14cbcSMatt Macy static void 819*eda14cbcSMatt Macy vdev_trim_thread(void *arg) 820*eda14cbcSMatt Macy { 821*eda14cbcSMatt Macy vdev_t *vd = arg; 822*eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 823*eda14cbcSMatt Macy trim_args_t ta; 824*eda14cbcSMatt Macy int error = 0; 825*eda14cbcSMatt Macy 826*eda14cbcSMatt Macy /* 827*eda14cbcSMatt Macy * The VDEV_LEAF_ZAP_TRIM_* entries may have been updated by 828*eda14cbcSMatt Macy * vdev_trim(). Wait for the updated values to be reflected 829*eda14cbcSMatt Macy * in the zap in order to start with the requested settings. 830*eda14cbcSMatt Macy */ 831*eda14cbcSMatt Macy txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); 832*eda14cbcSMatt Macy 833*eda14cbcSMatt Macy ASSERT(vdev_is_concrete(vd)); 834*eda14cbcSMatt Macy spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 835*eda14cbcSMatt Macy 836*eda14cbcSMatt Macy vd->vdev_trim_last_offset = 0; 837*eda14cbcSMatt Macy vd->vdev_trim_rate = 0; 838*eda14cbcSMatt Macy vd->vdev_trim_partial = 0; 839*eda14cbcSMatt Macy vd->vdev_trim_secure = 0; 840*eda14cbcSMatt Macy 841*eda14cbcSMatt Macy VERIFY0(vdev_trim_load(vd)); 842*eda14cbcSMatt Macy 843*eda14cbcSMatt Macy ta.trim_vdev = vd; 844*eda14cbcSMatt Macy ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; 845*eda14cbcSMatt Macy ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; 846*eda14cbcSMatt Macy ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); 847*eda14cbcSMatt Macy ta.trim_type = TRIM_TYPE_MANUAL; 848*eda14cbcSMatt Macy ta.trim_flags = 0; 849*eda14cbcSMatt Macy 850*eda14cbcSMatt Macy /* 851*eda14cbcSMatt Macy * When a secure TRIM has been requested infer that the intent 852*eda14cbcSMatt Macy * is that everything must be trimmed. Override the default 853*eda14cbcSMatt Macy * minimum TRIM size to prevent ranges from being skipped. 854*eda14cbcSMatt Macy */ 855*eda14cbcSMatt Macy if (vd->vdev_trim_secure) { 856*eda14cbcSMatt Macy ta.trim_flags |= ZIO_TRIM_SECURE; 857*eda14cbcSMatt Macy ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; 858*eda14cbcSMatt Macy } 859*eda14cbcSMatt Macy 860*eda14cbcSMatt Macy uint64_t ms_count = 0; 861*eda14cbcSMatt Macy for (uint64_t i = 0; !vd->vdev_detached && 862*eda14cbcSMatt Macy i < vd->vdev_top->vdev_ms_count; i++) { 863*eda14cbcSMatt Macy metaslab_t *msp = vd->vdev_top->vdev_ms[i]; 864*eda14cbcSMatt Macy 865*eda14cbcSMatt Macy /* 866*eda14cbcSMatt Macy * If we've expanded the top-level vdev or it's our 867*eda14cbcSMatt Macy * first pass, calculate our progress. 868*eda14cbcSMatt Macy */ 869*eda14cbcSMatt Macy if (vd->vdev_top->vdev_ms_count != ms_count) { 870*eda14cbcSMatt Macy vdev_trim_calculate_progress(vd); 871*eda14cbcSMatt Macy ms_count = vd->vdev_top->vdev_ms_count; 872*eda14cbcSMatt Macy } 873*eda14cbcSMatt Macy 874*eda14cbcSMatt Macy spa_config_exit(spa, SCL_CONFIG, FTAG); 875*eda14cbcSMatt Macy metaslab_disable(msp); 876*eda14cbcSMatt Macy mutex_enter(&msp->ms_lock); 877*eda14cbcSMatt Macy VERIFY0(metaslab_load(msp)); 878*eda14cbcSMatt Macy 879*eda14cbcSMatt Macy /* 880*eda14cbcSMatt Macy * If a partial TRIM was requested skip metaslabs which have 881*eda14cbcSMatt Macy * never been initialized and thus have never been written. 882*eda14cbcSMatt Macy */ 883*eda14cbcSMatt Macy if (msp->ms_sm == NULL && vd->vdev_trim_partial) { 884*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 885*eda14cbcSMatt Macy metaslab_enable(msp, B_FALSE, B_FALSE); 886*eda14cbcSMatt Macy spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 887*eda14cbcSMatt Macy vdev_trim_calculate_progress(vd); 888*eda14cbcSMatt Macy continue; 889*eda14cbcSMatt Macy } 890*eda14cbcSMatt Macy 891*eda14cbcSMatt Macy ta.trim_msp = msp; 892*eda14cbcSMatt Macy range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, &ta); 893*eda14cbcSMatt Macy range_tree_vacate(msp->ms_trim, NULL, NULL); 894*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 895*eda14cbcSMatt Macy 896*eda14cbcSMatt Macy error = vdev_trim_ranges(&ta); 897*eda14cbcSMatt Macy metaslab_enable(msp, B_TRUE, B_FALSE); 898*eda14cbcSMatt Macy spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 899*eda14cbcSMatt Macy 900*eda14cbcSMatt Macy range_tree_vacate(ta.trim_tree, NULL, NULL); 901*eda14cbcSMatt Macy if (error != 0) 902*eda14cbcSMatt Macy break; 903*eda14cbcSMatt Macy } 904*eda14cbcSMatt Macy 905*eda14cbcSMatt Macy spa_config_exit(spa, SCL_CONFIG, FTAG); 906*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_io_lock); 907*eda14cbcSMatt Macy while (vd->vdev_trim_inflight[0] > 0) { 908*eda14cbcSMatt Macy cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); 909*eda14cbcSMatt Macy } 910*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_io_lock); 911*eda14cbcSMatt Macy 912*eda14cbcSMatt Macy range_tree_destroy(ta.trim_tree); 913*eda14cbcSMatt Macy 914*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_lock); 915*eda14cbcSMatt Macy if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { 916*eda14cbcSMatt Macy vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, 917*eda14cbcSMatt Macy vd->vdev_trim_rate, vd->vdev_trim_partial, 918*eda14cbcSMatt Macy vd->vdev_trim_secure); 919*eda14cbcSMatt Macy } 920*eda14cbcSMatt Macy ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0); 921*eda14cbcSMatt Macy 922*eda14cbcSMatt Macy /* 923*eda14cbcSMatt Macy * Drop the vdev_trim_lock while we sync out the txg since it's 924*eda14cbcSMatt Macy * possible that a device might be trying to come online and must 925*eda14cbcSMatt Macy * check to see if it needs to restart a trim. That thread will be 926*eda14cbcSMatt Macy * holding the spa_config_lock which would prevent the txg_wait_synced 927*eda14cbcSMatt Macy * from completing. 928*eda14cbcSMatt Macy */ 929*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_lock); 930*eda14cbcSMatt Macy txg_wait_synced(spa_get_dsl(spa), 0); 931*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_lock); 932*eda14cbcSMatt Macy 933*eda14cbcSMatt Macy vd->vdev_trim_thread = NULL; 934*eda14cbcSMatt Macy cv_broadcast(&vd->vdev_trim_cv); 935*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_lock); 936*eda14cbcSMatt Macy 937*eda14cbcSMatt Macy thread_exit(); 938*eda14cbcSMatt Macy } 939*eda14cbcSMatt Macy 940*eda14cbcSMatt Macy /* 941*eda14cbcSMatt Macy * Initiates a manual TRIM for the vdev_t. Callers must hold vdev_trim_lock, 942*eda14cbcSMatt Macy * the vdev_t must be a leaf and cannot already be manually trimming. 943*eda14cbcSMatt Macy */ 944*eda14cbcSMatt Macy void 945*eda14cbcSMatt Macy vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) 946*eda14cbcSMatt Macy { 947*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); 948*eda14cbcSMatt Macy ASSERT(vd->vdev_ops->vdev_op_leaf); 949*eda14cbcSMatt Macy ASSERT(vdev_is_concrete(vd)); 950*eda14cbcSMatt Macy ASSERT3P(vd->vdev_trim_thread, ==, NULL); 951*eda14cbcSMatt Macy ASSERT(!vd->vdev_detached); 952*eda14cbcSMatt Macy ASSERT(!vd->vdev_trim_exit_wanted); 953*eda14cbcSMatt Macy ASSERT(!vd->vdev_top->vdev_removing); 954*eda14cbcSMatt Macy 955*eda14cbcSMatt Macy vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure); 956*eda14cbcSMatt Macy vd->vdev_trim_thread = thread_create(NULL, 0, 957*eda14cbcSMatt Macy vdev_trim_thread, vd, 0, &p0, TS_RUN, maxclsyspri); 958*eda14cbcSMatt Macy } 959*eda14cbcSMatt Macy 960*eda14cbcSMatt Macy /* 961*eda14cbcSMatt Macy * Wait for the trimming thread to be terminated (canceled or stopped). 962*eda14cbcSMatt Macy */ 963*eda14cbcSMatt Macy static void 964*eda14cbcSMatt Macy vdev_trim_stop_wait_impl(vdev_t *vd) 965*eda14cbcSMatt Macy { 966*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); 967*eda14cbcSMatt Macy 968*eda14cbcSMatt Macy while (vd->vdev_trim_thread != NULL) 969*eda14cbcSMatt Macy cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock); 970*eda14cbcSMatt Macy 971*eda14cbcSMatt Macy ASSERT3P(vd->vdev_trim_thread, ==, NULL); 972*eda14cbcSMatt Macy vd->vdev_trim_exit_wanted = B_FALSE; 973*eda14cbcSMatt Macy } 974*eda14cbcSMatt Macy 975*eda14cbcSMatt Macy /* 976*eda14cbcSMatt Macy * Wait for vdev trim threads which were listed to cleanly exit. 977*eda14cbcSMatt Macy */ 978*eda14cbcSMatt Macy void 979*eda14cbcSMatt Macy vdev_trim_stop_wait(spa_t *spa, list_t *vd_list) 980*eda14cbcSMatt Macy { 981*eda14cbcSMatt Macy vdev_t *vd; 982*eda14cbcSMatt Macy 983*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&spa_namespace_lock)); 984*eda14cbcSMatt Macy 985*eda14cbcSMatt Macy while ((vd = list_remove_head(vd_list)) != NULL) { 986*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_lock); 987*eda14cbcSMatt Macy vdev_trim_stop_wait_impl(vd); 988*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_lock); 989*eda14cbcSMatt Macy } 990*eda14cbcSMatt Macy } 991*eda14cbcSMatt Macy 992*eda14cbcSMatt Macy /* 993*eda14cbcSMatt Macy * Stop trimming a device, with the resultant trimming state being tgt_state. 994*eda14cbcSMatt Macy * For blocking behavior pass NULL for vd_list. Otherwise, when a list_t is 995*eda14cbcSMatt Macy * provided the stopping vdev is inserted in to the list. Callers are then 996*eda14cbcSMatt Macy * required to call vdev_trim_stop_wait() to block for all the trim threads 997*eda14cbcSMatt Macy * to exit. The caller must hold vdev_trim_lock and must not be writing to 998*eda14cbcSMatt Macy * the spa config, as the trimming thread may try to enter the config as a 999*eda14cbcSMatt Macy * reader before exiting. 1000*eda14cbcSMatt Macy */ 1001*eda14cbcSMatt Macy void 1002*eda14cbcSMatt Macy vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) 1003*eda14cbcSMatt Macy { 1004*eda14cbcSMatt Macy ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); 1005*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); 1006*eda14cbcSMatt Macy ASSERT(vd->vdev_ops->vdev_op_leaf); 1007*eda14cbcSMatt Macy ASSERT(vdev_is_concrete(vd)); 1008*eda14cbcSMatt Macy 1009*eda14cbcSMatt Macy /* 1010*eda14cbcSMatt Macy * Allow cancel requests to proceed even if the trim thread has 1011*eda14cbcSMatt Macy * stopped. 1012*eda14cbcSMatt Macy */ 1013*eda14cbcSMatt Macy if (vd->vdev_trim_thread == NULL && tgt_state != VDEV_TRIM_CANCELED) 1014*eda14cbcSMatt Macy return; 1015*eda14cbcSMatt Macy 1016*eda14cbcSMatt Macy vdev_trim_change_state(vd, tgt_state, 0, 0, 0); 1017*eda14cbcSMatt Macy vd->vdev_trim_exit_wanted = B_TRUE; 1018*eda14cbcSMatt Macy 1019*eda14cbcSMatt Macy if (vd_list == NULL) { 1020*eda14cbcSMatt Macy vdev_trim_stop_wait_impl(vd); 1021*eda14cbcSMatt Macy } else { 1022*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1023*eda14cbcSMatt Macy list_insert_tail(vd_list, vd); 1024*eda14cbcSMatt Macy } 1025*eda14cbcSMatt Macy } 1026*eda14cbcSMatt Macy 1027*eda14cbcSMatt Macy /* 1028*eda14cbcSMatt Macy * Requests that all listed vdevs stop trimming. 1029*eda14cbcSMatt Macy */ 1030*eda14cbcSMatt Macy static void 1031*eda14cbcSMatt Macy vdev_trim_stop_all_impl(vdev_t *vd, vdev_trim_state_t tgt_state, 1032*eda14cbcSMatt Macy list_t *vd_list) 1033*eda14cbcSMatt Macy { 1034*eda14cbcSMatt Macy if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { 1035*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_lock); 1036*eda14cbcSMatt Macy vdev_trim_stop(vd, tgt_state, vd_list); 1037*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_lock); 1038*eda14cbcSMatt Macy return; 1039*eda14cbcSMatt Macy } 1040*eda14cbcSMatt Macy 1041*eda14cbcSMatt Macy for (uint64_t i = 0; i < vd->vdev_children; i++) { 1042*eda14cbcSMatt Macy vdev_trim_stop_all_impl(vd->vdev_child[i], tgt_state, 1043*eda14cbcSMatt Macy vd_list); 1044*eda14cbcSMatt Macy } 1045*eda14cbcSMatt Macy } 1046*eda14cbcSMatt Macy 1047*eda14cbcSMatt Macy /* 1048*eda14cbcSMatt Macy * Convenience function to stop trimming of a vdev tree and set all trim 1049*eda14cbcSMatt Macy * thread pointers to NULL. 1050*eda14cbcSMatt Macy */ 1051*eda14cbcSMatt Macy void 1052*eda14cbcSMatt Macy vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) 1053*eda14cbcSMatt Macy { 1054*eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 1055*eda14cbcSMatt Macy list_t vd_list; 1056*eda14cbcSMatt Macy vdev_t *vd_l2cache; 1057*eda14cbcSMatt Macy 1058*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1059*eda14cbcSMatt Macy 1060*eda14cbcSMatt Macy list_create(&vd_list, sizeof (vdev_t), 1061*eda14cbcSMatt Macy offsetof(vdev_t, vdev_trim_node)); 1062*eda14cbcSMatt Macy 1063*eda14cbcSMatt Macy vdev_trim_stop_all_impl(vd, tgt_state, &vd_list); 1064*eda14cbcSMatt Macy 1065*eda14cbcSMatt Macy /* 1066*eda14cbcSMatt Macy * Iterate over cache devices and request stop trimming the 1067*eda14cbcSMatt Macy * whole device in case we export the pool or remove the cache 1068*eda14cbcSMatt Macy * device prematurely. 1069*eda14cbcSMatt Macy */ 1070*eda14cbcSMatt Macy for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 1071*eda14cbcSMatt Macy vd_l2cache = spa->spa_l2cache.sav_vdevs[i]; 1072*eda14cbcSMatt Macy vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list); 1073*eda14cbcSMatt Macy } 1074*eda14cbcSMatt Macy 1075*eda14cbcSMatt Macy vdev_trim_stop_wait(spa, &vd_list); 1076*eda14cbcSMatt Macy 1077*eda14cbcSMatt Macy if (vd->vdev_spa->spa_sync_on) { 1078*eda14cbcSMatt Macy /* Make sure that our state has been synced to disk */ 1079*eda14cbcSMatt Macy txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); 1080*eda14cbcSMatt Macy } 1081*eda14cbcSMatt Macy 1082*eda14cbcSMatt Macy list_destroy(&vd_list); 1083*eda14cbcSMatt Macy } 1084*eda14cbcSMatt Macy 1085*eda14cbcSMatt Macy /* 1086*eda14cbcSMatt Macy * Conditionally restarts a manual TRIM given its on-disk state. 1087*eda14cbcSMatt Macy */ 1088*eda14cbcSMatt Macy void 1089*eda14cbcSMatt Macy vdev_trim_restart(vdev_t *vd) 1090*eda14cbcSMatt Macy { 1091*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1092*eda14cbcSMatt Macy ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 1093*eda14cbcSMatt Macy 1094*eda14cbcSMatt Macy if (vd->vdev_leaf_zap != 0) { 1095*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_lock); 1096*eda14cbcSMatt Macy uint64_t trim_state = VDEV_TRIM_NONE; 1097*eda14cbcSMatt Macy int err = zap_lookup(vd->vdev_spa->spa_meta_objset, 1098*eda14cbcSMatt Macy vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE, 1099*eda14cbcSMatt Macy sizeof (trim_state), 1, &trim_state); 1100*eda14cbcSMatt Macy ASSERT(err == 0 || err == ENOENT); 1101*eda14cbcSMatt Macy vd->vdev_trim_state = trim_state; 1102*eda14cbcSMatt Macy 1103*eda14cbcSMatt Macy uint64_t timestamp = 0; 1104*eda14cbcSMatt Macy err = zap_lookup(vd->vdev_spa->spa_meta_objset, 1105*eda14cbcSMatt Macy vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME, 1106*eda14cbcSMatt Macy sizeof (timestamp), 1, ×tamp); 1107*eda14cbcSMatt Macy ASSERT(err == 0 || err == ENOENT); 1108*eda14cbcSMatt Macy vd->vdev_trim_action_time = timestamp; 1109*eda14cbcSMatt Macy 1110*eda14cbcSMatt Macy if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || 1111*eda14cbcSMatt Macy vd->vdev_offline) { 1112*eda14cbcSMatt Macy /* load progress for reporting, but don't resume */ 1113*eda14cbcSMatt Macy VERIFY0(vdev_trim_load(vd)); 1114*eda14cbcSMatt Macy } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE && 1115*eda14cbcSMatt Macy vdev_writeable(vd) && !vd->vdev_top->vdev_removing && 1116*eda14cbcSMatt Macy vd->vdev_trim_thread == NULL) { 1117*eda14cbcSMatt Macy VERIFY0(vdev_trim_load(vd)); 1118*eda14cbcSMatt Macy vdev_trim(vd, vd->vdev_trim_rate, 1119*eda14cbcSMatt Macy vd->vdev_trim_partial, vd->vdev_trim_secure); 1120*eda14cbcSMatt Macy } 1121*eda14cbcSMatt Macy 1122*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_lock); 1123*eda14cbcSMatt Macy } 1124*eda14cbcSMatt Macy 1125*eda14cbcSMatt Macy for (uint64_t i = 0; i < vd->vdev_children; i++) { 1126*eda14cbcSMatt Macy vdev_trim_restart(vd->vdev_child[i]); 1127*eda14cbcSMatt Macy } 1128*eda14cbcSMatt Macy } 1129*eda14cbcSMatt Macy 1130*eda14cbcSMatt Macy /* 1131*eda14cbcSMatt Macy * Used by the automatic TRIM when ZFS_DEBUG_TRIM is set to verify that 1132*eda14cbcSMatt Macy * every TRIM range is contained within ms_allocatable. 1133*eda14cbcSMatt Macy */ 1134*eda14cbcSMatt Macy static void 1135*eda14cbcSMatt Macy vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size) 1136*eda14cbcSMatt Macy { 1137*eda14cbcSMatt Macy trim_args_t *ta = arg; 1138*eda14cbcSMatt Macy metaslab_t *msp = ta->trim_msp; 1139*eda14cbcSMatt Macy 1140*eda14cbcSMatt Macy VERIFY3B(msp->ms_loaded, ==, B_TRUE); 1141*eda14cbcSMatt Macy VERIFY3U(msp->ms_disabled, >, 0); 1142*eda14cbcSMatt Macy VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); 1143*eda14cbcSMatt Macy } 1144*eda14cbcSMatt Macy 1145*eda14cbcSMatt Macy /* 1146*eda14cbcSMatt Macy * Each automatic TRIM thread is responsible for managing the trimming of a 1147*eda14cbcSMatt Macy * top-level vdev in the pool. No automatic TRIM state is maintained on-disk. 1148*eda14cbcSMatt Macy * 1149*eda14cbcSMatt Macy * N.B. This behavior is different from a manual TRIM where a thread 1150*eda14cbcSMatt Macy * is created for each leaf vdev, instead of each top-level vdev. 1151*eda14cbcSMatt Macy */ 1152*eda14cbcSMatt Macy static void 1153*eda14cbcSMatt Macy vdev_autotrim_thread(void *arg) 1154*eda14cbcSMatt Macy { 1155*eda14cbcSMatt Macy vdev_t *vd = arg; 1156*eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 1157*eda14cbcSMatt Macy int shift = 0; 1158*eda14cbcSMatt Macy 1159*eda14cbcSMatt Macy mutex_enter(&vd->vdev_autotrim_lock); 1160*eda14cbcSMatt Macy ASSERT3P(vd->vdev_top, ==, vd); 1161*eda14cbcSMatt Macy ASSERT3P(vd->vdev_autotrim_thread, !=, NULL); 1162*eda14cbcSMatt Macy mutex_exit(&vd->vdev_autotrim_lock); 1163*eda14cbcSMatt Macy spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1164*eda14cbcSMatt Macy 1165*eda14cbcSMatt Macy uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; 1166*eda14cbcSMatt Macy uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; 1167*eda14cbcSMatt Macy 1168*eda14cbcSMatt Macy while (!vdev_autotrim_should_stop(vd)) { 1169*eda14cbcSMatt Macy int txgs_per_trim = MAX(zfs_trim_txg_batch, 1); 1170*eda14cbcSMatt Macy boolean_t issued_trim = B_FALSE; 1171*eda14cbcSMatt Macy 1172*eda14cbcSMatt Macy /* 1173*eda14cbcSMatt Macy * All of the metaslabs are divided in to groups of size 1174*eda14cbcSMatt Macy * num_metaslabs / zfs_trim_txg_batch. Each of these groups 1175*eda14cbcSMatt Macy * is composed of metaslabs which are spread evenly over the 1176*eda14cbcSMatt Macy * device. 1177*eda14cbcSMatt Macy * 1178*eda14cbcSMatt Macy * For example, when zfs_trim_txg_batch = 32 (default) then 1179*eda14cbcSMatt Macy * group 0 will contain metaslabs 0, 32, 64, ...; 1180*eda14cbcSMatt Macy * group 1 will contain metaslabs 1, 33, 65, ...; 1181*eda14cbcSMatt Macy * group 2 will contain metaslabs 2, 34, 66, ...; and so on. 1182*eda14cbcSMatt Macy * 1183*eda14cbcSMatt Macy * On each pass through the while() loop one of these groups 1184*eda14cbcSMatt Macy * is selected. This is accomplished by using a shift value 1185*eda14cbcSMatt Macy * to select the starting metaslab, then striding over the 1186*eda14cbcSMatt Macy * metaslabs using the zfs_trim_txg_batch size. This is 1187*eda14cbcSMatt Macy * done to accomplish two things. 1188*eda14cbcSMatt Macy * 1189*eda14cbcSMatt Macy * 1) By dividing the metaslabs in to groups, and making sure 1190*eda14cbcSMatt Macy * that each group takes a minimum of one txg to process. 1191*eda14cbcSMatt Macy * Then zfs_trim_txg_batch controls the minimum number of 1192*eda14cbcSMatt Macy * txgs which must occur before a metaslab is revisited. 1193*eda14cbcSMatt Macy * 1194*eda14cbcSMatt Macy * 2) Selecting non-consecutive metaslabs distributes the 1195*eda14cbcSMatt Macy * TRIM commands for a group evenly over the entire device. 1196*eda14cbcSMatt Macy * This can be advantageous for certain types of devices. 1197*eda14cbcSMatt Macy */ 1198*eda14cbcSMatt Macy for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count; 1199*eda14cbcSMatt Macy i += txgs_per_trim) { 1200*eda14cbcSMatt Macy metaslab_t *msp = vd->vdev_ms[i]; 1201*eda14cbcSMatt Macy range_tree_t *trim_tree; 1202*eda14cbcSMatt Macy 1203*eda14cbcSMatt Macy spa_config_exit(spa, SCL_CONFIG, FTAG); 1204*eda14cbcSMatt Macy metaslab_disable(msp); 1205*eda14cbcSMatt Macy spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1206*eda14cbcSMatt Macy 1207*eda14cbcSMatt Macy mutex_enter(&msp->ms_lock); 1208*eda14cbcSMatt Macy 1209*eda14cbcSMatt Macy /* 1210*eda14cbcSMatt Macy * Skip the metaslab when it has never been allocated 1211*eda14cbcSMatt Macy * or when there are no recent frees to trim. 1212*eda14cbcSMatt Macy */ 1213*eda14cbcSMatt Macy if (msp->ms_sm == NULL || 1214*eda14cbcSMatt Macy range_tree_is_empty(msp->ms_trim)) { 1215*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 1216*eda14cbcSMatt Macy metaslab_enable(msp, B_FALSE, B_FALSE); 1217*eda14cbcSMatt Macy continue; 1218*eda14cbcSMatt Macy } 1219*eda14cbcSMatt Macy 1220*eda14cbcSMatt Macy /* 1221*eda14cbcSMatt Macy * Skip the metaslab when it has already been disabled. 1222*eda14cbcSMatt Macy * This may happen when a manual TRIM or initialize 1223*eda14cbcSMatt Macy * operation is running concurrently. In the case 1224*eda14cbcSMatt Macy * of a manual TRIM, the ms_trim tree will have been 1225*eda14cbcSMatt Macy * vacated. Only ranges added after the manual TRIM 1226*eda14cbcSMatt Macy * disabled the metaslab will be included in the tree. 1227*eda14cbcSMatt Macy * These will be processed when the automatic TRIM 1228*eda14cbcSMatt Macy * next revisits this metaslab. 1229*eda14cbcSMatt Macy */ 1230*eda14cbcSMatt Macy if (msp->ms_disabled > 1) { 1231*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 1232*eda14cbcSMatt Macy metaslab_enable(msp, B_FALSE, B_FALSE); 1233*eda14cbcSMatt Macy continue; 1234*eda14cbcSMatt Macy } 1235*eda14cbcSMatt Macy 1236*eda14cbcSMatt Macy /* 1237*eda14cbcSMatt Macy * Allocate an empty range tree which is swapped in 1238*eda14cbcSMatt Macy * for the existing ms_trim tree while it is processed. 1239*eda14cbcSMatt Macy */ 1240*eda14cbcSMatt Macy trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 1241*eda14cbcSMatt Macy 0, 0); 1242*eda14cbcSMatt Macy range_tree_swap(&msp->ms_trim, &trim_tree); 1243*eda14cbcSMatt Macy ASSERT(range_tree_is_empty(msp->ms_trim)); 1244*eda14cbcSMatt Macy 1245*eda14cbcSMatt Macy /* 1246*eda14cbcSMatt Macy * There are two cases when constructing the per-vdev 1247*eda14cbcSMatt Macy * trim trees for a metaslab. If the top-level vdev 1248*eda14cbcSMatt Macy * has no children then it is also a leaf and should 1249*eda14cbcSMatt Macy * be trimmed. Otherwise our children are the leaves 1250*eda14cbcSMatt Macy * and a trim tree should be constructed for each. 1251*eda14cbcSMatt Macy */ 1252*eda14cbcSMatt Macy trim_args_t *tap; 1253*eda14cbcSMatt Macy uint64_t children = vd->vdev_children; 1254*eda14cbcSMatt Macy if (children == 0) { 1255*eda14cbcSMatt Macy children = 1; 1256*eda14cbcSMatt Macy tap = kmem_zalloc(sizeof (trim_args_t) * 1257*eda14cbcSMatt Macy children, KM_SLEEP); 1258*eda14cbcSMatt Macy tap[0].trim_vdev = vd; 1259*eda14cbcSMatt Macy } else { 1260*eda14cbcSMatt Macy tap = kmem_zalloc(sizeof (trim_args_t) * 1261*eda14cbcSMatt Macy children, KM_SLEEP); 1262*eda14cbcSMatt Macy 1263*eda14cbcSMatt Macy for (uint64_t c = 0; c < children; c++) { 1264*eda14cbcSMatt Macy tap[c].trim_vdev = vd->vdev_child[c]; 1265*eda14cbcSMatt Macy } 1266*eda14cbcSMatt Macy } 1267*eda14cbcSMatt Macy 1268*eda14cbcSMatt Macy for (uint64_t c = 0; c < children; c++) { 1269*eda14cbcSMatt Macy trim_args_t *ta = &tap[c]; 1270*eda14cbcSMatt Macy vdev_t *cvd = ta->trim_vdev; 1271*eda14cbcSMatt Macy 1272*eda14cbcSMatt Macy ta->trim_msp = msp; 1273*eda14cbcSMatt Macy ta->trim_extent_bytes_max = extent_bytes_max; 1274*eda14cbcSMatt Macy ta->trim_extent_bytes_min = extent_bytes_min; 1275*eda14cbcSMatt Macy ta->trim_type = TRIM_TYPE_AUTO; 1276*eda14cbcSMatt Macy ta->trim_flags = 0; 1277*eda14cbcSMatt Macy 1278*eda14cbcSMatt Macy if (cvd->vdev_detached || 1279*eda14cbcSMatt Macy !vdev_writeable(cvd) || 1280*eda14cbcSMatt Macy !cvd->vdev_has_trim || 1281*eda14cbcSMatt Macy cvd->vdev_trim_thread != NULL) { 1282*eda14cbcSMatt Macy continue; 1283*eda14cbcSMatt Macy } 1284*eda14cbcSMatt Macy 1285*eda14cbcSMatt Macy /* 1286*eda14cbcSMatt Macy * When a device has an attached hot spare, or 1287*eda14cbcSMatt Macy * is being replaced it will not be trimmed. 1288*eda14cbcSMatt Macy * This is done to avoid adding additional 1289*eda14cbcSMatt Macy * stress to a potentially unhealthy device, 1290*eda14cbcSMatt Macy * and to minimize the required rebuild time. 1291*eda14cbcSMatt Macy */ 1292*eda14cbcSMatt Macy if (!cvd->vdev_ops->vdev_op_leaf) 1293*eda14cbcSMatt Macy continue; 1294*eda14cbcSMatt Macy 1295*eda14cbcSMatt Macy ta->trim_tree = range_tree_create(NULL, 1296*eda14cbcSMatt Macy RANGE_SEG64, NULL, 0, 0); 1297*eda14cbcSMatt Macy range_tree_walk(trim_tree, 1298*eda14cbcSMatt Macy vdev_trim_range_add, ta); 1299*eda14cbcSMatt Macy } 1300*eda14cbcSMatt Macy 1301*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 1302*eda14cbcSMatt Macy spa_config_exit(spa, SCL_CONFIG, FTAG); 1303*eda14cbcSMatt Macy 1304*eda14cbcSMatt Macy /* 1305*eda14cbcSMatt Macy * Issue the TRIM I/Os for all ranges covered by the 1306*eda14cbcSMatt Macy * TRIM trees. These ranges are safe to TRIM because 1307*eda14cbcSMatt Macy * no new allocations will be performed until the call 1308*eda14cbcSMatt Macy * to metaslab_enabled() below. 1309*eda14cbcSMatt Macy */ 1310*eda14cbcSMatt Macy for (uint64_t c = 0; c < children; c++) { 1311*eda14cbcSMatt Macy trim_args_t *ta = &tap[c]; 1312*eda14cbcSMatt Macy 1313*eda14cbcSMatt Macy /* 1314*eda14cbcSMatt Macy * Always yield to a manual TRIM if one has 1315*eda14cbcSMatt Macy * been started for the child vdev. 1316*eda14cbcSMatt Macy */ 1317*eda14cbcSMatt Macy if (ta->trim_tree == NULL || 1318*eda14cbcSMatt Macy ta->trim_vdev->vdev_trim_thread != NULL) { 1319*eda14cbcSMatt Macy continue; 1320*eda14cbcSMatt Macy } 1321*eda14cbcSMatt Macy 1322*eda14cbcSMatt Macy /* 1323*eda14cbcSMatt Macy * After this point metaslab_enable() must be 1324*eda14cbcSMatt Macy * called with the sync flag set. This is done 1325*eda14cbcSMatt Macy * here because vdev_trim_ranges() is allowed 1326*eda14cbcSMatt Macy * to be interrupted (EINTR) before issuing all 1327*eda14cbcSMatt Macy * of the required TRIM I/Os. 1328*eda14cbcSMatt Macy */ 1329*eda14cbcSMatt Macy issued_trim = B_TRUE; 1330*eda14cbcSMatt Macy 1331*eda14cbcSMatt Macy int error = vdev_trim_ranges(ta); 1332*eda14cbcSMatt Macy if (error) 1333*eda14cbcSMatt Macy break; 1334*eda14cbcSMatt Macy } 1335*eda14cbcSMatt Macy 1336*eda14cbcSMatt Macy /* 1337*eda14cbcSMatt Macy * Verify every range which was trimmed is still 1338*eda14cbcSMatt Macy * contained within the ms_allocatable tree. 1339*eda14cbcSMatt Macy */ 1340*eda14cbcSMatt Macy if (zfs_flags & ZFS_DEBUG_TRIM) { 1341*eda14cbcSMatt Macy mutex_enter(&msp->ms_lock); 1342*eda14cbcSMatt Macy VERIFY0(metaslab_load(msp)); 1343*eda14cbcSMatt Macy VERIFY3P(tap[0].trim_msp, ==, msp); 1344*eda14cbcSMatt Macy range_tree_walk(trim_tree, 1345*eda14cbcSMatt Macy vdev_trim_range_verify, &tap[0]); 1346*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 1347*eda14cbcSMatt Macy } 1348*eda14cbcSMatt Macy 1349*eda14cbcSMatt Macy range_tree_vacate(trim_tree, NULL, NULL); 1350*eda14cbcSMatt Macy range_tree_destroy(trim_tree); 1351*eda14cbcSMatt Macy 1352*eda14cbcSMatt Macy metaslab_enable(msp, issued_trim, B_FALSE); 1353*eda14cbcSMatt Macy spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1354*eda14cbcSMatt Macy 1355*eda14cbcSMatt Macy for (uint64_t c = 0; c < children; c++) { 1356*eda14cbcSMatt Macy trim_args_t *ta = &tap[c]; 1357*eda14cbcSMatt Macy 1358*eda14cbcSMatt Macy if (ta->trim_tree == NULL) 1359*eda14cbcSMatt Macy continue; 1360*eda14cbcSMatt Macy 1361*eda14cbcSMatt Macy range_tree_vacate(ta->trim_tree, NULL, NULL); 1362*eda14cbcSMatt Macy range_tree_destroy(ta->trim_tree); 1363*eda14cbcSMatt Macy } 1364*eda14cbcSMatt Macy 1365*eda14cbcSMatt Macy kmem_free(tap, sizeof (trim_args_t) * children); 1366*eda14cbcSMatt Macy } 1367*eda14cbcSMatt Macy 1368*eda14cbcSMatt Macy spa_config_exit(spa, SCL_CONFIG, FTAG); 1369*eda14cbcSMatt Macy 1370*eda14cbcSMatt Macy /* 1371*eda14cbcSMatt Macy * After completing the group of metaslabs wait for the next 1372*eda14cbcSMatt Macy * open txg. This is done to make sure that a minimum of 1373*eda14cbcSMatt Macy * zfs_trim_txg_batch txgs will occur before these metaslabs 1374*eda14cbcSMatt Macy * are trimmed again. 1375*eda14cbcSMatt Macy */ 1376*eda14cbcSMatt Macy txg_wait_open(spa_get_dsl(spa), 0, issued_trim); 1377*eda14cbcSMatt Macy 1378*eda14cbcSMatt Macy shift++; 1379*eda14cbcSMatt Macy spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1380*eda14cbcSMatt Macy } 1381*eda14cbcSMatt Macy 1382*eda14cbcSMatt Macy for (uint64_t c = 0; c < vd->vdev_children; c++) { 1383*eda14cbcSMatt Macy vdev_t *cvd = vd->vdev_child[c]; 1384*eda14cbcSMatt Macy mutex_enter(&cvd->vdev_trim_io_lock); 1385*eda14cbcSMatt Macy 1386*eda14cbcSMatt Macy while (cvd->vdev_trim_inflight[1] > 0) { 1387*eda14cbcSMatt Macy cv_wait(&cvd->vdev_trim_io_cv, 1388*eda14cbcSMatt Macy &cvd->vdev_trim_io_lock); 1389*eda14cbcSMatt Macy } 1390*eda14cbcSMatt Macy mutex_exit(&cvd->vdev_trim_io_lock); 1391*eda14cbcSMatt Macy } 1392*eda14cbcSMatt Macy 1393*eda14cbcSMatt Macy spa_config_exit(spa, SCL_CONFIG, FTAG); 1394*eda14cbcSMatt Macy 1395*eda14cbcSMatt Macy /* 1396*eda14cbcSMatt Macy * When exiting because the autotrim property was set to off, then 1397*eda14cbcSMatt Macy * abandon any unprocessed ms_trim ranges to reclaim the memory. 1398*eda14cbcSMatt Macy */ 1399*eda14cbcSMatt Macy if (spa_get_autotrim(spa) == SPA_AUTOTRIM_OFF) { 1400*eda14cbcSMatt Macy for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { 1401*eda14cbcSMatt Macy metaslab_t *msp = vd->vdev_ms[i]; 1402*eda14cbcSMatt Macy 1403*eda14cbcSMatt Macy mutex_enter(&msp->ms_lock); 1404*eda14cbcSMatt Macy range_tree_vacate(msp->ms_trim, NULL, NULL); 1405*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 1406*eda14cbcSMatt Macy } 1407*eda14cbcSMatt Macy } 1408*eda14cbcSMatt Macy 1409*eda14cbcSMatt Macy mutex_enter(&vd->vdev_autotrim_lock); 1410*eda14cbcSMatt Macy ASSERT(vd->vdev_autotrim_thread != NULL); 1411*eda14cbcSMatt Macy vd->vdev_autotrim_thread = NULL; 1412*eda14cbcSMatt Macy cv_broadcast(&vd->vdev_autotrim_cv); 1413*eda14cbcSMatt Macy mutex_exit(&vd->vdev_autotrim_lock); 1414*eda14cbcSMatt Macy 1415*eda14cbcSMatt Macy thread_exit(); 1416*eda14cbcSMatt Macy } 1417*eda14cbcSMatt Macy 1418*eda14cbcSMatt Macy /* 1419*eda14cbcSMatt Macy * Starts an autotrim thread, if needed, for each top-level vdev which can be 1420*eda14cbcSMatt Macy * trimmed. A top-level vdev which has been evacuated will never be trimmed. 1421*eda14cbcSMatt Macy */ 1422*eda14cbcSMatt Macy void 1423*eda14cbcSMatt Macy vdev_autotrim(spa_t *spa) 1424*eda14cbcSMatt Macy { 1425*eda14cbcSMatt Macy vdev_t *root_vd = spa->spa_root_vdev; 1426*eda14cbcSMatt Macy 1427*eda14cbcSMatt Macy for (uint64_t i = 0; i < root_vd->vdev_children; i++) { 1428*eda14cbcSMatt Macy vdev_t *tvd = root_vd->vdev_child[i]; 1429*eda14cbcSMatt Macy 1430*eda14cbcSMatt Macy mutex_enter(&tvd->vdev_autotrim_lock); 1431*eda14cbcSMatt Macy if (vdev_writeable(tvd) && !tvd->vdev_removing && 1432*eda14cbcSMatt Macy tvd->vdev_autotrim_thread == NULL) { 1433*eda14cbcSMatt Macy ASSERT3P(tvd->vdev_top, ==, tvd); 1434*eda14cbcSMatt Macy 1435*eda14cbcSMatt Macy tvd->vdev_autotrim_thread = thread_create(NULL, 0, 1436*eda14cbcSMatt Macy vdev_autotrim_thread, tvd, 0, &p0, TS_RUN, 1437*eda14cbcSMatt Macy maxclsyspri); 1438*eda14cbcSMatt Macy ASSERT(tvd->vdev_autotrim_thread != NULL); 1439*eda14cbcSMatt Macy } 1440*eda14cbcSMatt Macy mutex_exit(&tvd->vdev_autotrim_lock); 1441*eda14cbcSMatt Macy } 1442*eda14cbcSMatt Macy } 1443*eda14cbcSMatt Macy 1444*eda14cbcSMatt Macy /* 1445*eda14cbcSMatt Macy * Wait for the vdev_autotrim_thread associated with the passed top-level 1446*eda14cbcSMatt Macy * vdev to be terminated (canceled or stopped). 1447*eda14cbcSMatt Macy */ 1448*eda14cbcSMatt Macy void 1449*eda14cbcSMatt Macy vdev_autotrim_stop_wait(vdev_t *tvd) 1450*eda14cbcSMatt Macy { 1451*eda14cbcSMatt Macy mutex_enter(&tvd->vdev_autotrim_lock); 1452*eda14cbcSMatt Macy if (tvd->vdev_autotrim_thread != NULL) { 1453*eda14cbcSMatt Macy tvd->vdev_autotrim_exit_wanted = B_TRUE; 1454*eda14cbcSMatt Macy 1455*eda14cbcSMatt Macy while (tvd->vdev_autotrim_thread != NULL) { 1456*eda14cbcSMatt Macy cv_wait(&tvd->vdev_autotrim_cv, 1457*eda14cbcSMatt Macy &tvd->vdev_autotrim_lock); 1458*eda14cbcSMatt Macy } 1459*eda14cbcSMatt Macy 1460*eda14cbcSMatt Macy ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL); 1461*eda14cbcSMatt Macy tvd->vdev_autotrim_exit_wanted = B_FALSE; 1462*eda14cbcSMatt Macy } 1463*eda14cbcSMatt Macy mutex_exit(&tvd->vdev_autotrim_lock); 1464*eda14cbcSMatt Macy } 1465*eda14cbcSMatt Macy 1466*eda14cbcSMatt Macy /* 1467*eda14cbcSMatt Macy * Wait for all of the vdev_autotrim_thread associated with the pool to 1468*eda14cbcSMatt Macy * be terminated (canceled or stopped). 1469*eda14cbcSMatt Macy */ 1470*eda14cbcSMatt Macy void 1471*eda14cbcSMatt Macy vdev_autotrim_stop_all(spa_t *spa) 1472*eda14cbcSMatt Macy { 1473*eda14cbcSMatt Macy vdev_t *root_vd = spa->spa_root_vdev; 1474*eda14cbcSMatt Macy 1475*eda14cbcSMatt Macy for (uint64_t i = 0; i < root_vd->vdev_children; i++) 1476*eda14cbcSMatt Macy vdev_autotrim_stop_wait(root_vd->vdev_child[i]); 1477*eda14cbcSMatt Macy } 1478*eda14cbcSMatt Macy 1479*eda14cbcSMatt Macy /* 1480*eda14cbcSMatt Macy * Conditionally restart all of the vdev_autotrim_thread's for the pool. 1481*eda14cbcSMatt Macy */ 1482*eda14cbcSMatt Macy void 1483*eda14cbcSMatt Macy vdev_autotrim_restart(spa_t *spa) 1484*eda14cbcSMatt Macy { 1485*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1486*eda14cbcSMatt Macy 1487*eda14cbcSMatt Macy if (spa->spa_autotrim) 1488*eda14cbcSMatt Macy vdev_autotrim(spa); 1489*eda14cbcSMatt Macy } 1490*eda14cbcSMatt Macy 1491*eda14cbcSMatt Macy static void 1492*eda14cbcSMatt Macy vdev_trim_l2arc_thread(void *arg) 1493*eda14cbcSMatt Macy { 1494*eda14cbcSMatt Macy vdev_t *vd = arg; 1495*eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 1496*eda14cbcSMatt Macy l2arc_dev_t *dev = l2arc_vdev_get(vd); 1497*eda14cbcSMatt Macy trim_args_t ta; 1498*eda14cbcSMatt Macy range_seg64_t physical_rs; 1499*eda14cbcSMatt Macy 1500*eda14cbcSMatt Macy ASSERT(vdev_is_concrete(vd)); 1501*eda14cbcSMatt Macy spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1502*eda14cbcSMatt Macy 1503*eda14cbcSMatt Macy vd->vdev_trim_last_offset = 0; 1504*eda14cbcSMatt Macy vd->vdev_trim_rate = 0; 1505*eda14cbcSMatt Macy vd->vdev_trim_partial = 0; 1506*eda14cbcSMatt Macy vd->vdev_trim_secure = 0; 1507*eda14cbcSMatt Macy 1508*eda14cbcSMatt Macy bzero(&ta, sizeof (ta)); 1509*eda14cbcSMatt Macy ta.trim_vdev = vd; 1510*eda14cbcSMatt Macy ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); 1511*eda14cbcSMatt Macy ta.trim_type = TRIM_TYPE_MANUAL; 1512*eda14cbcSMatt Macy ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; 1513*eda14cbcSMatt Macy ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; 1514*eda14cbcSMatt Macy ta.trim_flags = 0; 1515*eda14cbcSMatt Macy 1516*eda14cbcSMatt Macy physical_rs.rs_start = vd->vdev_trim_bytes_done = 0; 1517*eda14cbcSMatt Macy physical_rs.rs_end = vd->vdev_trim_bytes_est = 1518*eda14cbcSMatt Macy vdev_get_min_asize(vd); 1519*eda14cbcSMatt Macy 1520*eda14cbcSMatt Macy range_tree_add(ta.trim_tree, physical_rs.rs_start, 1521*eda14cbcSMatt Macy physical_rs.rs_end - physical_rs.rs_start); 1522*eda14cbcSMatt Macy 1523*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_lock); 1524*eda14cbcSMatt Macy vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); 1525*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_lock); 1526*eda14cbcSMatt Macy 1527*eda14cbcSMatt Macy (void) vdev_trim_ranges(&ta); 1528*eda14cbcSMatt Macy 1529*eda14cbcSMatt Macy spa_config_exit(spa, SCL_CONFIG, FTAG); 1530*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_io_lock); 1531*eda14cbcSMatt Macy while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) { 1532*eda14cbcSMatt Macy cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); 1533*eda14cbcSMatt Macy } 1534*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_io_lock); 1535*eda14cbcSMatt Macy 1536*eda14cbcSMatt Macy range_tree_vacate(ta.trim_tree, NULL, NULL); 1537*eda14cbcSMatt Macy range_tree_destroy(ta.trim_tree); 1538*eda14cbcSMatt Macy 1539*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_lock); 1540*eda14cbcSMatt Macy if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { 1541*eda14cbcSMatt Macy vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, 1542*eda14cbcSMatt Macy vd->vdev_trim_rate, vd->vdev_trim_partial, 1543*eda14cbcSMatt Macy vd->vdev_trim_secure); 1544*eda14cbcSMatt Macy } 1545*eda14cbcSMatt Macy ASSERT(vd->vdev_trim_thread != NULL || 1546*eda14cbcSMatt Macy vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0); 1547*eda14cbcSMatt Macy 1548*eda14cbcSMatt Macy /* 1549*eda14cbcSMatt Macy * Drop the vdev_trim_lock while we sync out the txg since it's 1550*eda14cbcSMatt Macy * possible that a device might be trying to come online and 1551*eda14cbcSMatt Macy * must check to see if it needs to restart a trim. That thread 1552*eda14cbcSMatt Macy * will be holding the spa_config_lock which would prevent the 1553*eda14cbcSMatt Macy * txg_wait_synced from completing. Same strategy as in 1554*eda14cbcSMatt Macy * vdev_trim_thread(). 1555*eda14cbcSMatt Macy */ 1556*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_lock); 1557*eda14cbcSMatt Macy txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); 1558*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_lock); 1559*eda14cbcSMatt Macy 1560*eda14cbcSMatt Macy /* 1561*eda14cbcSMatt Macy * Update the header of the cache device here, before 1562*eda14cbcSMatt Macy * broadcasting vdev_trim_cv which may lead to the removal 1563*eda14cbcSMatt Macy * of the device. The same applies for setting l2ad_trim_all to 1564*eda14cbcSMatt Macy * false. 1565*eda14cbcSMatt Macy */ 1566*eda14cbcSMatt Macy spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd, 1567*eda14cbcSMatt Macy RW_READER); 1568*eda14cbcSMatt Macy bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize); 1569*eda14cbcSMatt Macy l2arc_dev_hdr_update(dev); 1570*eda14cbcSMatt Macy spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd); 1571*eda14cbcSMatt Macy 1572*eda14cbcSMatt Macy vd->vdev_trim_thread = NULL; 1573*eda14cbcSMatt Macy if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE) 1574*eda14cbcSMatt Macy dev->l2ad_trim_all = B_FALSE; 1575*eda14cbcSMatt Macy 1576*eda14cbcSMatt Macy cv_broadcast(&vd->vdev_trim_cv); 1577*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_lock); 1578*eda14cbcSMatt Macy 1579*eda14cbcSMatt Macy thread_exit(); 1580*eda14cbcSMatt Macy } 1581*eda14cbcSMatt Macy 1582*eda14cbcSMatt Macy /* 1583*eda14cbcSMatt Macy * Punches out TRIM threads for the L2ARC devices in a spa and assigns them 1584*eda14cbcSMatt Macy * to vd->vdev_trim_thread variable. This facilitates the management of 1585*eda14cbcSMatt Macy * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition 1586*eda14cbcSMatt Macy * to a pool or pool creation or when the header of the device is invalid. 1587*eda14cbcSMatt Macy */ 1588*eda14cbcSMatt Macy void 1589*eda14cbcSMatt Macy vdev_trim_l2arc(spa_t *spa) 1590*eda14cbcSMatt Macy { 1591*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1592*eda14cbcSMatt Macy 1593*eda14cbcSMatt Macy /* 1594*eda14cbcSMatt Macy * Locate the spa's l2arc devices and kick off TRIM threads. 1595*eda14cbcSMatt Macy */ 1596*eda14cbcSMatt Macy for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 1597*eda14cbcSMatt Macy vdev_t *vd = spa->spa_l2cache.sav_vdevs[i]; 1598*eda14cbcSMatt Macy l2arc_dev_t *dev = l2arc_vdev_get(vd); 1599*eda14cbcSMatt Macy 1600*eda14cbcSMatt Macy if (dev == NULL || !dev->l2ad_trim_all) { 1601*eda14cbcSMatt Macy /* 1602*eda14cbcSMatt Macy * Don't attempt TRIM if the vdev is UNAVAIL or if the 1603*eda14cbcSMatt Macy * cache device was not marked for whole device TRIM 1604*eda14cbcSMatt Macy * (ie l2arc_trim_ahead = 0, or the L2ARC device header 1605*eda14cbcSMatt Macy * is valid with trim_state = VDEV_TRIM_COMPLETE and 1606*eda14cbcSMatt Macy * l2ad_log_entries > 0). 1607*eda14cbcSMatt Macy */ 1608*eda14cbcSMatt Macy continue; 1609*eda14cbcSMatt Macy } 1610*eda14cbcSMatt Macy 1611*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_lock); 1612*eda14cbcSMatt Macy ASSERT(vd->vdev_ops->vdev_op_leaf); 1613*eda14cbcSMatt Macy ASSERT(vdev_is_concrete(vd)); 1614*eda14cbcSMatt Macy ASSERT3P(vd->vdev_trim_thread, ==, NULL); 1615*eda14cbcSMatt Macy ASSERT(!vd->vdev_detached); 1616*eda14cbcSMatt Macy ASSERT(!vd->vdev_trim_exit_wanted); 1617*eda14cbcSMatt Macy ASSERT(!vd->vdev_top->vdev_removing); 1618*eda14cbcSMatt Macy vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); 1619*eda14cbcSMatt Macy vd->vdev_trim_thread = thread_create(NULL, 0, 1620*eda14cbcSMatt Macy vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri); 1621*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_lock); 1622*eda14cbcSMatt Macy } 1623*eda14cbcSMatt Macy } 1624*eda14cbcSMatt Macy 1625*eda14cbcSMatt Macy /* 1626*eda14cbcSMatt Macy * A wrapper which calls vdev_trim_ranges(). It is intended to be called 1627*eda14cbcSMatt Macy * on leaf vdevs. 1628*eda14cbcSMatt Macy */ 1629*eda14cbcSMatt Macy int 1630*eda14cbcSMatt Macy vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) 1631*eda14cbcSMatt Macy { 1632*eda14cbcSMatt Macy trim_args_t ta; 1633*eda14cbcSMatt Macy range_seg64_t physical_rs; 1634*eda14cbcSMatt Macy int error; 1635*eda14cbcSMatt Macy physical_rs.rs_start = start; 1636*eda14cbcSMatt Macy physical_rs.rs_end = start + size; 1637*eda14cbcSMatt Macy 1638*eda14cbcSMatt Macy ASSERT(vdev_is_concrete(vd)); 1639*eda14cbcSMatt Macy ASSERT(vd->vdev_ops->vdev_op_leaf); 1640*eda14cbcSMatt Macy ASSERT(!vd->vdev_detached); 1641*eda14cbcSMatt Macy ASSERT(!vd->vdev_top->vdev_removing); 1642*eda14cbcSMatt Macy 1643*eda14cbcSMatt Macy bzero(&ta, sizeof (ta)); 1644*eda14cbcSMatt Macy ta.trim_vdev = vd; 1645*eda14cbcSMatt Macy ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); 1646*eda14cbcSMatt Macy ta.trim_type = TRIM_TYPE_SIMPLE; 1647*eda14cbcSMatt Macy ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; 1648*eda14cbcSMatt Macy ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; 1649*eda14cbcSMatt Macy ta.trim_flags = 0; 1650*eda14cbcSMatt Macy 1651*eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); 1652*eda14cbcSMatt Macy 1653*eda14cbcSMatt Macy if (physical_rs.rs_end > physical_rs.rs_start) { 1654*eda14cbcSMatt Macy range_tree_add(ta.trim_tree, physical_rs.rs_start, 1655*eda14cbcSMatt Macy physical_rs.rs_end - physical_rs.rs_start); 1656*eda14cbcSMatt Macy } else { 1657*eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); 1658*eda14cbcSMatt Macy } 1659*eda14cbcSMatt Macy 1660*eda14cbcSMatt Macy error = vdev_trim_ranges(&ta); 1661*eda14cbcSMatt Macy 1662*eda14cbcSMatt Macy mutex_enter(&vd->vdev_trim_io_lock); 1663*eda14cbcSMatt Macy while (vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE] > 0) { 1664*eda14cbcSMatt Macy cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); 1665*eda14cbcSMatt Macy } 1666*eda14cbcSMatt Macy mutex_exit(&vd->vdev_trim_io_lock); 1667*eda14cbcSMatt Macy 1668*eda14cbcSMatt Macy range_tree_vacate(ta.trim_tree, NULL, NULL); 1669*eda14cbcSMatt Macy range_tree_destroy(ta.trim_tree); 1670*eda14cbcSMatt Macy 1671*eda14cbcSMatt Macy return (error); 1672*eda14cbcSMatt Macy } 1673*eda14cbcSMatt Macy 1674*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_trim); 1675*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_trim_stop); 1676*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_trim_stop_all); 1677*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_trim_stop_wait); 1678*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_trim_restart); 1679*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_autotrim); 1680*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_autotrim_stop_all); 1681*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_autotrim_stop_wait); 1682*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_autotrim_restart); 1683*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_trim_l2arc); 1684*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_trim_simple); 1685*eda14cbcSMatt Macy 1686*eda14cbcSMatt Macy /* BEGIN CSTYLED */ 1687*eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW, 1688*eda14cbcSMatt Macy "Max size of TRIM commands, larger will be split"); 1689*eda14cbcSMatt Macy 1690*eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW, 1691*eda14cbcSMatt Macy "Min size of TRIM commands, smaller will be skipped"); 1692*eda14cbcSMatt Macy 1693*eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW, 1694*eda14cbcSMatt Macy "Skip metaslabs which have never been initialized"); 1695*eda14cbcSMatt Macy 1696*eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW, 1697*eda14cbcSMatt Macy "Min number of txgs to aggregate frees before issuing TRIM"); 1698*eda14cbcSMatt Macy 1699*eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW, 1700*eda14cbcSMatt Macy "Max queued TRIMs outstanding per leaf vdev"); 1701*eda14cbcSMatt Macy /* END CSTYLED */ 1702