1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6*eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7*eda14cbcSMatt Macy * 8*eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10*eda14cbcSMatt Macy * See the License for the specific language governing permissions 11*eda14cbcSMatt Macy * and limitations under the License. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14*eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16*eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17*eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18*eda14cbcSMatt Macy * 19*eda14cbcSMatt Macy * CDDL HEADER END 20*eda14cbcSMatt Macy */ 21*eda14cbcSMatt Macy /* 22*eda14cbcSMatt Macy * Copyright (c) 2017 by Lawrence Livermore National Security, LLC. 23*eda14cbcSMatt Macy */ 24*eda14cbcSMatt Macy 25*eda14cbcSMatt Macy #include <sys/abd.h> 26*eda14cbcSMatt Macy #include <sys/mmp.h> 27*eda14cbcSMatt Macy #include <sys/spa.h> 28*eda14cbcSMatt Macy #include <sys/spa_impl.h> 29*eda14cbcSMatt Macy #include <sys/time.h> 30*eda14cbcSMatt Macy #include <sys/vdev.h> 31*eda14cbcSMatt Macy #include <sys/vdev_impl.h> 32*eda14cbcSMatt Macy #include <sys/zfs_context.h> 33*eda14cbcSMatt Macy #include <sys/callb.h> 34*eda14cbcSMatt Macy 35*eda14cbcSMatt Macy /* 36*eda14cbcSMatt Macy * Multi-Modifier Protection (MMP) attempts to prevent a user from importing 37*eda14cbcSMatt Macy * or opening a pool on more than one host at a time. In particular, it 38*eda14cbcSMatt Macy * prevents "zpool import -f" on a host from succeeding while the pool is 39*eda14cbcSMatt Macy * already imported on another host. There are many other ways in which a 40*eda14cbcSMatt Macy * device could be used by two hosts for different purposes at the same time 41*eda14cbcSMatt Macy * resulting in pool damage. This implementation does not attempt to detect 42*eda14cbcSMatt Macy * those cases. 43*eda14cbcSMatt Macy * 44*eda14cbcSMatt Macy * MMP operates by ensuring there are frequent visible changes on disk (a 45*eda14cbcSMatt Macy * "heartbeat") at all times. And by altering the import process to check 46*eda14cbcSMatt Macy * for these changes and failing the import when they are detected. This 47*eda14cbcSMatt Macy * functionality is enabled by setting the 'multihost' pool property to on. 48*eda14cbcSMatt Macy * 49*eda14cbcSMatt Macy * Uberblocks written by the txg_sync thread always go into the first 50*eda14cbcSMatt Macy * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP. 51*eda14cbcSMatt Macy * They are used to hold uberblocks which are exactly the same as the last 52*eda14cbcSMatt Macy * synced uberblock except that the ub_timestamp and mmp_config are frequently 53*eda14cbcSMatt Macy * updated. Like all other uberblocks, the slot is written with an embedded 54*eda14cbcSMatt Macy * checksum, and slots with invalid checksums are ignored. This provides the 55*eda14cbcSMatt Macy * "heartbeat", with no risk of overwriting good uberblocks that must be 56*eda14cbcSMatt Macy * preserved, e.g. previous txgs and associated block pointers. 57*eda14cbcSMatt Macy * 58*eda14cbcSMatt Macy * Three optional fields are added to uberblock structure; ub_mmp_magic, 59*eda14cbcSMatt Macy * ub_mmp_config, and ub_mmp_delay. The ub_mmp_magic value allows zfs to tell 60*eda14cbcSMatt Macy * whether the other ub_mmp_* fields are valid. The ub_mmp_config field tells 61*eda14cbcSMatt Macy * the importing host the settings of zfs_multihost_interval and 62*eda14cbcSMatt Macy * zfs_multihost_fail_intervals on the host which last had (or currently has) 63*eda14cbcSMatt Macy * the pool imported. These determine how long a host must wait to detect 64*eda14cbcSMatt Macy * activity in the pool, before concluding the pool is not in use. The 65*eda14cbcSMatt Macy * mmp_delay field is a decaying average of the amount of time between 66*eda14cbcSMatt Macy * completion of successive MMP writes, in nanoseconds. It indicates whether 67*eda14cbcSMatt Macy * MMP is enabled. 68*eda14cbcSMatt Macy * 69*eda14cbcSMatt Macy * During import an activity test may now be performed to determine if 70*eda14cbcSMatt Macy * the pool is in use. The activity test is typically required if the 71*eda14cbcSMatt Macy * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is 72*eda14cbcSMatt Macy * POOL_STATE_ACTIVE, and the pool is not a root pool. 73*eda14cbcSMatt Macy * 74*eda14cbcSMatt Macy * The activity test finds the "best" uberblock (highest txg, timestamp, and, if 75*eda14cbcSMatt Macy * ub_mmp_magic is valid, sequence number from ub_mmp_config). It then waits 76*eda14cbcSMatt Macy * some time, and finds the "best" uberblock again. If any of the mentioned 77*eda14cbcSMatt Macy * fields have different values in the newly read uberblock, the pool is in use 78*eda14cbcSMatt Macy * by another host and the import fails. In order to assure the accuracy of the 79*eda14cbcSMatt Macy * activity test, the default values result in an activity test duration of 20x 80*eda14cbcSMatt Macy * the mmp write interval. 81*eda14cbcSMatt Macy * 82*eda14cbcSMatt Macy * The duration of the "zpool import" activity test depends on the information 83*eda14cbcSMatt Macy * available in the "best" uberblock: 84*eda14cbcSMatt Macy * 85*eda14cbcSMatt Macy * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0: 86*eda14cbcSMatt Macy * ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2 87*eda14cbcSMatt Macy * 88*eda14cbcSMatt Macy * In this case, a weak guarantee is provided. Since the host which last had 89*eda14cbcSMatt Macy * the pool imported will suspend the pool if no mmp writes land within 90*eda14cbcSMatt Macy * fail_intervals * multihost_interval ms, the absence of writes during that 91*eda14cbcSMatt Macy * time means either the pool is not imported, or it is imported but the pool 92*eda14cbcSMatt Macy * is suspended and no further writes will occur. 93*eda14cbcSMatt Macy * 94*eda14cbcSMatt Macy * Note that resuming the suspended pool on the remote host would invalidate 95*eda14cbcSMatt Macy * this guarantee, and so it is not allowed. 96*eda14cbcSMatt Macy * 97*eda14cbcSMatt Macy * The factor of 2 provides a conservative safety factor and derives from 98*eda14cbcSMatt Macy * MMP_IMPORT_SAFETY_FACTOR; 99*eda14cbcSMatt Macy * 100*eda14cbcSMatt Macy * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0: 101*eda14cbcSMatt Macy * (ub_mmp_config.multihost_interval + ub_mmp_delay) * 102*eda14cbcSMatt Macy * zfs_multihost_import_intervals 103*eda14cbcSMatt Macy * 104*eda14cbcSMatt Macy * In this case no guarantee can provided. However, as long as some devices 105*eda14cbcSMatt Macy * are healthy and connected, it is likely that at least one write will land 106*eda14cbcSMatt Macy * within (multihost_interval + mmp_delay) because multihost_interval is 107*eda14cbcSMatt Macy * enough time for a write to be attempted to each leaf vdev, and mmp_delay 108*eda14cbcSMatt Macy * is enough for one to land, based on past delays. Multiplying by 109*eda14cbcSMatt Macy * zfs_multihost_import_intervals provides a conservative safety factor. 110*eda14cbcSMatt Macy * 111*eda14cbcSMatt Macy * 3) If uberblock was written by zfs-0.7: 112*eda14cbcSMatt Macy * (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals 113*eda14cbcSMatt Macy * 114*eda14cbcSMatt Macy * The same logic as case #2 applies, but we do not know remote tunables. 115*eda14cbcSMatt Macy * 116*eda14cbcSMatt Macy * We use the local value for zfs_multihost_interval because the original MMP 117*eda14cbcSMatt Macy * did not record this value in the uberblock. 118*eda14cbcSMatt Macy * 119*eda14cbcSMatt Macy * ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host 120*eda14cbcSMatt Macy * has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect 121*eda14cbcSMatt Macy * that. We will have waited enough time for zfs_multihost_import_intervals 122*eda14cbcSMatt Macy * writes to be issued and all but one to land. 123*eda14cbcSMatt Macy * 124*eda14cbcSMatt Macy * single device pool example delays 125*eda14cbcSMatt Macy * 126*eda14cbcSMatt Macy * import_delay = (1 + 1) * 20 = 40s #defaults, no I/O delay 127*eda14cbcSMatt Macy * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay 128*eda14cbcSMatt Macy * import_delay = (10 + 10) * 20 = 400s #10s multihost_interval, 129*eda14cbcSMatt Macy * no I/O delay 130*eda14cbcSMatt Macy * 100 device pool example delays 131*eda14cbcSMatt Macy * 132*eda14cbcSMatt Macy * import_delay = (1 + .01) * 20 = 20s #defaults, no I/O delay 133*eda14cbcSMatt Macy * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay 134*eda14cbcSMatt Macy * import_delay = (10 + .1) * 20 = 202s #10s multihost_interval, 135*eda14cbcSMatt Macy * no I/O delay 136*eda14cbcSMatt Macy * 137*eda14cbcSMatt Macy * 4) Otherwise, this uberblock was written by a pre-MMP zfs: 138*eda14cbcSMatt Macy * zfs_multihost_import_intervals * zfs_multihost_interval 139*eda14cbcSMatt Macy * 140*eda14cbcSMatt Macy * In this case local tunables are used. By default this product = 10s, long 141*eda14cbcSMatt Macy * enough for a pool with any activity at all to write at least one 142*eda14cbcSMatt Macy * uberblock. No guarantee can be provided. 143*eda14cbcSMatt Macy * 144*eda14cbcSMatt Macy * Additionally, the duration is then extended by a random 25% to attempt to to 145*eda14cbcSMatt Macy * detect simultaneous imports. For example, if both partner hosts are rebooted 146*eda14cbcSMatt Macy * at the same time and automatically attempt to import the pool. 147*eda14cbcSMatt Macy */ 148*eda14cbcSMatt Macy 149*eda14cbcSMatt Macy /* 150*eda14cbcSMatt Macy * Used to control the frequency of mmp writes which are performed when the 151*eda14cbcSMatt Macy * 'multihost' pool property is on. This is one factor used to determine the 152*eda14cbcSMatt Macy * length of the activity check during import. 153*eda14cbcSMatt Macy * 154*eda14cbcSMatt Macy * On average an mmp write will be issued for each leaf vdev every 155*eda14cbcSMatt Macy * zfs_multihost_interval milliseconds. In practice, the observed period can 156*eda14cbcSMatt Macy * vary with the I/O load and this observed value is the ub_mmp_delay which is 157*eda14cbcSMatt Macy * stored in the uberblock. The minimum allowed value is 100 ms. 158*eda14cbcSMatt Macy */ 159*eda14cbcSMatt Macy ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL; 160*eda14cbcSMatt Macy 161*eda14cbcSMatt Macy /* 162*eda14cbcSMatt Macy * Used to control the duration of the activity test on import. Smaller values 163*eda14cbcSMatt Macy * of zfs_multihost_import_intervals will reduce the import time but increase 164*eda14cbcSMatt Macy * the risk of failing to detect an active pool. The total activity check time 165*eda14cbcSMatt Macy * is never allowed to drop below one second. A value of 0 is ignored and 166*eda14cbcSMatt Macy * treated as if it was set to 1. 167*eda14cbcSMatt Macy */ 168*eda14cbcSMatt Macy uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS; 169*eda14cbcSMatt Macy 170*eda14cbcSMatt Macy /* 171*eda14cbcSMatt Macy * Controls the behavior of the pool when mmp write failures or delays are 172*eda14cbcSMatt Macy * detected. 173*eda14cbcSMatt Macy * 174*eda14cbcSMatt Macy * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are 175*eda14cbcSMatt Macy * ignored. The failures will still be reported to the ZED which depending on 176*eda14cbcSMatt Macy * its configuration may take action such as suspending the pool or taking a 177*eda14cbcSMatt Macy * device offline. 178*eda14cbcSMatt Macy * 179*eda14cbcSMatt Macy * When zfs_multihost_fail_intervals > 0, the pool will be suspended if 180*eda14cbcSMatt Macy * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass 181*eda14cbcSMatt Macy * without a successful mmp write. This guarantees the activity test will see 182*eda14cbcSMatt Macy * mmp writes if the pool is imported. A value of 1 is ignored and treated as 183*eda14cbcSMatt Macy * if it was set to 2, because a single leaf vdev pool will issue a write once 184*eda14cbcSMatt Macy * per multihost_interval and thus any variation in latency would cause the 185*eda14cbcSMatt Macy * pool to be suspended. 186*eda14cbcSMatt Macy */ 187*eda14cbcSMatt Macy uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS; 188*eda14cbcSMatt Macy 189*eda14cbcSMatt Macy char *mmp_tag = "mmp_write_uberblock"; 190*eda14cbcSMatt Macy static void mmp_thread(void *arg); 191*eda14cbcSMatt Macy 192*eda14cbcSMatt Macy void 193*eda14cbcSMatt Macy mmp_init(spa_t *spa) 194*eda14cbcSMatt Macy { 195*eda14cbcSMatt Macy mmp_thread_t *mmp = &spa->spa_mmp; 196*eda14cbcSMatt Macy 197*eda14cbcSMatt Macy mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL); 198*eda14cbcSMatt Macy cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL); 199*eda14cbcSMatt Macy mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL); 200*eda14cbcSMatt Macy mmp->mmp_kstat_id = 1; 201*eda14cbcSMatt Macy 202*eda14cbcSMatt Macy /* 203*eda14cbcSMatt Macy * mmp_write_done() calculates mmp_delay based on prior mmp_delay and 204*eda14cbcSMatt Macy * the elapsed time since the last write. For the first mmp write, 205*eda14cbcSMatt Macy * there is no "last write", so we start with fake non-zero values. 206*eda14cbcSMatt Macy */ 207*eda14cbcSMatt Macy mmp->mmp_last_write = gethrtime(); 208*eda14cbcSMatt Macy mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)); 209*eda14cbcSMatt Macy } 210*eda14cbcSMatt Macy 211*eda14cbcSMatt Macy void 212*eda14cbcSMatt Macy mmp_fini(spa_t *spa) 213*eda14cbcSMatt Macy { 214*eda14cbcSMatt Macy mmp_thread_t *mmp = &spa->spa_mmp; 215*eda14cbcSMatt Macy 216*eda14cbcSMatt Macy mutex_destroy(&mmp->mmp_thread_lock); 217*eda14cbcSMatt Macy cv_destroy(&mmp->mmp_thread_cv); 218*eda14cbcSMatt Macy mutex_destroy(&mmp->mmp_io_lock); 219*eda14cbcSMatt Macy } 220*eda14cbcSMatt Macy 221*eda14cbcSMatt Macy static void 222*eda14cbcSMatt Macy mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr) 223*eda14cbcSMatt Macy { 224*eda14cbcSMatt Macy CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG); 225*eda14cbcSMatt Macy mutex_enter(&mmp->mmp_thread_lock); 226*eda14cbcSMatt Macy } 227*eda14cbcSMatt Macy 228*eda14cbcSMatt Macy static void 229*eda14cbcSMatt Macy mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr) 230*eda14cbcSMatt Macy { 231*eda14cbcSMatt Macy ASSERT(*mpp != NULL); 232*eda14cbcSMatt Macy *mpp = NULL; 233*eda14cbcSMatt Macy cv_broadcast(&mmp->mmp_thread_cv); 234*eda14cbcSMatt Macy CALLB_CPR_EXIT(cpr); /* drops &mmp->mmp_thread_lock */ 235*eda14cbcSMatt Macy thread_exit(); 236*eda14cbcSMatt Macy } 237*eda14cbcSMatt Macy 238*eda14cbcSMatt Macy void 239*eda14cbcSMatt Macy mmp_thread_start(spa_t *spa) 240*eda14cbcSMatt Macy { 241*eda14cbcSMatt Macy mmp_thread_t *mmp = &spa->spa_mmp; 242*eda14cbcSMatt Macy 243*eda14cbcSMatt Macy if (spa_writeable(spa)) { 244*eda14cbcSMatt Macy mutex_enter(&mmp->mmp_thread_lock); 245*eda14cbcSMatt Macy if (!mmp->mmp_thread) { 246*eda14cbcSMatt Macy mmp->mmp_thread = thread_create(NULL, 0, mmp_thread, 247*eda14cbcSMatt Macy spa, 0, &p0, TS_RUN, defclsyspri); 248*eda14cbcSMatt Macy zfs_dbgmsg("MMP thread started pool '%s' " 249*eda14cbcSMatt Macy "gethrtime %llu", spa_name(spa), gethrtime()); 250*eda14cbcSMatt Macy } 251*eda14cbcSMatt Macy mutex_exit(&mmp->mmp_thread_lock); 252*eda14cbcSMatt Macy } 253*eda14cbcSMatt Macy } 254*eda14cbcSMatt Macy 255*eda14cbcSMatt Macy void 256*eda14cbcSMatt Macy mmp_thread_stop(spa_t *spa) 257*eda14cbcSMatt Macy { 258*eda14cbcSMatt Macy mmp_thread_t *mmp = &spa->spa_mmp; 259*eda14cbcSMatt Macy 260*eda14cbcSMatt Macy mutex_enter(&mmp->mmp_thread_lock); 261*eda14cbcSMatt Macy mmp->mmp_thread_exiting = 1; 262*eda14cbcSMatt Macy cv_broadcast(&mmp->mmp_thread_cv); 263*eda14cbcSMatt Macy 264*eda14cbcSMatt Macy while (mmp->mmp_thread) { 265*eda14cbcSMatt Macy cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock); 266*eda14cbcSMatt Macy } 267*eda14cbcSMatt Macy mutex_exit(&mmp->mmp_thread_lock); 268*eda14cbcSMatt Macy zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu", 269*eda14cbcSMatt Macy spa_name(spa), gethrtime()); 270*eda14cbcSMatt Macy 271*eda14cbcSMatt Macy ASSERT(mmp->mmp_thread == NULL); 272*eda14cbcSMatt Macy mmp->mmp_thread_exiting = 0; 273*eda14cbcSMatt Macy } 274*eda14cbcSMatt Macy 275*eda14cbcSMatt Macy typedef enum mmp_vdev_state_flag { 276*eda14cbcSMatt Macy MMP_FAIL_NOT_WRITABLE = (1 << 0), 277*eda14cbcSMatt Macy MMP_FAIL_WRITE_PENDING = (1 << 1), 278*eda14cbcSMatt Macy } mmp_vdev_state_flag_t; 279*eda14cbcSMatt Macy 280*eda14cbcSMatt Macy /* 281*eda14cbcSMatt Macy * Find a leaf vdev to write an MMP block to. It must not have an outstanding 282*eda14cbcSMatt Macy * mmp write (if so a new write will also likely block). If there is no usable 283*eda14cbcSMatt Macy * leaf, a nonzero error value is returned. The error value returned is a bit 284*eda14cbcSMatt Macy * field. 285*eda14cbcSMatt Macy * 286*eda14cbcSMatt Macy * MMP_FAIL_WRITE_PENDING One or more leaf vdevs are writeable, but have an 287*eda14cbcSMatt Macy * outstanding MMP write. 288*eda14cbcSMatt Macy * MMP_FAIL_NOT_WRITABLE One or more leaf vdevs are not writeable. 289*eda14cbcSMatt Macy */ 290*eda14cbcSMatt Macy 291*eda14cbcSMatt Macy static int 292*eda14cbcSMatt Macy mmp_next_leaf(spa_t *spa) 293*eda14cbcSMatt Macy { 294*eda14cbcSMatt Macy vdev_t *leaf; 295*eda14cbcSMatt Macy vdev_t *starting_leaf; 296*eda14cbcSMatt Macy int fail_mask = 0; 297*eda14cbcSMatt Macy 298*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock)); 299*eda14cbcSMatt Macy ASSERT(spa_config_held(spa, SCL_STATE, RW_READER)); 300*eda14cbcSMatt Macy ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE); 301*eda14cbcSMatt Macy ASSERT(!list_is_empty(&spa->spa_leaf_list)); 302*eda14cbcSMatt Macy 303*eda14cbcSMatt Macy if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) { 304*eda14cbcSMatt Macy spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list); 305*eda14cbcSMatt Macy spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen; 306*eda14cbcSMatt Macy } 307*eda14cbcSMatt Macy 308*eda14cbcSMatt Macy leaf = spa->spa_mmp.mmp_last_leaf; 309*eda14cbcSMatt Macy if (leaf == NULL) 310*eda14cbcSMatt Macy leaf = list_head(&spa->spa_leaf_list); 311*eda14cbcSMatt Macy starting_leaf = leaf; 312*eda14cbcSMatt Macy 313*eda14cbcSMatt Macy do { 314*eda14cbcSMatt Macy leaf = list_next(&spa->spa_leaf_list, leaf); 315*eda14cbcSMatt Macy if (leaf == NULL) 316*eda14cbcSMatt Macy leaf = list_head(&spa->spa_leaf_list); 317*eda14cbcSMatt Macy 318*eda14cbcSMatt Macy if (!vdev_writeable(leaf)) { 319*eda14cbcSMatt Macy fail_mask |= MMP_FAIL_NOT_WRITABLE; 320*eda14cbcSMatt Macy } else if (leaf->vdev_mmp_pending != 0) { 321*eda14cbcSMatt Macy fail_mask |= MMP_FAIL_WRITE_PENDING; 322*eda14cbcSMatt Macy } else { 323*eda14cbcSMatt Macy spa->spa_mmp.mmp_last_leaf = leaf; 324*eda14cbcSMatt Macy return (0); 325*eda14cbcSMatt Macy } 326*eda14cbcSMatt Macy } while (leaf != starting_leaf); 327*eda14cbcSMatt Macy 328*eda14cbcSMatt Macy ASSERT(fail_mask); 329*eda14cbcSMatt Macy 330*eda14cbcSMatt Macy return (fail_mask); 331*eda14cbcSMatt Macy } 332*eda14cbcSMatt Macy 333*eda14cbcSMatt Macy /* 334*eda14cbcSMatt Macy * MMP writes are issued on a fixed schedule, but may complete at variable, 335*eda14cbcSMatt Macy * much longer, intervals. The mmp_delay captures long periods between 336*eda14cbcSMatt Macy * successful writes for any reason, including disk latency, scheduling delays, 337*eda14cbcSMatt Macy * etc. 338*eda14cbcSMatt Macy * 339*eda14cbcSMatt Macy * The mmp_delay is usually calculated as a decaying average, but if the latest 340*eda14cbcSMatt Macy * delay is higher we do not average it, so that we do not hide sudden spikes 341*eda14cbcSMatt Macy * which the importing host must wait for. 342*eda14cbcSMatt Macy * 343*eda14cbcSMatt Macy * If writes are occurring frequently, such as due to a high rate of txg syncs, 344*eda14cbcSMatt Macy * the mmp_delay could become very small. Since those short delays depend on 345*eda14cbcSMatt Macy * activity we cannot count on, we never allow mmp_delay to get lower than rate 346*eda14cbcSMatt Macy * expected if only mmp_thread writes occur. 347*eda14cbcSMatt Macy * 348*eda14cbcSMatt Macy * If an mmp write was skipped or fails, and we have already waited longer than 349*eda14cbcSMatt Macy * mmp_delay, we need to update it so the next write reflects the longer delay. 350*eda14cbcSMatt Macy * 351*eda14cbcSMatt Macy * Do not set mmp_delay if the multihost property is not on, so as not to 352*eda14cbcSMatt Macy * trigger an activity check on import. 353*eda14cbcSMatt Macy */ 354*eda14cbcSMatt Macy static void 355*eda14cbcSMatt Macy mmp_delay_update(spa_t *spa, boolean_t write_completed) 356*eda14cbcSMatt Macy { 357*eda14cbcSMatt Macy mmp_thread_t *mts = &spa->spa_mmp; 358*eda14cbcSMatt Macy hrtime_t delay = gethrtime() - mts->mmp_last_write; 359*eda14cbcSMatt Macy 360*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&mts->mmp_io_lock)); 361*eda14cbcSMatt Macy 362*eda14cbcSMatt Macy if (spa_multihost(spa) == B_FALSE) { 363*eda14cbcSMatt Macy mts->mmp_delay = 0; 364*eda14cbcSMatt Macy return; 365*eda14cbcSMatt Macy } 366*eda14cbcSMatt Macy 367*eda14cbcSMatt Macy if (delay > mts->mmp_delay) 368*eda14cbcSMatt Macy mts->mmp_delay = delay; 369*eda14cbcSMatt Macy 370*eda14cbcSMatt Macy if (write_completed == B_FALSE) 371*eda14cbcSMatt Macy return; 372*eda14cbcSMatt Macy 373*eda14cbcSMatt Macy mts->mmp_last_write = gethrtime(); 374*eda14cbcSMatt Macy 375*eda14cbcSMatt Macy /* 376*eda14cbcSMatt Macy * strictly less than, in case delay was changed above. 377*eda14cbcSMatt Macy */ 378*eda14cbcSMatt Macy if (delay < mts->mmp_delay) { 379*eda14cbcSMatt Macy hrtime_t min_delay = 380*eda14cbcSMatt Macy MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) / 381*eda14cbcSMatt Macy MAX(1, vdev_count_leaves(spa)); 382*eda14cbcSMatt Macy mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128), 383*eda14cbcSMatt Macy min_delay); 384*eda14cbcSMatt Macy } 385*eda14cbcSMatt Macy } 386*eda14cbcSMatt Macy 387*eda14cbcSMatt Macy static void 388*eda14cbcSMatt Macy mmp_write_done(zio_t *zio) 389*eda14cbcSMatt Macy { 390*eda14cbcSMatt Macy spa_t *spa = zio->io_spa; 391*eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 392*eda14cbcSMatt Macy mmp_thread_t *mts = zio->io_private; 393*eda14cbcSMatt Macy 394*eda14cbcSMatt Macy mutex_enter(&mts->mmp_io_lock); 395*eda14cbcSMatt Macy uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id; 396*eda14cbcSMatt Macy hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending; 397*eda14cbcSMatt Macy 398*eda14cbcSMatt Macy mmp_delay_update(spa, (zio->io_error == 0)); 399*eda14cbcSMatt Macy 400*eda14cbcSMatt Macy vd->vdev_mmp_pending = 0; 401*eda14cbcSMatt Macy vd->vdev_mmp_kstat_id = 0; 402*eda14cbcSMatt Macy 403*eda14cbcSMatt Macy mutex_exit(&mts->mmp_io_lock); 404*eda14cbcSMatt Macy spa_config_exit(spa, SCL_STATE, mmp_tag); 405*eda14cbcSMatt Macy 406*eda14cbcSMatt Macy spa_mmp_history_set(spa, mmp_kstat_id, zio->io_error, 407*eda14cbcSMatt Macy mmp_write_duration); 408*eda14cbcSMatt Macy 409*eda14cbcSMatt Macy abd_free(zio->io_abd); 410*eda14cbcSMatt Macy } 411*eda14cbcSMatt Macy 412*eda14cbcSMatt Macy /* 413*eda14cbcSMatt Macy * When the uberblock on-disk is updated by a spa_sync, 414*eda14cbcSMatt Macy * creating a new "best" uberblock, update the one stored 415*eda14cbcSMatt Macy * in the mmp thread state, used for mmp writes. 416*eda14cbcSMatt Macy */ 417*eda14cbcSMatt Macy void 418*eda14cbcSMatt Macy mmp_update_uberblock(spa_t *spa, uberblock_t *ub) 419*eda14cbcSMatt Macy { 420*eda14cbcSMatt Macy mmp_thread_t *mmp = &spa->spa_mmp; 421*eda14cbcSMatt Macy 422*eda14cbcSMatt Macy mutex_enter(&mmp->mmp_io_lock); 423*eda14cbcSMatt Macy mmp->mmp_ub = *ub; 424*eda14cbcSMatt Macy mmp->mmp_seq = 1; 425*eda14cbcSMatt Macy mmp->mmp_ub.ub_timestamp = gethrestime_sec(); 426*eda14cbcSMatt Macy mmp_delay_update(spa, B_TRUE); 427*eda14cbcSMatt Macy mutex_exit(&mmp->mmp_io_lock); 428*eda14cbcSMatt Macy } 429*eda14cbcSMatt Macy 430*eda14cbcSMatt Macy /* 431*eda14cbcSMatt Macy * Choose a random vdev, label, and MMP block, and write over it 432*eda14cbcSMatt Macy * with a copy of the last-synced uberblock, whose timestamp 433*eda14cbcSMatt Macy * has been updated to reflect that the pool is in use. 434*eda14cbcSMatt Macy */ 435*eda14cbcSMatt Macy static void 436*eda14cbcSMatt Macy mmp_write_uberblock(spa_t *spa) 437*eda14cbcSMatt Macy { 438*eda14cbcSMatt Macy int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; 439*eda14cbcSMatt Macy mmp_thread_t *mmp = &spa->spa_mmp; 440*eda14cbcSMatt Macy uberblock_t *ub; 441*eda14cbcSMatt Macy vdev_t *vd = NULL; 442*eda14cbcSMatt Macy int label, error; 443*eda14cbcSMatt Macy uint64_t offset; 444*eda14cbcSMatt Macy 445*eda14cbcSMatt Macy hrtime_t lock_acquire_time = gethrtime(); 446*eda14cbcSMatt Macy spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER); 447*eda14cbcSMatt Macy lock_acquire_time = gethrtime() - lock_acquire_time; 448*eda14cbcSMatt Macy if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) 449*eda14cbcSMatt Macy zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " 450*eda14cbcSMatt Macy "gethrtime %llu", spa_name(spa), lock_acquire_time, 451*eda14cbcSMatt Macy gethrtime()); 452*eda14cbcSMatt Macy 453*eda14cbcSMatt Macy mutex_enter(&mmp->mmp_io_lock); 454*eda14cbcSMatt Macy 455*eda14cbcSMatt Macy error = mmp_next_leaf(spa); 456*eda14cbcSMatt Macy 457*eda14cbcSMatt Macy /* 458*eda14cbcSMatt Macy * spa_mmp_history has two types of entries: 459*eda14cbcSMatt Macy * Issued MMP write: records time issued, error status, etc. 460*eda14cbcSMatt Macy * Skipped MMP write: an MMP write could not be issued because no 461*eda14cbcSMatt Macy * suitable leaf vdev was available. See comment above struct 462*eda14cbcSMatt Macy * spa_mmp_history for details. 463*eda14cbcSMatt Macy */ 464*eda14cbcSMatt Macy 465*eda14cbcSMatt Macy if (error) { 466*eda14cbcSMatt Macy mmp_delay_update(spa, B_FALSE); 467*eda14cbcSMatt Macy if (mmp->mmp_skip_error == error) { 468*eda14cbcSMatt Macy spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1); 469*eda14cbcSMatt Macy } else { 470*eda14cbcSMatt Macy mmp->mmp_skip_error = error; 471*eda14cbcSMatt Macy spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg, 472*eda14cbcSMatt Macy gethrestime_sec(), mmp->mmp_delay, NULL, 0, 473*eda14cbcSMatt Macy mmp->mmp_kstat_id++, error); 474*eda14cbcSMatt Macy zfs_dbgmsg("MMP error choosing leaf pool '%s' " 475*eda14cbcSMatt Macy "gethrtime %llu fail_mask %#x", spa_name(spa), 476*eda14cbcSMatt Macy gethrtime(), error); 477*eda14cbcSMatt Macy } 478*eda14cbcSMatt Macy mutex_exit(&mmp->mmp_io_lock); 479*eda14cbcSMatt Macy spa_config_exit(spa, SCL_STATE, mmp_tag); 480*eda14cbcSMatt Macy return; 481*eda14cbcSMatt Macy } 482*eda14cbcSMatt Macy 483*eda14cbcSMatt Macy vd = spa->spa_mmp.mmp_last_leaf; 484*eda14cbcSMatt Macy if (mmp->mmp_skip_error != 0) { 485*eda14cbcSMatt Macy mmp->mmp_skip_error = 0; 486*eda14cbcSMatt Macy zfs_dbgmsg("MMP write after skipping due to unavailable " 487*eda14cbcSMatt Macy "leaves, pool '%s' gethrtime %llu leaf %#llu", 488*eda14cbcSMatt Macy spa_name(spa), gethrtime(), vd->vdev_guid); 489*eda14cbcSMatt Macy } 490*eda14cbcSMatt Macy 491*eda14cbcSMatt Macy if (mmp->mmp_zio_root == NULL) 492*eda14cbcSMatt Macy mmp->mmp_zio_root = zio_root(spa, NULL, NULL, 493*eda14cbcSMatt Macy flags | ZIO_FLAG_GODFATHER); 494*eda14cbcSMatt Macy 495*eda14cbcSMatt Macy if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) { 496*eda14cbcSMatt Macy /* 497*eda14cbcSMatt Macy * Want to reset mmp_seq when timestamp advances because after 498*eda14cbcSMatt Macy * an mmp_seq wrap new values will not be chosen by 499*eda14cbcSMatt Macy * uberblock_compare() as the "best". 500*eda14cbcSMatt Macy */ 501*eda14cbcSMatt Macy mmp->mmp_ub.ub_timestamp = gethrestime_sec(); 502*eda14cbcSMatt Macy mmp->mmp_seq = 1; 503*eda14cbcSMatt Macy } 504*eda14cbcSMatt Macy 505*eda14cbcSMatt Macy ub = &mmp->mmp_ub; 506*eda14cbcSMatt Macy ub->ub_mmp_magic = MMP_MAGIC; 507*eda14cbcSMatt Macy ub->ub_mmp_delay = mmp->mmp_delay; 508*eda14cbcSMatt Macy ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) | 509*eda14cbcSMatt Macy MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) | 510*eda14cbcSMatt Macy MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK( 511*eda14cbcSMatt Macy zfs_multihost_fail_intervals)); 512*eda14cbcSMatt Macy vd->vdev_mmp_pending = gethrtime(); 513*eda14cbcSMatt Macy vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id; 514*eda14cbcSMatt Macy 515*eda14cbcSMatt Macy zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags); 516*eda14cbcSMatt Macy abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); 517*eda14cbcSMatt Macy abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); 518*eda14cbcSMatt Macy abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); 519*eda14cbcSMatt Macy 520*eda14cbcSMatt Macy mmp->mmp_seq++; 521*eda14cbcSMatt Macy mmp->mmp_kstat_id++; 522*eda14cbcSMatt Macy mutex_exit(&mmp->mmp_io_lock); 523*eda14cbcSMatt Macy 524*eda14cbcSMatt Macy offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) - 525*eda14cbcSMatt Macy MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL)); 526*eda14cbcSMatt Macy 527*eda14cbcSMatt Macy label = spa_get_random(VDEV_LABELS); 528*eda14cbcSMatt Macy vdev_label_write(zio, vd, label, ub_abd, offset, 529*eda14cbcSMatt Macy VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp, 530*eda14cbcSMatt Macy flags | ZIO_FLAG_DONT_PROPAGATE); 531*eda14cbcSMatt Macy 532*eda14cbcSMatt Macy (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp, 533*eda14cbcSMatt Macy ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0); 534*eda14cbcSMatt Macy 535*eda14cbcSMatt Macy zio_nowait(zio); 536*eda14cbcSMatt Macy } 537*eda14cbcSMatt Macy 538*eda14cbcSMatt Macy static void 539*eda14cbcSMatt Macy mmp_thread(void *arg) 540*eda14cbcSMatt Macy { 541*eda14cbcSMatt Macy spa_t *spa = (spa_t *)arg; 542*eda14cbcSMatt Macy mmp_thread_t *mmp = &spa->spa_mmp; 543*eda14cbcSMatt Macy boolean_t suspended = spa_suspended(spa); 544*eda14cbcSMatt Macy boolean_t multihost = spa_multihost(spa); 545*eda14cbcSMatt Macy uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK( 546*eda14cbcSMatt Macy zfs_multihost_interval)); 547*eda14cbcSMatt Macy uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK( 548*eda14cbcSMatt Macy zfs_multihost_fail_intervals); 549*eda14cbcSMatt Macy hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval; 550*eda14cbcSMatt Macy boolean_t last_spa_suspended = suspended; 551*eda14cbcSMatt Macy boolean_t last_spa_multihost = multihost; 552*eda14cbcSMatt Macy uint64_t last_mmp_interval = mmp_interval; 553*eda14cbcSMatt Macy uint32_t last_mmp_fail_intervals = mmp_fail_intervals; 554*eda14cbcSMatt Macy hrtime_t last_mmp_fail_ns = mmp_fail_ns; 555*eda14cbcSMatt Macy callb_cpr_t cpr; 556*eda14cbcSMatt Macy int skip_wait = 0; 557*eda14cbcSMatt Macy 558*eda14cbcSMatt Macy mmp_thread_enter(mmp, &cpr); 559*eda14cbcSMatt Macy 560*eda14cbcSMatt Macy while (!mmp->mmp_thread_exiting) { 561*eda14cbcSMatt Macy hrtime_t next_time = gethrtime() + 562*eda14cbcSMatt Macy MSEC2NSEC(MMP_DEFAULT_INTERVAL); 563*eda14cbcSMatt Macy int leaves = MAX(vdev_count_leaves(spa), 1); 564*eda14cbcSMatt Macy 565*eda14cbcSMatt Macy /* Detect changes in tunables or state */ 566*eda14cbcSMatt Macy 567*eda14cbcSMatt Macy last_spa_suspended = suspended; 568*eda14cbcSMatt Macy last_spa_multihost = multihost; 569*eda14cbcSMatt Macy suspended = spa_suspended(spa); 570*eda14cbcSMatt Macy multihost = spa_multihost(spa); 571*eda14cbcSMatt Macy 572*eda14cbcSMatt Macy last_mmp_interval = mmp_interval; 573*eda14cbcSMatt Macy last_mmp_fail_intervals = mmp_fail_intervals; 574*eda14cbcSMatt Macy last_mmp_fail_ns = mmp_fail_ns; 575*eda14cbcSMatt Macy mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK( 576*eda14cbcSMatt Macy zfs_multihost_interval)); 577*eda14cbcSMatt Macy mmp_fail_intervals = MMP_FAIL_INTVS_OK( 578*eda14cbcSMatt Macy zfs_multihost_fail_intervals); 579*eda14cbcSMatt Macy 580*eda14cbcSMatt Macy /* Smooth so pool is not suspended when reducing tunables */ 581*eda14cbcSMatt Macy if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) { 582*eda14cbcSMatt Macy mmp_fail_ns = (mmp_fail_ns * 31 + 583*eda14cbcSMatt Macy mmp_fail_intervals * mmp_interval) / 32; 584*eda14cbcSMatt Macy } else { 585*eda14cbcSMatt Macy mmp_fail_ns = mmp_fail_intervals * 586*eda14cbcSMatt Macy mmp_interval; 587*eda14cbcSMatt Macy } 588*eda14cbcSMatt Macy 589*eda14cbcSMatt Macy if (mmp_interval != last_mmp_interval || 590*eda14cbcSMatt Macy mmp_fail_intervals != last_mmp_fail_intervals) { 591*eda14cbcSMatt Macy /* 592*eda14cbcSMatt Macy * We want other hosts to see new tunables as quickly as 593*eda14cbcSMatt Macy * possible. Write out at higher frequency than usual. 594*eda14cbcSMatt Macy */ 595*eda14cbcSMatt Macy skip_wait += leaves; 596*eda14cbcSMatt Macy } 597*eda14cbcSMatt Macy 598*eda14cbcSMatt Macy if (multihost) 599*eda14cbcSMatt Macy next_time = gethrtime() + mmp_interval / leaves; 600*eda14cbcSMatt Macy 601*eda14cbcSMatt Macy if (mmp_fail_ns != last_mmp_fail_ns) { 602*eda14cbcSMatt Macy zfs_dbgmsg("MMP interval change pool '%s' " 603*eda14cbcSMatt Macy "gethrtime %llu last_mmp_interval %llu " 604*eda14cbcSMatt Macy "mmp_interval %llu last_mmp_fail_intervals %u " 605*eda14cbcSMatt Macy "mmp_fail_intervals %u mmp_fail_ns %llu " 606*eda14cbcSMatt Macy "skip_wait %d leaves %d next_time %llu", 607*eda14cbcSMatt Macy spa_name(spa), gethrtime(), last_mmp_interval, 608*eda14cbcSMatt Macy mmp_interval, last_mmp_fail_intervals, 609*eda14cbcSMatt Macy mmp_fail_intervals, mmp_fail_ns, skip_wait, leaves, 610*eda14cbcSMatt Macy next_time); 611*eda14cbcSMatt Macy } 612*eda14cbcSMatt Macy 613*eda14cbcSMatt Macy /* 614*eda14cbcSMatt Macy * MMP off => on, or suspended => !suspended: 615*eda14cbcSMatt Macy * No writes occurred recently. Update mmp_last_write to give 616*eda14cbcSMatt Macy * us some time to try. 617*eda14cbcSMatt Macy */ 618*eda14cbcSMatt Macy if ((!last_spa_multihost && multihost) || 619*eda14cbcSMatt Macy (last_spa_suspended && !suspended)) { 620*eda14cbcSMatt Macy zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu " 621*eda14cbcSMatt Macy "last_spa_multihost %u multihost %u " 622*eda14cbcSMatt Macy "last_spa_suspended %u suspended %u", 623*eda14cbcSMatt Macy spa_name(spa), last_spa_multihost, multihost, 624*eda14cbcSMatt Macy last_spa_suspended, suspended); 625*eda14cbcSMatt Macy mutex_enter(&mmp->mmp_io_lock); 626*eda14cbcSMatt Macy mmp->mmp_last_write = gethrtime(); 627*eda14cbcSMatt Macy mmp->mmp_delay = mmp_interval; 628*eda14cbcSMatt Macy mutex_exit(&mmp->mmp_io_lock); 629*eda14cbcSMatt Macy } 630*eda14cbcSMatt Macy 631*eda14cbcSMatt Macy /* 632*eda14cbcSMatt Macy * MMP on => off: 633*eda14cbcSMatt Macy * mmp_delay == 0 tells importing node to skip activity check. 634*eda14cbcSMatt Macy */ 635*eda14cbcSMatt Macy if (last_spa_multihost && !multihost) { 636*eda14cbcSMatt Macy mutex_enter(&mmp->mmp_io_lock); 637*eda14cbcSMatt Macy mmp->mmp_delay = 0; 638*eda14cbcSMatt Macy mutex_exit(&mmp->mmp_io_lock); 639*eda14cbcSMatt Macy } 640*eda14cbcSMatt Macy 641*eda14cbcSMatt Macy /* 642*eda14cbcSMatt Macy * Suspend the pool if no MMP write has succeeded in over 643*eda14cbcSMatt Macy * mmp_interval * mmp_fail_intervals nanoseconds. 644*eda14cbcSMatt Macy */ 645*eda14cbcSMatt Macy if (multihost && !suspended && mmp_fail_intervals && 646*eda14cbcSMatt Macy (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { 647*eda14cbcSMatt Macy zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " 648*eda14cbcSMatt Macy "mmp_last_write %llu mmp_interval %llu " 649*eda14cbcSMatt Macy "mmp_fail_intervals %llu mmp_fail_ns %llu", 650*eda14cbcSMatt Macy spa_name(spa), (u_longlong_t)gethrtime(), 651*eda14cbcSMatt Macy (u_longlong_t)mmp->mmp_last_write, 652*eda14cbcSMatt Macy (u_longlong_t)mmp_interval, 653*eda14cbcSMatt Macy (u_longlong_t)mmp_fail_intervals, 654*eda14cbcSMatt Macy (u_longlong_t)mmp_fail_ns); 655*eda14cbcSMatt Macy cmn_err(CE_WARN, "MMP writes to pool '%s' have not " 656*eda14cbcSMatt Macy "succeeded in over %llu ms; suspending pool. " 657*eda14cbcSMatt Macy "Hrtime %llu", 658*eda14cbcSMatt Macy spa_name(spa), 659*eda14cbcSMatt Macy NSEC2MSEC(gethrtime() - mmp->mmp_last_write), 660*eda14cbcSMatt Macy gethrtime()); 661*eda14cbcSMatt Macy zio_suspend(spa, NULL, ZIO_SUSPEND_MMP); 662*eda14cbcSMatt Macy } 663*eda14cbcSMatt Macy 664*eda14cbcSMatt Macy if (multihost && !suspended) 665*eda14cbcSMatt Macy mmp_write_uberblock(spa); 666*eda14cbcSMatt Macy 667*eda14cbcSMatt Macy if (skip_wait > 0) { 668*eda14cbcSMatt Macy next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) / 669*eda14cbcSMatt Macy leaves; 670*eda14cbcSMatt Macy skip_wait--; 671*eda14cbcSMatt Macy } 672*eda14cbcSMatt Macy 673*eda14cbcSMatt Macy CALLB_CPR_SAFE_BEGIN(&cpr); 674*eda14cbcSMatt Macy (void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv, 675*eda14cbcSMatt Macy &mmp->mmp_thread_lock, next_time, USEC2NSEC(100), 676*eda14cbcSMatt Macy CALLOUT_FLAG_ABSOLUTE); 677*eda14cbcSMatt Macy CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); 678*eda14cbcSMatt Macy } 679*eda14cbcSMatt Macy 680*eda14cbcSMatt Macy /* Outstanding writes are allowed to complete. */ 681*eda14cbcSMatt Macy zio_wait(mmp->mmp_zio_root); 682*eda14cbcSMatt Macy 683*eda14cbcSMatt Macy mmp->mmp_zio_root = NULL; 684*eda14cbcSMatt Macy mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr); 685*eda14cbcSMatt Macy } 686*eda14cbcSMatt Macy 687*eda14cbcSMatt Macy /* 688*eda14cbcSMatt Macy * Signal the MMP thread to wake it, when it is sleeping on 689*eda14cbcSMatt Macy * its cv. Used when some module parameter has changed and 690*eda14cbcSMatt Macy * we want the thread to know about it. 691*eda14cbcSMatt Macy * Only signal if the pool is active and mmp thread is 692*eda14cbcSMatt Macy * running, otherwise there is no thread to wake. 693*eda14cbcSMatt Macy */ 694*eda14cbcSMatt Macy static void 695*eda14cbcSMatt Macy mmp_signal_thread(spa_t *spa) 696*eda14cbcSMatt Macy { 697*eda14cbcSMatt Macy mmp_thread_t *mmp = &spa->spa_mmp; 698*eda14cbcSMatt Macy 699*eda14cbcSMatt Macy mutex_enter(&mmp->mmp_thread_lock); 700*eda14cbcSMatt Macy if (mmp->mmp_thread) 701*eda14cbcSMatt Macy cv_broadcast(&mmp->mmp_thread_cv); 702*eda14cbcSMatt Macy mutex_exit(&mmp->mmp_thread_lock); 703*eda14cbcSMatt Macy } 704*eda14cbcSMatt Macy 705*eda14cbcSMatt Macy void 706*eda14cbcSMatt Macy mmp_signal_all_threads(void) 707*eda14cbcSMatt Macy { 708*eda14cbcSMatt Macy spa_t *spa = NULL; 709*eda14cbcSMatt Macy 710*eda14cbcSMatt Macy mutex_enter(&spa_namespace_lock); 711*eda14cbcSMatt Macy while ((spa = spa_next(spa))) { 712*eda14cbcSMatt Macy if (spa->spa_state == POOL_STATE_ACTIVE) 713*eda14cbcSMatt Macy mmp_signal_thread(spa); 714*eda14cbcSMatt Macy } 715*eda14cbcSMatt Macy mutex_exit(&spa_namespace_lock); 716*eda14cbcSMatt Macy } 717*eda14cbcSMatt Macy 718*eda14cbcSMatt Macy /* BEGIN CSTYLED */ 719*eda14cbcSMatt Macy ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval, 720*eda14cbcSMatt Macy param_set_multihost_interval, param_get_ulong, ZMOD_RW, 721*eda14cbcSMatt Macy "Milliseconds between mmp writes to each leaf"); 722*eda14cbcSMatt Macy /* END CSTYLED */ 723*eda14cbcSMatt Macy 724*eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, fail_intervals, UINT, ZMOD_RW, 725*eda14cbcSMatt Macy "Max allowed period without a successful mmp write"); 726*eda14cbcSMatt Macy 727*eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, import_intervals, UINT, ZMOD_RW, 728*eda14cbcSMatt Macy "Number of zfs_multihost_interval periods to wait for activity"); 729