1ea8dc4b6Seschrock /* 2ea8dc4b6Seschrock * CDDL HEADER START 3ea8dc4b6Seschrock * 4ea8dc4b6Seschrock * The contents of this file are subject to the terms of the 5ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7ea8dc4b6Seschrock * 8ea8dc4b6Seschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9ea8dc4b6Seschrock * or http://www.opensolaris.org/os/licensing. 10ea8dc4b6Seschrock * See the License for the specific language governing permissions 11ea8dc4b6Seschrock * and limitations under the License. 12ea8dc4b6Seschrock * 13ea8dc4b6Seschrock * When distributing Covered Code, include this CDDL HEADER in each 14ea8dc4b6Seschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15ea8dc4b6Seschrock * If applicable, add the following below this CDDL HEADER, with the 16ea8dc4b6Seschrock * fields enclosed by brackets "[]" replaced with your own identifying 17ea8dc4b6Seschrock * information: Portions Copyright [yyyy] [name of copyright owner] 18ea8dc4b6Seschrock * 19ea8dc4b6Seschrock * CDDL HEADER END 20ea8dc4b6Seschrock */ 21ea8dc4b6Seschrock /* 2298d1cbfeSGeorge Wilson * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23*97e81309SPrakash Surya * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24ea8dc4b6Seschrock */ 25ea8dc4b6Seschrock 26ea8dc4b6Seschrock /* 27ea8dc4b6Seschrock * ZFS fault injection 28ea8dc4b6Seschrock * 29ea8dc4b6Seschrock * To handle fault injection, we keep track of a series of zinject_record_t 30ea8dc4b6Seschrock * structures which describe which logical block(s) should be injected with a 31ea8dc4b6Seschrock * fault. These are kept in a global list. Each record corresponds to a given 32ea8dc4b6Seschrock * spa_t and maintains a special hold on the spa_t so that it cannot be deleted 33ea8dc4b6Seschrock * or exported while the injection record exists. 34ea8dc4b6Seschrock * 35ea8dc4b6Seschrock * Device level injection is done using the 'zi_guid' field. If this is set, it 36ea8dc4b6Seschrock * means that the error is destined for a particular device, not a piece of 37ea8dc4b6Seschrock * data. 38ea8dc4b6Seschrock * 39ea8dc4b6Seschrock * This is a rather poor data structure and algorithm, but we don't expect more 40ea8dc4b6Seschrock * than a few faults at any one time, so it should be sufficient for our needs. 41ea8dc4b6Seschrock */ 42ea8dc4b6Seschrock 43ea8dc4b6Seschrock #include <sys/arc.h> 44ea8dc4b6Seschrock #include <sys/zio_impl.h> 45ea8dc4b6Seschrock #include <sys/zfs_ioctl.h> 46ea8dc4b6Seschrock #include <sys/vdev_impl.h> 47b24ab676SJeff Bonwick #include <sys/dmu_objset.h> 4821bf64a7Sgw25295 #include <sys/fs/zfs.h> 49ea8dc4b6Seschrock 50ea8dc4b6Seschrock uint32_t zio_injection_enabled; 51ea8dc4b6Seschrock 52*97e81309SPrakash Surya /* 53*97e81309SPrakash Surya * Data describing each zinject handler registered on the system, and 54*97e81309SPrakash Surya * contains the list node linking the handler in the global zinject 55*97e81309SPrakash Surya * handler list. 56*97e81309SPrakash Surya */ 57ea8dc4b6Seschrock typedef struct inject_handler { 58ea8dc4b6Seschrock int zi_id; 59ea8dc4b6Seschrock spa_t *zi_spa; 60ea8dc4b6Seschrock zinject_record_t zi_record; 61*97e81309SPrakash Surya uint64_t *zi_lanes; 62*97e81309SPrakash Surya int zi_next_lane; 63ea8dc4b6Seschrock list_node_t zi_link; 64ea8dc4b6Seschrock } inject_handler_t; 65ea8dc4b6Seschrock 66*97e81309SPrakash Surya /* 67*97e81309SPrakash Surya * List of all zinject handlers registered on the system, protected by 68*97e81309SPrakash Surya * the inject_lock defined below. 69*97e81309SPrakash Surya */ 70ea8dc4b6Seschrock static list_t inject_handlers; 71*97e81309SPrakash Surya 72*97e81309SPrakash Surya /* 73*97e81309SPrakash Surya * This protects insertion into, and traversal of, the inject handler 74*97e81309SPrakash Surya * list defined above; as well as the inject_delay_count. Any time a 75*97e81309SPrakash Surya * handler is inserted or removed from the list, this lock should be 76*97e81309SPrakash Surya * taken as a RW_WRITER; and any time traversal is done over the list 77*97e81309SPrakash Surya * (without modification to it) this lock should be taken as a RW_READER. 78*97e81309SPrakash Surya */ 79ea8dc4b6Seschrock static krwlock_t inject_lock; 80*97e81309SPrakash Surya 81*97e81309SPrakash Surya /* 82*97e81309SPrakash Surya * This holds the number of zinject delay handlers that have been 83*97e81309SPrakash Surya * registered on the system. It is protected by the inject_lock defined 84*97e81309SPrakash Surya * above. Thus modifications to this count must be a RW_WRITER of the 85*97e81309SPrakash Surya * inject_lock, and reads of this count must be (at least) a RW_READER 86*97e81309SPrakash Surya * of the lock. 87*97e81309SPrakash Surya */ 88*97e81309SPrakash Surya static int inject_delay_count = 0; 89*97e81309SPrakash Surya 90*97e81309SPrakash Surya /* 91*97e81309SPrakash Surya * This lock is used only in zio_handle_io_delay(), refer to the comment 92*97e81309SPrakash Surya * in that function for more details. 93*97e81309SPrakash Surya */ 94*97e81309SPrakash Surya static kmutex_t inject_delay_mtx; 95*97e81309SPrakash Surya 96*97e81309SPrakash Surya /* 97*97e81309SPrakash Surya * Used to assign unique identifying numbers to each new zinject handler. 98*97e81309SPrakash Surya */ 99ea8dc4b6Seschrock static int inject_next_id = 1; 100ea8dc4b6Seschrock 101ea8dc4b6Seschrock /* 102ea8dc4b6Seschrock * Returns true if the given record matches the I/O in progress. 103ea8dc4b6Seschrock */ 104ea8dc4b6Seschrock static boolean_t 1057802d7bfSMatthew Ahrens zio_match_handler(zbookmark_phys_t *zb, uint64_t type, 106ea8dc4b6Seschrock zinject_record_t *record, int error) 107ea8dc4b6Seschrock { 108ea8dc4b6Seschrock /* 109ea8dc4b6Seschrock * Check for a match against the MOS, which is based on type 110ea8dc4b6Seschrock */ 111b24ab676SJeff Bonwick if (zb->zb_objset == DMU_META_OBJSET && 112b24ab676SJeff Bonwick record->zi_objset == DMU_META_OBJSET && 113b24ab676SJeff Bonwick record->zi_object == DMU_META_DNODE_OBJECT) { 114ea8dc4b6Seschrock if (record->zi_type == DMU_OT_NONE || 115ea8dc4b6Seschrock type == record->zi_type) 116ea8dc4b6Seschrock return (record->zi_freq == 0 || 117ea8dc4b6Seschrock spa_get_random(100) < record->zi_freq); 118ea8dc4b6Seschrock else 119ea8dc4b6Seschrock return (B_FALSE); 120ea8dc4b6Seschrock } 121ea8dc4b6Seschrock 122ea8dc4b6Seschrock /* 123ea8dc4b6Seschrock * Check for an exact match. 124ea8dc4b6Seschrock */ 125ea8dc4b6Seschrock if (zb->zb_objset == record->zi_objset && 126ea8dc4b6Seschrock zb->zb_object == record->zi_object && 127ea8dc4b6Seschrock zb->zb_level == record->zi_level && 128ea8dc4b6Seschrock zb->zb_blkid >= record->zi_start && 129ea8dc4b6Seschrock zb->zb_blkid <= record->zi_end && 130ea8dc4b6Seschrock error == record->zi_error) 131ea8dc4b6Seschrock return (record->zi_freq == 0 || 132ea8dc4b6Seschrock spa_get_random(100) < record->zi_freq); 133ea8dc4b6Seschrock 134ea8dc4b6Seschrock return (B_FALSE); 135ea8dc4b6Seschrock } 136ea8dc4b6Seschrock 137ea8dc4b6Seschrock /* 13888ecc943SGeorge Wilson * Panic the system when a config change happens in the function 13988ecc943SGeorge Wilson * specified by tag. 14088ecc943SGeorge Wilson */ 14188ecc943SGeorge Wilson void 1421195e687SMark J Musante zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) 14388ecc943SGeorge Wilson { 14488ecc943SGeorge Wilson inject_handler_t *handler; 14588ecc943SGeorge Wilson 14688ecc943SGeorge Wilson rw_enter(&inject_lock, RW_READER); 14788ecc943SGeorge Wilson 14888ecc943SGeorge Wilson for (handler = list_head(&inject_handlers); handler != NULL; 14988ecc943SGeorge Wilson handler = list_next(&inject_handlers, handler)) { 15088ecc943SGeorge Wilson 15188ecc943SGeorge Wilson if (spa != handler->zi_spa) 15288ecc943SGeorge Wilson continue; 15388ecc943SGeorge Wilson 1541195e687SMark J Musante if (handler->zi_record.zi_type == type && 1551195e687SMark J Musante strcmp(tag, handler->zi_record.zi_func) == 0) 15688ecc943SGeorge Wilson panic("Panic requested in function %s\n", tag); 15788ecc943SGeorge Wilson } 15888ecc943SGeorge Wilson 15988ecc943SGeorge Wilson rw_exit(&inject_lock); 16088ecc943SGeorge Wilson } 16188ecc943SGeorge Wilson 16288ecc943SGeorge Wilson /* 163ea8dc4b6Seschrock * Determine if the I/O in question should return failure. Returns the errno 164ea8dc4b6Seschrock * to be returned to the caller. 165ea8dc4b6Seschrock */ 166ea8dc4b6Seschrock int 167ea8dc4b6Seschrock zio_handle_fault_injection(zio_t *zio, int error) 168ea8dc4b6Seschrock { 169ea8dc4b6Seschrock int ret = 0; 170ea8dc4b6Seschrock inject_handler_t *handler; 171ea8dc4b6Seschrock 172ea8dc4b6Seschrock /* 173ea8dc4b6Seschrock * Ignore I/O not associated with any logical data. 174ea8dc4b6Seschrock */ 175ea8dc4b6Seschrock if (zio->io_logical == NULL) 176ea8dc4b6Seschrock return (0); 177ea8dc4b6Seschrock 178ea8dc4b6Seschrock /* 179ea8dc4b6Seschrock * Currently, we only support fault injection on reads. 180ea8dc4b6Seschrock */ 181ea8dc4b6Seschrock if (zio->io_type != ZIO_TYPE_READ) 182ea8dc4b6Seschrock return (0); 183ea8dc4b6Seschrock 184ea8dc4b6Seschrock rw_enter(&inject_lock, RW_READER); 185ea8dc4b6Seschrock 186ea8dc4b6Seschrock for (handler = list_head(&inject_handlers); handler != NULL; 187ea8dc4b6Seschrock handler = list_next(&inject_handlers, handler)) { 188ea8dc4b6Seschrock 189283b8460SGeorge.Wilson if (zio->io_spa != handler->zi_spa || 190283b8460SGeorge.Wilson handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) 191ea8dc4b6Seschrock continue; 192ea8dc4b6Seschrock 193ea8dc4b6Seschrock /* If this handler matches, return EIO */ 194ea8dc4b6Seschrock if (zio_match_handler(&zio->io_logical->io_bookmark, 195ea8dc4b6Seschrock zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, 196ea8dc4b6Seschrock &handler->zi_record, error)) { 197ea8dc4b6Seschrock ret = error; 198ea8dc4b6Seschrock break; 199ea8dc4b6Seschrock } 200ea8dc4b6Seschrock } 201ea8dc4b6Seschrock 202ea8dc4b6Seschrock rw_exit(&inject_lock); 203ea8dc4b6Seschrock 204ea8dc4b6Seschrock return (ret); 205ea8dc4b6Seschrock } 206ea8dc4b6Seschrock 20721bf64a7Sgw25295 /* 20821bf64a7Sgw25295 * Determine if the zio is part of a label update and has an injection 20921bf64a7Sgw25295 * handler associated with that portion of the label. Currently, we 21021bf64a7Sgw25295 * allow error injection in either the nvlist or the uberblock region of 21121bf64a7Sgw25295 * of the vdev label. 21221bf64a7Sgw25295 */ 21321bf64a7Sgw25295 int 21421bf64a7Sgw25295 zio_handle_label_injection(zio_t *zio, int error) 21521bf64a7Sgw25295 { 21621bf64a7Sgw25295 inject_handler_t *handler; 21721bf64a7Sgw25295 vdev_t *vd = zio->io_vd; 21821bf64a7Sgw25295 uint64_t offset = zio->io_offset; 21921bf64a7Sgw25295 int label; 22021bf64a7Sgw25295 int ret = 0; 22121bf64a7Sgw25295 2228f18d1faSGeorge Wilson if (offset >= VDEV_LABEL_START_SIZE && 22321bf64a7Sgw25295 offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) 22421bf64a7Sgw25295 return (0); 22521bf64a7Sgw25295 22621bf64a7Sgw25295 rw_enter(&inject_lock, RW_READER); 22721bf64a7Sgw25295 22821bf64a7Sgw25295 for (handler = list_head(&inject_handlers); handler != NULL; 22921bf64a7Sgw25295 handler = list_next(&inject_handlers, handler)) { 23021bf64a7Sgw25295 uint64_t start = handler->zi_record.zi_start; 23121bf64a7Sgw25295 uint64_t end = handler->zi_record.zi_end; 23221bf64a7Sgw25295 233283b8460SGeorge.Wilson if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) 23421bf64a7Sgw25295 continue; 23521bf64a7Sgw25295 23621bf64a7Sgw25295 /* 23721bf64a7Sgw25295 * The injection region is the relative offsets within a 23821bf64a7Sgw25295 * vdev label. We must determine the label which is being 23921bf64a7Sgw25295 * updated and adjust our region accordingly. 24021bf64a7Sgw25295 */ 24121bf64a7Sgw25295 label = vdev_label_number(vd->vdev_psize, offset); 24221bf64a7Sgw25295 start = vdev_label_offset(vd->vdev_psize, label, start); 24321bf64a7Sgw25295 end = vdev_label_offset(vd->vdev_psize, label, end); 24421bf64a7Sgw25295 24521bf64a7Sgw25295 if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && 24621bf64a7Sgw25295 (offset >= start && offset <= end)) { 24721bf64a7Sgw25295 ret = error; 24821bf64a7Sgw25295 break; 24921bf64a7Sgw25295 } 25021bf64a7Sgw25295 } 25121bf64a7Sgw25295 rw_exit(&inject_lock); 25221bf64a7Sgw25295 return (ret); 25321bf64a7Sgw25295 } 25421bf64a7Sgw25295 25521bf64a7Sgw25295 256ea8dc4b6Seschrock int 2578956713aSEric Schrock zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) 258ea8dc4b6Seschrock { 259ea8dc4b6Seschrock inject_handler_t *handler; 260ea8dc4b6Seschrock int ret = 0; 261ea8dc4b6Seschrock 2628f18d1faSGeorge Wilson /* 2638f18d1faSGeorge Wilson * We skip over faults in the labels unless it's during 2648f18d1faSGeorge Wilson * device open (i.e. zio == NULL). 2658f18d1faSGeorge Wilson */ 2668f18d1faSGeorge Wilson if (zio != NULL) { 2678f18d1faSGeorge Wilson uint64_t offset = zio->io_offset; 2688f18d1faSGeorge Wilson 2698f18d1faSGeorge Wilson if (offset < VDEV_LABEL_START_SIZE || 2708f18d1faSGeorge Wilson offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) 2718f18d1faSGeorge Wilson return (0); 2728f18d1faSGeorge Wilson } 2738f18d1faSGeorge Wilson 274ea8dc4b6Seschrock rw_enter(&inject_lock, RW_READER); 275ea8dc4b6Seschrock 276ea8dc4b6Seschrock for (handler = list_head(&inject_handlers); handler != NULL; 277ea8dc4b6Seschrock handler = list_next(&inject_handlers, handler)) { 278ea8dc4b6Seschrock 279283b8460SGeorge.Wilson if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) 28021bf64a7Sgw25295 continue; 28121bf64a7Sgw25295 282ea8dc4b6Seschrock if (vd->vdev_guid == handler->zi_record.zi_guid) { 2838956713aSEric Schrock if (handler->zi_record.zi_failfast && 2848956713aSEric Schrock (zio == NULL || (zio->io_flags & 2858956713aSEric Schrock (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { 2868956713aSEric Schrock continue; 2878956713aSEric Schrock } 2888956713aSEric Schrock 2898f18d1faSGeorge Wilson /* Handle type specific I/O failures */ 2908f18d1faSGeorge Wilson if (zio != NULL && 2918f18d1faSGeorge Wilson handler->zi_record.zi_iotype != ZIO_TYPES && 2928f18d1faSGeorge Wilson handler->zi_record.zi_iotype != zio->io_type) 2938f18d1faSGeorge Wilson continue; 2948f18d1faSGeorge Wilson 295ea8dc4b6Seschrock if (handler->zi_record.zi_error == error) { 296ea8dc4b6Seschrock /* 297ea8dc4b6Seschrock * For a failed open, pretend like the device 298ea8dc4b6Seschrock * has gone away. 299ea8dc4b6Seschrock */ 300ea8dc4b6Seschrock if (error == ENXIO) 301ea8dc4b6Seschrock vd->vdev_stat.vs_aux = 302ea8dc4b6Seschrock VDEV_AUX_OPEN_FAILED; 30398d1cbfeSGeorge Wilson 30498d1cbfeSGeorge Wilson /* 30598d1cbfeSGeorge Wilson * Treat these errors as if they had been 30698d1cbfeSGeorge Wilson * retried so that all the appropriate stats 30798d1cbfeSGeorge Wilson * and FMA events are generated. 30898d1cbfeSGeorge Wilson */ 30998d1cbfeSGeorge Wilson if (!handler->zi_record.zi_failfast && 31098d1cbfeSGeorge Wilson zio != NULL) 31198d1cbfeSGeorge Wilson zio->io_flags |= ZIO_FLAG_IO_RETRY; 31298d1cbfeSGeorge Wilson 313ea8dc4b6Seschrock ret = error; 314ea8dc4b6Seschrock break; 315ea8dc4b6Seschrock } 316ea8dc4b6Seschrock if (handler->zi_record.zi_error == ENXIO) { 317be6fd75aSMatthew Ahrens ret = SET_ERROR(EIO); 318ea8dc4b6Seschrock break; 319ea8dc4b6Seschrock } 320ea8dc4b6Seschrock } 321ea8dc4b6Seschrock } 322ea8dc4b6Seschrock 323ea8dc4b6Seschrock rw_exit(&inject_lock); 324ea8dc4b6Seschrock 325ea8dc4b6Seschrock return (ret); 326ea8dc4b6Seschrock } 327ea8dc4b6Seschrock 328ea8dc4b6Seschrock /* 329468c413aSTim Haley * Simulate hardware that ignores cache flushes. For requested number 330468c413aSTim Haley * of seconds nix the actual writing to disk. 331468c413aSTim Haley */ 332468c413aSTim Haley void 333468c413aSTim Haley zio_handle_ignored_writes(zio_t *zio) 334468c413aSTim Haley { 335468c413aSTim Haley inject_handler_t *handler; 336468c413aSTim Haley 337468c413aSTim Haley rw_enter(&inject_lock, RW_READER); 338468c413aSTim Haley 339468c413aSTim Haley for (handler = list_head(&inject_handlers); handler != NULL; 340468c413aSTim Haley handler = list_next(&inject_handlers, handler)) { 341468c413aSTim Haley 342468c413aSTim Haley /* Ignore errors not destined for this pool */ 343283b8460SGeorge.Wilson if (zio->io_spa != handler->zi_spa || 344283b8460SGeorge.Wilson handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 345468c413aSTim Haley continue; 346468c413aSTim Haley 347468c413aSTim Haley /* 348468c413aSTim Haley * Positive duration implies # of seconds, negative 349468c413aSTim Haley * a number of txgs 350468c413aSTim Haley */ 351468c413aSTim Haley if (handler->zi_record.zi_timer == 0) { 352468c413aSTim Haley if (handler->zi_record.zi_duration > 0) 353d3d50737SRafael Vanoni handler->zi_record.zi_timer = ddi_get_lbolt64(); 354468c413aSTim Haley else 355468c413aSTim Haley handler->zi_record.zi_timer = zio->io_txg; 356468c413aSTim Haley } 357a33cae98STim Haley 358a33cae98STim Haley /* Have a "problem" writing 60% of the time */ 359a33cae98STim Haley if (spa_get_random(100) < 60) 360468c413aSTim Haley zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 361468c413aSTim Haley break; 362468c413aSTim Haley } 363468c413aSTim Haley 364468c413aSTim Haley rw_exit(&inject_lock); 365468c413aSTim Haley } 366468c413aSTim Haley 367468c413aSTim Haley void 368468c413aSTim Haley spa_handle_ignored_writes(spa_t *spa) 369468c413aSTim Haley { 370468c413aSTim Haley inject_handler_t *handler; 371468c413aSTim Haley 372468c413aSTim Haley if (zio_injection_enabled == 0) 373468c413aSTim Haley return; 374468c413aSTim Haley 375468c413aSTim Haley rw_enter(&inject_lock, RW_READER); 376468c413aSTim Haley 377468c413aSTim Haley for (handler = list_head(&inject_handlers); handler != NULL; 378468c413aSTim Haley handler = list_next(&inject_handlers, handler)) { 379468c413aSTim Haley 380283b8460SGeorge.Wilson if (spa != handler->zi_spa || 381283b8460SGeorge.Wilson handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 382468c413aSTim Haley continue; 383468c413aSTim Haley 384468c413aSTim Haley if (handler->zi_record.zi_duration > 0) { 385468c413aSTim Haley VERIFY(handler->zi_record.zi_timer == 0 || 386468c413aSTim Haley handler->zi_record.zi_timer + 387d3d50737SRafael Vanoni handler->zi_record.zi_duration * hz > 388d3d50737SRafael Vanoni ddi_get_lbolt64()); 389468c413aSTim Haley } else { 390468c413aSTim Haley /* duration is negative so the subtraction here adds */ 391468c413aSTim Haley VERIFY(handler->zi_record.zi_timer == 0 || 392468c413aSTim Haley handler->zi_record.zi_timer - 393468c413aSTim Haley handler->zi_record.zi_duration >= 394b24ab676SJeff Bonwick spa_syncing_txg(spa)); 395468c413aSTim Haley } 396468c413aSTim Haley } 397468c413aSTim Haley 398468c413aSTim Haley rw_exit(&inject_lock); 399468c413aSTim Haley } 400468c413aSTim Haley 401*97e81309SPrakash Surya hrtime_t 402283b8460SGeorge.Wilson zio_handle_io_delay(zio_t *zio) 403283b8460SGeorge.Wilson { 404283b8460SGeorge.Wilson vdev_t *vd = zio->io_vd; 405*97e81309SPrakash Surya inject_handler_t *min_handler = NULL; 406*97e81309SPrakash Surya hrtime_t min_target = 0; 407283b8460SGeorge.Wilson 408283b8460SGeorge.Wilson rw_enter(&inject_lock, RW_READER); 409283b8460SGeorge.Wilson 410*97e81309SPrakash Surya /* 411*97e81309SPrakash Surya * inject_delay_count is a subset of zio_injection_enabled that 412*97e81309SPrakash Surya * is only incremented for delay handlers. These checks are 413*97e81309SPrakash Surya * mainly added to remind the reader why we're not explicitly 414*97e81309SPrakash Surya * checking zio_injection_enabled like the other functions. 415*97e81309SPrakash Surya */ 416*97e81309SPrakash Surya IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); 417*97e81309SPrakash Surya IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); 418283b8460SGeorge.Wilson 419*97e81309SPrakash Surya /* 420*97e81309SPrakash Surya * If there aren't any inject delay handlers registered, then we 421*97e81309SPrakash Surya * can short circuit and simply return 0 here. A value of zero 422*97e81309SPrakash Surya * informs zio_delay_interrupt() that this request should not be 423*97e81309SPrakash Surya * delayed. This short circuit keeps us from acquiring the 424*97e81309SPrakash Surya * inject_delay_mutex unnecessarily. 425*97e81309SPrakash Surya */ 426*97e81309SPrakash Surya if (inject_delay_count == 0) { 427*97e81309SPrakash Surya rw_exit(&inject_lock); 428*97e81309SPrakash Surya return (0); 429*97e81309SPrakash Surya } 430*97e81309SPrakash Surya 431*97e81309SPrakash Surya /* 432*97e81309SPrakash Surya * Each inject handler has a number of "lanes" associated with 433*97e81309SPrakash Surya * it. Each lane is able to handle requests independently of one 434*97e81309SPrakash Surya * another, and at a latency defined by the inject handler 435*97e81309SPrakash Surya * record's zi_timer field. Thus if a handler in configured with 436*97e81309SPrakash Surya * a single lane with a 10ms latency, it will delay requests 437*97e81309SPrakash Surya * such that only a single request is completed every 10ms. So, 438*97e81309SPrakash Surya * if more than one request is attempted per each 10ms interval, 439*97e81309SPrakash Surya * the average latency of the requests will be greater than 440*97e81309SPrakash Surya * 10ms; but if only a single request is submitted each 10ms 441*97e81309SPrakash Surya * interval the average latency will be 10ms. 442*97e81309SPrakash Surya * 443*97e81309SPrakash Surya * We need to acquire this mutex to prevent multiple concurrent 444*97e81309SPrakash Surya * threads being assigned to the same lane of a given inject 445*97e81309SPrakash Surya * handler. The mutex allows us to perform the following two 446*97e81309SPrakash Surya * operations atomically: 447*97e81309SPrakash Surya * 448*97e81309SPrakash Surya * 1. determine the minimum handler and minimum target 449*97e81309SPrakash Surya * value of all the possible handlers 450*97e81309SPrakash Surya * 2. update that minimum handler's lane array 451*97e81309SPrakash Surya * 452*97e81309SPrakash Surya * Without atomicity, two (or more) threads could pick the same 453*97e81309SPrakash Surya * lane in step (1), and then conflict with each other in step 454*97e81309SPrakash Surya * (2). This could allow a single lane handler to process 455*97e81309SPrakash Surya * multiple requests simultaneously, which shouldn't be possible. 456*97e81309SPrakash Surya */ 457*97e81309SPrakash Surya mutex_enter(&inject_delay_mtx); 458*97e81309SPrakash Surya 459*97e81309SPrakash Surya for (inject_handler_t *handler = list_head(&inject_handlers); 460*97e81309SPrakash Surya handler != NULL; handler = list_next(&inject_handlers, handler)) { 461283b8460SGeorge.Wilson if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) 462283b8460SGeorge.Wilson continue; 463283b8460SGeorge.Wilson 464*97e81309SPrakash Surya if (vd->vdev_guid != handler->zi_record.zi_guid) 465*97e81309SPrakash Surya continue; 466*97e81309SPrakash Surya 467*97e81309SPrakash Surya /* 468*97e81309SPrakash Surya * Defensive; should never happen as the array allocation 469*97e81309SPrakash Surya * occurs prior to inserting this handler on the list. 470*97e81309SPrakash Surya */ 471*97e81309SPrakash Surya ASSERT3P(handler->zi_lanes, !=, NULL); 472*97e81309SPrakash Surya 473*97e81309SPrakash Surya /* 474*97e81309SPrakash Surya * This should never happen, the zinject command should 475*97e81309SPrakash Surya * prevent a user from setting an IO delay with zero lanes. 476*97e81309SPrakash Surya */ 477*97e81309SPrakash Surya ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); 478*97e81309SPrakash Surya 479*97e81309SPrakash Surya ASSERT3U(handler->zi_record.zi_nlanes, >, 480*97e81309SPrakash Surya handler->zi_next_lane); 481*97e81309SPrakash Surya 482*97e81309SPrakash Surya /* 483*97e81309SPrakash Surya * We want to issue this IO to the lane that will become 484*97e81309SPrakash Surya * idle the soonest, so we compare the soonest this 485*97e81309SPrakash Surya * specific handler can complete the IO with all other 486*97e81309SPrakash Surya * handlers, to find the lowest value of all possible 487*97e81309SPrakash Surya * lanes. We then use this lane to submit the request. 488*97e81309SPrakash Surya * 489*97e81309SPrakash Surya * Since each handler has a constant value for its 490*97e81309SPrakash Surya * delay, we can just use the "next" lane for that 491*97e81309SPrakash Surya * handler; as it will always be the lane with the 492*97e81309SPrakash Surya * lowest value for that particular handler (i.e. the 493*97e81309SPrakash Surya * lane that will become idle the soonest). This saves a 494*97e81309SPrakash Surya * scan of each handler's lanes array. 495*97e81309SPrakash Surya * 496*97e81309SPrakash Surya * There's two cases to consider when determining when 497*97e81309SPrakash Surya * this specific IO request should complete. If this 498*97e81309SPrakash Surya * lane is idle, we want to "submit" the request now so 499*97e81309SPrakash Surya * it will complete after zi_timer milliseconds. Thus, 500*97e81309SPrakash Surya * we set the target to now + zi_timer. 501*97e81309SPrakash Surya * 502*97e81309SPrakash Surya * If the lane is busy, we want this request to complete 503*97e81309SPrakash Surya * zi_timer milliseconds after the lane becomes idle. 504*97e81309SPrakash Surya * Since the 'zi_lanes' array holds the time at which 505*97e81309SPrakash Surya * each lane will become idle, we use that value to 506*97e81309SPrakash Surya * determine when this request should complete. 507*97e81309SPrakash Surya */ 508*97e81309SPrakash Surya hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); 509*97e81309SPrakash Surya hrtime_t busy = handler->zi_record.zi_timer + 510*97e81309SPrakash Surya handler->zi_lanes[handler->zi_next_lane]; 511*97e81309SPrakash Surya hrtime_t target = MAX(idle, busy); 512*97e81309SPrakash Surya 513*97e81309SPrakash Surya if (min_handler == NULL) { 514*97e81309SPrakash Surya min_handler = handler; 515*97e81309SPrakash Surya min_target = target; 516*97e81309SPrakash Surya continue; 517283b8460SGeorge.Wilson } 518283b8460SGeorge.Wilson 519*97e81309SPrakash Surya ASSERT3P(min_handler, !=, NULL); 520*97e81309SPrakash Surya ASSERT3U(min_target, !=, 0); 521*97e81309SPrakash Surya 522*97e81309SPrakash Surya /* 523*97e81309SPrakash Surya * We don't yet increment the "next lane" variable since 524*97e81309SPrakash Surya * we still might find a lower value lane in another 525*97e81309SPrakash Surya * handler during any remaining iterations. Once we're 526*97e81309SPrakash Surya * sure we've selected the absolute minimum, we'll claim 527*97e81309SPrakash Surya * the lane and increment the handler's "next lane" 528*97e81309SPrakash Surya * field below. 529*97e81309SPrakash Surya */ 530*97e81309SPrakash Surya 531*97e81309SPrakash Surya if (target < min_target) { 532*97e81309SPrakash Surya min_handler = handler; 533*97e81309SPrakash Surya min_target = target; 534283b8460SGeorge.Wilson } 535*97e81309SPrakash Surya } 536*97e81309SPrakash Surya 537*97e81309SPrakash Surya /* 538*97e81309SPrakash Surya * 'min_handler' will be NULL if no IO delays are registered for 539*97e81309SPrakash Surya * this vdev, otherwise it will point to the handler containing 540*97e81309SPrakash Surya * the lane that will become idle the soonest. 541*97e81309SPrakash Surya */ 542*97e81309SPrakash Surya if (min_handler != NULL) { 543*97e81309SPrakash Surya ASSERT3U(min_target, !=, 0); 544*97e81309SPrakash Surya min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; 545*97e81309SPrakash Surya 546*97e81309SPrakash Surya /* 547*97e81309SPrakash Surya * If we've used all possible lanes for this handler, 548*97e81309SPrakash Surya * loop back and start using the first lane again; 549*97e81309SPrakash Surya * otherwise, just increment the lane index. 550*97e81309SPrakash Surya */ 551*97e81309SPrakash Surya min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % 552*97e81309SPrakash Surya min_handler->zi_record.zi_nlanes; 553*97e81309SPrakash Surya } 554*97e81309SPrakash Surya 555*97e81309SPrakash Surya mutex_exit(&inject_delay_mtx); 556283b8460SGeorge.Wilson rw_exit(&inject_lock); 557*97e81309SPrakash Surya 558*97e81309SPrakash Surya return (min_target); 559283b8460SGeorge.Wilson } 560283b8460SGeorge.Wilson 561468c413aSTim Haley /* 562ea8dc4b6Seschrock * Create a new handler for the given record. We add it to the list, adding 563ea8dc4b6Seschrock * a reference to the spa_t in the process. We increment zio_injection_enabled, 564ea8dc4b6Seschrock * which is the switch to trigger all fault injection. 565ea8dc4b6Seschrock */ 566ea8dc4b6Seschrock int 567ea8dc4b6Seschrock zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) 568ea8dc4b6Seschrock { 569ea8dc4b6Seschrock inject_handler_t *handler; 570ea8dc4b6Seschrock int error; 571ea8dc4b6Seschrock spa_t *spa; 572ea8dc4b6Seschrock 573ea8dc4b6Seschrock /* 574ea8dc4b6Seschrock * If this is pool-wide metadata, make sure we unload the corresponding 575ea8dc4b6Seschrock * spa_t, so that the next attempt to load it will trigger the fault. 576ea8dc4b6Seschrock * We call spa_reset() to unload the pool appropriately. 577ea8dc4b6Seschrock */ 578ea8dc4b6Seschrock if (flags & ZINJECT_UNLOAD_SPA) 579ea8dc4b6Seschrock if ((error = spa_reset(name)) != 0) 580ea8dc4b6Seschrock return (error); 581ea8dc4b6Seschrock 582*97e81309SPrakash Surya if (record->zi_cmd == ZINJECT_DELAY_IO) { 583*97e81309SPrakash Surya /* 584*97e81309SPrakash Surya * A value of zero for the number of lanes or for the 585*97e81309SPrakash Surya * delay time doesn't make sense. 586*97e81309SPrakash Surya */ 587*97e81309SPrakash Surya if (record->zi_timer == 0 || record->zi_nlanes == 0) 588*97e81309SPrakash Surya return (SET_ERROR(EINVAL)); 589*97e81309SPrakash Surya 590*97e81309SPrakash Surya /* 591*97e81309SPrakash Surya * The number of lanes is directly mapped to the size of 592*97e81309SPrakash Surya * an array used by the handler. Thus, to ensure the 593*97e81309SPrakash Surya * user doesn't trigger an allocation that's "too large" 594*97e81309SPrakash Surya * we cap the number of lanes here. 595*97e81309SPrakash Surya */ 596*97e81309SPrakash Surya if (record->zi_nlanes >= UINT16_MAX) 597*97e81309SPrakash Surya return (SET_ERROR(EINVAL)); 598*97e81309SPrakash Surya } 599*97e81309SPrakash Surya 600ea8dc4b6Seschrock if (!(flags & ZINJECT_NULL)) { 601ea8dc4b6Seschrock /* 602ea8dc4b6Seschrock * spa_inject_ref() will add an injection reference, which will 603ea8dc4b6Seschrock * prevent the pool from being removed from the namespace while 604ea8dc4b6Seschrock * still allowing it to be unloaded. 605ea8dc4b6Seschrock */ 606ea8dc4b6Seschrock if ((spa = spa_inject_addref(name)) == NULL) 607be6fd75aSMatthew Ahrens return (SET_ERROR(ENOENT)); 608ea8dc4b6Seschrock 609ea8dc4b6Seschrock handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); 610ea8dc4b6Seschrock 611ea8dc4b6Seschrock handler->zi_spa = spa; 612ea8dc4b6Seschrock handler->zi_record = *record; 613*97e81309SPrakash Surya 614*97e81309SPrakash Surya if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 615*97e81309SPrakash Surya handler->zi_lanes = kmem_zalloc( 616*97e81309SPrakash Surya sizeof (*handler->zi_lanes) * 617*97e81309SPrakash Surya handler->zi_record.zi_nlanes, KM_SLEEP); 618*97e81309SPrakash Surya handler->zi_next_lane = 0; 619*97e81309SPrakash Surya } else { 620*97e81309SPrakash Surya handler->zi_lanes = NULL; 621*97e81309SPrakash Surya handler->zi_next_lane = 0; 622*97e81309SPrakash Surya } 623*97e81309SPrakash Surya 624*97e81309SPrakash Surya rw_enter(&inject_lock, RW_WRITER); 625*97e81309SPrakash Surya 626*97e81309SPrakash Surya /* 627*97e81309SPrakash Surya * We can't move this increment into the conditional 628*97e81309SPrakash Surya * above because we need to hold the RW_WRITER lock of 629*97e81309SPrakash Surya * inject_lock, and we don't want to hold that while 630*97e81309SPrakash Surya * allocating the handler's zi_lanes array. 631*97e81309SPrakash Surya */ 632*97e81309SPrakash Surya if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 633*97e81309SPrakash Surya ASSERT3S(inject_delay_count, >=, 0); 634*97e81309SPrakash Surya inject_delay_count++; 635*97e81309SPrakash Surya ASSERT3S(inject_delay_count, >, 0); 636*97e81309SPrakash Surya } 637*97e81309SPrakash Surya 638*97e81309SPrakash Surya *id = handler->zi_id = inject_next_id++; 639ea8dc4b6Seschrock list_insert_tail(&inject_handlers, handler); 6401a5e258fSJosef 'Jeff' Sipek atomic_inc_32(&zio_injection_enabled); 641ea8dc4b6Seschrock 642ea8dc4b6Seschrock rw_exit(&inject_lock); 643ea8dc4b6Seschrock } 644ea8dc4b6Seschrock 645ea8dc4b6Seschrock /* 646ea8dc4b6Seschrock * Flush the ARC, so that any attempts to read this data will end up 647ea8dc4b6Seschrock * going to the ZIO layer. Note that this is a little overkill, but 648ea8dc4b6Seschrock * we don't have the necessary ARC interfaces to do anything else, and 649ea8dc4b6Seschrock * fault injection isn't a performance critical path. 650ea8dc4b6Seschrock */ 651ea8dc4b6Seschrock if (flags & ZINJECT_FLUSH_ARC) 652244781f1SPrakash Surya /* 653244781f1SPrakash Surya * We must use FALSE to ensure arc_flush returns, since 654244781f1SPrakash Surya * we're not preventing concurrent ARC insertions. 655244781f1SPrakash Surya */ 656244781f1SPrakash Surya arc_flush(NULL, FALSE); 657ea8dc4b6Seschrock 658ea8dc4b6Seschrock return (0); 659ea8dc4b6Seschrock } 660ea8dc4b6Seschrock 661ea8dc4b6Seschrock /* 662ea8dc4b6Seschrock * Returns the next record with an ID greater than that supplied to the 663ea8dc4b6Seschrock * function. Used to iterate over all handlers in the system. 664ea8dc4b6Seschrock */ 665ea8dc4b6Seschrock int 666ea8dc4b6Seschrock zio_inject_list_next(int *id, char *name, size_t buflen, 667ea8dc4b6Seschrock zinject_record_t *record) 668ea8dc4b6Seschrock { 669ea8dc4b6Seschrock inject_handler_t *handler; 670ea8dc4b6Seschrock int ret; 671ea8dc4b6Seschrock 672ea8dc4b6Seschrock mutex_enter(&spa_namespace_lock); 673ea8dc4b6Seschrock rw_enter(&inject_lock, RW_READER); 674ea8dc4b6Seschrock 675ea8dc4b6Seschrock for (handler = list_head(&inject_handlers); handler != NULL; 676ea8dc4b6Seschrock handler = list_next(&inject_handlers, handler)) 677ea8dc4b6Seschrock if (handler->zi_id > *id) 678ea8dc4b6Seschrock break; 679ea8dc4b6Seschrock 680ea8dc4b6Seschrock if (handler) { 681ea8dc4b6Seschrock *record = handler->zi_record; 682ea8dc4b6Seschrock *id = handler->zi_id; 683ea8dc4b6Seschrock (void) strncpy(name, spa_name(handler->zi_spa), buflen); 684ea8dc4b6Seschrock ret = 0; 685ea8dc4b6Seschrock } else { 686be6fd75aSMatthew Ahrens ret = SET_ERROR(ENOENT); 687ea8dc4b6Seschrock } 688ea8dc4b6Seschrock 689ea8dc4b6Seschrock rw_exit(&inject_lock); 690ea8dc4b6Seschrock mutex_exit(&spa_namespace_lock); 691ea8dc4b6Seschrock 692ea8dc4b6Seschrock return (ret); 693ea8dc4b6Seschrock } 694ea8dc4b6Seschrock 695ea8dc4b6Seschrock /* 696ea8dc4b6Seschrock * Clear the fault handler with the given identifier, or return ENOENT if none 697ea8dc4b6Seschrock * exists. 698ea8dc4b6Seschrock */ 699ea8dc4b6Seschrock int 700ea8dc4b6Seschrock zio_clear_fault(int id) 701ea8dc4b6Seschrock { 702ea8dc4b6Seschrock inject_handler_t *handler; 703ea8dc4b6Seschrock 704ea8dc4b6Seschrock rw_enter(&inject_lock, RW_WRITER); 705ea8dc4b6Seschrock 706ea8dc4b6Seschrock for (handler = list_head(&inject_handlers); handler != NULL; 707ea8dc4b6Seschrock handler = list_next(&inject_handlers, handler)) 708ea8dc4b6Seschrock if (handler->zi_id == id) 709ea8dc4b6Seschrock break; 710ea8dc4b6Seschrock 711ea8dc4b6Seschrock if (handler == NULL) { 712679b018dSMark J Musante rw_exit(&inject_lock); 713be6fd75aSMatthew Ahrens return (SET_ERROR(ENOENT)); 714679b018dSMark J Musante } 715679b018dSMark J Musante 716*97e81309SPrakash Surya if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 717*97e81309SPrakash Surya ASSERT3S(inject_delay_count, >, 0); 718*97e81309SPrakash Surya inject_delay_count--; 719*97e81309SPrakash Surya ASSERT3S(inject_delay_count, >=, 0); 720*97e81309SPrakash Surya } 721*97e81309SPrakash Surya 722ea8dc4b6Seschrock list_remove(&inject_handlers, handler); 723679b018dSMark J Musante rw_exit(&inject_lock); 724679b018dSMark J Musante 725*97e81309SPrakash Surya if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 726*97e81309SPrakash Surya ASSERT3P(handler->zi_lanes, !=, NULL); 727*97e81309SPrakash Surya kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * 728*97e81309SPrakash Surya handler->zi_record.zi_nlanes); 729*97e81309SPrakash Surya } else { 730*97e81309SPrakash Surya ASSERT3P(handler->zi_lanes, ==, NULL); 731*97e81309SPrakash Surya } 732*97e81309SPrakash Surya 733ea8dc4b6Seschrock spa_inject_delref(handler->zi_spa); 734ea8dc4b6Seschrock kmem_free(handler, sizeof (inject_handler_t)); 7351a5e258fSJosef 'Jeff' Sipek atomic_dec_32(&zio_injection_enabled); 736ea8dc4b6Seschrock 737679b018dSMark J Musante return (0); 738ea8dc4b6Seschrock } 739ea8dc4b6Seschrock 740ea8dc4b6Seschrock void 741ea8dc4b6Seschrock zio_inject_init(void) 742ea8dc4b6Seschrock { 7439b3f6b42SEric Kustarz rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); 744*97e81309SPrakash Surya mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); 745ea8dc4b6Seschrock list_create(&inject_handlers, sizeof (inject_handler_t), 746ea8dc4b6Seschrock offsetof(inject_handler_t, zi_link)); 747ea8dc4b6Seschrock } 748ea8dc4b6Seschrock 749ea8dc4b6Seschrock void 750ea8dc4b6Seschrock zio_inject_fini(void) 751ea8dc4b6Seschrock { 752ea8dc4b6Seschrock list_destroy(&inject_handlers); 753*97e81309SPrakash Surya mutex_destroy(&inject_delay_mtx); 7549b3f6b42SEric Kustarz rw_destroy(&inject_lock); 755ea8dc4b6Seschrock } 756