1ea8dc4b6Seschrock /*
2ea8dc4b6Seschrock * CDDL HEADER START
3ea8dc4b6Seschrock *
4ea8dc4b6Seschrock * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock * You may not use this file except in compliance with the License.
7ea8dc4b6Seschrock *
8ea8dc4b6Seschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9ea8dc4b6Seschrock * or http://www.opensolaris.org/os/licensing.
10ea8dc4b6Seschrock * See the License for the specific language governing permissions
11ea8dc4b6Seschrock * and limitations under the License.
12ea8dc4b6Seschrock *
13ea8dc4b6Seschrock * When distributing Covered Code, include this CDDL HEADER in each
14ea8dc4b6Seschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15ea8dc4b6Seschrock * If applicable, add the following below this CDDL HEADER, with the
16ea8dc4b6Seschrock * fields enclosed by brackets "[]" replaced with your own identifying
17ea8dc4b6Seschrock * information: Portions Copyright [yyyy] [name of copyright owner]
18ea8dc4b6Seschrock *
19ea8dc4b6Seschrock * CDDL HEADER END
20ea8dc4b6Seschrock */
21ea8dc4b6Seschrock /*
2298d1cbfeSGeorge Wilson * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23*97e81309SPrakash Surya * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24ea8dc4b6Seschrock */
25ea8dc4b6Seschrock
26ea8dc4b6Seschrock /*
27ea8dc4b6Seschrock * ZFS fault injection
28ea8dc4b6Seschrock *
29ea8dc4b6Seschrock * To handle fault injection, we keep track of a series of zinject_record_t
30ea8dc4b6Seschrock * structures which describe which logical block(s) should be injected with a
31ea8dc4b6Seschrock * fault. These are kept in a global list. Each record corresponds to a given
32ea8dc4b6Seschrock * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
33ea8dc4b6Seschrock * or exported while the injection record exists.
34ea8dc4b6Seschrock *
35ea8dc4b6Seschrock * Device level injection is done using the 'zi_guid' field. If this is set, it
36ea8dc4b6Seschrock * means that the error is destined for a particular device, not a piece of
37ea8dc4b6Seschrock * data.
38ea8dc4b6Seschrock *
39ea8dc4b6Seschrock * This is a rather poor data structure and algorithm, but we don't expect more
40ea8dc4b6Seschrock * than a few faults at any one time, so it should be sufficient for our needs.
41ea8dc4b6Seschrock */
42ea8dc4b6Seschrock
43ea8dc4b6Seschrock #include <sys/arc.h>
44ea8dc4b6Seschrock #include <sys/zio_impl.h>
45ea8dc4b6Seschrock #include <sys/zfs_ioctl.h>
46ea8dc4b6Seschrock #include <sys/vdev_impl.h>
47b24ab676SJeff Bonwick #include <sys/dmu_objset.h>
4821bf64a7Sgw25295 #include <sys/fs/zfs.h>
49ea8dc4b6Seschrock
50ea8dc4b6Seschrock uint32_t zio_injection_enabled;
51ea8dc4b6Seschrock
52*97e81309SPrakash Surya /*
53*97e81309SPrakash Surya * Data describing each zinject handler registered on the system, and
54*97e81309SPrakash Surya * contains the list node linking the handler in the global zinject
55*97e81309SPrakash Surya * handler list.
56*97e81309SPrakash Surya */
57ea8dc4b6Seschrock typedef struct inject_handler {
58ea8dc4b6Seschrock int zi_id;
59ea8dc4b6Seschrock spa_t *zi_spa;
60ea8dc4b6Seschrock zinject_record_t zi_record;
61*97e81309SPrakash Surya uint64_t *zi_lanes;
62*97e81309SPrakash Surya int zi_next_lane;
63ea8dc4b6Seschrock list_node_t zi_link;
64ea8dc4b6Seschrock } inject_handler_t;
65ea8dc4b6Seschrock
66*97e81309SPrakash Surya /*
67*97e81309SPrakash Surya * List of all zinject handlers registered on the system, protected by
68*97e81309SPrakash Surya * the inject_lock defined below.
69*97e81309SPrakash Surya */
70ea8dc4b6Seschrock static list_t inject_handlers;
71*97e81309SPrakash Surya
72*97e81309SPrakash Surya /*
73*97e81309SPrakash Surya * This protects insertion into, and traversal of, the inject handler
74*97e81309SPrakash Surya * list defined above; as well as the inject_delay_count. Any time a
75*97e81309SPrakash Surya * handler is inserted or removed from the list, this lock should be
76*97e81309SPrakash Surya * taken as a RW_WRITER; and any time traversal is done over the list
77*97e81309SPrakash Surya * (without modification to it) this lock should be taken as a RW_READER.
78*97e81309SPrakash Surya */
79ea8dc4b6Seschrock static krwlock_t inject_lock;
80*97e81309SPrakash Surya
81*97e81309SPrakash Surya /*
82*97e81309SPrakash Surya * This holds the number of zinject delay handlers that have been
83*97e81309SPrakash Surya * registered on the system. It is protected by the inject_lock defined
84*97e81309SPrakash Surya * above. Thus modifications to this count must be a RW_WRITER of the
85*97e81309SPrakash Surya * inject_lock, and reads of this count must be (at least) a RW_READER
86*97e81309SPrakash Surya * of the lock.
87*97e81309SPrakash Surya */
88*97e81309SPrakash Surya static int inject_delay_count = 0;
89*97e81309SPrakash Surya
90*97e81309SPrakash Surya /*
91*97e81309SPrakash Surya * This lock is used only in zio_handle_io_delay(), refer to the comment
92*97e81309SPrakash Surya * in that function for more details.
93*97e81309SPrakash Surya */
94*97e81309SPrakash Surya static kmutex_t inject_delay_mtx;
95*97e81309SPrakash Surya
96*97e81309SPrakash Surya /*
97*97e81309SPrakash Surya * Used to assign unique identifying numbers to each new zinject handler.
98*97e81309SPrakash Surya */
99ea8dc4b6Seschrock static int inject_next_id = 1;
100ea8dc4b6Seschrock
101ea8dc4b6Seschrock /*
102ea8dc4b6Seschrock * Returns true if the given record matches the I/O in progress.
103ea8dc4b6Seschrock */
104ea8dc4b6Seschrock static boolean_t
zio_match_handler(zbookmark_phys_t * zb,uint64_t type,zinject_record_t * record,int error)1057802d7bfSMatthew Ahrens zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
106ea8dc4b6Seschrock zinject_record_t *record, int error)
107ea8dc4b6Seschrock {
108ea8dc4b6Seschrock /*
109ea8dc4b6Seschrock * Check for a match against the MOS, which is based on type
110ea8dc4b6Seschrock */
111b24ab676SJeff Bonwick if (zb->zb_objset == DMU_META_OBJSET &&
112b24ab676SJeff Bonwick record->zi_objset == DMU_META_OBJSET &&
113b24ab676SJeff Bonwick record->zi_object == DMU_META_DNODE_OBJECT) {
114ea8dc4b6Seschrock if (record->zi_type == DMU_OT_NONE ||
115ea8dc4b6Seschrock type == record->zi_type)
116ea8dc4b6Seschrock return (record->zi_freq == 0 ||
117ea8dc4b6Seschrock spa_get_random(100) < record->zi_freq);
118ea8dc4b6Seschrock else
119ea8dc4b6Seschrock return (B_FALSE);
120ea8dc4b6Seschrock }
121ea8dc4b6Seschrock
122ea8dc4b6Seschrock /*
123ea8dc4b6Seschrock * Check for an exact match.
124ea8dc4b6Seschrock */
125ea8dc4b6Seschrock if (zb->zb_objset == record->zi_objset &&
126ea8dc4b6Seschrock zb->zb_object == record->zi_object &&
127ea8dc4b6Seschrock zb->zb_level == record->zi_level &&
128ea8dc4b6Seschrock zb->zb_blkid >= record->zi_start &&
129ea8dc4b6Seschrock zb->zb_blkid <= record->zi_end &&
130ea8dc4b6Seschrock error == record->zi_error)
131ea8dc4b6Seschrock return (record->zi_freq == 0 ||
132ea8dc4b6Seschrock spa_get_random(100) < record->zi_freq);
133ea8dc4b6Seschrock
134ea8dc4b6Seschrock return (B_FALSE);
135ea8dc4b6Seschrock }
136ea8dc4b6Seschrock
137ea8dc4b6Seschrock /*
13888ecc943SGeorge Wilson * Panic the system when a config change happens in the function
13988ecc943SGeorge Wilson * specified by tag.
14088ecc943SGeorge Wilson */
14188ecc943SGeorge Wilson void
zio_handle_panic_injection(spa_t * spa,char * tag,uint64_t type)1421195e687SMark J Musante zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
14388ecc943SGeorge Wilson {
14488ecc943SGeorge Wilson inject_handler_t *handler;
14588ecc943SGeorge Wilson
14688ecc943SGeorge Wilson rw_enter(&inject_lock, RW_READER);
14788ecc943SGeorge Wilson
14888ecc943SGeorge Wilson for (handler = list_head(&inject_handlers); handler != NULL;
14988ecc943SGeorge Wilson handler = list_next(&inject_handlers, handler)) {
15088ecc943SGeorge Wilson
15188ecc943SGeorge Wilson if (spa != handler->zi_spa)
15288ecc943SGeorge Wilson continue;
15388ecc943SGeorge Wilson
1541195e687SMark J Musante if (handler->zi_record.zi_type == type &&
1551195e687SMark J Musante strcmp(tag, handler->zi_record.zi_func) == 0)
15688ecc943SGeorge Wilson panic("Panic requested in function %s\n", tag);
15788ecc943SGeorge Wilson }
15888ecc943SGeorge Wilson
15988ecc943SGeorge Wilson rw_exit(&inject_lock);
16088ecc943SGeorge Wilson }
16188ecc943SGeorge Wilson
16288ecc943SGeorge Wilson /*
163ea8dc4b6Seschrock * Determine if the I/O in question should return failure. Returns the errno
164ea8dc4b6Seschrock * to be returned to the caller.
165ea8dc4b6Seschrock */
166ea8dc4b6Seschrock int
zio_handle_fault_injection(zio_t * zio,int error)167ea8dc4b6Seschrock zio_handle_fault_injection(zio_t *zio, int error)
168ea8dc4b6Seschrock {
169ea8dc4b6Seschrock int ret = 0;
170ea8dc4b6Seschrock inject_handler_t *handler;
171ea8dc4b6Seschrock
172ea8dc4b6Seschrock /*
173ea8dc4b6Seschrock * Ignore I/O not associated with any logical data.
174ea8dc4b6Seschrock */
175ea8dc4b6Seschrock if (zio->io_logical == NULL)
176ea8dc4b6Seschrock return (0);
177ea8dc4b6Seschrock
178ea8dc4b6Seschrock /*
179ea8dc4b6Seschrock * Currently, we only support fault injection on reads.
180ea8dc4b6Seschrock */
181ea8dc4b6Seschrock if (zio->io_type != ZIO_TYPE_READ)
182ea8dc4b6Seschrock return (0);
183ea8dc4b6Seschrock
184ea8dc4b6Seschrock rw_enter(&inject_lock, RW_READER);
185ea8dc4b6Seschrock
186ea8dc4b6Seschrock for (handler = list_head(&inject_handlers); handler != NULL;
187ea8dc4b6Seschrock handler = list_next(&inject_handlers, handler)) {
188ea8dc4b6Seschrock
189283b8460SGeorge.Wilson if (zio->io_spa != handler->zi_spa ||
190283b8460SGeorge.Wilson handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
191ea8dc4b6Seschrock continue;
192ea8dc4b6Seschrock
193ea8dc4b6Seschrock /* If this handler matches, return EIO */
194ea8dc4b6Seschrock if (zio_match_handler(&zio->io_logical->io_bookmark,
195ea8dc4b6Seschrock zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
196ea8dc4b6Seschrock &handler->zi_record, error)) {
197ea8dc4b6Seschrock ret = error;
198ea8dc4b6Seschrock break;
199ea8dc4b6Seschrock }
200ea8dc4b6Seschrock }
201ea8dc4b6Seschrock
202ea8dc4b6Seschrock rw_exit(&inject_lock);
203ea8dc4b6Seschrock
204ea8dc4b6Seschrock return (ret);
205ea8dc4b6Seschrock }
206ea8dc4b6Seschrock
20721bf64a7Sgw25295 /*
20821bf64a7Sgw25295 * Determine if the zio is part of a label update and has an injection
20921bf64a7Sgw25295 * handler associated with that portion of the label. Currently, we
21021bf64a7Sgw25295 * allow error injection in either the nvlist or the uberblock region of
21121bf64a7Sgw25295 * of the vdev label.
21221bf64a7Sgw25295 */
21321bf64a7Sgw25295 int
zio_handle_label_injection(zio_t * zio,int error)21421bf64a7Sgw25295 zio_handle_label_injection(zio_t *zio, int error)
21521bf64a7Sgw25295 {
21621bf64a7Sgw25295 inject_handler_t *handler;
21721bf64a7Sgw25295 vdev_t *vd = zio->io_vd;
21821bf64a7Sgw25295 uint64_t offset = zio->io_offset;
21921bf64a7Sgw25295 int label;
22021bf64a7Sgw25295 int ret = 0;
22121bf64a7Sgw25295
2228f18d1faSGeorge Wilson if (offset >= VDEV_LABEL_START_SIZE &&
22321bf64a7Sgw25295 offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
22421bf64a7Sgw25295 return (0);
22521bf64a7Sgw25295
22621bf64a7Sgw25295 rw_enter(&inject_lock, RW_READER);
22721bf64a7Sgw25295
22821bf64a7Sgw25295 for (handler = list_head(&inject_handlers); handler != NULL;
22921bf64a7Sgw25295 handler = list_next(&inject_handlers, handler)) {
23021bf64a7Sgw25295 uint64_t start = handler->zi_record.zi_start;
23121bf64a7Sgw25295 uint64_t end = handler->zi_record.zi_end;
23221bf64a7Sgw25295
233283b8460SGeorge.Wilson if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
23421bf64a7Sgw25295 continue;
23521bf64a7Sgw25295
23621bf64a7Sgw25295 /*
23721bf64a7Sgw25295 * The injection region is the relative offsets within a
23821bf64a7Sgw25295 * vdev label. We must determine the label which is being
23921bf64a7Sgw25295 * updated and adjust our region accordingly.
24021bf64a7Sgw25295 */
24121bf64a7Sgw25295 label = vdev_label_number(vd->vdev_psize, offset);
24221bf64a7Sgw25295 start = vdev_label_offset(vd->vdev_psize, label, start);
24321bf64a7Sgw25295 end = vdev_label_offset(vd->vdev_psize, label, end);
24421bf64a7Sgw25295
24521bf64a7Sgw25295 if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
24621bf64a7Sgw25295 (offset >= start && offset <= end)) {
24721bf64a7Sgw25295 ret = error;
24821bf64a7Sgw25295 break;
24921bf64a7Sgw25295 }
25021bf64a7Sgw25295 }
25121bf64a7Sgw25295 rw_exit(&inject_lock);
25221bf64a7Sgw25295 return (ret);
25321bf64a7Sgw25295 }
25421bf64a7Sgw25295
25521bf64a7Sgw25295
256ea8dc4b6Seschrock int
zio_handle_device_injection(vdev_t * vd,zio_t * zio,int error)2578956713aSEric Schrock zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
258ea8dc4b6Seschrock {
259ea8dc4b6Seschrock inject_handler_t *handler;
260ea8dc4b6Seschrock int ret = 0;
261ea8dc4b6Seschrock
2628f18d1faSGeorge Wilson /*
2638f18d1faSGeorge Wilson * We skip over faults in the labels unless it's during
2648f18d1faSGeorge Wilson * device open (i.e. zio == NULL).
2658f18d1faSGeorge Wilson */
2668f18d1faSGeorge Wilson if (zio != NULL) {
2678f18d1faSGeorge Wilson uint64_t offset = zio->io_offset;
2688f18d1faSGeorge Wilson
2698f18d1faSGeorge Wilson if (offset < VDEV_LABEL_START_SIZE ||
2708f18d1faSGeorge Wilson offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
2718f18d1faSGeorge Wilson return (0);
2728f18d1faSGeorge Wilson }
2738f18d1faSGeorge Wilson
274ea8dc4b6Seschrock rw_enter(&inject_lock, RW_READER);
275ea8dc4b6Seschrock
276ea8dc4b6Seschrock for (handler = list_head(&inject_handlers); handler != NULL;
277ea8dc4b6Seschrock handler = list_next(&inject_handlers, handler)) {
278ea8dc4b6Seschrock
279283b8460SGeorge.Wilson if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
28021bf64a7Sgw25295 continue;
28121bf64a7Sgw25295
282ea8dc4b6Seschrock if (vd->vdev_guid == handler->zi_record.zi_guid) {
2838956713aSEric Schrock if (handler->zi_record.zi_failfast &&
2848956713aSEric Schrock (zio == NULL || (zio->io_flags &
2858956713aSEric Schrock (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
2868956713aSEric Schrock continue;
2878956713aSEric Schrock }
2888956713aSEric Schrock
2898f18d1faSGeorge Wilson /* Handle type specific I/O failures */
2908f18d1faSGeorge Wilson if (zio != NULL &&
2918f18d1faSGeorge Wilson handler->zi_record.zi_iotype != ZIO_TYPES &&
2928f18d1faSGeorge Wilson handler->zi_record.zi_iotype != zio->io_type)
2938f18d1faSGeorge Wilson continue;
2948f18d1faSGeorge Wilson
295ea8dc4b6Seschrock if (handler->zi_record.zi_error == error) {
296ea8dc4b6Seschrock /*
297ea8dc4b6Seschrock * For a failed open, pretend like the device
298ea8dc4b6Seschrock * has gone away.
299ea8dc4b6Seschrock */
300ea8dc4b6Seschrock if (error == ENXIO)
301ea8dc4b6Seschrock vd->vdev_stat.vs_aux =
302ea8dc4b6Seschrock VDEV_AUX_OPEN_FAILED;
30398d1cbfeSGeorge Wilson
30498d1cbfeSGeorge Wilson /*
30598d1cbfeSGeorge Wilson * Treat these errors as if they had been
30698d1cbfeSGeorge Wilson * retried so that all the appropriate stats
30798d1cbfeSGeorge Wilson * and FMA events are generated.
30898d1cbfeSGeorge Wilson */
30998d1cbfeSGeorge Wilson if (!handler->zi_record.zi_failfast &&
31098d1cbfeSGeorge Wilson zio != NULL)
31198d1cbfeSGeorge Wilson zio->io_flags |= ZIO_FLAG_IO_RETRY;
31298d1cbfeSGeorge Wilson
313ea8dc4b6Seschrock ret = error;
314ea8dc4b6Seschrock break;
315ea8dc4b6Seschrock }
316ea8dc4b6Seschrock if (handler->zi_record.zi_error == ENXIO) {
317be6fd75aSMatthew Ahrens ret = SET_ERROR(EIO);
318ea8dc4b6Seschrock break;
319ea8dc4b6Seschrock }
320ea8dc4b6Seschrock }
321ea8dc4b6Seschrock }
322ea8dc4b6Seschrock
323ea8dc4b6Seschrock rw_exit(&inject_lock);
324ea8dc4b6Seschrock
325ea8dc4b6Seschrock return (ret);
326ea8dc4b6Seschrock }
327ea8dc4b6Seschrock
328ea8dc4b6Seschrock /*
329468c413aSTim Haley * Simulate hardware that ignores cache flushes. For requested number
330468c413aSTim Haley * of seconds nix the actual writing to disk.
331468c413aSTim Haley */
332468c413aSTim Haley void
zio_handle_ignored_writes(zio_t * zio)333468c413aSTim Haley zio_handle_ignored_writes(zio_t *zio)
334468c413aSTim Haley {
335468c413aSTim Haley inject_handler_t *handler;
336468c413aSTim Haley
337468c413aSTim Haley rw_enter(&inject_lock, RW_READER);
338468c413aSTim Haley
339468c413aSTim Haley for (handler = list_head(&inject_handlers); handler != NULL;
340468c413aSTim Haley handler = list_next(&inject_handlers, handler)) {
341468c413aSTim Haley
342468c413aSTim Haley /* Ignore errors not destined for this pool */
343283b8460SGeorge.Wilson if (zio->io_spa != handler->zi_spa ||
344283b8460SGeorge.Wilson handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
345468c413aSTim Haley continue;
346468c413aSTim Haley
347468c413aSTim Haley /*
348468c413aSTim Haley * Positive duration implies # of seconds, negative
349468c413aSTim Haley * a number of txgs
350468c413aSTim Haley */
351468c413aSTim Haley if (handler->zi_record.zi_timer == 0) {
352468c413aSTim Haley if (handler->zi_record.zi_duration > 0)
353d3d50737SRafael Vanoni handler->zi_record.zi_timer = ddi_get_lbolt64();
354468c413aSTim Haley else
355468c413aSTim Haley handler->zi_record.zi_timer = zio->io_txg;
356468c413aSTim Haley }
357a33cae98STim Haley
358a33cae98STim Haley /* Have a "problem" writing 60% of the time */
359a33cae98STim Haley if (spa_get_random(100) < 60)
360468c413aSTim Haley zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
361468c413aSTim Haley break;
362468c413aSTim Haley }
363468c413aSTim Haley
364468c413aSTim Haley rw_exit(&inject_lock);
365468c413aSTim Haley }
366468c413aSTim Haley
367468c413aSTim Haley void
spa_handle_ignored_writes(spa_t * spa)368468c413aSTim Haley spa_handle_ignored_writes(spa_t *spa)
369468c413aSTim Haley {
370468c413aSTim Haley inject_handler_t *handler;
371468c413aSTim Haley
372468c413aSTim Haley if (zio_injection_enabled == 0)
373468c413aSTim Haley return;
374468c413aSTim Haley
375468c413aSTim Haley rw_enter(&inject_lock, RW_READER);
376468c413aSTim Haley
377468c413aSTim Haley for (handler = list_head(&inject_handlers); handler != NULL;
378468c413aSTim Haley handler = list_next(&inject_handlers, handler)) {
379468c413aSTim Haley
380283b8460SGeorge.Wilson if (spa != handler->zi_spa ||
381283b8460SGeorge.Wilson handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
382468c413aSTim Haley continue;
383468c413aSTim Haley
384468c413aSTim Haley if (handler->zi_record.zi_duration > 0) {
385468c413aSTim Haley VERIFY(handler->zi_record.zi_timer == 0 ||
386468c413aSTim Haley handler->zi_record.zi_timer +
387d3d50737SRafael Vanoni handler->zi_record.zi_duration * hz >
388d3d50737SRafael Vanoni ddi_get_lbolt64());
389468c413aSTim Haley } else {
390468c413aSTim Haley /* duration is negative so the subtraction here adds */
391468c413aSTim Haley VERIFY(handler->zi_record.zi_timer == 0 ||
392468c413aSTim Haley handler->zi_record.zi_timer -
393468c413aSTim Haley handler->zi_record.zi_duration >=
394b24ab676SJeff Bonwick spa_syncing_txg(spa));
395468c413aSTim Haley }
396468c413aSTim Haley }
397468c413aSTim Haley
398468c413aSTim Haley rw_exit(&inject_lock);
399468c413aSTim Haley }
400468c413aSTim Haley
401*97e81309SPrakash Surya hrtime_t
zio_handle_io_delay(zio_t * zio)402283b8460SGeorge.Wilson zio_handle_io_delay(zio_t *zio)
403283b8460SGeorge.Wilson {
404283b8460SGeorge.Wilson vdev_t *vd = zio->io_vd;
405*97e81309SPrakash Surya inject_handler_t *min_handler = NULL;
406*97e81309SPrakash Surya hrtime_t min_target = 0;
407283b8460SGeorge.Wilson
408283b8460SGeorge.Wilson rw_enter(&inject_lock, RW_READER);
409283b8460SGeorge.Wilson
410*97e81309SPrakash Surya /*
411*97e81309SPrakash Surya * inject_delay_count is a subset of zio_injection_enabled that
412*97e81309SPrakash Surya * is only incremented for delay handlers. These checks are
413*97e81309SPrakash Surya * mainly added to remind the reader why we're not explicitly
414*97e81309SPrakash Surya * checking zio_injection_enabled like the other functions.
415*97e81309SPrakash Surya */
416*97e81309SPrakash Surya IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
417*97e81309SPrakash Surya IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
418283b8460SGeorge.Wilson
419*97e81309SPrakash Surya /*
420*97e81309SPrakash Surya * If there aren't any inject delay handlers registered, then we
421*97e81309SPrakash Surya * can short circuit and simply return 0 here. A value of zero
422*97e81309SPrakash Surya * informs zio_delay_interrupt() that this request should not be
423*97e81309SPrakash Surya * delayed. This short circuit keeps us from acquiring the
424*97e81309SPrakash Surya * inject_delay_mutex unnecessarily.
425*97e81309SPrakash Surya */
426*97e81309SPrakash Surya if (inject_delay_count == 0) {
427*97e81309SPrakash Surya rw_exit(&inject_lock);
428*97e81309SPrakash Surya return (0);
429*97e81309SPrakash Surya }
430*97e81309SPrakash Surya
431*97e81309SPrakash Surya /*
432*97e81309SPrakash Surya * Each inject handler has a number of "lanes" associated with
433*97e81309SPrakash Surya * it. Each lane is able to handle requests independently of one
434*97e81309SPrakash Surya * another, and at a latency defined by the inject handler
435*97e81309SPrakash Surya * record's zi_timer field. Thus if a handler in configured with
436*97e81309SPrakash Surya * a single lane with a 10ms latency, it will delay requests
437*97e81309SPrakash Surya * such that only a single request is completed every 10ms. So,
438*97e81309SPrakash Surya * if more than one request is attempted per each 10ms interval,
439*97e81309SPrakash Surya * the average latency of the requests will be greater than
440*97e81309SPrakash Surya * 10ms; but if only a single request is submitted each 10ms
441*97e81309SPrakash Surya * interval the average latency will be 10ms.
442*97e81309SPrakash Surya *
443*97e81309SPrakash Surya * We need to acquire this mutex to prevent multiple concurrent
444*97e81309SPrakash Surya * threads being assigned to the same lane of a given inject
445*97e81309SPrakash Surya * handler. The mutex allows us to perform the following two
446*97e81309SPrakash Surya * operations atomically:
447*97e81309SPrakash Surya *
448*97e81309SPrakash Surya * 1. determine the minimum handler and minimum target
449*97e81309SPrakash Surya * value of all the possible handlers
450*97e81309SPrakash Surya * 2. update that minimum handler's lane array
451*97e81309SPrakash Surya *
452*97e81309SPrakash Surya * Without atomicity, two (or more) threads could pick the same
453*97e81309SPrakash Surya * lane in step (1), and then conflict with each other in step
454*97e81309SPrakash Surya * (2). This could allow a single lane handler to process
455*97e81309SPrakash Surya * multiple requests simultaneously, which shouldn't be possible.
456*97e81309SPrakash Surya */
457*97e81309SPrakash Surya mutex_enter(&inject_delay_mtx);
458*97e81309SPrakash Surya
459*97e81309SPrakash Surya for (inject_handler_t *handler = list_head(&inject_handlers);
460*97e81309SPrakash Surya handler != NULL; handler = list_next(&inject_handlers, handler)) {
461283b8460SGeorge.Wilson if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
462283b8460SGeorge.Wilson continue;
463283b8460SGeorge.Wilson
464*97e81309SPrakash Surya if (vd->vdev_guid != handler->zi_record.zi_guid)
465*97e81309SPrakash Surya continue;
466*97e81309SPrakash Surya
467*97e81309SPrakash Surya /*
468*97e81309SPrakash Surya * Defensive; should never happen as the array allocation
469*97e81309SPrakash Surya * occurs prior to inserting this handler on the list.
470*97e81309SPrakash Surya */
471*97e81309SPrakash Surya ASSERT3P(handler->zi_lanes, !=, NULL);
472*97e81309SPrakash Surya
473*97e81309SPrakash Surya /*
474*97e81309SPrakash Surya * This should never happen, the zinject command should
475*97e81309SPrakash Surya * prevent a user from setting an IO delay with zero lanes.
476*97e81309SPrakash Surya */
477*97e81309SPrakash Surya ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
478*97e81309SPrakash Surya
479*97e81309SPrakash Surya ASSERT3U(handler->zi_record.zi_nlanes, >,
480*97e81309SPrakash Surya handler->zi_next_lane);
481*97e81309SPrakash Surya
482*97e81309SPrakash Surya /*
483*97e81309SPrakash Surya * We want to issue this IO to the lane that will become
484*97e81309SPrakash Surya * idle the soonest, so we compare the soonest this
485*97e81309SPrakash Surya * specific handler can complete the IO with all other
486*97e81309SPrakash Surya * handlers, to find the lowest value of all possible
487*97e81309SPrakash Surya * lanes. We then use this lane to submit the request.
488*97e81309SPrakash Surya *
489*97e81309SPrakash Surya * Since each handler has a constant value for its
490*97e81309SPrakash Surya * delay, we can just use the "next" lane for that
491*97e81309SPrakash Surya * handler; as it will always be the lane with the
492*97e81309SPrakash Surya * lowest value for that particular handler (i.e. the
493*97e81309SPrakash Surya * lane that will become idle the soonest). This saves a
494*97e81309SPrakash Surya * scan of each handler's lanes array.
495*97e81309SPrakash Surya *
496*97e81309SPrakash Surya * There's two cases to consider when determining when
497*97e81309SPrakash Surya * this specific IO request should complete. If this
498*97e81309SPrakash Surya * lane is idle, we want to "submit" the request now so
499*97e81309SPrakash Surya * it will complete after zi_timer milliseconds. Thus,
500*97e81309SPrakash Surya * we set the target to now + zi_timer.
501*97e81309SPrakash Surya *
502*97e81309SPrakash Surya * If the lane is busy, we want this request to complete
503*97e81309SPrakash Surya * zi_timer milliseconds after the lane becomes idle.
504*97e81309SPrakash Surya * Since the 'zi_lanes' array holds the time at which
505*97e81309SPrakash Surya * each lane will become idle, we use that value to
506*97e81309SPrakash Surya * determine when this request should complete.
507*97e81309SPrakash Surya */
508*97e81309SPrakash Surya hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
509*97e81309SPrakash Surya hrtime_t busy = handler->zi_record.zi_timer +
510*97e81309SPrakash Surya handler->zi_lanes[handler->zi_next_lane];
511*97e81309SPrakash Surya hrtime_t target = MAX(idle, busy);
512*97e81309SPrakash Surya
513*97e81309SPrakash Surya if (min_handler == NULL) {
514*97e81309SPrakash Surya min_handler = handler;
515*97e81309SPrakash Surya min_target = target;
516*97e81309SPrakash Surya continue;
517283b8460SGeorge.Wilson }
518283b8460SGeorge.Wilson
519*97e81309SPrakash Surya ASSERT3P(min_handler, !=, NULL);
520*97e81309SPrakash Surya ASSERT3U(min_target, !=, 0);
521*97e81309SPrakash Surya
522*97e81309SPrakash Surya /*
523*97e81309SPrakash Surya * We don't yet increment the "next lane" variable since
524*97e81309SPrakash Surya * we still might find a lower value lane in another
525*97e81309SPrakash Surya * handler during any remaining iterations. Once we're
526*97e81309SPrakash Surya * sure we've selected the absolute minimum, we'll claim
527*97e81309SPrakash Surya * the lane and increment the handler's "next lane"
528*97e81309SPrakash Surya * field below.
529*97e81309SPrakash Surya */
530*97e81309SPrakash Surya
531*97e81309SPrakash Surya if (target < min_target) {
532*97e81309SPrakash Surya min_handler = handler;
533*97e81309SPrakash Surya min_target = target;
534283b8460SGeorge.Wilson }
535*97e81309SPrakash Surya }
536*97e81309SPrakash Surya
537*97e81309SPrakash Surya /*
538*97e81309SPrakash Surya * 'min_handler' will be NULL if no IO delays are registered for
539*97e81309SPrakash Surya * this vdev, otherwise it will point to the handler containing
540*97e81309SPrakash Surya * the lane that will become idle the soonest.
541*97e81309SPrakash Surya */
542*97e81309SPrakash Surya if (min_handler != NULL) {
543*97e81309SPrakash Surya ASSERT3U(min_target, !=, 0);
544*97e81309SPrakash Surya min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
545*97e81309SPrakash Surya
546*97e81309SPrakash Surya /*
547*97e81309SPrakash Surya * If we've used all possible lanes for this handler,
548*97e81309SPrakash Surya * loop back and start using the first lane again;
549*97e81309SPrakash Surya * otherwise, just increment the lane index.
550*97e81309SPrakash Surya */
551*97e81309SPrakash Surya min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
552*97e81309SPrakash Surya min_handler->zi_record.zi_nlanes;
553*97e81309SPrakash Surya }
554*97e81309SPrakash Surya
555*97e81309SPrakash Surya mutex_exit(&inject_delay_mtx);
556283b8460SGeorge.Wilson rw_exit(&inject_lock);
557*97e81309SPrakash Surya
558*97e81309SPrakash Surya return (min_target);
559283b8460SGeorge.Wilson }
560283b8460SGeorge.Wilson
561468c413aSTim Haley /*
562ea8dc4b6Seschrock * Create a new handler for the given record. We add it to the list, adding
563ea8dc4b6Seschrock * a reference to the spa_t in the process. We increment zio_injection_enabled,
564ea8dc4b6Seschrock * which is the switch to trigger all fault injection.
565ea8dc4b6Seschrock */
566ea8dc4b6Seschrock int
zio_inject_fault(char * name,int flags,int * id,zinject_record_t * record)567ea8dc4b6Seschrock zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
568ea8dc4b6Seschrock {
569ea8dc4b6Seschrock inject_handler_t *handler;
570ea8dc4b6Seschrock int error;
571ea8dc4b6Seschrock spa_t *spa;
572ea8dc4b6Seschrock
573ea8dc4b6Seschrock /*
574ea8dc4b6Seschrock * If this is pool-wide metadata, make sure we unload the corresponding
575ea8dc4b6Seschrock * spa_t, so that the next attempt to load it will trigger the fault.
576ea8dc4b6Seschrock * We call spa_reset() to unload the pool appropriately.
577ea8dc4b6Seschrock */
578ea8dc4b6Seschrock if (flags & ZINJECT_UNLOAD_SPA)
579ea8dc4b6Seschrock if ((error = spa_reset(name)) != 0)
580ea8dc4b6Seschrock return (error);
581ea8dc4b6Seschrock
582*97e81309SPrakash Surya if (record->zi_cmd == ZINJECT_DELAY_IO) {
583*97e81309SPrakash Surya /*
584*97e81309SPrakash Surya * A value of zero for the number of lanes or for the
585*97e81309SPrakash Surya * delay time doesn't make sense.
586*97e81309SPrakash Surya */
587*97e81309SPrakash Surya if (record->zi_timer == 0 || record->zi_nlanes == 0)
588*97e81309SPrakash Surya return (SET_ERROR(EINVAL));
589*97e81309SPrakash Surya
590*97e81309SPrakash Surya /*
591*97e81309SPrakash Surya * The number of lanes is directly mapped to the size of
592*97e81309SPrakash Surya * an array used by the handler. Thus, to ensure the
593*97e81309SPrakash Surya * user doesn't trigger an allocation that's "too large"
594*97e81309SPrakash Surya * we cap the number of lanes here.
595*97e81309SPrakash Surya */
596*97e81309SPrakash Surya if (record->zi_nlanes >= UINT16_MAX)
597*97e81309SPrakash Surya return (SET_ERROR(EINVAL));
598*97e81309SPrakash Surya }
599*97e81309SPrakash Surya
600ea8dc4b6Seschrock if (!(flags & ZINJECT_NULL)) {
601ea8dc4b6Seschrock /*
602ea8dc4b6Seschrock * spa_inject_ref() will add an injection reference, which will
603ea8dc4b6Seschrock * prevent the pool from being removed from the namespace while
604ea8dc4b6Seschrock * still allowing it to be unloaded.
605ea8dc4b6Seschrock */
606ea8dc4b6Seschrock if ((spa = spa_inject_addref(name)) == NULL)
607be6fd75aSMatthew Ahrens return (SET_ERROR(ENOENT));
608ea8dc4b6Seschrock
609ea8dc4b6Seschrock handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
610ea8dc4b6Seschrock
611ea8dc4b6Seschrock handler->zi_spa = spa;
612ea8dc4b6Seschrock handler->zi_record = *record;
613*97e81309SPrakash Surya
614*97e81309SPrakash Surya if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
615*97e81309SPrakash Surya handler->zi_lanes = kmem_zalloc(
616*97e81309SPrakash Surya sizeof (*handler->zi_lanes) *
617*97e81309SPrakash Surya handler->zi_record.zi_nlanes, KM_SLEEP);
618*97e81309SPrakash Surya handler->zi_next_lane = 0;
619*97e81309SPrakash Surya } else {
620*97e81309SPrakash Surya handler->zi_lanes = NULL;
621*97e81309SPrakash Surya handler->zi_next_lane = 0;
622*97e81309SPrakash Surya }
623*97e81309SPrakash Surya
624*97e81309SPrakash Surya rw_enter(&inject_lock, RW_WRITER);
625*97e81309SPrakash Surya
626*97e81309SPrakash Surya /*
627*97e81309SPrakash Surya * We can't move this increment into the conditional
628*97e81309SPrakash Surya * above because we need to hold the RW_WRITER lock of
629*97e81309SPrakash Surya * inject_lock, and we don't want to hold that while
630*97e81309SPrakash Surya * allocating the handler's zi_lanes array.
631*97e81309SPrakash Surya */
632*97e81309SPrakash Surya if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
633*97e81309SPrakash Surya ASSERT3S(inject_delay_count, >=, 0);
634*97e81309SPrakash Surya inject_delay_count++;
635*97e81309SPrakash Surya ASSERT3S(inject_delay_count, >, 0);
636*97e81309SPrakash Surya }
637*97e81309SPrakash Surya
638*97e81309SPrakash Surya *id = handler->zi_id = inject_next_id++;
639ea8dc4b6Seschrock list_insert_tail(&inject_handlers, handler);
6401a5e258fSJosef 'Jeff' Sipek atomic_inc_32(&zio_injection_enabled);
641ea8dc4b6Seschrock
642ea8dc4b6Seschrock rw_exit(&inject_lock);
643ea8dc4b6Seschrock }
644ea8dc4b6Seschrock
645ea8dc4b6Seschrock /*
646ea8dc4b6Seschrock * Flush the ARC, so that any attempts to read this data will end up
647ea8dc4b6Seschrock * going to the ZIO layer. Note that this is a little overkill, but
648ea8dc4b6Seschrock * we don't have the necessary ARC interfaces to do anything else, and
649ea8dc4b6Seschrock * fault injection isn't a performance critical path.
650ea8dc4b6Seschrock */
651ea8dc4b6Seschrock if (flags & ZINJECT_FLUSH_ARC)
652244781f1SPrakash Surya /*
653244781f1SPrakash Surya * We must use FALSE to ensure arc_flush returns, since
654244781f1SPrakash Surya * we're not preventing concurrent ARC insertions.
655244781f1SPrakash Surya */
656244781f1SPrakash Surya arc_flush(NULL, FALSE);
657ea8dc4b6Seschrock
658ea8dc4b6Seschrock return (0);
659ea8dc4b6Seschrock }
660ea8dc4b6Seschrock
661ea8dc4b6Seschrock /*
662ea8dc4b6Seschrock * Returns the next record with an ID greater than that supplied to the
663ea8dc4b6Seschrock * function. Used to iterate over all handlers in the system.
664ea8dc4b6Seschrock */
665ea8dc4b6Seschrock int
zio_inject_list_next(int * id,char * name,size_t buflen,zinject_record_t * record)666ea8dc4b6Seschrock zio_inject_list_next(int *id, char *name, size_t buflen,
667ea8dc4b6Seschrock zinject_record_t *record)
668ea8dc4b6Seschrock {
669ea8dc4b6Seschrock inject_handler_t *handler;
670ea8dc4b6Seschrock int ret;
671ea8dc4b6Seschrock
672ea8dc4b6Seschrock mutex_enter(&spa_namespace_lock);
673ea8dc4b6Seschrock rw_enter(&inject_lock, RW_READER);
674ea8dc4b6Seschrock
675ea8dc4b6Seschrock for (handler = list_head(&inject_handlers); handler != NULL;
676ea8dc4b6Seschrock handler = list_next(&inject_handlers, handler))
677ea8dc4b6Seschrock if (handler->zi_id > *id)
678ea8dc4b6Seschrock break;
679ea8dc4b6Seschrock
680ea8dc4b6Seschrock if (handler) {
681ea8dc4b6Seschrock *record = handler->zi_record;
682ea8dc4b6Seschrock *id = handler->zi_id;
683ea8dc4b6Seschrock (void) strncpy(name, spa_name(handler->zi_spa), buflen);
684ea8dc4b6Seschrock ret = 0;
685ea8dc4b6Seschrock } else {
686be6fd75aSMatthew Ahrens ret = SET_ERROR(ENOENT);
687ea8dc4b6Seschrock }
688ea8dc4b6Seschrock
689ea8dc4b6Seschrock rw_exit(&inject_lock);
690ea8dc4b6Seschrock mutex_exit(&spa_namespace_lock);
691ea8dc4b6Seschrock
692ea8dc4b6Seschrock return (ret);
693ea8dc4b6Seschrock }
694ea8dc4b6Seschrock
695ea8dc4b6Seschrock /*
696ea8dc4b6Seschrock * Clear the fault handler with the given identifier, or return ENOENT if none
697ea8dc4b6Seschrock * exists.
698ea8dc4b6Seschrock */
699ea8dc4b6Seschrock int
zio_clear_fault(int id)700ea8dc4b6Seschrock zio_clear_fault(int id)
701ea8dc4b6Seschrock {
702ea8dc4b6Seschrock inject_handler_t *handler;
703ea8dc4b6Seschrock
704ea8dc4b6Seschrock rw_enter(&inject_lock, RW_WRITER);
705ea8dc4b6Seschrock
706ea8dc4b6Seschrock for (handler = list_head(&inject_handlers); handler != NULL;
707ea8dc4b6Seschrock handler = list_next(&inject_handlers, handler))
708ea8dc4b6Seschrock if (handler->zi_id == id)
709ea8dc4b6Seschrock break;
710ea8dc4b6Seschrock
711ea8dc4b6Seschrock if (handler == NULL) {
712679b018dSMark J Musante rw_exit(&inject_lock);
713be6fd75aSMatthew Ahrens return (SET_ERROR(ENOENT));
714679b018dSMark J Musante }
715679b018dSMark J Musante
716*97e81309SPrakash Surya if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
717*97e81309SPrakash Surya ASSERT3S(inject_delay_count, >, 0);
718*97e81309SPrakash Surya inject_delay_count--;
719*97e81309SPrakash Surya ASSERT3S(inject_delay_count, >=, 0);
720*97e81309SPrakash Surya }
721*97e81309SPrakash Surya
722ea8dc4b6Seschrock list_remove(&inject_handlers, handler);
723679b018dSMark J Musante rw_exit(&inject_lock);
724679b018dSMark J Musante
725*97e81309SPrakash Surya if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
726*97e81309SPrakash Surya ASSERT3P(handler->zi_lanes, !=, NULL);
727*97e81309SPrakash Surya kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
728*97e81309SPrakash Surya handler->zi_record.zi_nlanes);
729*97e81309SPrakash Surya } else {
730*97e81309SPrakash Surya ASSERT3P(handler->zi_lanes, ==, NULL);
731*97e81309SPrakash Surya }
732*97e81309SPrakash Surya
733ea8dc4b6Seschrock spa_inject_delref(handler->zi_spa);
734ea8dc4b6Seschrock kmem_free(handler, sizeof (inject_handler_t));
7351a5e258fSJosef 'Jeff' Sipek atomic_dec_32(&zio_injection_enabled);
736ea8dc4b6Seschrock
737679b018dSMark J Musante return (0);
738ea8dc4b6Seschrock }
739ea8dc4b6Seschrock
740ea8dc4b6Seschrock void
zio_inject_init(void)741ea8dc4b6Seschrock zio_inject_init(void)
742ea8dc4b6Seschrock {
7439b3f6b42SEric Kustarz rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
744*97e81309SPrakash Surya mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
745ea8dc4b6Seschrock list_create(&inject_handlers, sizeof (inject_handler_t),
746ea8dc4b6Seschrock offsetof(inject_handler_t, zi_link));
747ea8dc4b6Seschrock }
748ea8dc4b6Seschrock
749ea8dc4b6Seschrock void
zio_inject_fini(void)750ea8dc4b6Seschrock zio_inject_fini(void)
751ea8dc4b6Seschrock {
752ea8dc4b6Seschrock list_destroy(&inject_handlers);
753*97e81309SPrakash Surya mutex_destroy(&inject_delay_mtx);
7549b3f6b42SEric Kustarz rw_destroy(&inject_lock);
755ea8dc4b6Seschrock }
756