xref: /titanic_50/usr/src/uts/common/fs/zfs/zio.c (revision 7ddc9b1afd18f260b9fb78ec7732facd91769131)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/fm/fs/zfs.h>
30 #include <sys/spa.h>
31 #include <sys/txg.h>
32 #include <sys/spa_impl.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/zio_impl.h>
35 #include <sys/zio_compress.h>
36 #include <sys/zio_checksum.h>
37 
38 /*
39  * ==========================================================================
40  * I/O priority table
41  * ==========================================================================
42  */
43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
44 	0,	/* ZIO_PRIORITY_NOW		*/
45 	0,	/* ZIO_PRIORITY_SYNC_READ	*/
46 	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
47 	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
48 	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
49 	4,	/* ZIO_PRIORITY_FREE		*/
50 	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
51 	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
52 	10,	/* ZIO_PRIORITY_RESILVER	*/
53 	20,	/* ZIO_PRIORITY_SCRUB		*/
54 };
55 
56 /*
57  * ==========================================================================
58  * I/O type descriptions
59  * ==========================================================================
60  */
61 char *zio_type_name[ZIO_TYPES] = {
62 	"null", "read", "write", "free", "claim", "ioctl" };
63 
64 /* Force an allocation failure when non-zero */
65 uint16_t zio_zil_fail_shift = 0;
66 uint16_t zio_io_fail_shift = 0;
67 
68 /* Enable/disable the write-retry logic */
69 int zio_write_retry = 1;
70 
71 /* Taskq to handle reissuing of I/Os */
72 taskq_t *zio_taskq;
73 int zio_resume_threads = 4;
74 
75 typedef struct zio_sync_pass {
76 	int	zp_defer_free;		/* defer frees after this pass */
77 	int	zp_dontcompress;	/* don't compress after this pass */
78 	int	zp_rewrite;		/* rewrite new bps after this pass */
79 } zio_sync_pass_t;
80 
81 zio_sync_pass_t zio_sync_pass = {
82 	1,	/* zp_defer_free */
83 	4,	/* zp_dontcompress */
84 	1,	/* zp_rewrite */
85 };
86 
87 static boolean_t zio_io_should_fail(uint16_t);
88 
89 /*
90  * ==========================================================================
91  * I/O kmem caches
92  * ==========================================================================
93  */
94 kmem_cache_t *zio_cache;
95 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
96 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
97 
98 #ifdef _KERNEL
99 extern vmem_t *zio_alloc_arena;
100 #endif
101 
102 /*
103  * Determine if we are allowed to issue the IO based on the
104  * pool state. If we must wait then block until we are told
105  * that we may continue.
106  */
107 #define	ZIO_ENTER(spa) {						\
108 	if (spa->spa_state == POOL_STATE_IO_FAILURE) {			\
109 		mutex_enter(&spa->spa_zio_lock);			\
110 		while (spa->spa_state == POOL_STATE_IO_FAILURE)		\
111 			cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock);	\
112 		mutex_exit(&spa->spa_zio_lock);				\
113 	}								\
114 }
115 
116 /*
117  * An allocation zio is one that either currently has the DVA allocate
118  * stage set or will have it later in it's lifetime.
119  */
120 #define	IO_IS_ALLOCATING(zio) \
121 	((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
122 
123 void
124 zio_init(void)
125 {
126 	size_t c;
127 	vmem_t *data_alloc_arena = NULL;
128 
129 #ifdef _KERNEL
130 	data_alloc_arena = zio_alloc_arena;
131 #endif
132 
133 	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
134 	    NULL, NULL, NULL, NULL, NULL, 0);
135 
136 	/*
137 	 * For small buffers, we want a cache for each multiple of
138 	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
139 	 * for each quarter-power of 2.  For large buffers, we want
140 	 * a cache for each multiple of PAGESIZE.
141 	 */
142 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
143 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
144 		size_t p2 = size;
145 		size_t align = 0;
146 
147 		while (p2 & (p2 - 1))
148 			p2 &= p2 - 1;
149 
150 		if (size <= 4 * SPA_MINBLOCKSIZE) {
151 			align = SPA_MINBLOCKSIZE;
152 		} else if (P2PHASE(size, PAGESIZE) == 0) {
153 			align = PAGESIZE;
154 		} else if (P2PHASE(size, p2 >> 2) == 0) {
155 			align = p2 >> 2;
156 		}
157 
158 		if (align != 0) {
159 			char name[36];
160 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
161 			zio_buf_cache[c] = kmem_cache_create(name, size,
162 			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
163 
164 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
165 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
166 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
167 			    KMC_NODEBUG);
168 
169 		}
170 	}
171 
172 	while (--c != 0) {
173 		ASSERT(zio_buf_cache[c] != NULL);
174 		if (zio_buf_cache[c - 1] == NULL)
175 			zio_buf_cache[c - 1] = zio_buf_cache[c];
176 
177 		ASSERT(zio_data_buf_cache[c] != NULL);
178 		if (zio_data_buf_cache[c - 1] == NULL)
179 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
180 	}
181 
182 	zio_taskq = taskq_create("zio_taskq", zio_resume_threads,
183 	    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
184 
185 	zio_inject_init();
186 }
187 
188 void
189 zio_fini(void)
190 {
191 	size_t c;
192 	kmem_cache_t *last_cache = NULL;
193 	kmem_cache_t *last_data_cache = NULL;
194 
195 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
196 		if (zio_buf_cache[c] != last_cache) {
197 			last_cache = zio_buf_cache[c];
198 			kmem_cache_destroy(zio_buf_cache[c]);
199 		}
200 		zio_buf_cache[c] = NULL;
201 
202 		if (zio_data_buf_cache[c] != last_data_cache) {
203 			last_data_cache = zio_data_buf_cache[c];
204 			kmem_cache_destroy(zio_data_buf_cache[c]);
205 		}
206 		zio_data_buf_cache[c] = NULL;
207 	}
208 
209 	taskq_destroy(zio_taskq);
210 
211 	kmem_cache_destroy(zio_cache);
212 
213 	zio_inject_fini();
214 }
215 
216 /*
217  * ==========================================================================
218  * Allocate and free I/O buffers
219  * ==========================================================================
220  */
221 
222 /*
223  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
224  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
225  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
226  * excess / transient data in-core during a crashdump.
227  */
228 void *
229 zio_buf_alloc(size_t size)
230 {
231 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
232 
233 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
234 
235 	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
236 }
237 
238 /*
239  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
240  * crashdump if the kernel panics.  This exists so that we will limit the amount
241  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
242  * of kernel heap dumped to disk when the kernel panics)
243  */
244 void *
245 zio_data_buf_alloc(size_t size)
246 {
247 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
248 
249 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
250 
251 	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
252 }
253 
254 void
255 zio_buf_free(void *buf, size_t size)
256 {
257 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
258 
259 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
260 
261 	kmem_cache_free(zio_buf_cache[c], buf);
262 }
263 
264 void
265 zio_data_buf_free(void *buf, size_t size)
266 {
267 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
268 
269 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
270 
271 	kmem_cache_free(zio_data_buf_cache[c], buf);
272 }
273 
274 /*
275  * ==========================================================================
276  * Push and pop I/O transform buffers
277  * ==========================================================================
278  */
279 static void
280 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
281 {
282 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
283 
284 	zt->zt_data = data;
285 	zt->zt_size = size;
286 	zt->zt_bufsize = bufsize;
287 
288 	zt->zt_next = zio->io_transform_stack;
289 	zio->io_transform_stack = zt;
290 
291 	zio->io_data = data;
292 	zio->io_size = size;
293 }
294 
295 static void
296 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
297 {
298 	zio_transform_t *zt = zio->io_transform_stack;
299 
300 	*data = zt->zt_data;
301 	*size = zt->zt_size;
302 	*bufsize = zt->zt_bufsize;
303 
304 	zio->io_transform_stack = zt->zt_next;
305 	kmem_free(zt, sizeof (zio_transform_t));
306 
307 	if ((zt = zio->io_transform_stack) != NULL) {
308 		zio->io_data = zt->zt_data;
309 		zio->io_size = zt->zt_size;
310 	}
311 }
312 
313 static void
314 zio_clear_transform_stack(zio_t *zio)
315 {
316 	void *data;
317 	uint64_t size, bufsize;
318 
319 	ASSERT(zio->io_transform_stack != NULL);
320 
321 	zio_pop_transform(zio, &data, &size, &bufsize);
322 	while (zio->io_transform_stack != NULL) {
323 		zio_buf_free(data, bufsize);
324 		zio_pop_transform(zio, &data, &size, &bufsize);
325 	}
326 }
327 
328 /*
329  * ==========================================================================
330  * Create the various types of I/O (read, write, free)
331  * ==========================================================================
332  */
333 static zio_t *
334 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
335     void *data, uint64_t size, zio_done_func_t *done, void *private,
336     zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
337 {
338 	zio_t *zio;
339 
340 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
341 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
342 
343 	/* Only we should set CONFIG_GRABBED */
344 	ASSERT(!(flags & ZIO_FLAG_CONFIG_GRABBED));
345 
346 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
347 	bzero(zio, sizeof (zio_t));
348 	zio->io_parent = pio;
349 	zio->io_spa = spa;
350 	zio->io_txg = txg;
351 	zio->io_flags = flags;
352 	if (bp != NULL) {
353 		zio->io_bp = bp;
354 		zio->io_bp_copy = *bp;
355 		zio->io_bp_orig = *bp;
356 	}
357 	zio->io_done = done;
358 	zio->io_private = private;
359 	zio->io_type = type;
360 	zio->io_priority = priority;
361 	zio->io_stage = stage;
362 	zio->io_pipeline = pipeline;
363 	zio->io_timestamp = lbolt64;
364 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
365 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
366 	zio_push_transform(zio, data, size, size);
367 
368 	/*
369 	 * Note on config lock:
370 	 *
371 	 * If CONFIG_HELD is set, then the caller already has the config
372 	 * lock, so we don't need it for this io.
373 	 *
374 	 * We set CONFIG_GRABBED to indicate that we have grabbed the
375 	 * config lock on behalf of this io, so it should be released
376 	 * in zio_done.
377 	 *
378 	 * Unless CONFIG_HELD is set, we will grab the config lock for
379 	 * any top-level (parent-less) io, *except* NULL top-level ios.
380 	 * The NULL top-level ios rarely have any children, so we delay
381 	 * grabbing the lock until the first child is added (but it is
382 	 * still grabbed on behalf of the top-level i/o, so additional
383 	 * children don't need to also grab it).  This greatly reduces
384 	 * contention on the config lock.
385 	 */
386 	if (pio == NULL) {
387 		if (type != ZIO_TYPE_NULL &&
388 		    !(flags & ZIO_FLAG_CONFIG_HELD)) {
389 			spa_config_enter(spa, RW_READER, zio);
390 			zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
391 		}
392 		zio->io_root = zio;
393 	} else {
394 		zio->io_root = pio->io_root;
395 		if (!(flags & ZIO_FLAG_NOBOOKMARK))
396 			zio->io_logical = pio->io_logical;
397 		mutex_enter(&pio->io_lock);
398 		if (pio->io_parent == NULL &&
399 		    pio->io_type == ZIO_TYPE_NULL &&
400 		    !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
401 		    !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
402 			pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
403 			spa_config_enter(spa, RW_READER, pio);
404 		}
405 		if (stage < ZIO_STAGE_READY)
406 			pio->io_children_notready++;
407 		pio->io_children_notdone++;
408 		zio->io_sibling_next = pio->io_child;
409 		zio->io_sibling_prev = NULL;
410 		if (pio->io_child != NULL)
411 			pio->io_child->io_sibling_prev = zio;
412 		pio->io_child = zio;
413 		zio->io_ndvas = pio->io_ndvas;
414 		mutex_exit(&pio->io_lock);
415 	}
416 
417 	/*
418 	 * Save off the original state incase we need to retry later.
419 	 */
420 	zio->io_orig_stage = zio->io_stage;
421 	zio->io_orig_pipeline = zio->io_pipeline;
422 	zio->io_orig_flags = zio->io_flags;
423 
424 	/*
425 	 * If this is not a null zio, and config is not already held,
426 	 * then the root zio should have grabbed the config lock.
427 	 * If this is not a root zio, it should not have grabbed the
428 	 * config lock.
429 	 */
430 	ASSERT((zio->io_root->io_flags & ZIO_FLAG_CONFIG_HELD) ||
431 	    zio->io_type == ZIO_TYPE_NULL ||
432 	    (zio->io_root->io_flags & ZIO_FLAG_CONFIG_GRABBED));
433 	ASSERT(zio->io_root == zio ||
434 	    !(zio->io_flags & ZIO_FLAG_CONFIG_GRABBED));
435 
436 	return (zio);
437 }
438 
439 static void
440 zio_reset(zio_t *zio)
441 {
442 	zio_clear_transform_stack(zio);
443 
444 	zio->io_flags = zio->io_orig_flags;
445 	zio->io_stage = zio->io_orig_stage;
446 	zio->io_pipeline = zio->io_orig_pipeline;
447 	zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size);
448 }
449 
450 zio_t *
451 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
452 	int flags)
453 {
454 	zio_t *zio;
455 
456 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
457 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
458 	    ZIO_WAIT_FOR_CHILDREN_PIPELINE);
459 
460 	return (zio);
461 }
462 
463 zio_t *
464 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
465 {
466 	return (zio_null(NULL, spa, done, private, flags));
467 }
468 
469 zio_t *
470 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
471     uint64_t size, zio_done_func_t *done, void *private,
472     int priority, int flags, const zbookmark_t *zb)
473 {
474 	zio_t *zio;
475 
476 	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
477 
478 	/*
479 	 * If the user has specified that we allow I/Os to continue
480 	 * then attempt to satisfy the read.
481 	 */
482 	if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
483 		ZIO_ENTER(spa);
484 
485 	zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
486 	    data, size, done, private,
487 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
488 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
489 	zio->io_bookmark = *zb;
490 
491 	zio->io_logical = zio;
492 
493 	/*
494 	 * Work off our copy of the bp so the caller can free it.
495 	 */
496 	zio->io_bp = &zio->io_bp_copy;
497 
498 	return (zio);
499 }
500 
501 zio_t *
502 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
503     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
504     zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
505     int flags, const zbookmark_t *zb)
506 {
507 	zio_t *zio;
508 
509 	ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
510 	    checksum < ZIO_CHECKSUM_FUNCTIONS);
511 
512 	ASSERT(compress >= ZIO_COMPRESS_OFF &&
513 	    compress < ZIO_COMPRESS_FUNCTIONS);
514 
515 	ZIO_ENTER(spa);
516 
517 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
518 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
519 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
520 
521 	zio->io_ready = ready;
522 
523 	zio->io_bookmark = *zb;
524 
525 	zio->io_logical = zio;
526 
527 	zio->io_checksum = checksum;
528 	zio->io_compress = compress;
529 	zio->io_ndvas = ncopies;
530 
531 	if (bp->blk_birth != txg) {
532 		/* XXX the bp usually (always?) gets re-zeroed later */
533 		BP_ZERO(bp);
534 		BP_SET_LSIZE(bp, size);
535 		BP_SET_PSIZE(bp, size);
536 	} else {
537 		/* Make sure someone doesn't change their mind on overwrites */
538 		ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
539 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
540 	}
541 
542 	return (zio);
543 }
544 
545 zio_t *
546 zio_rewrite(zio_t *pio, spa_t *spa, int checksum, uint64_t txg,
547     blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done,
548     void *private, int priority, int flags, zbookmark_t *zb)
549 {
550 	zio_t *zio;
551 
552 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
553 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
554 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp));
555 
556 	zio->io_bookmark = *zb;
557 	zio->io_checksum = checksum;
558 	zio->io_compress = ZIO_COMPRESS_OFF;
559 
560 	if (pio != NULL)
561 		ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
562 
563 	return (zio);
564 }
565 
566 static void
567 zio_write_allocate_ready(zio_t *zio)
568 {
569 	/* Free up the previous block */
570 	if (!BP_IS_HOLE(&zio->io_bp_orig)) {
571 		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
572 		    &zio->io_bp_orig, NULL, NULL));
573 	}
574 }
575 
576 static zio_t *
577 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
578     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
579     zio_done_func_t *done, void *private, int priority, int flags)
580 {
581 	zio_t *zio;
582 
583 	BP_ZERO(bp);
584 	BP_SET_LSIZE(bp, size);
585 	BP_SET_PSIZE(bp, size);
586 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
587 
588 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
589 	    ZIO_TYPE_WRITE, priority, flags,
590 	    ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
591 
592 	zio->io_checksum = checksum;
593 	zio->io_compress = ZIO_COMPRESS_OFF;
594 	zio->io_ready = zio_write_allocate_ready;
595 
596 	return (zio);
597 }
598 
599 zio_t *
600 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
601     zio_done_func_t *done, void *private)
602 {
603 	zio_t *zio;
604 
605 	ASSERT(!BP_IS_HOLE(bp));
606 
607 	if (txg == spa->spa_syncing_txg &&
608 	    spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
609 		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
610 		return (zio_null(pio, spa, NULL, NULL, 0));
611 	}
612 
613 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
614 	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
615 	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp));
616 
617 	zio->io_bp = &zio->io_bp_copy;
618 
619 	return (zio);
620 }
621 
622 zio_t *
623 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
624     zio_done_func_t *done, void *private)
625 {
626 	zio_t *zio;
627 
628 	/*
629 	 * A claim is an allocation of a specific block.  Claims are needed
630 	 * to support immediate writes in the intent log.  The issue is that
631 	 * immediate writes contain committed data, but in a txg that was
632 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
633 	 * the intent log claims all blocks that contain immediate write data
634 	 * so that the SPA knows they're in use.
635 	 *
636 	 * All claims *must* be resolved in the first txg -- before the SPA
637 	 * starts allocating blocks -- so that nothing is allocated twice.
638 	 */
639 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
640 	ASSERT3U(spa_first_txg(spa), <=, txg);
641 
642 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
643 	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
644 	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp));
645 
646 	zio->io_bp = &zio->io_bp_copy;
647 
648 	return (zio);
649 }
650 
651 zio_t *
652 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
653     zio_done_func_t *done, void *private, int priority, int flags)
654 {
655 	zio_t *zio;
656 	int c;
657 
658 	if (vd->vdev_children == 0) {
659 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
660 		    ZIO_TYPE_IOCTL, priority, flags,
661 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
662 
663 		zio->io_vd = vd;
664 		zio->io_cmd = cmd;
665 	} else {
666 		zio = zio_null(pio, spa, NULL, NULL, flags);
667 
668 		for (c = 0; c < vd->vdev_children; c++)
669 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
670 			    done, private, priority, flags));
671 	}
672 
673 	return (zio);
674 }
675 
676 static void
677 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
678     int checksum, boolean_t labels)
679 {
680 	ASSERT(vd->vdev_children == 0);
681 
682 	ASSERT(size <= SPA_MAXBLOCKSIZE);
683 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
684 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
685 
686 #ifdef ZFS_DEBUG
687 	if (labels) {
688 		ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
689 		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
690 	}
691 #endif
692 	ASSERT3U(offset + size, <=, vd->vdev_psize);
693 
694 	BP_ZERO(bp);
695 
696 	BP_SET_LSIZE(bp, size);
697 	BP_SET_PSIZE(bp, size);
698 
699 	BP_SET_CHECKSUM(bp, checksum);
700 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
701 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
702 
703 	if (checksum != ZIO_CHECKSUM_OFF)
704 		ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
705 }
706 
707 zio_t *
708 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
709     void *data, int checksum, zio_done_func_t *done, void *private,
710     int priority, int flags, boolean_t labels)
711 {
712 	zio_t *zio;
713 	blkptr_t blk;
714 
715 	ZIO_ENTER(vd->vdev_spa);
716 
717 	zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
718 
719 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
720 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
721 	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
722 
723 	zio->io_vd = vd;
724 	zio->io_offset = offset;
725 
726 	/*
727 	 * Work off our copy of the bp so the caller can free it.
728 	 */
729 	zio->io_bp = &zio->io_bp_copy;
730 
731 	return (zio);
732 }
733 
734 zio_t *
735 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
736     void *data, int checksum, zio_done_func_t *done, void *private,
737     int priority, int flags, boolean_t labels)
738 {
739 	zio_block_tail_t *zbt;
740 	void *wbuf;
741 	zio_t *zio;
742 	blkptr_t blk;
743 
744 	ZIO_ENTER(vd->vdev_spa);
745 
746 	zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
747 
748 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
749 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
750 	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
751 
752 	zio->io_vd = vd;
753 	zio->io_offset = offset;
754 
755 	zio->io_bp = &zio->io_bp_copy;
756 	zio->io_checksum = checksum;
757 
758 	if (zio_checksum_table[checksum].ci_zbt) {
759 		/*
760 		 * zbt checksums are necessarily destructive -- they modify
761 		 * one word of the write buffer to hold the verifier/checksum.
762 		 * Therefore, we must make a local copy in case the data is
763 		 * being written to multiple places.
764 		 */
765 		wbuf = zio_buf_alloc(size);
766 		bcopy(data, wbuf, size);
767 		zio_push_transform(zio, wbuf, size, size);
768 
769 		zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
770 		zbt->zbt_cksum = blk.blk_cksum;
771 	}
772 
773 	return (zio);
774 }
775 
776 /*
777  * Create a child I/O to do some work for us.  It has no associated bp.
778  */
779 zio_t *
780 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
781 	void *data, uint64_t size, int type, int priority, int flags,
782 	zio_done_func_t *done, void *private)
783 {
784 	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
785 	zio_t *cio;
786 
787 	if (type == ZIO_TYPE_READ && bp != NULL) {
788 		/*
789 		 * If we have the bp, then the child should perform the
790 		 * checksum and the parent need not.  This pushes error
791 		 * detection as close to the leaves as possible and
792 		 * eliminates redundant checksums in the interior nodes.
793 		 */
794 		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
795 		zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
796 	}
797 
798 	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
799 	    done, private, type, priority,
800 	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
801 	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
802 
803 	cio->io_vd = vd;
804 	cio->io_offset = offset;
805 
806 	return (cio);
807 }
808 
809 /*
810  * ==========================================================================
811  * Initiate I/O, either sync or async
812  * ==========================================================================
813  */
814 static void
815 zio_destroy(zio_t *zio)
816 {
817 	mutex_destroy(&zio->io_lock);
818 	cv_destroy(&zio->io_cv);
819 	if (zio->io_failed_vds != NULL) {
820 		kmem_free(zio->io_failed_vds,
821 		    zio->io_failed_vds_count * sizeof (vdev_t *));
822 		zio->io_failed_vds = NULL;
823 		zio->io_failed_vds_count = 0;
824 	}
825 	kmem_cache_free(zio_cache, zio);
826 }
827 
828 int
829 zio_wait(zio_t *zio)
830 {
831 	int error;
832 
833 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
834 
835 	zio->io_waiter = curthread;
836 
837 	zio_execute(zio);
838 
839 	mutex_enter(&zio->io_lock);
840 	while (zio->io_stalled != ZIO_STAGE_DONE)
841 		cv_wait(&zio->io_cv, &zio->io_lock);
842 	mutex_exit(&zio->io_lock);
843 
844 	error = zio->io_error;
845 	zio_destroy(zio);
846 
847 	return (error);
848 }
849 
850 void
851 zio_nowait(zio_t *zio)
852 {
853 	zio_execute(zio);
854 }
855 
856 void
857 zio_interrupt(zio_t *zio)
858 {
859 	(void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type],
860 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
861 }
862 
863 static int
864 zio_issue_async(zio_t *zio)
865 {
866 	(void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type],
867 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
868 
869 	return (ZIO_PIPELINE_STOP);
870 }
871 
872 /*
873  * ==========================================================================
874  * I/O pipeline interlocks: parent/child dependency scoreboarding
875  * ==========================================================================
876  */
877 static int
878 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
879 {
880 	int rv = ZIO_PIPELINE_CONTINUE;
881 
882 	mutex_enter(&zio->io_lock);
883 	ASSERT(zio->io_stalled == 0);
884 	if (*countp != 0) {
885 		zio->io_stalled = stage;
886 		rv = ZIO_PIPELINE_STOP;
887 	}
888 	mutex_exit(&zio->io_lock);
889 
890 	return (rv);
891 }
892 
893 static void
894 zio_add_failed_vdev(zio_t *pio, zio_t *zio)
895 {
896 	uint64_t oldcount = pio->io_failed_vds_count;
897 	vdev_t **new_vds;
898 	int i;
899 
900 	ASSERT(MUTEX_HELD(&pio->io_lock));
901 
902 	if (zio->io_vd == NULL)
903 		return;
904 
905 	for (i = 0; i < oldcount; i++) {
906 		if (pio->io_failed_vds[i] == zio->io_vd)
907 			return;
908 	}
909 
910 	new_vds = kmem_zalloc((oldcount + 1) * sizeof (vdev_t *), KM_SLEEP);
911 	if (pio->io_failed_vds != NULL) {
912 		bcopy(pio->io_failed_vds, new_vds,
913 		    oldcount * sizeof (vdev_t *));
914 		kmem_free(pio->io_failed_vds, oldcount * sizeof (vdev_t *));
915 	}
916 	pio->io_failed_vds = new_vds;
917 	pio->io_failed_vds[oldcount] = zio->io_vd;
918 	pio->io_failed_vds_count++;
919 }
920 
921 static void
922 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
923 {
924 	zio_t *pio = zio->io_parent;
925 
926 	mutex_enter(&pio->io_lock);
927 	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) {
928 		pio->io_error = zio->io_error;
929 		if (zio->io_error && zio->io_error != ENOTSUP)
930 			zio_add_failed_vdev(pio, zio);
931 	}
932 	ASSERT3U(*countp, >, 0);
933 	if (--*countp == 0 && pio->io_stalled == stage) {
934 		pio->io_stalled = 0;
935 		mutex_exit(&pio->io_lock);
936 		zio_execute(pio);
937 	} else {
938 		mutex_exit(&pio->io_lock);
939 	}
940 }
941 
942 int
943 zio_wait_for_children_ready(zio_t *zio)
944 {
945 	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
946 	    &zio->io_children_notready));
947 }
948 
949 int
950 zio_wait_for_children_done(zio_t *zio)
951 {
952 	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
953 	    &zio->io_children_notdone));
954 }
955 
956 static int
957 zio_read_init(zio_t *zio)
958 {
959 	blkptr_t *bp = zio->io_bp;
960 
961 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
962 		uint64_t csize = BP_GET_PSIZE(bp);
963 		void *cbuf = zio_buf_alloc(csize);
964 
965 		zio_push_transform(zio, cbuf, csize, csize);
966 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
967 	}
968 
969 	if (BP_IS_GANG(bp)) {
970 		uint64_t gsize = SPA_GANGBLOCKSIZE;
971 		void *gbuf = zio_buf_alloc(gsize);
972 
973 		zio_push_transform(zio, gbuf, gsize, gsize);
974 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
975 	}
976 
977 	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
978 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
979 
980 	return (ZIO_PIPELINE_CONTINUE);
981 }
982 
983 static int
984 zio_ready(zio_t *zio)
985 {
986 	zio_t *pio = zio->io_parent;
987 
988 	if (zio->io_ready)
989 		zio->io_ready(zio);
990 
991 	if (pio != NULL)
992 		zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
993 		    &pio->io_children_notready);
994 
995 	if (zio->io_bp)
996 		zio->io_bp_copy = *zio->io_bp;
997 
998 	return (ZIO_PIPELINE_CONTINUE);
999 }
1000 
1001 static int
1002 zio_vdev_retry_io(zio_t *zio)
1003 {
1004 	zio_t *pio = zio->io_parent;
1005 
1006 	/*
1007 	 * Preserve the failed bp so that the io_ready() callback can
1008 	 * update the accounting accordingly. The callback will also be
1009 	 * responsible for freeing the previously allocated block, if one
1010 	 * exists.
1011 	 */
1012 	zio->io_bp_orig = *zio->io_bp;
1013 
1014 	/*
1015 	 * We must zero out the old DVA and blk_birth before reallocating
1016 	 * the bp.
1017 	 */
1018 	BP_ZERO_DVAS(zio->io_bp);
1019 	zio_reset(zio);
1020 
1021 	if (pio) {
1022 		/*
1023 		 * Let the parent know that we will
1024 		 * re-alloc the write (=> new bp info).
1025 		 */
1026 		mutex_enter(&pio->io_lock);
1027 		pio->io_children_notready++;
1028 
1029 		/*
1030 		 * If the parent I/O is still in the open stage, then
1031 		 * don't bother telling it to retry since it hasn't
1032 		 * progressed far enough for it to care.
1033 		 */
1034 		if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
1035 			pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
1036 
1037 		ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE);
1038 		mutex_exit(&pio->io_lock);
1039 	}
1040 
1041 	/*
1042 	 * We are getting ready to process the retry request so clear
1043 	 * the flag and the zio's current error status.
1044 	 */
1045 	zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
1046 	zio->io_error = 0;
1047 
1048 	return (ZIO_PIPELINE_CONTINUE);
1049 }
1050 
1051 int
1052 zio_vdev_resume_io(spa_t *spa)
1053 {
1054 	zio_t *zio;
1055 
1056 	mutex_enter(&spa->spa_zio_lock);
1057 
1058 	/*
1059 	 * Probe all of vdevs that have experienced an I/O error.
1060 	 * If we are still unable to verify the integrity of the vdev
1061 	 * then we prevent the resume from proceeeding.
1062 	 */
1063 	for (zio = list_head(&spa->spa_zio_list); zio != NULL;
1064 	    zio = list_next(&spa->spa_zio_list, zio)) {
1065 		int error = 0;
1066 
1067 		/* We only care about I/Os that must succeed */
1068 		if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL)
1069 			continue;
1070 		error = vdev_probe(zio->io_vd);
1071 		if (error) {
1072 			mutex_exit(&spa->spa_zio_lock);
1073 			return (error);
1074 		}
1075 	}
1076 
1077 	/*
1078 	 * Clear the vdev stats so that I/O can flow.
1079 	 */
1080 	vdev_clear(spa, NULL, B_FALSE);
1081 
1082 	spa->spa_state = POOL_STATE_ACTIVE;
1083 	while ((zio = list_head(&spa->spa_zio_list)) != NULL) {
1084 		list_remove(&spa->spa_zio_list, zio);
1085 		zio->io_error = 0;
1086 
1087 		/*
1088 		 * If we are resuming an allocating I/O then we force it
1089 		 * to retry and let it resume operation where it left off.
1090 		 * Otherwise, go back to the ready stage and pick up from
1091 		 * there.
1092 		 */
1093 		if (zio_write_retry && IO_IS_ALLOCATING(zio)) {
1094 			zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
1095 			zio->io_stage--;
1096 		} else {
1097 			zio->io_stage = ZIO_STAGE_READY;
1098 		}
1099 
1100 		(void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute,
1101 		    zio, TQ_SLEEP);
1102 	}
1103 	mutex_exit(&spa->spa_zio_lock);
1104 
1105 	/*
1106 	 * Wait for the taskqs to finish and recheck the pool state since
1107 	 * it's possible that a resumed I/O has failed again.
1108 	 */
1109 	taskq_wait(zio_taskq);
1110 	if (spa_state(spa) == POOL_STATE_IO_FAILURE)
1111 		return (EIO);
1112 
1113 	mutex_enter(&spa->spa_zio_lock);
1114 	cv_broadcast(&spa->spa_zio_cv);
1115 	mutex_exit(&spa->spa_zio_lock);
1116 
1117 	return (0);
1118 }
1119 
1120 static int
1121 zio_vdev_suspend_io(zio_t *zio)
1122 {
1123 	spa_t *spa = zio->io_spa;
1124 
1125 	/*
1126 	 * We've experienced an unrecoverable failure so
1127 	 * set the pool state accordingly and queue all
1128 	 * failed IOs.
1129 	 */
1130 	spa->spa_state = POOL_STATE_IO_FAILURE;
1131 
1132 	mutex_enter(&spa->spa_zio_lock);
1133 	list_insert_tail(&spa->spa_zio_list, zio);
1134 
1135 #ifndef _KERNEL
1136 	/* Used to notify ztest that the pool has suspended */
1137 	cv_broadcast(&spa->spa_zio_cv);
1138 #endif
1139 	mutex_exit(&spa->spa_zio_lock);
1140 
1141 	return (ZIO_PIPELINE_STOP);
1142 }
1143 
1144 static void
1145 zio_handle_io_failure(zio_t *zio, vdev_t *vd)
1146 {
1147 	spa_t *spa = zio->io_spa;
1148 	blkptr_t *bp = zio->io_bp;
1149 	char *blkbuf;
1150 
1151 #ifdef ZFS_DEBUG
1152 	blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
1153 	if (blkbuf) {
1154 		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
1155 		    bp ? bp : &zio->io_bp_copy);
1156 	}
1157 	cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p %s): error %d",
1158 	    zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
1159 	    zio_type_name[zio->io_type], vdev_description(vd),
1160 	    (u_longlong_t)zio->io_offset, (void *)zio,
1161 	    blkbuf ? blkbuf : "", zio->io_error);
1162 	if (blkbuf)
1163 		kmem_free(blkbuf, BP_SPRINTF_LEN);
1164 #endif
1165 
1166 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) {
1167 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1168 		    "failure and the failure mode property for this pool "
1169 		    "is set to panic.", spa_name(spa));
1170 	}
1171 	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1172 	vdev_set_state(vd, vd == spa->spa_root_vdev ? B_TRUE : B_FALSE,
1173 	    VDEV_STATE_FAULTED, VDEV_AUX_IO_FAILURE);
1174 }
1175 
1176 static int
1177 zio_assess(zio_t *zio)
1178 {
1179 	spa_t *spa = zio->io_spa;
1180 	blkptr_t *bp = zio->io_bp;
1181 	vdev_t *vd = zio->io_vd;
1182 
1183 	ASSERT(zio->io_children_notready == 0);
1184 	ASSERT(zio->io_children_notdone == 0);
1185 
1186 	if (bp != NULL) {
1187 		ASSERT(bp->blk_pad[0] == 0);
1188 		ASSERT(bp->blk_pad[1] == 0);
1189 		ASSERT(bp->blk_pad[2] == 0);
1190 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
1191 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
1192 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
1193 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
1194 			if (zio->io_ndvas != 0)
1195 				ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
1196 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
1197 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
1198 		}
1199 	}
1200 
1201 	/*
1202 	 * Some child I/O has indicated that a retry is necessary, so
1203 	 * we set an error on the I/O and let the logic below do the
1204 	 * rest.
1205 	 */
1206 	if (zio->io_flags & ZIO_FLAG_WRITE_RETRY)
1207 		zio->io_error = ERESTART;
1208 
1209 	if (vd != NULL)
1210 		vdev_stat_update(zio);
1211 
1212 	if (zio->io_error) {
1213 		/*
1214 		 * If this I/O is attached to a particular vdev,
1215 		 * generate an error message describing the I/O failure
1216 		 * at the block level.  We ignore these errors if the
1217 		 * device is currently unavailable.
1218 		 */
1219 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
1220 			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
1221 
1222 		if ((zio->io_error == EIO ||
1223 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
1224 		    zio->io_logical == zio) {
1225 			/*
1226 			 * For root I/O requests, tell the SPA to log the error
1227 			 * appropriately.  Also, generate a logical data
1228 			 * ereport.
1229 			 */
1230 			spa_log_error(spa, zio);
1231 
1232 			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
1233 			    0, 0);
1234 		}
1235 
1236 		/*
1237 		 * If we are an allocating I/O then we attempt to reissue
1238 		 * the I/O on another vdev unless the pool is out of space.
1239 		 * We handle this condition based on the spa's failmode
1240 		 * property.
1241 		 */
1242 		if (zio_write_retry && zio->io_error != ENOSPC &&
1243 		    IO_IS_ALLOCATING(zio))
1244 			return (zio_vdev_retry_io(zio));
1245 
1246 		ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
1247 
1248 		/*
1249 		 * For I/O requests that cannot fail, we carry out
1250 		 * the requested behavior based on the failmode pool
1251 		 * property.
1252 		 *
1253 		 * XXX - Need to differentiate between an ENOSPC as
1254 		 * a result of vdev failures vs. a full pool.
1255 		 */
1256 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
1257 			int i;
1258 
1259 			for (i = 0; i < zio->io_failed_vds_count; i++) {
1260 				zio_handle_io_failure(zio,
1261 				    zio->io_failed_vds[i]);
1262 			}
1263 			if (zio->io_failed_vds_count == 0) {
1264 				zio_handle_io_failure(zio,
1265 				    vd ? vd : spa->spa_root_vdev);
1266 			}
1267 			if (zio->io_failed_vds != NULL) {
1268 				kmem_free(zio->io_failed_vds,
1269 				    zio->io_failed_vds_count *
1270 				    sizeof (vdev_t *));
1271 				zio->io_failed_vds = NULL;
1272 				zio->io_failed_vds_count = 0;
1273 			}
1274 			return (zio_vdev_suspend_io(zio));
1275 		}
1276 	}
1277 	ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
1278 	ASSERT(zio->io_children_notready == 0);
1279 
1280 	return (ZIO_PIPELINE_CONTINUE);
1281 }
1282 
1283 static int
1284 zio_done(zio_t *zio)
1285 {
1286 	zio_t *pio = zio->io_parent;
1287 	spa_t *spa = zio->io_spa;
1288 
1289 	ASSERT(zio->io_children_notready == 0);
1290 	ASSERT(zio->io_children_notdone == 0);
1291 
1292 	zio_clear_transform_stack(zio);
1293 
1294 	if (zio->io_done)
1295 		zio->io_done(zio);
1296 
1297 	ASSERT(zio->io_delegate_list == NULL);
1298 	ASSERT(zio->io_delegate_next == NULL);
1299 
1300 	if (pio != NULL) {
1301 		zio_t *next, *prev;
1302 
1303 		mutex_enter(&pio->io_lock);
1304 		next = zio->io_sibling_next;
1305 		prev = zio->io_sibling_prev;
1306 		if (next != NULL)
1307 			next->io_sibling_prev = prev;
1308 		if (prev != NULL)
1309 			prev->io_sibling_next = next;
1310 		if (pio->io_child == zio)
1311 			pio->io_child = next;
1312 		mutex_exit(&pio->io_lock);
1313 
1314 		zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
1315 		    &pio->io_children_notdone);
1316 	}
1317 
1318 	/*
1319 	 * Note: this I/O is now done, and will shortly be freed, so there is no
1320 	 * need to clear this (or any other) flag.
1321 	 */
1322 	if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)
1323 		spa_config_exit(spa, zio);
1324 
1325 	if (zio->io_waiter != NULL) {
1326 		mutex_enter(&zio->io_lock);
1327 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1328 		zio->io_stalled = zio->io_stage;
1329 		cv_broadcast(&zio->io_cv);
1330 		mutex_exit(&zio->io_lock);
1331 	} else {
1332 		zio_destroy(zio);
1333 	}
1334 
1335 	return (ZIO_PIPELINE_STOP);
1336 }
1337 
1338 /*
1339  * ==========================================================================
1340  * Compression support
1341  * ==========================================================================
1342  */
1343 static int
1344 zio_write_compress(zio_t *zio)
1345 {
1346 	int compress = zio->io_compress;
1347 	blkptr_t *bp = zio->io_bp;
1348 	void *cbuf;
1349 	uint64_t lsize = zio->io_size;
1350 	uint64_t csize = lsize;
1351 	uint64_t cbufsize = 0;
1352 	int pass;
1353 
1354 	if (bp->blk_birth == zio->io_txg) {
1355 		/*
1356 		 * We're rewriting an existing block, which means we're
1357 		 * working on behalf of spa_sync().  For spa_sync() to
1358 		 * converge, it must eventually be the case that we don't
1359 		 * have to allocate new blocks.  But compression changes
1360 		 * the blocksize, which forces a reallocate, and makes
1361 		 * convergence take longer.  Therefore, after the first
1362 		 * few passes, stop compressing to ensure convergence.
1363 		 */
1364 		pass = spa_sync_pass(zio->io_spa);
1365 		if (pass > zio_sync_pass.zp_dontcompress)
1366 			compress = ZIO_COMPRESS_OFF;
1367 	} else {
1368 		ASSERT(BP_IS_HOLE(bp));
1369 		pass = 1;
1370 	}
1371 
1372 	if (compress != ZIO_COMPRESS_OFF)
1373 		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
1374 		    &cbuf, &csize, &cbufsize))
1375 			compress = ZIO_COMPRESS_OFF;
1376 
1377 	if (compress != ZIO_COMPRESS_OFF && csize != 0)
1378 		zio_push_transform(zio, cbuf, csize, cbufsize);
1379 
1380 	/*
1381 	 * The final pass of spa_sync() must be all rewrites, but the first
1382 	 * few passes offer a trade-off: allocating blocks defers convergence,
1383 	 * but newly allocated blocks are sequential, so they can be written
1384 	 * to disk faster.  Therefore, we allow the first few passes of
1385 	 * spa_sync() to reallocate new blocks, but force rewrites after that.
1386 	 * There should only be a handful of blocks after pass 1 in any case.
1387 	 */
1388 	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
1389 	    pass > zio_sync_pass.zp_rewrite) {
1390 		ASSERT(csize != 0);
1391 		BP_SET_LSIZE(bp, lsize);
1392 		BP_SET_COMPRESS(bp, compress);
1393 		zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp);
1394 	} else {
1395 		if (bp->blk_birth == zio->io_txg)
1396 			BP_ZERO(bp);
1397 		if (csize == 0) {
1398 			BP_ZERO(bp);
1399 			zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
1400 		} else {
1401 			ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
1402 			BP_SET_LSIZE(bp, lsize);
1403 			BP_SET_PSIZE(bp, csize);
1404 			BP_SET_COMPRESS(bp, compress);
1405 		}
1406 	}
1407 
1408 	return (ZIO_PIPELINE_CONTINUE);
1409 }
1410 
1411 static int
1412 zio_read_decompress(zio_t *zio)
1413 {
1414 	blkptr_t *bp = zio->io_bp;
1415 	void *data;
1416 	uint64_t size;
1417 	uint64_t bufsize;
1418 	int compress = BP_GET_COMPRESS(bp);
1419 
1420 	ASSERT(compress != ZIO_COMPRESS_OFF);
1421 
1422 	zio_pop_transform(zio, &data, &size, &bufsize);
1423 
1424 	if (zio_decompress_data(compress, data, size,
1425 	    zio->io_data, zio->io_size))
1426 		zio->io_error = EIO;
1427 
1428 	zio_buf_free(data, bufsize);
1429 
1430 	return (ZIO_PIPELINE_CONTINUE);
1431 }
1432 
1433 /*
1434  * ==========================================================================
1435  * Gang block support
1436  * ==========================================================================
1437  */
1438 static void
1439 zio_gang_byteswap(zio_t *zio)
1440 {
1441 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1442 
1443 	if (BP_SHOULD_BYTESWAP(zio->io_bp))
1444 		byteswap_uint64_array(zio->io_data, zio->io_size);
1445 }
1446 
1447 static int
1448 zio_get_gang_header(zio_t *zio)
1449 {
1450 	blkptr_t *bp = zio->io_bp;
1451 	uint64_t gsize = SPA_GANGBLOCKSIZE;
1452 	void *gbuf = zio_buf_alloc(gsize);
1453 
1454 	ASSERT(BP_IS_GANG(bp));
1455 
1456 	zio_push_transform(zio, gbuf, gsize, gsize);
1457 
1458 	zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
1459 	    NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
1460 	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
1461 	    ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE));
1462 
1463 	return (zio_wait_for_children_done(zio));
1464 }
1465 
1466 static int
1467 zio_read_gang_members(zio_t *zio)
1468 {
1469 	zio_gbh_phys_t *gbh;
1470 	uint64_t gsize, gbufsize, loff, lsize;
1471 	int i;
1472 
1473 	ASSERT(BP_IS_GANG(zio->io_bp));
1474 
1475 	zio_gang_byteswap(zio);
1476 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1477 
1478 	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
1479 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1480 		lsize = BP_GET_PSIZE(gbp);
1481 
1482 		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
1483 		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
1484 		ASSERT3U(loff + lsize, <=, zio->io_size);
1485 		ASSERT(i < SPA_GBH_NBLKPTRS);
1486 		ASSERT(!BP_IS_HOLE(gbp));
1487 
1488 		zio_nowait(zio_read(zio, zio->io_spa, gbp,
1489 		    (char *)zio->io_data + loff, lsize,
1490 		    NULL, NULL, zio->io_priority,
1491 		    zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
1492 	}
1493 
1494 	zio_buf_free(gbh, gbufsize);
1495 
1496 	return (zio_wait_for_children_done(zio));
1497 }
1498 
1499 static int
1500 zio_rewrite_gang_members(zio_t *zio)
1501 {
1502 	zio_gbh_phys_t *gbh;
1503 	uint64_t gsize, gbufsize, loff, lsize;
1504 	int i;
1505 
1506 	ASSERT(BP_IS_GANG(zio->io_bp));
1507 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
1508 
1509 	zio_gang_byteswap(zio);
1510 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1511 
1512 	ASSERT(gsize == gbufsize);
1513 
1514 	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
1515 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1516 		lsize = BP_GET_PSIZE(gbp);
1517 
1518 		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
1519 		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
1520 		ASSERT3U(loff + lsize, <=, zio->io_size);
1521 		ASSERT(i < SPA_GBH_NBLKPTRS);
1522 		ASSERT(!BP_IS_HOLE(gbp));
1523 
1524 		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
1525 		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
1526 		    NULL, NULL, zio->io_priority,
1527 		    zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
1528 	}
1529 
1530 	zio_push_transform(zio, gbh, gsize, gbufsize);
1531 
1532 	return (zio_wait_for_children_ready(zio));
1533 }
1534 
1535 static int
1536 zio_free_gang_members(zio_t *zio)
1537 {
1538 	zio_gbh_phys_t *gbh;
1539 	uint64_t gsize, gbufsize;
1540 	int i;
1541 
1542 	ASSERT(BP_IS_GANG(zio->io_bp));
1543 
1544 	zio_gang_byteswap(zio);
1545 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1546 
1547 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1548 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1549 
1550 		if (BP_IS_HOLE(gbp))
1551 			continue;
1552 		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
1553 		    gbp, NULL, NULL));
1554 	}
1555 
1556 	zio_buf_free(gbh, gbufsize);
1557 
1558 	return (ZIO_PIPELINE_CONTINUE);
1559 }
1560 
1561 static int
1562 zio_claim_gang_members(zio_t *zio)
1563 {
1564 	zio_gbh_phys_t *gbh;
1565 	uint64_t gsize, gbufsize;
1566 	int i;
1567 
1568 	ASSERT(BP_IS_GANG(zio->io_bp));
1569 
1570 	zio_gang_byteswap(zio);
1571 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1572 
1573 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1574 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1575 		if (BP_IS_HOLE(gbp))
1576 			continue;
1577 		zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
1578 		    gbp, NULL, NULL));
1579 	}
1580 
1581 	zio_buf_free(gbh, gbufsize);
1582 
1583 	return (ZIO_PIPELINE_CONTINUE);
1584 }
1585 
1586 static void
1587 zio_write_allocate_gang_member_done(zio_t *zio)
1588 {
1589 	zio_t *pio = zio->io_parent;
1590 	dva_t *cdva = zio->io_bp->blk_dva;
1591 	dva_t *pdva = pio->io_bp->blk_dva;
1592 	uint64_t asize;
1593 	int d;
1594 
1595 	ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
1596 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1597 	ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
1598 	ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
1599 
1600 	mutex_enter(&pio->io_lock);
1601 	for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
1602 		ASSERT(DVA_GET_GANG(&pdva[d]));
1603 		asize = DVA_GET_ASIZE(&pdva[d]);
1604 		asize += DVA_GET_ASIZE(&cdva[d]);
1605 		DVA_SET_ASIZE(&pdva[d], asize);
1606 	}
1607 	mutex_exit(&pio->io_lock);
1608 }
1609 
1610 static int
1611 zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
1612 {
1613 	blkptr_t *bp = zio->io_bp;
1614 	dva_t *dva = bp->blk_dva;
1615 	spa_t *spa = zio->io_spa;
1616 	zio_gbh_phys_t *gbh;
1617 	uint64_t txg = zio->io_txg;
1618 	uint64_t resid = zio->io_size;
1619 	uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
1620 	uint64_t gsize, loff, lsize;
1621 	uint32_t gbps_left;
1622 	int ndvas = zio->io_ndvas;
1623 	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
1624 	int error;
1625 	int i, d;
1626 
1627 	gsize = SPA_GANGBLOCKSIZE;
1628 	gbps_left = SPA_GBH_NBLKPTRS;
1629 
1630 	error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
1631 	    B_FALSE);
1632 	if (error) {
1633 		zio->io_error = error;
1634 		return (ZIO_PIPELINE_CONTINUE);
1635 	}
1636 
1637 	for (d = 0; d < gbh_ndvas; d++)
1638 		DVA_SET_GANG(&dva[d], 1);
1639 
1640 	bp->blk_birth = txg;
1641 
1642 	gbh = zio_buf_alloc(gsize);
1643 	bzero(gbh, gsize);
1644 
1645 	for (loff = 0, i = 0; loff != zio->io_size;
1646 	    loff += lsize, resid -= lsize, gbps_left--, i++) {
1647 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1648 		dva = gbp->blk_dva;
1649 
1650 		ASSERT(gbps_left != 0);
1651 		maxalloc = MIN(maxalloc, resid);
1652 
1653 		while (resid <= maxalloc * gbps_left) {
1654 			error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas,
1655 			    txg, bp, B_FALSE);
1656 			if (error == 0)
1657 				break;
1658 			ASSERT3U(error, ==, ENOSPC);
1659 			/* XXX - free up previous allocations? */
1660 			if (maxalloc == SPA_MINBLOCKSIZE) {
1661 				zio->io_error = error;
1662 				return (ZIO_PIPELINE_CONTINUE);
1663 			}
1664 			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
1665 		}
1666 
1667 		if (resid <= maxalloc * gbps_left) {
1668 			lsize = maxalloc;
1669 			BP_SET_LSIZE(gbp, lsize);
1670 			BP_SET_PSIZE(gbp, lsize);
1671 			BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
1672 			gbp->blk_birth = txg;
1673 			zio_nowait(zio_rewrite(zio, spa, zio->io_checksum, txg,
1674 			    gbp, (char *)zio->io_data + loff, lsize,
1675 			    zio_write_allocate_gang_member_done, NULL,
1676 			    zio->io_priority,
1677 			    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
1678 			    &zio->io_bookmark));
1679 		} else {
1680 			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
1681 			ASSERT(lsize != SPA_MINBLOCKSIZE);
1682 			zio_nowait(zio_write_allocate(zio, spa,
1683 			    zio->io_checksum, txg, gbp,
1684 			    (char *)zio->io_data + loff, lsize,
1685 			    zio_write_allocate_gang_member_done, NULL,
1686 			    zio->io_priority,
1687 			    zio->io_flags & ZIO_FLAG_GANG_INHERIT));
1688 		}
1689 	}
1690 
1691 	ASSERT(resid == 0 && loff == zio->io_size);
1692 
1693 	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
1694 
1695 	zio_push_transform(zio, gbh, gsize, gsize);
1696 
1697 	/*
1698 	 * As much as we'd like this to be 'ready' instead of 'done',
1699 	 * updating our ASIZE doesn't happen until the io_done callback,
1700 	 * so we have to wait for that to finish in order for our BP
1701 	 * to be stable.
1702 	 */
1703 	return (zio_wait_for_children_done(zio));
1704 }
1705 
1706 /*
1707  * ==========================================================================
1708  * Allocate and free blocks
1709  * ==========================================================================
1710  */
1711 static int
1712 zio_dva_allocate(zio_t *zio)
1713 {
1714 	spa_t *spa = zio->io_spa;
1715 	metaslab_class_t *mc = spa->spa_normal_class;
1716 	blkptr_t *bp = zio->io_bp;
1717 	int error;
1718 
1719 	ASSERT(BP_IS_HOLE(bp));
1720 	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
1721 	ASSERT3U(zio->io_ndvas, >, 0);
1722 	ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa));
1723 
1724 	/*
1725 	 * For testing purposes, we force I/Os to retry. We don't allow
1726 	 * retries beyond the first pass since those I/Os are non-allocating
1727 	 * writes.
1728 	 */
1729 	if (zio_io_fail_shift &&
1730 	    spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite &&
1731 	    zio_io_should_fail(zio_io_fail_shift))
1732 		zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
1733 
1734 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
1735 
1736 	error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas,
1737 	    zio->io_txg, NULL, B_FALSE);
1738 
1739 	if (error == 0) {
1740 		bp->blk_birth = zio->io_txg;
1741 	} else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
1742 		return (zio_write_allocate_gang_members(zio, mc));
1743 	} else {
1744 		zio->io_error = error;
1745 	}
1746 
1747 	return (ZIO_PIPELINE_CONTINUE);
1748 }
1749 
1750 static int
1751 zio_dva_free(zio_t *zio)
1752 {
1753 	blkptr_t *bp = zio->io_bp;
1754 
1755 	metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
1756 
1757 	BP_ZERO(bp);
1758 
1759 	return (ZIO_PIPELINE_CONTINUE);
1760 }
1761 
1762 static int
1763 zio_dva_claim(zio_t *zio)
1764 {
1765 	zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
1766 
1767 	return (ZIO_PIPELINE_CONTINUE);
1768 }
1769 
1770 /*
1771  * ==========================================================================
1772  * Read and write to physical devices
1773  * ==========================================================================
1774  */
1775 
1776 static int
1777 zio_vdev_io_start(zio_t *zio)
1778 {
1779 	vdev_t *vd = zio->io_vd;
1780 	vdev_t *tvd = vd ? vd->vdev_top : NULL;
1781 	blkptr_t *bp = zio->io_bp;
1782 	uint64_t align;
1783 	spa_t *spa = zio->io_spa;
1784 
1785 	/*
1786 	 * If the pool is already in a failure state then just suspend
1787 	 * this IO until the problem is resolved. We will reissue them
1788 	 * at that time.
1789 	 */
1790 	if (spa_state(spa) == POOL_STATE_IO_FAILURE &&
1791 	    zio->io_type == ZIO_TYPE_WRITE)
1792 		return (zio_vdev_suspend_io(zio));
1793 
1794 	/*
1795 	 * The mirror_ops handle multiple DVAs in a single BP
1796 	 */
1797 	if (vd == NULL)
1798 		return (vdev_mirror_ops.vdev_op_io_start(zio));
1799 
1800 	align = 1ULL << tvd->vdev_ashift;
1801 
1802 	if (zio->io_retries == 0 && vd == tvd)
1803 		zio->io_flags |= ZIO_FLAG_FAILFAST;
1804 
1805 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
1806 		zio->io_flags |= ZIO_FLAG_PHYSICAL;
1807 		zio->io_offset += VDEV_LABEL_START_SIZE;
1808 	}
1809 
1810 	if (P2PHASE(zio->io_size, align) != 0) {
1811 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
1812 		char *abuf = zio_buf_alloc(asize);
1813 		ASSERT(vd == tvd);
1814 		if (zio->io_type == ZIO_TYPE_WRITE) {
1815 			bcopy(zio->io_data, abuf, zio->io_size);
1816 			bzero(abuf + zio->io_size, asize - zio->io_size);
1817 		}
1818 		zio_push_transform(zio, abuf, asize, asize);
1819 		ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
1820 		zio->io_flags |= ZIO_FLAG_SUBBLOCK;
1821 	}
1822 
1823 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
1824 	ASSERT(P2PHASE(zio->io_size, align) == 0);
1825 	ASSERT(bp == NULL ||
1826 	    P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
1827 	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
1828 
1829 	return (vd->vdev_ops->vdev_op_io_start(zio));
1830 }
1831 
1832 static int
1833 zio_vdev_io_done(zio_t *zio)
1834 {
1835 	if (zio->io_vd == NULL)
1836 		return (vdev_mirror_ops.vdev_op_io_done(zio));
1837 
1838 	return (zio->io_vd->vdev_ops->vdev_op_io_done(zio));
1839 }
1840 
1841 /* XXPOLICY */
1842 boolean_t
1843 zio_should_retry(zio_t *zio)
1844 {
1845 	vdev_t *vd = zio->io_vd;
1846 
1847 	if (zio->io_error == 0)
1848 		return (B_FALSE);
1849 	if (zio->io_delegate_list != NULL)
1850 		return (B_FALSE);
1851 	if (vd != NULL) {
1852 		if (vd != vd->vdev_top)
1853 			return (B_FALSE);
1854 		if (vd->vdev_is_failing)
1855 			return (B_FALSE);
1856 	}
1857 	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
1858 		return (B_FALSE);
1859 	if (zio->io_retries > 0)
1860 		return (B_FALSE);
1861 
1862 	return (B_TRUE);
1863 }
1864 
1865 static int
1866 zio_vdev_io_assess(zio_t *zio)
1867 {
1868 	vdev_t *vd = zio->io_vd;
1869 	vdev_t *tvd = vd ? vd->vdev_top : NULL;
1870 
1871 	ASSERT(zio->io_vsd == NULL);
1872 
1873 	if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
1874 		void *abuf;
1875 		uint64_t asize;
1876 		ASSERT(vd == tvd);
1877 		zio_pop_transform(zio, &abuf, &asize, &asize);
1878 		if (zio->io_type == ZIO_TYPE_READ)
1879 			bcopy(abuf, zio->io_data, zio->io_size);
1880 		zio_buf_free(abuf, asize);
1881 		zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
1882 	}
1883 
1884 	if (zio_injection_enabled && !zio->io_error)
1885 		zio->io_error = zio_handle_fault_injection(zio, EIO);
1886 
1887 	/*
1888 	 * If the I/O failed, determine whether we should attempt to retry it.
1889 	 */
1890 	/* XXPOLICY */
1891 	if (zio_should_retry(zio)) {
1892 		ASSERT(tvd == vd);
1893 
1894 		zio->io_retries++;
1895 		zio->io_error = 0;
1896 		zio->io_flags &= ZIO_FLAG_RETRY_INHERIT;
1897 		/* XXPOLICY */
1898 		zio->io_flags &= ~ZIO_FLAG_FAILFAST;
1899 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1900 		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
1901 
1902 		return (ZIO_PIPELINE_CONTINUE);
1903 	}
1904 
1905 	return (ZIO_PIPELINE_CONTINUE);
1906 }
1907 
1908 void
1909 zio_vdev_io_reissue(zio_t *zio)
1910 {
1911 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
1912 	ASSERT(zio->io_error == 0);
1913 
1914 	zio->io_stage--;
1915 }
1916 
1917 void
1918 zio_vdev_io_redone(zio_t *zio)
1919 {
1920 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
1921 
1922 	zio->io_stage--;
1923 }
1924 
1925 void
1926 zio_vdev_io_bypass(zio_t *zio)
1927 {
1928 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
1929 	ASSERT(zio->io_error == 0);
1930 
1931 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
1932 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
1933 }
1934 
1935 /*
1936  * ==========================================================================
1937  * Generate and verify checksums
1938  * ==========================================================================
1939  */
1940 static int
1941 zio_checksum_generate(zio_t *zio)
1942 {
1943 	int checksum = zio->io_checksum;
1944 	blkptr_t *bp = zio->io_bp;
1945 
1946 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
1947 
1948 	BP_SET_CHECKSUM(bp, checksum);
1949 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1950 
1951 	zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
1952 
1953 	return (ZIO_PIPELINE_CONTINUE);
1954 }
1955 
1956 static int
1957 zio_gang_checksum_generate(zio_t *zio)
1958 {
1959 	zio_cksum_t zc;
1960 	zio_gbh_phys_t *gbh = zio->io_data;
1961 
1962 	ASSERT(BP_IS_GANG(zio->io_bp));
1963 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
1964 
1965 	zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
1966 
1967 	zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
1968 
1969 	return (ZIO_PIPELINE_CONTINUE);
1970 }
1971 
1972 static int
1973 zio_checksum_verify(zio_t *zio)
1974 {
1975 	if (zio->io_bp != NULL) {
1976 		zio->io_error = zio_checksum_error(zio);
1977 		if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
1978 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1979 			    zio->io_spa, zio->io_vd, zio, 0, 0);
1980 	}
1981 
1982 	return (ZIO_PIPELINE_CONTINUE);
1983 }
1984 
1985 /*
1986  * Called by RAID-Z to ensure we don't compute the checksum twice.
1987  */
1988 void
1989 zio_checksum_verified(zio_t *zio)
1990 {
1991 	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
1992 }
1993 
1994 /*
1995  * Set the external verifier for a gang block based on stuff in the bp
1996  */
1997 void
1998 zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
1999 {
2000 	blkptr_t *bp = zio->io_bp;
2001 
2002 	zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
2003 	zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
2004 	zcp->zc_word[2] = bp->blk_birth;
2005 	zcp->zc_word[3] = 0;
2006 }
2007 
2008 /*
2009  * ==========================================================================
2010  * Define the pipeline
2011  * ==========================================================================
2012  */
2013 typedef int zio_pipe_stage_t(zio_t *zio);
2014 
2015 zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
2016 	NULL,
2017 	zio_wait_for_children_ready,
2018 	zio_read_init,
2019 	zio_issue_async,
2020 	zio_write_compress,
2021 	zio_checksum_generate,
2022 	zio_get_gang_header,
2023 	zio_rewrite_gang_members,
2024 	zio_free_gang_members,
2025 	zio_claim_gang_members,
2026 	zio_dva_allocate,
2027 	zio_dva_free,
2028 	zio_dva_claim,
2029 	zio_gang_checksum_generate,
2030 	zio_ready,
2031 	zio_vdev_io_start,
2032 	zio_vdev_io_done,
2033 	zio_vdev_io_assess,
2034 	zio_wait_for_children_done,
2035 	zio_checksum_verify,
2036 	zio_read_gang_members,
2037 	zio_read_decompress,
2038 	zio_assess,
2039 	zio_done,
2040 	NULL
2041 };
2042 
2043 /*
2044  * Execute the I/O pipeline until one of the following occurs:
2045  * (1) the I/O completes; (2) the pipeline stalls waiting for
2046  * dependent child I/Os; (3) the I/O issues, so we're waiting
2047  * for an I/O completion interrupt; (4) the I/O is delegated by
2048  * vdev-level caching or aggregation; (5) the I/O is deferred
2049  * due to vdev-level queueing; (6) the I/O is handed off to
2050  * another thread.  In all cases, the pipeline stops whenever
2051  * there's no CPU work; it never burns a thread in cv_wait().
2052  *
2053  * There's no locking on io_stage because there's no legitimate way
2054  * for multiple threads to be attempting to process the same I/O.
2055  */
2056 void
2057 zio_execute(zio_t *zio)
2058 {
2059 	while (zio->io_stage < ZIO_STAGE_DONE) {
2060 		uint32_t pipeline = zio->io_pipeline;
2061 		int rv;
2062 
2063 		ASSERT(!MUTEX_HELD(&zio->io_lock));
2064 
2065 		/*
2066 		 * If an error occurred outside the vdev stack,
2067 		 * just execute the interlock stages to clean up.
2068 		 */
2069 		if (zio->io_error &&
2070 		    ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0)
2071 			pipeline &= ZIO_ERROR_PIPELINE_MASK;
2072 
2073 		while (((1U << ++zio->io_stage) & pipeline) == 0)
2074 			continue;
2075 
2076 		ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
2077 		ASSERT(zio->io_stalled == 0);
2078 
2079 		rv = zio_pipeline[zio->io_stage](zio);
2080 
2081 		if (rv == ZIO_PIPELINE_STOP)
2082 			return;
2083 
2084 		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
2085 	}
2086 }
2087 
2088 static boolean_t
2089 zio_io_should_fail(uint16_t range)
2090 {
2091 	static uint16_t	allocs = 0;
2092 
2093 	return (P2PHASE(allocs++, 1U<<range) == 0);
2094 }
2095 
2096 /*
2097  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2098  */
2099 int
2100 zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
2101     uint64_t txg)
2102 {
2103 	int error;
2104 
2105 	spa_config_enter(spa, RW_READER, FTAG);
2106 
2107 	if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) {
2108 		spa_config_exit(spa, FTAG);
2109 		return (ENOSPC);
2110 	}
2111 
2112 	/*
2113 	 * We were passed the previous log block's DVA in bp->blk_dva[0].
2114 	 * We use that as a hint for which vdev to allocate from next.
2115 	 */
2116 	error = metaslab_alloc(spa, spa->spa_log_class, size,
2117 	    new_bp, 1, txg, old_bp, B_TRUE);
2118 
2119 	if (error)
2120 		error = metaslab_alloc(spa, spa->spa_normal_class, size,
2121 		    new_bp, 1, txg, old_bp, B_TRUE);
2122 
2123 	if (error == 0) {
2124 		BP_SET_LSIZE(new_bp, size);
2125 		BP_SET_PSIZE(new_bp, size);
2126 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2127 		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
2128 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2129 		BP_SET_LEVEL(new_bp, 0);
2130 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2131 		new_bp->blk_birth = txg;
2132 	}
2133 
2134 	spa_config_exit(spa, FTAG);
2135 
2136 	return (error);
2137 }
2138 
2139 /*
2140  * Free an intent log block.  We know it can't be a gang block, so there's
2141  * nothing to do except metaslab_free() it.
2142  */
2143 void
2144 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
2145 {
2146 	ASSERT(!BP_IS_GANG(bp));
2147 
2148 	spa_config_enter(spa, RW_READER, FTAG);
2149 
2150 	metaslab_free(spa, bp, txg, B_FALSE);
2151 
2152 	spa_config_exit(spa, FTAG);
2153 }
2154 
2155 /*
2156  * start an async flush of the write cache for this vdev
2157  */
2158 void
2159 zio_flush(zio_t *zio, vdev_t *vd)
2160 {
2161 	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
2162 	    NULL, NULL, ZIO_PRIORITY_NOW,
2163 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
2164 }
2165