xref: /titanic_44/usr/src/uts/common/fs/zfs/zio.c (revision fe598cdcd847f8359013532d5c691bb6190378c0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/fm/fs/zfs.h>
30 #include <sys/spa.h>
31 #include <sys/txg.h>
32 #include <sys/spa_impl.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/zio_impl.h>
35 #include <sys/zio_compress.h>
36 #include <sys/zio_checksum.h>
37 
38 /*
39  * ==========================================================================
40  * I/O priority table
41  * ==========================================================================
42  */
43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
44 	0,	/* ZIO_PRIORITY_NOW		*/
45 	0,	/* ZIO_PRIORITY_SYNC_READ	*/
46 	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
47 	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
48 	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
49 	4,	/* ZIO_PRIORITY_FREE		*/
50 	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
51 	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
52 	10,	/* ZIO_PRIORITY_RESILVER	*/
53 	20,	/* ZIO_PRIORITY_SCRUB		*/
54 };
55 
56 /*
57  * ==========================================================================
58  * I/O type descriptions
59  * ==========================================================================
60  */
61 char *zio_type_name[ZIO_TYPES] = {
62 	"null", "read", "write", "free", "claim", "ioctl" };
63 
64 /* At or above this size, force gang blocking - for testing */
65 uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
66 
67 /* Force an allocation failure when non-zero */
68 uint16_t zio_zil_fail_shift = 0;
69 
70 typedef struct zio_sync_pass {
71 	int	zp_defer_free;		/* defer frees after this pass */
72 	int	zp_dontcompress;	/* don't compress after this pass */
73 	int	zp_rewrite;		/* rewrite new bps after this pass */
74 } zio_sync_pass_t;
75 
76 zio_sync_pass_t zio_sync_pass = {
77 	1,	/* zp_defer_free */
78 	4,	/* zp_dontcompress */
79 	1,	/* zp_rewrite */
80 };
81 
82 /*
83  * ==========================================================================
84  * I/O kmem caches
85  * ==========================================================================
86  */
87 kmem_cache_t *zio_cache;
88 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
89 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
90 
91 #ifdef _KERNEL
92 extern vmem_t *zio_alloc_arena;
93 #endif
94 
95 void
96 zio_init(void)
97 {
98 	size_t c;
99 	vmem_t *data_alloc_arena = NULL;
100 
101 #ifdef _KERNEL
102 	data_alloc_arena = zio_alloc_arena;
103 #endif
104 
105 	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
106 	    NULL, NULL, NULL, NULL, NULL, 0);
107 
108 	/*
109 	 * For small buffers, we want a cache for each multiple of
110 	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
111 	 * for each quarter-power of 2.  For large buffers, we want
112 	 * a cache for each multiple of PAGESIZE.
113 	 */
114 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
115 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
116 		size_t p2 = size;
117 		size_t align = 0;
118 
119 		while (p2 & (p2 - 1))
120 			p2 &= p2 - 1;
121 
122 		if (size <= 4 * SPA_MINBLOCKSIZE) {
123 			align = SPA_MINBLOCKSIZE;
124 		} else if (P2PHASE(size, PAGESIZE) == 0) {
125 			align = PAGESIZE;
126 		} else if (P2PHASE(size, p2 >> 2) == 0) {
127 			align = p2 >> 2;
128 		}
129 
130 		if (align != 0) {
131 			char name[36];
132 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
133 			zio_buf_cache[c] = kmem_cache_create(name, size,
134 			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
135 
136 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
137 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
138 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
139 			    KMC_NODEBUG);
140 
141 			dprintf("creating cache for size %5lx align %5lx\n",
142 			    size, align);
143 		}
144 	}
145 
146 	while (--c != 0) {
147 		ASSERT(zio_buf_cache[c] != NULL);
148 		if (zio_buf_cache[c - 1] == NULL)
149 			zio_buf_cache[c - 1] = zio_buf_cache[c];
150 
151 		ASSERT(zio_data_buf_cache[c] != NULL);
152 		if (zio_data_buf_cache[c - 1] == NULL)
153 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
154 	}
155 
156 	zio_inject_init();
157 }
158 
159 void
160 zio_fini(void)
161 {
162 	size_t c;
163 	kmem_cache_t *last_cache = NULL;
164 	kmem_cache_t *last_data_cache = NULL;
165 
166 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
167 		if (zio_buf_cache[c] != last_cache) {
168 			last_cache = zio_buf_cache[c];
169 			kmem_cache_destroy(zio_buf_cache[c]);
170 		}
171 		zio_buf_cache[c] = NULL;
172 
173 		if (zio_data_buf_cache[c] != last_data_cache) {
174 			last_data_cache = zio_data_buf_cache[c];
175 			kmem_cache_destroy(zio_data_buf_cache[c]);
176 		}
177 		zio_data_buf_cache[c] = NULL;
178 	}
179 
180 	kmem_cache_destroy(zio_cache);
181 
182 	zio_inject_fini();
183 }
184 
185 /*
186  * ==========================================================================
187  * Allocate and free I/O buffers
188  * ==========================================================================
189  */
190 
191 /*
192  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
193  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
194  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
195  * excess / transient data in-core during a crashdump.
196  */
197 void *
198 zio_buf_alloc(size_t size)
199 {
200 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
201 
202 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
203 
204 	return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
205 }
206 
207 /*
208  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
209  * crashdump if the kernel panics.  This exists so that we will limit the amount
210  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
211  * of kernel heap dumped to disk when the kernel panics)
212  */
213 void *
214 zio_data_buf_alloc(size_t size)
215 {
216 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
217 
218 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
219 
220 	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP));
221 }
222 
223 void
224 zio_buf_free(void *buf, size_t size)
225 {
226 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
227 
228 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
229 
230 	kmem_cache_free(zio_buf_cache[c], buf);
231 }
232 
233 void
234 zio_data_buf_free(void *buf, size_t size)
235 {
236 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
237 
238 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
239 
240 	kmem_cache_free(zio_data_buf_cache[c], buf);
241 }
242 
243 /*
244  * ==========================================================================
245  * Push and pop I/O transform buffers
246  * ==========================================================================
247  */
248 static void
249 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
250 {
251 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
252 
253 	zt->zt_data = data;
254 	zt->zt_size = size;
255 	zt->zt_bufsize = bufsize;
256 
257 	zt->zt_next = zio->io_transform_stack;
258 	zio->io_transform_stack = zt;
259 
260 	zio->io_data = data;
261 	zio->io_size = size;
262 }
263 
264 static void
265 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
266 {
267 	zio_transform_t *zt = zio->io_transform_stack;
268 
269 	*data = zt->zt_data;
270 	*size = zt->zt_size;
271 	*bufsize = zt->zt_bufsize;
272 
273 	zio->io_transform_stack = zt->zt_next;
274 	kmem_free(zt, sizeof (zio_transform_t));
275 
276 	if ((zt = zio->io_transform_stack) != NULL) {
277 		zio->io_data = zt->zt_data;
278 		zio->io_size = zt->zt_size;
279 	}
280 }
281 
282 static void
283 zio_clear_transform_stack(zio_t *zio)
284 {
285 	void *data;
286 	uint64_t size, bufsize;
287 
288 	ASSERT(zio->io_transform_stack != NULL);
289 
290 	zio_pop_transform(zio, &data, &size, &bufsize);
291 	while (zio->io_transform_stack != NULL) {
292 		zio_buf_free(data, bufsize);
293 		zio_pop_transform(zio, &data, &size, &bufsize);
294 	}
295 }
296 
297 /*
298  * ==========================================================================
299  * Create the various types of I/O (read, write, free)
300  * ==========================================================================
301  */
302 static zio_t *
303 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
304     void *data, uint64_t size, zio_done_func_t *done, void *private,
305     zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
306 {
307 	zio_t *zio;
308 
309 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
310 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
311 
312 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
313 	bzero(zio, sizeof (zio_t));
314 	zio->io_parent = pio;
315 	zio->io_spa = spa;
316 	zio->io_txg = txg;
317 	zio->io_flags = flags;
318 	if (bp != NULL) {
319 		zio->io_bp = bp;
320 		zio->io_bp_copy = *bp;
321 		zio->io_bp_orig = *bp;
322 		if (dmu_ot[BP_GET_TYPE(bp)].ot_metadata ||
323 		    BP_GET_LEVEL(bp) != 0)
324 			zio->io_flags |= ZIO_FLAG_METADATA;
325 	}
326 	zio->io_done = done;
327 	zio->io_private = private;
328 	zio->io_type = type;
329 	zio->io_priority = priority;
330 	zio->io_stage = stage;
331 	zio->io_pipeline = pipeline;
332 	zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
333 	zio->io_timestamp = lbolt64;
334 	if (pio != NULL)
335 		zio->io_flags |= (pio->io_flags & ZIO_FLAG_METADATA);
336 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
337 	zio_push_transform(zio, data, size, size);
338 
339 	/*
340 	 * Note on config lock:
341 	 *
342 	 * If CONFIG_HELD is set, then the caller already has the config
343 	 * lock, so we don't need it for this io.
344 	 *
345 	 * We set CONFIG_GRABBED to indicate that we have grabbed the
346 	 * config lock on behalf of this io, so it should be released
347 	 * in zio_done.
348 	 *
349 	 * Unless CONFIG_HELD is set, we will grab the config lock for
350 	 * any top-level (parent-less) io, *except* NULL top-level ios.
351 	 * The NULL top-level ios rarely have any children, so we delay
352 	 * grabbing the lock until the first child is added (but it is
353 	 * still grabbed on behalf of the top-level i/o, so additional
354 	 * children don't need to also grab it).  This greatly reduces
355 	 * contention on the config lock.
356 	 */
357 	if (pio == NULL) {
358 		if (type != ZIO_TYPE_NULL &&
359 		    !(flags & ZIO_FLAG_CONFIG_HELD)) {
360 			spa_config_enter(zio->io_spa, RW_READER, zio);
361 			zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
362 		}
363 		zio->io_root = zio;
364 	} else {
365 		zio->io_root = pio->io_root;
366 		if (!(flags & ZIO_FLAG_NOBOOKMARK))
367 			zio->io_logical = pio->io_logical;
368 		mutex_enter(&pio->io_lock);
369 		if (pio->io_parent == NULL &&
370 		    pio->io_type == ZIO_TYPE_NULL &&
371 		    !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
372 		    !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
373 			pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
374 			spa_config_enter(zio->io_spa, RW_READER, pio);
375 		}
376 		if (stage < ZIO_STAGE_READY)
377 			pio->io_children_notready++;
378 		pio->io_children_notdone++;
379 		zio->io_sibling_next = pio->io_child;
380 		zio->io_sibling_prev = NULL;
381 		if (pio->io_child != NULL)
382 			pio->io_child->io_sibling_prev = zio;
383 		pio->io_child = zio;
384 		zio->io_ndvas = pio->io_ndvas;
385 		mutex_exit(&pio->io_lock);
386 	}
387 
388 	return (zio);
389 }
390 
391 zio_t *
392 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
393 	int flags)
394 {
395 	zio_t *zio;
396 
397 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
398 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
399 	    ZIO_WAIT_FOR_CHILDREN_PIPELINE);
400 
401 	return (zio);
402 }
403 
404 zio_t *
405 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
406 {
407 	return (zio_null(NULL, spa, done, private, flags));
408 }
409 
410 zio_t *
411 zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
412     uint64_t size, zio_done_func_t *done, void *private,
413     int priority, int flags, zbookmark_t *zb)
414 {
415 	zio_t *zio;
416 
417 	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
418 
419 	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
420 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
421 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
422 	zio->io_bookmark = *zb;
423 
424 	zio->io_logical = zio;
425 
426 	/*
427 	 * Work off our copy of the bp so the caller can free it.
428 	 */
429 	zio->io_bp = &zio->io_bp_copy;
430 
431 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
432 		uint64_t csize = BP_GET_PSIZE(bp);
433 		void *cbuf = zio_buf_alloc(csize);
434 
435 		zio_push_transform(zio, cbuf, csize, csize);
436 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
437 	}
438 
439 	if (BP_IS_GANG(bp)) {
440 		uint64_t gsize = SPA_GANGBLOCKSIZE;
441 		void *gbuf = zio_buf_alloc(gsize);
442 
443 		zio_push_transform(zio, gbuf, gsize, gsize);
444 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
445 	}
446 
447 	return (zio);
448 }
449 
450 zio_t *
451 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
452     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
453     zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
454     int flags, zbookmark_t *zb)
455 {
456 	zio_t *zio;
457 
458 	ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
459 	    checksum < ZIO_CHECKSUM_FUNCTIONS);
460 
461 	ASSERT(compress >= ZIO_COMPRESS_OFF &&
462 	    compress < ZIO_COMPRESS_FUNCTIONS);
463 
464 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
465 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
466 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
467 
468 	zio->io_ready = ready;
469 
470 	zio->io_bookmark = *zb;
471 
472 	zio->io_logical = zio;
473 
474 	zio->io_checksum = checksum;
475 	zio->io_compress = compress;
476 	zio->io_ndvas = ncopies;
477 
478 	if (compress != ZIO_COMPRESS_OFF)
479 		zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
480 
481 	if (bp->blk_birth != txg) {
482 		/* XXX the bp usually (always?) gets re-zeroed later */
483 		BP_ZERO(bp);
484 		BP_SET_LSIZE(bp, size);
485 		BP_SET_PSIZE(bp, size);
486 	} else {
487 		/* Make sure someone doesn't change their mind on overwrites */
488 		ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
489 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
490 	}
491 
492 	return (zio);
493 }
494 
495 zio_t *
496 zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
497     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
498     zio_done_func_t *done, void *private, int priority, int flags,
499     zbookmark_t *zb)
500 {
501 	zio_t *zio;
502 
503 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
504 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
505 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
506 
507 	zio->io_bookmark = *zb;
508 	zio->io_checksum = checksum;
509 	zio->io_compress = ZIO_COMPRESS_OFF;
510 
511 	if (pio != NULL)
512 		ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
513 
514 	return (zio);
515 }
516 
517 static zio_t *
518 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
519     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
520     zio_done_func_t *done, void *private, int priority, int flags)
521 {
522 	zio_t *zio;
523 
524 	BP_ZERO(bp);
525 	BP_SET_LSIZE(bp, size);
526 	BP_SET_PSIZE(bp, size);
527 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
528 
529 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
530 	    ZIO_TYPE_WRITE, priority, flags,
531 	    ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
532 
533 	zio->io_checksum = checksum;
534 	zio->io_compress = ZIO_COMPRESS_OFF;
535 
536 	return (zio);
537 }
538 
539 zio_t *
540 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
541     zio_done_func_t *done, void *private)
542 {
543 	zio_t *zio;
544 
545 	ASSERT(!BP_IS_HOLE(bp));
546 
547 	if (txg == spa->spa_syncing_txg &&
548 	    spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
549 		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
550 		return (zio_null(pio, spa, NULL, NULL, 0));
551 	}
552 
553 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
554 	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
555 	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
556 
557 	zio->io_bp = &zio->io_bp_copy;
558 
559 	return (zio);
560 }
561 
562 zio_t *
563 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
564     zio_done_func_t *done, void *private)
565 {
566 	zio_t *zio;
567 
568 	/*
569 	 * A claim is an allocation of a specific block.  Claims are needed
570 	 * to support immediate writes in the intent log.  The issue is that
571 	 * immediate writes contain committed data, but in a txg that was
572 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
573 	 * the intent log claims all blocks that contain immediate write data
574 	 * so that the SPA knows they're in use.
575 	 *
576 	 * All claims *must* be resolved in the first txg -- before the SPA
577 	 * starts allocating blocks -- so that nothing is allocated twice.
578 	 */
579 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
580 	ASSERT3U(spa_first_txg(spa), <=, txg);
581 
582 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
583 	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
584 	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
585 
586 	zio->io_bp = &zio->io_bp_copy;
587 
588 	return (zio);
589 }
590 
591 zio_t *
592 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
593     zio_done_func_t *done, void *private, int priority, int flags)
594 {
595 	zio_t *zio;
596 	int c;
597 
598 	if (vd->vdev_children == 0) {
599 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
600 		    ZIO_TYPE_IOCTL, priority, flags,
601 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
602 
603 		zio->io_vd = vd;
604 		zio->io_cmd = cmd;
605 	} else {
606 		zio = zio_null(pio, spa, NULL, NULL, flags);
607 
608 		for (c = 0; c < vd->vdev_children; c++)
609 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
610 			    done, private, priority, flags));
611 	}
612 
613 	return (zio);
614 }
615 
616 static void
617 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
618     int checksum)
619 {
620 	ASSERT(vd->vdev_children == 0);
621 
622 	ASSERT(size <= SPA_MAXBLOCKSIZE);
623 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
624 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
625 
626 	ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
627 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
628 	ASSERT3U(offset + size, <=, vd->vdev_psize);
629 
630 	BP_ZERO(bp);
631 
632 	BP_SET_LSIZE(bp, size);
633 	BP_SET_PSIZE(bp, size);
634 
635 	BP_SET_CHECKSUM(bp, checksum);
636 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
637 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
638 
639 	if (checksum != ZIO_CHECKSUM_OFF)
640 		ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
641 }
642 
643 zio_t *
644 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
645     void *data, int checksum, zio_done_func_t *done, void *private,
646     int priority, int flags)
647 {
648 	zio_t *zio;
649 	blkptr_t blk;
650 
651 	zio_phys_bp_init(vd, &blk, offset, size, checksum);
652 
653 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
654 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
655 	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
656 
657 	zio->io_vd = vd;
658 	zio->io_offset = offset;
659 
660 	/*
661 	 * Work off our copy of the bp so the caller can free it.
662 	 */
663 	zio->io_bp = &zio->io_bp_copy;
664 
665 	return (zio);
666 }
667 
668 zio_t *
669 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
670     void *data, int checksum, zio_done_func_t *done, void *private,
671     int priority, int flags)
672 {
673 	zio_block_tail_t *zbt;
674 	void *wbuf;
675 	zio_t *zio;
676 	blkptr_t blk;
677 
678 	zio_phys_bp_init(vd, &blk, offset, size, checksum);
679 
680 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
681 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
682 	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
683 
684 	zio->io_vd = vd;
685 	zio->io_offset = offset;
686 
687 	zio->io_bp = &zio->io_bp_copy;
688 	zio->io_checksum = checksum;
689 
690 	if (zio_checksum_table[checksum].ci_zbt) {
691 		/*
692 		 * zbt checksums are necessarily destructive -- they modify
693 		 * one word of the write buffer to hold the verifier/checksum.
694 		 * Therefore, we must make a local copy in case the data is
695 		 * being written to multiple places.
696 		 */
697 		wbuf = zio_buf_alloc(size);
698 		bcopy(data, wbuf, size);
699 		zio_push_transform(zio, wbuf, size, size);
700 
701 		zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
702 		zbt->zbt_cksum = blk.blk_cksum;
703 	}
704 
705 	return (zio);
706 }
707 
708 /*
709  * Create a child I/O to do some work for us.  It has no associated bp.
710  */
711 zio_t *
712 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
713 	void *data, uint64_t size, int type, int priority, int flags,
714 	zio_done_func_t *done, void *private)
715 {
716 	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
717 	zio_t *cio;
718 
719 	if (type == ZIO_TYPE_READ && bp != NULL) {
720 		/*
721 		 * If we have the bp, then the child should perform the
722 		 * checksum and the parent need not.  This pushes error
723 		 * detection as close to the leaves as possible and
724 		 * eliminates redundant checksums in the interior nodes.
725 		 */
726 		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
727 		zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
728 	}
729 
730 	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
731 	    done, private, type, priority,
732 	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
733 	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
734 
735 	cio->io_vd = vd;
736 	cio->io_offset = offset;
737 
738 	return (cio);
739 }
740 
741 /*
742  * ==========================================================================
743  * Initiate I/O, either sync or async
744  * ==========================================================================
745  */
746 int
747 zio_wait(zio_t *zio)
748 {
749 	int error;
750 
751 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
752 
753 	zio->io_waiter = curthread;
754 
755 	zio_next_stage_async(zio);
756 
757 	mutex_enter(&zio->io_lock);
758 	while (zio->io_stalled != ZIO_STAGE_DONE)
759 		cv_wait(&zio->io_cv, &zio->io_lock);
760 	mutex_exit(&zio->io_lock);
761 
762 	error = zio->io_error;
763 	mutex_destroy(&zio->io_lock);
764 	kmem_cache_free(zio_cache, zio);
765 
766 	return (error);
767 }
768 
769 void
770 zio_nowait(zio_t *zio)
771 {
772 	zio_next_stage_async(zio);
773 }
774 
775 /*
776  * ==========================================================================
777  * I/O pipeline interlocks: parent/child dependency scoreboarding
778  * ==========================================================================
779  */
780 static void
781 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
782 {
783 	mutex_enter(&zio->io_lock);
784 	if (*countp == 0) {
785 		ASSERT(zio->io_stalled == 0);
786 		mutex_exit(&zio->io_lock);
787 		zio_next_stage(zio);
788 	} else {
789 		zio->io_stalled = stage;
790 		mutex_exit(&zio->io_lock);
791 	}
792 }
793 
794 static void
795 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
796 {
797 	zio_t *pio = zio->io_parent;
798 
799 	mutex_enter(&pio->io_lock);
800 	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
801 		pio->io_error = zio->io_error;
802 	if (--*countp == 0 && pio->io_stalled == stage) {
803 		pio->io_stalled = 0;
804 		mutex_exit(&pio->io_lock);
805 		zio_next_stage_async(pio);
806 	} else {
807 		mutex_exit(&pio->io_lock);
808 	}
809 }
810 
811 static void
812 zio_wait_children_ready(zio_t *zio)
813 {
814 	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
815 	    &zio->io_children_notready);
816 }
817 
818 void
819 zio_wait_children_done(zio_t *zio)
820 {
821 	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
822 	    &zio->io_children_notdone);
823 }
824 
825 static void
826 zio_ready(zio_t *zio)
827 {
828 	zio_t *pio = zio->io_parent;
829 
830 	if (zio->io_ready)
831 		zio->io_ready(zio);
832 
833 	if (pio != NULL)
834 		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
835 		    &pio->io_children_notready);
836 
837 	if (zio->io_bp)
838 		zio->io_bp_copy = *zio->io_bp;
839 
840 	zio_next_stage(zio);
841 }
842 
843 static void
844 zio_done(zio_t *zio)
845 {
846 	zio_t *pio = zio->io_parent;
847 	spa_t *spa = zio->io_spa;
848 	blkptr_t *bp = zio->io_bp;
849 	vdev_t *vd = zio->io_vd;
850 
851 	ASSERT(zio->io_children_notready == 0);
852 	ASSERT(zio->io_children_notdone == 0);
853 
854 	if (bp != NULL) {
855 		ASSERT(bp->blk_pad[0] == 0);
856 		ASSERT(bp->blk_pad[1] == 0);
857 		ASSERT(bp->blk_pad[2] == 0);
858 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
859 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
860 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
861 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
862 			if (zio->io_ndvas != 0)
863 				ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
864 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
865 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
866 		}
867 	}
868 
869 	if (vd != NULL)
870 		vdev_stat_update(zio);
871 
872 	if (zio->io_error) {
873 		/*
874 		 * If this I/O is attached to a particular vdev,
875 		 * generate an error message describing the I/O failure
876 		 * at the block level.  We ignore these errors if the
877 		 * device is currently unavailable.
878 		 */
879 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
880 			zfs_ereport_post(FM_EREPORT_ZFS_IO,
881 			    zio->io_spa, vd, zio, 0, 0);
882 
883 		if ((zio->io_error == EIO ||
884 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
885 		    zio->io_logical == zio) {
886 			/*
887 			 * For root I/O requests, tell the SPA to log the error
888 			 * appropriately.  Also, generate a logical data
889 			 * ereport.
890 			 */
891 			spa_log_error(zio->io_spa, zio);
892 
893 			zfs_ereport_post(FM_EREPORT_ZFS_DATA,
894 			    zio->io_spa, NULL, zio, 0, 0);
895 		}
896 
897 		/*
898 		 * For I/O requests that cannot fail, panic appropriately.
899 		 */
900 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
901 			char *blkbuf;
902 
903 			blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
904 			if (blkbuf) {
905 				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
906 				    bp ? bp : &zio->io_bp_copy);
907 			}
908 			panic("ZFS: %s (%s on %s off %llx: zio %p %s): error "
909 			    "%d", zio->io_error == ECKSUM ?
910 			    "bad checksum" : "I/O failure",
911 			    zio_type_name[zio->io_type],
912 			    vdev_description(vd),
913 			    (u_longlong_t)zio->io_offset,
914 			    zio, blkbuf ? blkbuf : "", zio->io_error);
915 		}
916 	}
917 	zio_clear_transform_stack(zio);
918 
919 	if (zio->io_done)
920 		zio->io_done(zio);
921 
922 	ASSERT(zio->io_delegate_list == NULL);
923 	ASSERT(zio->io_delegate_next == NULL);
924 
925 	if (pio != NULL) {
926 		zio_t *next, *prev;
927 
928 		mutex_enter(&pio->io_lock);
929 		next = zio->io_sibling_next;
930 		prev = zio->io_sibling_prev;
931 		if (next != NULL)
932 			next->io_sibling_prev = prev;
933 		if (prev != NULL)
934 			prev->io_sibling_next = next;
935 		if (pio->io_child == zio)
936 			pio->io_child = next;
937 		mutex_exit(&pio->io_lock);
938 
939 		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
940 		    &pio->io_children_notdone);
941 	}
942 
943 	/*
944 	 * Note: this I/O is now done, and will shortly be freed, so there is no
945 	 * need to clear this (or any other) flag.
946 	 */
947 	if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)
948 		spa_config_exit(spa, zio);
949 
950 	if (zio->io_waiter != NULL) {
951 		mutex_enter(&zio->io_lock);
952 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
953 		zio->io_stalled = zio->io_stage;
954 		cv_broadcast(&zio->io_cv);
955 		mutex_exit(&zio->io_lock);
956 	} else {
957 		kmem_cache_free(zio_cache, zio);
958 	}
959 }
960 
961 /*
962  * ==========================================================================
963  * Compression support
964  * ==========================================================================
965  */
966 static void
967 zio_write_compress(zio_t *zio)
968 {
969 	int compress = zio->io_compress;
970 	blkptr_t *bp = zio->io_bp;
971 	void *cbuf;
972 	uint64_t lsize = zio->io_size;
973 	uint64_t csize = lsize;
974 	uint64_t cbufsize = 0;
975 	int pass;
976 
977 	if (bp->blk_birth == zio->io_txg) {
978 		/*
979 		 * We're rewriting an existing block, which means we're
980 		 * working on behalf of spa_sync().  For spa_sync() to
981 		 * converge, it must eventually be the case that we don't
982 		 * have to allocate new blocks.  But compression changes
983 		 * the blocksize, which forces a reallocate, and makes
984 		 * convergence take longer.  Therefore, after the first
985 		 * few passes, stop compressing to ensure convergence.
986 		 */
987 		pass = spa_sync_pass(zio->io_spa);
988 		if (pass > zio_sync_pass.zp_dontcompress)
989 			compress = ZIO_COMPRESS_OFF;
990 	} else {
991 		ASSERT(BP_IS_HOLE(bp));
992 		pass = 1;
993 	}
994 
995 	if (compress != ZIO_COMPRESS_OFF)
996 		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
997 		    &cbuf, &csize, &cbufsize))
998 			compress = ZIO_COMPRESS_OFF;
999 
1000 	if (compress != ZIO_COMPRESS_OFF && csize != 0)
1001 		zio_push_transform(zio, cbuf, csize, cbufsize);
1002 
1003 	/*
1004 	 * The final pass of spa_sync() must be all rewrites, but the first
1005 	 * few passes offer a trade-off: allocating blocks defers convergence,
1006 	 * but newly allocated blocks are sequential, so they can be written
1007 	 * to disk faster.  Therefore, we allow the first few passes of
1008 	 * spa_sync() to reallocate new blocks, but force rewrites after that.
1009 	 * There should only be a handful of blocks after pass 1 in any case.
1010 	 */
1011 	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
1012 	    pass > zio_sync_pass.zp_rewrite) {
1013 		ASSERT(csize != 0);
1014 		BP_SET_LSIZE(bp, lsize);
1015 		BP_SET_COMPRESS(bp, compress);
1016 		zio->io_pipeline = ZIO_REWRITE_PIPELINE;
1017 	} else {
1018 		if (bp->blk_birth == zio->io_txg)
1019 			BP_ZERO(bp);
1020 		if (csize == 0) {
1021 			BP_ZERO(bp);
1022 			zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
1023 		} else {
1024 			ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
1025 			BP_SET_LSIZE(bp, lsize);
1026 			BP_SET_PSIZE(bp, csize);
1027 			BP_SET_COMPRESS(bp, compress);
1028 			zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE;
1029 		}
1030 	}
1031 
1032 	zio_next_stage(zio);
1033 }
1034 
1035 static void
1036 zio_read_decompress(zio_t *zio)
1037 {
1038 	blkptr_t *bp = zio->io_bp;
1039 	void *data;
1040 	uint64_t size;
1041 	uint64_t bufsize;
1042 	int compress = BP_GET_COMPRESS(bp);
1043 
1044 	ASSERT(compress != ZIO_COMPRESS_OFF);
1045 
1046 	zio_pop_transform(zio, &data, &size, &bufsize);
1047 
1048 	if (zio_decompress_data(compress, data, size,
1049 	    zio->io_data, zio->io_size))
1050 		zio->io_error = EIO;
1051 
1052 	zio_buf_free(data, bufsize);
1053 
1054 	zio_next_stage(zio);
1055 }
1056 
1057 /*
1058  * ==========================================================================
1059  * Gang block support
1060  * ==========================================================================
1061  */
1062 static void
1063 zio_gang_pipeline(zio_t *zio)
1064 {
1065 	/*
1066 	 * By default, the pipeline assumes that we're dealing with a gang
1067 	 * block.  If we're not, strip out any gang-specific stages.
1068 	 */
1069 	if (!BP_IS_GANG(zio->io_bp))
1070 		zio->io_pipeline &= ~ZIO_GANG_STAGES;
1071 
1072 	zio_next_stage(zio);
1073 }
1074 
1075 static void
1076 zio_gang_byteswap(zio_t *zio)
1077 {
1078 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1079 
1080 	if (BP_SHOULD_BYTESWAP(zio->io_bp))
1081 		byteswap_uint64_array(zio->io_data, zio->io_size);
1082 }
1083 
1084 static void
1085 zio_get_gang_header(zio_t *zio)
1086 {
1087 	blkptr_t *bp = zio->io_bp;
1088 	uint64_t gsize = SPA_GANGBLOCKSIZE;
1089 	void *gbuf = zio_buf_alloc(gsize);
1090 
1091 	ASSERT(BP_IS_GANG(bp));
1092 
1093 	zio_push_transform(zio, gbuf, gsize, gsize);
1094 
1095 	zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
1096 	    NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
1097 	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
1098 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE));
1099 
1100 	zio_wait_children_done(zio);
1101 }
1102 
1103 static void
1104 zio_read_gang_members(zio_t *zio)
1105 {
1106 	zio_gbh_phys_t *gbh;
1107 	uint64_t gsize, gbufsize, loff, lsize;
1108 	int i;
1109 
1110 	ASSERT(BP_IS_GANG(zio->io_bp));
1111 
1112 	zio_gang_byteswap(zio);
1113 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1114 
1115 	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
1116 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1117 		lsize = BP_GET_PSIZE(gbp);
1118 
1119 		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
1120 		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
1121 		ASSERT3U(loff + lsize, <=, zio->io_size);
1122 		ASSERT(i < SPA_GBH_NBLKPTRS);
1123 		ASSERT(!BP_IS_HOLE(gbp));
1124 
1125 		zio_nowait(zio_read(zio, zio->io_spa, gbp,
1126 		    (char *)zio->io_data + loff, lsize, NULL, NULL,
1127 		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
1128 		    &zio->io_bookmark));
1129 	}
1130 
1131 	zio_buf_free(gbh, gbufsize);
1132 	zio_wait_children_done(zio);
1133 }
1134 
1135 static void
1136 zio_rewrite_gang_members(zio_t *zio)
1137 {
1138 	zio_gbh_phys_t *gbh;
1139 	uint64_t gsize, gbufsize, loff, lsize;
1140 	int i;
1141 
1142 	ASSERT(BP_IS_GANG(zio->io_bp));
1143 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
1144 
1145 	zio_gang_byteswap(zio);
1146 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1147 
1148 	ASSERT(gsize == gbufsize);
1149 
1150 	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
1151 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1152 		lsize = BP_GET_PSIZE(gbp);
1153 
1154 		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
1155 		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
1156 		ASSERT3U(loff + lsize, <=, zio->io_size);
1157 		ASSERT(i < SPA_GBH_NBLKPTRS);
1158 		ASSERT(!BP_IS_HOLE(gbp));
1159 
1160 		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
1161 		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
1162 		    NULL, NULL, zio->io_priority, zio->io_flags,
1163 		    &zio->io_bookmark));
1164 	}
1165 
1166 	zio_push_transform(zio, gbh, gsize, gbufsize);
1167 	zio_wait_children_ready(zio);
1168 }
1169 
1170 static void
1171 zio_free_gang_members(zio_t *zio)
1172 {
1173 	zio_gbh_phys_t *gbh;
1174 	uint64_t gsize, gbufsize;
1175 	int i;
1176 
1177 	ASSERT(BP_IS_GANG(zio->io_bp));
1178 
1179 	zio_gang_byteswap(zio);
1180 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1181 
1182 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1183 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1184 
1185 		if (BP_IS_HOLE(gbp))
1186 			continue;
1187 		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
1188 		    gbp, NULL, NULL));
1189 	}
1190 
1191 	zio_buf_free(gbh, gbufsize);
1192 	zio_next_stage(zio);
1193 }
1194 
1195 static void
1196 zio_claim_gang_members(zio_t *zio)
1197 {
1198 	zio_gbh_phys_t *gbh;
1199 	uint64_t gsize, gbufsize;
1200 	int i;
1201 
1202 	ASSERT(BP_IS_GANG(zio->io_bp));
1203 
1204 	zio_gang_byteswap(zio);
1205 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1206 
1207 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1208 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1209 		if (BP_IS_HOLE(gbp))
1210 			continue;
1211 		zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
1212 		    gbp, NULL, NULL));
1213 	}
1214 
1215 	zio_buf_free(gbh, gbufsize);
1216 	zio_next_stage(zio);
1217 }
1218 
1219 static void
1220 zio_write_allocate_gang_member_done(zio_t *zio)
1221 {
1222 	zio_t *pio = zio->io_parent;
1223 	dva_t *cdva = zio->io_bp->blk_dva;
1224 	dva_t *pdva = pio->io_bp->blk_dva;
1225 	uint64_t asize;
1226 	int d;
1227 
1228 	ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
1229 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1230 	ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
1231 	ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
1232 
1233 	mutex_enter(&pio->io_lock);
1234 	for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
1235 		ASSERT(DVA_GET_GANG(&pdva[d]));
1236 		asize = DVA_GET_ASIZE(&pdva[d]);
1237 		asize += DVA_GET_ASIZE(&cdva[d]);
1238 		DVA_SET_ASIZE(&pdva[d], asize);
1239 	}
1240 	mutex_exit(&pio->io_lock);
1241 }
1242 
1243 static void
1244 zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
1245 {
1246 	blkptr_t *bp = zio->io_bp;
1247 	dva_t *dva = bp->blk_dva;
1248 	spa_t *spa = zio->io_spa;
1249 	zio_gbh_phys_t *gbh;
1250 	uint64_t txg = zio->io_txg;
1251 	uint64_t resid = zio->io_size;
1252 	uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
1253 	uint64_t gsize, loff, lsize;
1254 	uint32_t gbps_left;
1255 	int ndvas = zio->io_ndvas;
1256 	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
1257 	int error;
1258 	int i, d;
1259 
1260 	gsize = SPA_GANGBLOCKSIZE;
1261 	gbps_left = SPA_GBH_NBLKPTRS;
1262 
1263 	error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
1264 	    B_FALSE);
1265 	if (error == ENOSPC)
1266 		panic("can't allocate gang block header");
1267 	ASSERT(error == 0);
1268 
1269 	for (d = 0; d < gbh_ndvas; d++)
1270 		DVA_SET_GANG(&dva[d], 1);
1271 
1272 	bp->blk_birth = txg;
1273 
1274 	gbh = zio_buf_alloc(gsize);
1275 	bzero(gbh, gsize);
1276 
1277 	/* We need to test multi-level gang blocks */
1278 	if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0)
1279 		maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
1280 
1281 	for (loff = 0, i = 0; loff != zio->io_size;
1282 	    loff += lsize, resid -= lsize, gbps_left--, i++) {
1283 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1284 		dva = gbp->blk_dva;
1285 
1286 		ASSERT(gbps_left != 0);
1287 		maxalloc = MIN(maxalloc, resid);
1288 
1289 		while (resid <= maxalloc * gbps_left) {
1290 			error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas,
1291 			    txg, bp, B_FALSE);
1292 			if (error == 0)
1293 				break;
1294 			ASSERT3U(error, ==, ENOSPC);
1295 			if (maxalloc == SPA_MINBLOCKSIZE)
1296 				panic("really out of space");
1297 			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
1298 		}
1299 
1300 		if (resid <= maxalloc * gbps_left) {
1301 			lsize = maxalloc;
1302 			BP_SET_LSIZE(gbp, lsize);
1303 			BP_SET_PSIZE(gbp, lsize);
1304 			BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
1305 			gbp->blk_birth = txg;
1306 			zio_nowait(zio_rewrite(zio, spa,
1307 			    zio->io_checksum, txg, gbp,
1308 			    (char *)zio->io_data + loff, lsize,
1309 			    zio_write_allocate_gang_member_done, NULL,
1310 			    zio->io_priority, zio->io_flags,
1311 			    &zio->io_bookmark));
1312 		} else {
1313 			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
1314 			ASSERT(lsize != SPA_MINBLOCKSIZE);
1315 			zio_nowait(zio_write_allocate(zio, spa,
1316 			    zio->io_checksum, txg, gbp,
1317 			    (char *)zio->io_data + loff, lsize,
1318 			    zio_write_allocate_gang_member_done, NULL,
1319 			    zio->io_priority, zio->io_flags));
1320 		}
1321 	}
1322 
1323 	ASSERT(resid == 0 && loff == zio->io_size);
1324 
1325 	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
1326 
1327 	zio_push_transform(zio, gbh, gsize, gsize);
1328 	/*
1329 	 * As much as we'd like this to be zio_wait_children_ready(),
1330 	 * updating our ASIZE doesn't happen until the io_done callback,
1331 	 * so we have to wait for that to finish in order for our BP
1332 	 * to be stable.
1333 	 */
1334 	zio_wait_children_done(zio);
1335 }
1336 
1337 /*
1338  * ==========================================================================
1339  * Allocate and free blocks
1340  * ==========================================================================
1341  */
1342 static void
1343 zio_dva_allocate(zio_t *zio)
1344 {
1345 	spa_t *spa = zio->io_spa;
1346 	metaslab_class_t *mc = spa->spa_normal_class;
1347 	blkptr_t *bp = zio->io_bp;
1348 	int error;
1349 
1350 	ASSERT(BP_IS_HOLE(bp));
1351 	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
1352 	ASSERT3U(zio->io_ndvas, >, 0);
1353 	ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa));
1354 
1355 	/* For testing, make some blocks above a certain size be gang blocks */
1356 	if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
1357 		zio_write_allocate_gang_members(zio, mc);
1358 		return;
1359 	}
1360 
1361 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
1362 
1363 	error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas,
1364 	    zio->io_txg, NULL, B_FALSE);
1365 
1366 	if (error == 0) {
1367 		bp->blk_birth = zio->io_txg;
1368 	} else if (error == ENOSPC) {
1369 		if (zio->io_size == SPA_MINBLOCKSIZE)
1370 			panic("really, truly out of space");
1371 		zio_write_allocate_gang_members(zio, mc);
1372 		return;
1373 	} else {
1374 		zio->io_error = error;
1375 	}
1376 	zio_next_stage(zio);
1377 }
1378 
1379 static void
1380 zio_dva_free(zio_t *zio)
1381 {
1382 	blkptr_t *bp = zio->io_bp;
1383 
1384 	metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
1385 
1386 	BP_ZERO(bp);
1387 
1388 	zio_next_stage(zio);
1389 }
1390 
1391 static void
1392 zio_dva_claim(zio_t *zio)
1393 {
1394 	zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
1395 
1396 	zio_next_stage(zio);
1397 }
1398 
1399 /*
1400  * ==========================================================================
1401  * Read and write to physical devices
1402  * ==========================================================================
1403  */
1404 
1405 static void
1406 zio_vdev_io_start(zio_t *zio)
1407 {
1408 	vdev_t *vd = zio->io_vd;
1409 	vdev_t *tvd = vd ? vd->vdev_top : NULL;
1410 	blkptr_t *bp = zio->io_bp;
1411 	uint64_t align;
1412 
1413 	if (vd == NULL) {
1414 		/* The mirror_ops handle multiple DVAs in a single BP */
1415 		vdev_mirror_ops.vdev_op_io_start(zio);
1416 		return;
1417 	}
1418 
1419 	align = 1ULL << tvd->vdev_ashift;
1420 
1421 	if (zio->io_retries == 0 && vd == tvd)
1422 		zio->io_flags |= ZIO_FLAG_FAILFAST;
1423 
1424 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
1425 	    vd->vdev_children == 0) {
1426 		zio->io_flags |= ZIO_FLAG_PHYSICAL;
1427 		zio->io_offset += VDEV_LABEL_START_SIZE;
1428 	}
1429 
1430 	if (P2PHASE(zio->io_size, align) != 0) {
1431 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
1432 		char *abuf = zio_buf_alloc(asize);
1433 		ASSERT(vd == tvd);
1434 		if (zio->io_type == ZIO_TYPE_WRITE) {
1435 			bcopy(zio->io_data, abuf, zio->io_size);
1436 			bzero(abuf + zio->io_size, asize - zio->io_size);
1437 		}
1438 		zio_push_transform(zio, abuf, asize, asize);
1439 		ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
1440 		zio->io_flags |= ZIO_FLAG_SUBBLOCK;
1441 	}
1442 
1443 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
1444 	ASSERT(P2PHASE(zio->io_size, align) == 0);
1445 	ASSERT(bp == NULL ||
1446 	    P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
1447 	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
1448 
1449 	vdev_io_start(zio);
1450 
1451 	/* zio_next_stage_async() gets called from io completion interrupt */
1452 }
1453 
1454 static void
1455 zio_vdev_io_done(zio_t *zio)
1456 {
1457 	if (zio->io_vd == NULL)
1458 		/* The mirror_ops handle multiple DVAs in a single BP */
1459 		vdev_mirror_ops.vdev_op_io_done(zio);
1460 	else
1461 		vdev_io_done(zio);
1462 }
1463 
1464 /* XXPOLICY */
1465 boolean_t
1466 zio_should_retry(zio_t *zio)
1467 {
1468 	vdev_t *vd = zio->io_vd;
1469 
1470 	if (zio->io_error == 0)
1471 		return (B_FALSE);
1472 	if (zio->io_delegate_list != NULL)
1473 		return (B_FALSE);
1474 	if (vd && vd != vd->vdev_top)
1475 		return (B_FALSE);
1476 	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
1477 		return (B_FALSE);
1478 	if (zio->io_retries > 0)
1479 		return (B_FALSE);
1480 
1481 	return (B_TRUE);
1482 }
1483 
1484 static void
1485 zio_vdev_io_assess(zio_t *zio)
1486 {
1487 	vdev_t *vd = zio->io_vd;
1488 	vdev_t *tvd = vd ? vd->vdev_top : NULL;
1489 
1490 	ASSERT(zio->io_vsd == NULL);
1491 
1492 	if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
1493 		void *abuf;
1494 		uint64_t asize;
1495 		ASSERT(vd == tvd);
1496 		zio_pop_transform(zio, &abuf, &asize, &asize);
1497 		if (zio->io_type == ZIO_TYPE_READ)
1498 			bcopy(abuf, zio->io_data, zio->io_size);
1499 		zio_buf_free(abuf, asize);
1500 		zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
1501 	}
1502 
1503 	if (zio_injection_enabled && !zio->io_error)
1504 		zio->io_error = zio_handle_fault_injection(zio, EIO);
1505 
1506 	/*
1507 	 * If the I/O failed, determine whether we should attempt to retry it.
1508 	 */
1509 	/* XXPOLICY */
1510 	if (zio_should_retry(zio)) {
1511 		ASSERT(tvd == vd);
1512 
1513 		zio->io_retries++;
1514 		zio->io_error = 0;
1515 		zio->io_flags &= ZIO_FLAG_VDEV_INHERIT |
1516 		    ZIO_FLAG_CONFIG_GRABBED;
1517 		/* XXPOLICY */
1518 		zio->io_flags &= ~ZIO_FLAG_FAILFAST;
1519 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1520 		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
1521 
1522 		dprintf("retry #%d for %s to %s offset %llx\n",
1523 		    zio->io_retries, zio_type_name[zio->io_type],
1524 		    vdev_description(vd), zio->io_offset);
1525 
1526 		zio_next_stage_async(zio);
1527 		return;
1528 	}
1529 
1530 	zio_next_stage(zio);
1531 }
1532 
1533 void
1534 zio_vdev_io_reissue(zio_t *zio)
1535 {
1536 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
1537 	ASSERT(zio->io_error == 0);
1538 
1539 	zio->io_stage--;
1540 }
1541 
1542 void
1543 zio_vdev_io_redone(zio_t *zio)
1544 {
1545 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
1546 
1547 	zio->io_stage--;
1548 }
1549 
1550 void
1551 zio_vdev_io_bypass(zio_t *zio)
1552 {
1553 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
1554 	ASSERT(zio->io_error == 0);
1555 
1556 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
1557 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
1558 }
1559 
1560 /*
1561  * ==========================================================================
1562  * Generate and verify checksums
1563  * ==========================================================================
1564  */
1565 static void
1566 zio_checksum_generate(zio_t *zio)
1567 {
1568 	int checksum = zio->io_checksum;
1569 	blkptr_t *bp = zio->io_bp;
1570 
1571 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
1572 
1573 	BP_SET_CHECKSUM(bp, checksum);
1574 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1575 
1576 	zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
1577 
1578 	zio_next_stage(zio);
1579 }
1580 
1581 static void
1582 zio_gang_checksum_generate(zio_t *zio)
1583 {
1584 	zio_cksum_t zc;
1585 	zio_gbh_phys_t *gbh = zio->io_data;
1586 
1587 	ASSERT(BP_IS_GANG(zio->io_bp));
1588 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
1589 
1590 	zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
1591 
1592 	zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
1593 
1594 	zio_next_stage(zio);
1595 }
1596 
1597 static void
1598 zio_checksum_verify(zio_t *zio)
1599 {
1600 	if (zio->io_bp != NULL) {
1601 		zio->io_error = zio_checksum_error(zio);
1602 		if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
1603 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1604 			    zio->io_spa, zio->io_vd, zio, 0, 0);
1605 	}
1606 
1607 	zio_next_stage(zio);
1608 }
1609 
1610 /*
1611  * Called by RAID-Z to ensure we don't compute the checksum twice.
1612  */
1613 void
1614 zio_checksum_verified(zio_t *zio)
1615 {
1616 	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
1617 }
1618 
1619 /*
1620  * Set the external verifier for a gang block based on stuff in the bp
1621  */
1622 void
1623 zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
1624 {
1625 	blkptr_t *bp = zio->io_bp;
1626 
1627 	zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
1628 	zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
1629 	zcp->zc_word[2] = bp->blk_birth;
1630 	zcp->zc_word[3] = 0;
1631 }
1632 
1633 /*
1634  * ==========================================================================
1635  * Define the pipeline
1636  * ==========================================================================
1637  */
1638 typedef void zio_pipe_stage_t(zio_t *zio);
1639 
1640 static void
1641 zio_badop(zio_t *zio)
1642 {
1643 	panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
1644 }
1645 
1646 zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
1647 	zio_badop,
1648 	zio_wait_children_ready,
1649 	zio_write_compress,
1650 	zio_checksum_generate,
1651 	zio_gang_pipeline,
1652 	zio_get_gang_header,
1653 	zio_rewrite_gang_members,
1654 	zio_free_gang_members,
1655 	zio_claim_gang_members,
1656 	zio_dva_allocate,
1657 	zio_dva_free,
1658 	zio_dva_claim,
1659 	zio_gang_checksum_generate,
1660 	zio_ready,
1661 	zio_vdev_io_start,
1662 	zio_vdev_io_done,
1663 	zio_vdev_io_assess,
1664 	zio_wait_children_done,
1665 	zio_checksum_verify,
1666 	zio_read_gang_members,
1667 	zio_read_decompress,
1668 	zio_done,
1669 	zio_badop
1670 };
1671 
1672 /*
1673  * Move an I/O to the next stage of the pipeline and execute that stage.
1674  * There's no locking on io_stage because there's no legitimate way for
1675  * multiple threads to be attempting to process the same I/O.
1676  */
1677 void
1678 zio_next_stage(zio_t *zio)
1679 {
1680 	uint32_t pipeline = zio->io_pipeline;
1681 
1682 	ASSERT(!MUTEX_HELD(&zio->io_lock));
1683 
1684 	if (zio->io_error) {
1685 		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
1686 		    zio, vdev_description(zio->io_vd),
1687 		    zio->io_offset, zio->io_stage, zio->io_error);
1688 		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
1689 			pipeline &= ZIO_ERROR_PIPELINE_MASK;
1690 	}
1691 
1692 	while (((1U << ++zio->io_stage) & pipeline) == 0)
1693 		continue;
1694 
1695 	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
1696 	ASSERT(zio->io_stalled == 0);
1697 
1698 	/*
1699 	 * See the comment in zio_next_stage_async() about per-CPU taskqs.
1700 	 */
1701 	if (((1U << zio->io_stage) & zio->io_async_stages) &&
1702 	    (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) &&
1703 	    !(zio->io_flags & ZIO_FLAG_METADATA)) {
1704 		taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
1705 		(void) taskq_dispatch(tq,
1706 		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
1707 	} else {
1708 		zio_pipeline[zio->io_stage](zio);
1709 	}
1710 }
1711 
1712 void
1713 zio_next_stage_async(zio_t *zio)
1714 {
1715 	taskq_t *tq;
1716 	uint32_t pipeline = zio->io_pipeline;
1717 
1718 	ASSERT(!MUTEX_HELD(&zio->io_lock));
1719 
1720 	if (zio->io_error) {
1721 		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
1722 		    zio, vdev_description(zio->io_vd),
1723 		    zio->io_offset, zio->io_stage, zio->io_error);
1724 		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
1725 			pipeline &= ZIO_ERROR_PIPELINE_MASK;
1726 	}
1727 
1728 	while (((1U << ++zio->io_stage) & pipeline) == 0)
1729 		continue;
1730 
1731 	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
1732 	ASSERT(zio->io_stalled == 0);
1733 
1734 	/*
1735 	 * For performance, we'll probably want two sets of task queues:
1736 	 * per-CPU issue taskqs and per-CPU completion taskqs.  The per-CPU
1737 	 * part is for read performance: since we have to make a pass over
1738 	 * the data to checksum it anyway, we want to do this on the same CPU
1739 	 * that issued the read, because (assuming CPU scheduling affinity)
1740 	 * that thread is probably still there.  Getting this optimization
1741 	 * right avoids performance-hostile cache-to-cache transfers.
1742 	 *
1743 	 * Note that having two sets of task queues is also necessary for
1744 	 * correctness: if all of the issue threads get bogged down waiting
1745 	 * for dependent reads (e.g. metaslab freelist) to complete, then
1746 	 * there won't be any threads available to service I/O completion
1747 	 * interrupts.
1748 	 */
1749 	if ((1U << zio->io_stage) & zio->io_async_stages) {
1750 		if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
1751 			tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
1752 		else
1753 			tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
1754 		(void) taskq_dispatch(tq,
1755 		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
1756 	} else {
1757 		zio_pipeline[zio->io_stage](zio);
1758 	}
1759 }
1760 
1761 static boolean_t
1762 zio_alloc_should_fail(void)
1763 {
1764 	static uint16_t	allocs = 0;
1765 
1766 	return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0);
1767 }
1768 
1769 /*
1770  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
1771  */
1772 int
1773 zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
1774     uint64_t txg)
1775 {
1776 	int error;
1777 
1778 	spa_config_enter(spa, RW_READER, FTAG);
1779 
1780 	if (zio_zil_fail_shift && zio_alloc_should_fail()) {
1781 		spa_config_exit(spa, FTAG);
1782 		return (ENOSPC);
1783 	}
1784 
1785 	/*
1786 	 * We were passed the previous log block's DVA in bp->blk_dva[0].
1787 	 * We use that as a hint for which vdev to allocate from next.
1788 	 */
1789 	error = metaslab_alloc(spa, spa->spa_log_class, size,
1790 	    new_bp, 1, txg, old_bp, B_TRUE);
1791 
1792 	if (error)
1793 		error = metaslab_alloc(spa, spa->spa_normal_class, size,
1794 		    new_bp, 1, txg, old_bp, B_TRUE);
1795 
1796 	if (error == 0) {
1797 		BP_SET_LSIZE(new_bp, size);
1798 		BP_SET_PSIZE(new_bp, size);
1799 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
1800 		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
1801 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
1802 		BP_SET_LEVEL(new_bp, 0);
1803 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
1804 		new_bp->blk_birth = txg;
1805 	}
1806 
1807 	spa_config_exit(spa, FTAG);
1808 
1809 	return (error);
1810 }
1811 
1812 /*
1813  * Free an intent log block.  We know it can't be a gang block, so there's
1814  * nothing to do except metaslab_free() it.
1815  */
1816 void
1817 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
1818 {
1819 	ASSERT(!BP_IS_GANG(bp));
1820 
1821 	spa_config_enter(spa, RW_READER, FTAG);
1822 
1823 	metaslab_free(spa, bp, txg, B_FALSE);
1824 
1825 	spa_config_exit(spa, FTAG);
1826 }
1827 
1828 /*
1829  * start an async flush of the write cache for this vdev
1830  */
1831 void
1832 zio_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio)
1833 {
1834 	vdev_t *vd;
1835 
1836 	/*
1837 	 * Lock out configuration changes.
1838 	 */
1839 	spa_config_enter(spa, RW_READER, FTAG);
1840 
1841 	if (*zio == NULL)
1842 		*zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
1843 
1844 	vd = vdev_lookup_top(spa, vdev);
1845 	ASSERT(vd);
1846 
1847 	(void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE,
1848 	    NULL, NULL, ZIO_PRIORITY_NOW,
1849 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
1850 
1851 	spa_config_exit(spa, FTAG);
1852 }
1853