xref: /titanic_52/usr/src/uts/common/fs/zfs/zil.c (revision 5c88ba20fc79ecf19255b4a04f03d77630b6d0e7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/dmu.h>
32 #include <sys/zap.h>
33 #include <sys/arc.h>
34 #include <sys/stat.h>
35 #include <sys/resource.h>
36 #include <sys/zil.h>
37 #include <sys/zil_impl.h>
38 #include <sys/dsl_dataset.h>
39 #include <sys/vdev.h>
40 
41 
42 /*
43  * The zfs intent log (ZIL) saves transaction records of system calls
44  * that change the file system in memory with enough information
45  * to be able to replay them. These are stored in memory until
46  * either the DMU transaction group (txg) commits them to the stable pool
47  * and they can be discarded, or they are flushed to the stable log
48  * (also in the pool) due to a fsync, O_DSYNC or other synchronous
49  * requirement. In the event of a panic or power fail then those log
50  * records (transactions) are replayed.
51  *
52  * There is one ZIL per file system. Its on-disk (pool) format consists
53  * of 3 parts:
54  *
55  * 	- ZIL header
56  * 	- ZIL blocks
57  * 	- ZIL records
58  *
59  * A log record holds a system call transaction. Log blocks can
60  * hold many log records and the blocks are chained together.
61  * Each ZIL block contains a block pointer (blkptr_t) to the next
62  * ZIL block in the chain. The ZIL header points to the first
63  * block in the chain. Note there is not a fixed place in the pool
64  * to hold blocks. They are dynamically allocated and freed as
65  * needed from the blocks available. Figure X shows the ZIL structure:
66  */
67 
68 /*
69  * These global ZIL switches affect all pools
70  */
71 int zil_disable = 0;	/* disable intent logging */
72 int zil_always = 0;	/* make every transaction synchronous */
73 int zil_purge = 0;	/* at pool open, just throw everything away */
74 int zil_noflush = 0;	/* don't flush write cache buffers on disks */
75 
76 static kmem_cache_t *zil_lwb_cache;
77 
78 static int
79 zil_dva_compare(const void *x1, const void *x2)
80 {
81 	const dva_t *dva1 = x1;
82 	const dva_t *dva2 = x2;
83 
84 	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
85 		return (-1);
86 	if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
87 		return (1);
88 
89 	if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
90 		return (-1);
91 	if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
92 		return (1);
93 
94 	return (0);
95 }
96 
97 static void
98 zil_dva_tree_init(avl_tree_t *t)
99 {
100 	avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
101 	    offsetof(zil_dva_node_t, zn_node));
102 }
103 
104 static void
105 zil_dva_tree_fini(avl_tree_t *t)
106 {
107 	zil_dva_node_t *zn;
108 	void *cookie = NULL;
109 
110 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
111 		kmem_free(zn, sizeof (zil_dva_node_t));
112 
113 	avl_destroy(t);
114 }
115 
116 static int
117 zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
118 {
119 	zil_dva_node_t *zn;
120 	avl_index_t where;
121 
122 	if (avl_find(t, dva, &where) != NULL)
123 		return (EEXIST);
124 
125 	zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
126 	zn->zn_dva = *dva;
127 	avl_insert(t, zn, where);
128 
129 	return (0);
130 }
131 
132 /*
133  * Read a log block, make sure it's valid, and byteswap it if necessary.
134  */
135 static int
136 zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf)
137 {
138 	uint64_t blksz = BP_GET_LSIZE(bp);
139 	zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1;
140 	zio_cksum_t cksum;
141 	int error;
142 
143 	error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz,
144 	    NULL, NULL, ZIO_PRIORITY_SYNC_READ,
145 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
146 	if (error) {
147 		dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ",
148 		    zilog, bp, error);
149 		return (error);
150 	}
151 
152 	if (BP_SHOULD_BYTESWAP(bp))
153 		byteswap_uint64_array(buf, blksz);
154 
155 	/*
156 	 * Sequence numbers should be... sequential.  The checksum verifier for
157 	 * the next block should be: <logid[0], logid[1], objset id, seq + 1>.
158 	 */
159 	cksum = bp->blk_cksum;
160 	cksum.zc_word[3]++;
161 	if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)) != 0) {
162 		dprintf_bp(bp, "zilog %p bp %p stale pointer: ", zilog, bp);
163 		return (ESTALE);
164 	}
165 
166 	if (BP_IS_HOLE(&ztp->zit_next_blk)) {
167 		dprintf_bp(bp, "zilog %p bp %p hole: ", zilog, bp);
168 		return (ENOENT);
169 	}
170 
171 	if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) {
172 		dprintf("zilog %p bp %p nused exceeds blksz\n", zilog, bp);
173 		return (EOVERFLOW);
174 	}
175 
176 	dprintf_bp(bp, "zilog %p bp %p good block: ", zilog, bp);
177 
178 	return (0);
179 }
180 
181 /*
182  * Parse the intent log, and call parse_func for each valid record within.
183  */
184 void
185 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
186     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
187 {
188 	blkptr_t blk;
189 	char *lrbuf, *lrp;
190 	zil_trailer_t *ztp;
191 	int reclen, error;
192 
193 	blk = zilog->zl_header->zh_log;
194 	if (BP_IS_HOLE(&blk))
195 		return;
196 
197 	/*
198 	 * Starting at the block pointed to by zh_log we read the log chain.
199 	 * For each block in the chain we strongly check that block to
200 	 * ensure its validity.  We stop when an invalid block is found.
201 	 * For each block pointer in the chain we call parse_blk_func().
202 	 * For each record in each valid block we call parse_lr_func().
203 	 */
204 	zil_dva_tree_init(&zilog->zl_dva_tree);
205 	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
206 	for (;;) {
207 		error = zil_read_log_block(zilog, &blk, lrbuf);
208 
209 		if (parse_blk_func != NULL)
210 			parse_blk_func(zilog, &blk, arg, txg);
211 
212 		if (error)
213 			break;
214 
215 		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
216 		blk = ztp->zit_next_blk;
217 
218 		if (parse_lr_func == NULL)
219 			continue;
220 
221 		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
222 			lr_t *lr = (lr_t *)lrp;
223 			reclen = lr->lrc_reclen;
224 			ASSERT3U(reclen, >=, sizeof (lr_t));
225 			parse_lr_func(zilog, lr, arg, txg);
226 		}
227 	}
228 	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
229 	zil_dva_tree_fini(&zilog->zl_dva_tree);
230 }
231 
232 /* ARGSUSED */
233 static void
234 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
235 {
236 	spa_t *spa = zilog->zl_spa;
237 	int err;
238 
239 	dprintf_bp(bp, "first_txg %llu: ", first_txg);
240 
241 	/*
242 	 * Claim log block if not already committed and not already claimed.
243 	 */
244 	if (bp->blk_birth >= first_txg &&
245 	    zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
246 		err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL));
247 		ASSERT(err == 0);
248 	}
249 }
250 
251 static void
252 zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
253 {
254 	if (lrc->lrc_txtype == TX_WRITE) {
255 		lr_write_t *lr = (lr_write_t *)lrc;
256 		zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
257 	}
258 }
259 
260 /* ARGSUSED */
261 static void
262 zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
263 {
264 	zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
265 }
266 
267 static void
268 zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
269 {
270 	/*
271 	 * If we previously claimed it, we need to free it.
272 	 */
273 	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
274 		lr_write_t *lr = (lr_write_t *)lrc;
275 		blkptr_t *bp = &lr->lr_blkptr;
276 		if (bp->blk_birth >= claim_txg &&
277 		    !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
278 			(void) arc_free(NULL, zilog->zl_spa,
279 			    dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
280 		}
281 	}
282 }
283 
284 /*
285  * Create an on-disk intent log.
286  */
287 static void
288 zil_create(zilog_t *zilog)
289 {
290 	lwb_t *lwb;
291 	uint64_t txg;
292 	dmu_tx_t *tx;
293 	blkptr_t blk;
294 	int error;
295 
296 	ASSERT(zilog->zl_header->zh_claim_txg == 0);
297 	ASSERT(zilog->zl_header->zh_replay_seq == 0);
298 
299 	/*
300 	 * Initialize the log header block.
301 	 */
302 	tx = dmu_tx_create(zilog->zl_os);
303 	(void) dmu_tx_assign(tx, TXG_WAIT);
304 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
305 	txg = dmu_tx_get_txg(tx);
306 
307 	/*
308 	 * Allocate the first log block and assign its checksum verifier.
309 	 */
310 	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
311 	    ZIL_MIN_BLKSZ, &blk, txg);
312 	if (error == 0) {
313 		ZIO_SET_CHECKSUM(&blk.blk_cksum,
314 		    spa_get_random(-1ULL), spa_get_random(-1ULL),
315 		    dmu_objset_id(zilog->zl_os), 1ULL);
316 
317 		/*
318 		 * Allocate a log write buffer (lwb) for the first log block.
319 		 */
320 		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
321 		lwb->lwb_zilog = zilog;
322 		lwb->lwb_blk = blk;
323 		lwb->lwb_nused = 0;
324 		lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
325 		lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
326 		lwb->lwb_max_txg = txg;
327 		lwb->lwb_seq = 0;
328 		lwb->lwb_state = UNWRITTEN;
329 		mutex_enter(&zilog->zl_lock);
330 		list_insert_tail(&zilog->zl_lwb_list, lwb);
331 		mutex_exit(&zilog->zl_lock);
332 	}
333 
334 	dmu_tx_commit(tx);
335 	txg_wait_synced(zilog->zl_dmu_pool, txg);
336 }
337 
338 /*
339  * In one tx, free all log blocks and clear the log header.
340  */
341 void
342 zil_destroy(zilog_t *zilog)
343 {
344 	dmu_tx_t *tx;
345 	uint64_t txg;
346 
347 	mutex_enter(&zilog->zl_destroy_lock);
348 
349 	if (BP_IS_HOLE(&zilog->zl_header->zh_log)) {
350 		mutex_exit(&zilog->zl_destroy_lock);
351 		return;
352 	}
353 
354 	tx = dmu_tx_create(zilog->zl_os);
355 	(void) dmu_tx_assign(tx, TXG_WAIT);
356 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
357 	txg = dmu_tx_get_txg(tx);
358 
359 	zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx,
360 	    zilog->zl_header->zh_claim_txg);
361 	zilog->zl_destroy_txg = txg;
362 
363 	dmu_tx_commit(tx);
364 	txg_wait_synced(zilog->zl_dmu_pool, txg);
365 
366 	mutex_exit(&zilog->zl_destroy_lock);
367 }
368 
369 void
370 zil_claim(char *osname, void *txarg)
371 {
372 	dmu_tx_t *tx = txarg;
373 	uint64_t first_txg = dmu_tx_get_txg(tx);
374 	zilog_t *zilog;
375 	zil_header_t *zh;
376 	objset_t *os;
377 	int error;
378 
379 	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os);
380 	if (error) {
381 		cmn_err(CE_WARN, "can't process intent log for %s", osname);
382 		return;
383 	}
384 
385 	zilog = dmu_objset_zil(os);
386 	zh = zilog->zl_header;
387 
388 	/*
389 	 * Claim all log blocks if we haven't already done so.
390 	 */
391 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
392 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
393 		zh->zh_claim_txg = first_txg;
394 		zil_parse(zilog, zil_claim_log_block, zil_claim_log_record,
395 		    tx, first_txg);
396 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
397 	}
398 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
399 	dmu_objset_close(os);
400 }
401 
402 void
403 zil_add_vdev(zilog_t *zilog, uint64_t vdev, uint64_t seq)
404 {
405 	zil_vdev_t *zv;
406 
407 	if (zil_noflush)
408 		return;
409 
410 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
411 	zv = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
412 	zv->vdev = vdev;
413 	zv->seq = seq;
414 	list_insert_tail(&zilog->zl_vdev_list, zv);
415 }
416 
417 
418 void
419 zil_flush_vdevs(zilog_t *zilog, uint64_t seq)
420 {
421 	vdev_t *vd;
422 	zil_vdev_t *zv, *zv2;
423 	zio_t *zio;
424 	spa_t *spa;
425 	uint64_t vdev;
426 
427 	if (zil_noflush)
428 		return;
429 
430 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
431 
432 	spa = zilog->zl_spa;
433 	zio = NULL;
434 
435 	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL &&
436 	    zv->seq <= seq) {
437 		vdev = zv->vdev;
438 		list_remove(&zilog->zl_vdev_list, zv);
439 		kmem_free(zv, sizeof (zil_vdev_t));
440 
441 		/*
442 		 * remove all chained entries <= seq with same vdev
443 		 */
444 		zv = list_head(&zilog->zl_vdev_list);
445 		while (zv && zv->seq <= seq) {
446 			zv2 = list_next(&zilog->zl_vdev_list, zv);
447 			if (zv->vdev == vdev) {
448 				list_remove(&zilog->zl_vdev_list, zv);
449 				kmem_free(zv, sizeof (zil_vdev_t));
450 			}
451 			zv = zv2;
452 		}
453 
454 		/* flush the write cache for this vdev */
455 		mutex_exit(&zilog->zl_lock);
456 		if (zio == NULL)
457 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
458 		vd = vdev_lookup_top(spa, vdev);
459 		ASSERT(vd);
460 		(void) zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
461 		    NULL, NULL, ZIO_PRIORITY_NOW,
462 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
463 		mutex_enter(&zilog->zl_lock);
464 	}
465 
466 	/*
467 	 * Wait for all the flushes to complete.  Not all devices actually
468 	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
469 	 */
470 	if (zio != NULL)
471 		(void) zio_wait(zio);
472 }
473 
474 /*
475  * Function called when a log block write completes
476  */
477 static void
478 zil_lwb_write_done(zio_t *zio)
479 {
480 	lwb_t *prev;
481 	lwb_t *lwb = zio->io_private;
482 	zilog_t *zilog = lwb->lwb_zilog;
483 	uint64_t max_seq;
484 
485 	/*
486 	 * Now that we've written this log block, we have a stable pointer
487 	 * to the next block in the chain, so it's OK to let the txg in
488 	 * which we allocated the next block sync.
489 	 */
490 	txg_rele_to_sync(&lwb->lwb_txgh);
491 
492 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
493 	mutex_enter(&zilog->zl_lock);
494 	lwb->lwb_buf = NULL;
495 	if (zio->io_error) {
496 		zilog->zl_log_error = B_TRUE;
497 		mutex_exit(&zilog->zl_lock);
498 		cv_broadcast(&zilog->zl_cv_seq);
499 		return;
500 	}
501 
502 	prev = list_prev(&zilog->zl_lwb_list, lwb);
503 	if (prev && prev->lwb_state != SEQ_COMPLETE) {
504 		/* There's an unwritten buffer in the chain before this one */
505 		lwb->lwb_state = SEQ_INCOMPLETE;
506 		mutex_exit(&zilog->zl_lock);
507 		return;
508 	}
509 
510 	max_seq = lwb->lwb_seq;
511 	lwb->lwb_state = SEQ_COMPLETE;
512 	/*
513 	 * We must also follow up the chain for already written buffers
514 	 * to see if we can set zl_ss_seq even higher.
515 	 */
516 	while (lwb = list_next(&zilog->zl_lwb_list, lwb)) {
517 		if (lwb->lwb_state != SEQ_INCOMPLETE)
518 			break;
519 		lwb->lwb_state = SEQ_COMPLETE;
520 		/* lwb_seq will be zero if we've written an empty buffer */
521 		if (lwb->lwb_seq) {
522 			ASSERT3U(max_seq, <, lwb->lwb_seq);
523 			max_seq = lwb->lwb_seq;
524 		}
525 	}
526 	zilog->zl_ss_seq = MAX(max_seq, zilog->zl_ss_seq);
527 	mutex_exit(&zilog->zl_lock);
528 	cv_broadcast(&zilog->zl_cv_seq);
529 }
530 
531 /*
532  * Start a log block write and advance to the next log block.
533  * Calls are serialized.
534  */
535 static lwb_t *
536 zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
537 {
538 	lwb_t *nlwb;
539 	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
540 	uint64_t txg;
541 	uint64_t zil_blksz;
542 	int error;
543 
544 	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
545 
546 	/*
547 	 * Allocate the next block and save its address in this block
548 	 * before writing it in order to establish the log chain.
549 	 * Note that if the allocation of nlwb synced before we wrote
550 	 * the block that points at it (lwb), we'd leak it if we crashed.
551 	 * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
552 	 */
553 	txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
554 	txg_rele_to_quiesce(&lwb->lwb_txgh);
555 
556 	/*
557 	 * Pick a ZIL blocksize based upon the size of the outstanding
558 	 * in-memory transactions, or if none the same size as the
559 	 * last block.
560 	 */
561 	if (zilog->zl_itx_list_sz) {
562 		zil_blksz = zilog->zl_itx_list_sz + sizeof (*ztp);
563 		zil_blksz = P2ROUNDUP(zil_blksz, ZIL_MIN_BLKSZ);
564 		if (zil_blksz > ZIL_MAX_BLKSZ)
565 			zil_blksz = ZIL_MAX_BLKSZ;
566 		zilog->zl_prev_blk_sz = zil_blksz;
567 	} else {
568 		zil_blksz = zilog->zl_prev_blk_sz;
569 	}
570 
571 	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
572 	    zil_blksz, &ztp->zit_next_blk, txg);
573 	if (error) {
574 		txg_rele_to_sync(&lwb->lwb_txgh);
575 		return (NULL);
576 	}
577 
578 	ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg);
579 	ztp->zit_nused = lwb->lwb_nused;
580 	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
581 	ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum;
582 	ztp->zit_next_blk.blk_cksum.zc_word[3]++;
583 
584 	/*
585 	 * Allocate a new log write buffer (lwb).
586 	 */
587 	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
588 
589 	nlwb->lwb_zilog = zilog;
590 	nlwb->lwb_blk = ztp->zit_next_blk;
591 	nlwb->lwb_nused = 0;
592 	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
593 	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
594 	nlwb->lwb_max_txg = txg;
595 	nlwb->lwb_seq = 0;
596 	nlwb->lwb_state = UNWRITTEN;
597 
598 	/*
599 	 * Put new lwb at the end of the log chain,
600 	 * and record the vdev for later flushing
601 	 */
602 	mutex_enter(&zilog->zl_lock);
603 	list_insert_tail(&zilog->zl_lwb_list, nlwb);
604 	zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))),
605 	    lwb->lwb_seq);
606 	mutex_exit(&zilog->zl_lock);
607 
608 	/*
609 	 * write the old log block
610 	 */
611 	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
612 	zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0,
613 	    &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
614 	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED));
615 
616 	return (nlwb);
617 }
618 
619 static lwb_t *
620 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
621 {
622 	lr_t *lrc = &itx->itx_lr; /* common log record */
623 	uint64_t seq = lrc->lrc_seq;
624 	uint64_t txg = lrc->lrc_txg;
625 	uint64_t reclen = lrc->lrc_reclen;
626 	int error;
627 
628 	if (lwb == NULL)
629 		return (NULL);
630 	ASSERT(lwb->lwb_buf != NULL);
631 
632 	/*
633 	 * If it's a write, fetch the data or get its blkptr as appropriate.
634 	 */
635 	if (lrc->lrc_txtype == TX_WRITE) {
636 		lr_write_t *lr = (lr_write_t *)lrc;
637 		if (txg > spa_freeze_txg(zilog->zl_spa))
638 			txg_wait_synced(zilog->zl_dmu_pool, txg);
639 
640 		if (!itx->itx_data_copied &&
641 		    (error = zilog->zl_get_data(itx->itx_private, lr)) != 0) {
642 			if (error != ENOENT && error != EALREADY) {
643 				txg_wait_synced(zilog->zl_dmu_pool, txg);
644 				mutex_enter(&zilog->zl_lock);
645 				zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
646 				zil_add_vdev(zilog,
647 				    DVA_GET_VDEV(BP_IDENTITY(&(lr->lr_blkptr))),
648 				    seq);
649 				mutex_exit(&zilog->zl_lock);
650 				return (lwb);
651 			}
652 			mutex_enter(&zilog->zl_lock);
653 			zil_add_vdev(zilog,
654 			    DVA_GET_VDEV(BP_IDENTITY(&(lr->lr_blkptr))), seq);
655 			mutex_exit(&zilog->zl_lock);
656 			return (lwb);
657 		}
658 	}
659 
660 	/*
661 	 * If this record won't fit in the current log block, start a new one.
662 	 */
663 	if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
664 		lwb = zil_lwb_write_start(zilog, lwb);
665 		if (lwb == NULL)
666 			return (NULL);
667 		if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
668 			txg_wait_synced(zilog->zl_dmu_pool, txg);
669 			mutex_enter(&zilog->zl_lock);
670 			zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
671 			mutex_exit(&zilog->zl_lock);
672 			return (lwb);
673 		}
674 	}
675 
676 	bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
677 	lwb->lwb_nused += reclen;
678 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
679 	ASSERT3U(lwb->lwb_seq, <, seq);
680 	lwb->lwb_seq = seq;
681 	ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
682 	ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
683 
684 	return (lwb);
685 }
686 
687 itx_t *
688 zil_itx_create(int txtype, size_t lrsize)
689 {
690 	itx_t *itx;
691 
692 	lrsize = P2ROUNDUP(lrsize, sizeof (uint64_t));
693 
694 	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
695 	itx->itx_lr.lrc_txtype = txtype;
696 	itx->itx_lr.lrc_reclen = lrsize;
697 	itx->itx_lr.lrc_seq = 0;	/* defensive */
698 
699 	return (itx);
700 }
701 
702 uint64_t
703 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
704 {
705 	uint64_t seq;
706 
707 	ASSERT(itx->itx_lr.lrc_seq == 0);
708 
709 	mutex_enter(&zilog->zl_lock);
710 	list_insert_tail(&zilog->zl_itx_list, itx);
711 	zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen;
712 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
713 	itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
714 	mutex_exit(&zilog->zl_lock);
715 
716 	return (seq);
717 }
718 
719 /*
720  * Free up all in-memory intent log transactions that have now been synced.
721  */
722 static void
723 zil_itx_clean(zilog_t *zilog)
724 {
725 	uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
726 	uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
727 	uint64_t max_seq = 0;
728 	itx_t *itx;
729 
730 	mutex_enter(&zilog->zl_lock);
731 	while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
732 	    itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
733 		list_remove(&zilog->zl_itx_list, itx);
734 		zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen;
735 		ASSERT3U(max_seq, <, itx->itx_lr.lrc_seq);
736 		max_seq = itx->itx_lr.lrc_seq;
737 		kmem_free(itx, offsetof(itx_t, itx_lr)
738 		    + itx->itx_lr.lrc_reclen);
739 	}
740 	if (max_seq > zilog->zl_ss_seq) {
741 		zilog->zl_ss_seq = max_seq;
742 		cv_broadcast(&zilog->zl_cv_seq);
743 	}
744 	mutex_exit(&zilog->zl_lock);
745 }
746 
747 void
748 zil_clean(zilog_t *zilog)
749 {
750 	/*
751 	 * Check for any log blocks that can be freed.
752 	 * Log blocks are only freed when the log block allocation and
753 	 * log records contained within are both known to be committed.
754 	 */
755 	mutex_enter(&zilog->zl_lock);
756 	if (list_head(&zilog->zl_itx_list) != NULL)
757 		(void) taskq_dispatch(zilog->zl_clean_taskq,
758 		    (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
759 	mutex_exit(&zilog->zl_lock);
760 }
761 
762 /*
763  * Push zfs transactions to stable storage up to the supplied sequence number.
764  */
765 void
766 zil_commit(zilog_t *zilog, uint64_t seq, int ioflag)
767 {
768 	uint64_t txg;
769 	uint64_t max_seq;
770 	uint64_t reclen;
771 	itx_t *itx;
772 	lwb_t *lwb;
773 	spa_t *spa;
774 
775 	if (zilog == NULL || seq == 0 ||
776 	    ((ioflag & (FSYNC | FDSYNC | FRSYNC)) == 0 && !zil_always))
777 		return;
778 
779 	spa = zilog->zl_spa;
780 	mutex_enter(&zilog->zl_lock);
781 
782 	seq = MIN(seq, zilog->zl_itx_seq);	/* cap seq at largest itx seq */
783 
784 	for (;;) {
785 		if (zilog->zl_ss_seq >= seq) {	/* already on stable storage */
786 			cv_signal(&zilog->zl_cv_write);
787 			mutex_exit(&zilog->zl_lock);
788 			return;
789 		}
790 
791 		if (zilog->zl_writer == B_FALSE) /* no one writing, do it */
792 			break;
793 
794 		cv_wait(&zilog->zl_cv_write, &zilog->zl_lock);
795 	}
796 
797 	zilog->zl_writer = B_TRUE;
798 	max_seq = 0;
799 
800 	if (zilog->zl_suspend) {
801 		lwb = NULL;
802 	} else {
803 		lwb = list_tail(&zilog->zl_lwb_list);
804 		if (lwb == NULL) {
805 			mutex_exit(&zilog->zl_lock);
806 			zil_create(zilog);
807 			mutex_enter(&zilog->zl_lock);
808 			lwb = list_tail(&zilog->zl_lwb_list);
809 		}
810 	}
811 
812 	/*
813 	 * Loop through in-memory log transactions filling log blocks,
814 	 * until we reach the given sequence number and there's no more
815 	 * room in the write buffer.
816 	 */
817 	for (;;) {
818 		itx = list_head(&zilog->zl_itx_list);
819 		if (itx == NULL)
820 			break;
821 
822 		reclen = itx->itx_lr.lrc_reclen;
823 		if ((itx->itx_lr.lrc_seq > seq) &&
824 		    ((lwb == NULL) || (lwb->lwb_nused + reclen >
825 		    ZIL_BLK_DATA_SZ(lwb))))
826 			break;
827 
828 		list_remove(&zilog->zl_itx_list, itx);
829 		txg = itx->itx_lr.lrc_txg;
830 		ASSERT(txg);
831 
832 		mutex_exit(&zilog->zl_lock);
833 		if (txg > spa_last_synced_txg(spa) ||
834 		    txg > spa_freeze_txg(spa))
835 			lwb = zil_lwb_commit(zilog, itx, lwb);
836 		else
837 			max_seq = itx->itx_lr.lrc_seq;
838 		kmem_free(itx, offsetof(itx_t, itx_lr)
839 		    + itx->itx_lr.lrc_reclen);
840 		mutex_enter(&zilog->zl_lock);
841 		zilog->zl_itx_list_sz -= reclen;
842 	}
843 
844 	mutex_exit(&zilog->zl_lock);
845 
846 	/* write the last block out */
847 	if (lwb != NULL && lwb->lwb_nused != 0)
848 		lwb = zil_lwb_write_start(zilog, lwb);
849 
850 	/* wake up others waiting to start a write */
851 	mutex_enter(&zilog->zl_lock);
852 	zilog->zl_writer = B_FALSE;
853 	cv_signal(&zilog->zl_cv_write);
854 
855 	if (max_seq > zilog->zl_ss_seq) {
856 		zilog->zl_ss_seq = max_seq;
857 		cv_broadcast(&zilog->zl_cv_seq);
858 	}
859 	/*
860 	 * Wait if necessary for our seq to be committed.
861 	 */
862 	if (lwb) {
863 		while (zilog->zl_ss_seq < seq && zilog->zl_log_error == 0)
864 			cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
865 		zil_flush_vdevs(zilog, seq);
866 	}
867 	if (zilog->zl_log_error || lwb == NULL) {
868 		zilog->zl_log_error = 0;
869 		max_seq = zilog->zl_itx_seq;
870 		mutex_exit(&zilog->zl_lock);
871 		txg_wait_synced(zilog->zl_dmu_pool, 0);
872 		mutex_enter(&zilog->zl_lock);
873 		zilog->zl_ss_seq = MAX(max_seq, zilog->zl_ss_seq);
874 		cv_broadcast(&zilog->zl_cv_seq);
875 	}
876 	mutex_exit(&zilog->zl_lock);
877 }
878 
879 /*
880  * Called in syncing context to free committed log blocks and update log header.
881  */
882 void
883 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
884 {
885 	uint64_t txg = dmu_tx_get_txg(tx);
886 	spa_t *spa = zilog->zl_spa;
887 	lwb_t *lwb;
888 
889 	ASSERT(zilog->zl_stop_sync == 0);
890 
891 	zilog->zl_header->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
892 
893 	if (zilog->zl_destroy_txg == txg) {
894 		bzero(zilog->zl_header, sizeof (zil_header_t));
895 		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
896 		zilog->zl_destroy_txg = 0;
897 	}
898 
899 	mutex_enter(&zilog->zl_lock);
900 	for (;;) {
901 		lwb = list_head(&zilog->zl_lwb_list);
902 		if (lwb == NULL) {
903 			mutex_exit(&zilog->zl_lock);
904 			return;
905 		}
906 		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
907 			break;
908 		list_remove(&zilog->zl_lwb_list, lwb);
909 		zio_free_blk(spa, &lwb->lwb_blk, txg);
910 		kmem_cache_free(zil_lwb_cache, lwb);
911 	}
912 	zilog->zl_header->zh_log = lwb->lwb_blk;
913 	mutex_exit(&zilog->zl_lock);
914 }
915 
916 void
917 zil_init(void)
918 {
919 	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
920 	    sizeof (struct lwb), NULL, NULL, NULL, NULL, NULL, NULL, 0);
921 }
922 
923 void
924 zil_fini(void)
925 {
926 	kmem_cache_destroy(zil_lwb_cache);
927 }
928 
929 zilog_t *
930 zil_alloc(objset_t *os, zil_header_t *zh_phys)
931 {
932 	zilog_t *zilog;
933 
934 	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
935 
936 	zilog->zl_header = zh_phys;
937 	zilog->zl_os = os;
938 	zilog->zl_spa = dmu_objset_spa(os);
939 	zilog->zl_dmu_pool = dmu_objset_pool(os);
940 	zilog->zl_prev_blk_sz = ZIL_MIN_BLKSZ;
941 
942 	list_create(&zilog->zl_itx_list, sizeof (itx_t),
943 	    offsetof(itx_t, itx_node));
944 
945 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
946 	    offsetof(lwb_t, lwb_node));
947 
948 	list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t),
949 	    offsetof(zil_vdev_t, vdev_seq_node));
950 
951 	return (zilog);
952 }
953 
954 void
955 zil_free(zilog_t *zilog)
956 {
957 	lwb_t *lwb;
958 	zil_vdev_t *zv;
959 
960 	zilog->zl_stop_sync = 1;
961 
962 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
963 		list_remove(&zilog->zl_lwb_list, lwb);
964 		if (lwb->lwb_buf != NULL)
965 			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
966 		kmem_cache_free(zil_lwb_cache, lwb);
967 	}
968 	list_destroy(&zilog->zl_lwb_list);
969 
970 	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
971 		list_remove(&zilog->zl_vdev_list, zv);
972 		kmem_free(zv, sizeof (zil_vdev_t));
973 	}
974 	list_destroy(&zilog->zl_vdev_list);
975 
976 	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
977 	list_destroy(&zilog->zl_itx_list);
978 
979 	kmem_free(zilog, sizeof (zilog_t));
980 }
981 
982 /*
983  * Open an intent log.
984  */
985 zilog_t *
986 zil_open(objset_t *os, zil_get_data_t *get_data)
987 {
988 	zilog_t *zilog = dmu_objset_zil(os);
989 
990 	zilog->zl_get_data = get_data;
991 	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
992 	    2, 2, TASKQ_PREPOPULATE);
993 
994 	return (zilog);
995 }
996 
997 /*
998  * Close an intent log.
999  */
1000 void
1001 zil_close(zilog_t *zilog)
1002 {
1003 	txg_wait_synced(zilog->zl_dmu_pool, 0);
1004 	taskq_destroy(zilog->zl_clean_taskq);
1005 	zilog->zl_clean_taskq = NULL;
1006 	zilog->zl_get_data = NULL;
1007 
1008 	zil_itx_clean(zilog);
1009 	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
1010 }
1011 
1012 /*
1013  * Suspend an intent log.  While in suspended mode, we still honor
1014  * synchronous semantics, but we rely on txg_wait_synced() to do it.
1015  * We suspend the log briefly when taking a snapshot so that the snapshot
1016  * contains all the data it's supposed to, and has an empty intent log.
1017  */
1018 int
1019 zil_suspend(zilog_t *zilog)
1020 {
1021 	lwb_t *lwb;
1022 
1023 	mutex_enter(&zilog->zl_lock);
1024 	if (zilog->zl_header->zh_claim_txg != 0) {	/* unplayed log */
1025 		mutex_exit(&zilog->zl_lock);
1026 		return (EBUSY);
1027 	}
1028 	zilog->zl_suspend++;
1029 	mutex_exit(&zilog->zl_lock);
1030 
1031 	zil_commit(zilog, UINT64_MAX, FSYNC);
1032 
1033 	mutex_enter(&zilog->zl_lock);
1034 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
1035 		if (lwb->lwb_buf != NULL) {
1036 			/*
1037 			 * Wait for the buffer if it's in the process of
1038 			 * being written.
1039 			 */
1040 			if ((lwb->lwb_seq != 0) &&
1041 			    (lwb->lwb_state != SEQ_COMPLETE)) {
1042 				cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
1043 				continue;
1044 			}
1045 			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
1046 		}
1047 		list_remove(&zilog->zl_lwb_list, lwb);
1048 		kmem_cache_free(zil_lwb_cache, lwb);
1049 	}
1050 	mutex_exit(&zilog->zl_lock);
1051 
1052 	zil_destroy(zilog);
1053 
1054 	return (0);
1055 }
1056 
1057 void
1058 zil_resume(zilog_t *zilog)
1059 {
1060 	mutex_enter(&zilog->zl_lock);
1061 	ASSERT(zilog->zl_suspend != 0);
1062 	zilog->zl_suspend--;
1063 	mutex_exit(&zilog->zl_lock);
1064 }
1065 
1066 typedef struct zil_replay_arg {
1067 	objset_t	*zr_os;
1068 	zil_replay_func_t **zr_replay;
1069 	void		*zr_arg;
1070 	void		(*zr_rm_sync)(void *arg);
1071 	uint64_t	*zr_txgp;
1072 	boolean_t	zr_byteswap;
1073 	char		*zr_lrbuf;
1074 } zil_replay_arg_t;
1075 
1076 static void
1077 zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
1078 {
1079 	zil_replay_arg_t *zr = zra;
1080 	zil_header_t *zh = zilog->zl_header;
1081 	uint64_t reclen = lr->lrc_reclen;
1082 	uint64_t txtype = lr->lrc_txtype;
1083 	int pass, error;
1084 
1085 	if (zilog->zl_stop_replay)
1086 		return;
1087 
1088 	if (lr->lrc_txg < claim_txg)		/* already committed */
1089 		return;
1090 
1091 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
1092 		return;
1093 
1094 	/*
1095 	 * Make a copy of the data so we can revise and extend it.
1096 	 */
1097 	bcopy(lr, zr->zr_lrbuf, reclen);
1098 
1099 	/*
1100 	 * The log block containing this lr may have been byteswapped
1101 	 * so that we can easily examine common fields like lrc_txtype.
1102 	 * However, the log is a mix of different data types, and only the
1103 	 * replay vectors know how to byteswap their records.  Therefore, if
1104 	 * the lr was byteswapped, undo it before invoking the replay vector.
1105 	 */
1106 	if (zr->zr_byteswap)
1107 		byteswap_uint64_array(zr->zr_lrbuf, reclen);
1108 
1109 	/*
1110 	 * If this is a TX_WRITE with a blkptr, suck in the data.
1111 	 */
1112 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
1113 		lr_write_t *lrw = (lr_write_t *)lr;
1114 		blkptr_t *wbp = &lrw->lr_blkptr;
1115 		uint64_t wlen = lrw->lr_length;
1116 		char *wbuf = zr->zr_lrbuf + reclen;
1117 
1118 		if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
1119 			bzero(wbuf, wlen);
1120 		} else {
1121 			/*
1122 			 * A subsequent write may have overwritten this block,
1123 			 * in which case wbp may have been been freed and
1124 			 * reallocated, and our read of wbp may fail with a
1125 			 * checksum error.  We can safely ignore this because
1126 			 * the later write will provide the correct data.
1127 			 */
1128 			(void) zio_wait(zio_read(NULL, zilog->zl_spa,
1129 			    wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
1130 			    ZIO_PRIORITY_SYNC_READ,
1131 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
1132 			(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
1133 		}
1134 	}
1135 
1136 	/*
1137 	 * We must now do two things atomically: replay this log record,
1138 	 * and update the log header to reflect the fact that we did so.
1139 	 * We use the DMU's ability to assign into a specific txg to do this.
1140 	 */
1141 	for (pass = 1; /* CONSTANTCONDITION */; pass++) {
1142 		uint64_t replay_txg;
1143 		dmu_tx_t *replay_tx;
1144 
1145 		replay_tx = dmu_tx_create(zr->zr_os);
1146 		error = dmu_tx_assign(replay_tx, TXG_WAIT);
1147 		if (error) {
1148 			dmu_tx_abort(replay_tx);
1149 			break;
1150 		}
1151 
1152 		replay_txg = dmu_tx_get_txg(replay_tx);
1153 
1154 		if (txtype == 0 || txtype >= TX_MAX_TYPE) {
1155 			error = EINVAL;
1156 		} else {
1157 			/*
1158 			 * On the first pass, arrange for the replay vector
1159 			 * to fail its dmu_tx_assign().  That's the only way
1160 			 * to ensure that those code paths remain well tested.
1161 			 */
1162 			*zr->zr_txgp = replay_txg - (pass == 1);
1163 			error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
1164 			    zr->zr_byteswap);
1165 			*zr->zr_txgp = TXG_NOWAIT;
1166 		}
1167 
1168 		if (error == 0) {
1169 			dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
1170 			zilog->zl_replay_seq[replay_txg & TXG_MASK] =
1171 			    lr->lrc_seq;
1172 		}
1173 
1174 		dmu_tx_commit(replay_tx);
1175 
1176 		if (error != ERESTART)
1177 			break;
1178 
1179 		if (pass != 1)
1180 			txg_wait_open(spa_get_dsl(zilog->zl_spa),
1181 			    replay_txg + 1);
1182 
1183 		dprintf("pass %d, retrying\n", pass);
1184 	}
1185 
1186 	if (error) {
1187 		char *name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
1188 		dmu_objset_name(zr->zr_os, name);
1189 		cmn_err(CE_WARN, "ZFS replay transaction error %d, "
1190 		    "dataset %s, seq 0x%llx, txtype %llu\n",
1191 		    error, name,
1192 		    (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
1193 		zilog->zl_stop_replay = 1;
1194 		kmem_free(name, MAXNAMELEN);
1195 	}
1196 
1197 	/*
1198 	 * The DMU's dnode layer doesn't see removes until the txg commits,
1199 	 * so a subsequent claim can spuriously fail with EEXIST.
1200 	 * To prevent this, if we might have removed an object,
1201 	 * wait for the delete thread to delete it, and then
1202 	 * wait for the transaction group to sync.
1203 	 */
1204 	if (txtype == TX_REMOVE || txtype == TX_RMDIR || txtype == TX_RENAME) {
1205 		if (zr->zr_rm_sync != NULL)
1206 			zr->zr_rm_sync(zr->zr_arg);
1207 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
1208 	}
1209 }
1210 
1211 /*
1212  * If this dataset has an intent log, replay it and destroy it.
1213  */
1214 void
1215 zil_replay(objset_t *os, void *arg, uint64_t *txgp,
1216 	zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_sync)(void *arg))
1217 {
1218 	zilog_t *zilog = dmu_objset_zil(os);
1219 	zil_replay_arg_t zr;
1220 
1221 	zr.zr_os = os;
1222 	zr.zr_replay = replay_func;
1223 	zr.zr_arg = arg;
1224 	zr.zr_rm_sync = rm_sync;
1225 	zr.zr_txgp = txgp;
1226 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zilog->zl_header->zh_log);
1227 	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
1228 
1229 	/*
1230 	 * Wait for in-progress removes to sync before starting replay.
1231 	 */
1232 	if (rm_sync != NULL)
1233 		rm_sync(arg);
1234 	txg_wait_synced(zilog->zl_dmu_pool, 0);
1235 
1236 	zilog->zl_stop_replay = 0;
1237 	zil_parse(zilog, NULL, zil_replay_log_record, &zr,
1238 	    zilog->zl_header->zh_claim_txg);
1239 	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
1240 
1241 	zil_destroy(zilog);
1242 }
1243