xref: /titanic_41/usr/src/uts/common/fs/ufs/lufs_log.c (revision 7535ae1914017b0e648abd7a139aca709fa82be3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/vnode.h>
29 #include <sys/errno.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/kmem.h>
33 #include <sys/conf.h>
34 #include <sys/proc.h>
35 #include <sys/cmn_err.h>
36 #include <sys/fssnap_if.h>
37 #include <sys/fs/ufs_inode.h>
38 #include <sys/fs/ufs_filio.h>
39 #include <sys/fs/ufs_log.h>
40 #include <sys/fs/ufs_bio.h>
41 #include <sys/atomic.h>
42 
43 extern int		maxphys;
44 extern uint_t		bypass_snapshot_throttle_key;
45 
46 extern struct kmem_cache	*lufs_sv;
47 extern struct kmem_cache	*lufs_bp;
48 
49 static void
50 makebusy(ml_unit_t *ul, buf_t *bp)
51 {
52 	sema_p(&bp->b_sem);
53 	if ((bp->b_flags & B_ERROR) == 0)
54 		return;
55 	if (bp->b_flags & B_READ)
56 		ldl_seterror(ul, "Error reading ufs log");
57 	else
58 		ldl_seterror(ul, "Error writing ufs log");
59 }
60 
61 static int
62 logdone(buf_t *bp)
63 {
64 	bp->b_flags |= B_DONE;
65 
66 	if (bp->b_flags & B_WRITE)
67 		sema_v(&bp->b_sem);
68 	else
69 		/* wakeup the thread waiting on this buf */
70 		sema_v(&bp->b_io);
71 	return (0);
72 }
73 
74 static int
75 ldl_strategy_done(buf_t *cb)
76 {
77 	lufs_save_t	*sv;
78 	lufs_buf_t	*lbp;
79 	buf_t		*bp;
80 
81 	ASSERT(SEMA_HELD(&cb->b_sem));
82 	ASSERT((cb->b_flags & B_DONE) == 0);
83 
84 	/*
85 	 * Compute address of the ``save'' struct
86 	 */
87 	lbp = (lufs_buf_t *)cb;
88 	sv = (lufs_save_t *)lbp->lb_ptr;
89 
90 	if (cb->b_flags & B_ERROR)
91 		sv->sv_error = 1;
92 
93 	/*
94 	 * If this is the last request, release the resources and
95 	 * ``done'' the original buffer header.
96 	 */
97 	if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
98 		kmem_cache_free(lufs_bp, lbp);
99 		return (1);
100 	}
101 	/* Propagate any errors back to the original buffer header */
102 	bp = sv->sv_bp;
103 	if (sv->sv_error)
104 		bp->b_flags |= B_ERROR;
105 	kmem_cache_free(lufs_bp, lbp);
106 	kmem_cache_free(lufs_sv, sv);
107 
108 	biodone(bp);
109 	return (0);
110 }
111 
112 /*
113  * Map the log logical block number to a physical disk block number
114  */
115 static int
116 map_frag(
117 	ml_unit_t	*ul,
118 	daddr_t		lblkno,
119 	size_t		bcount,
120 	daddr_t		*pblkno,
121 	size_t		*pbcount)
122 {
123 	ic_extent_t	*ext = ul->un_ebp->ic_extents;
124 	uint32_t	e = ul->un_ebp->ic_nextents;
125 	uint32_t	s = 0;
126 	uint32_t	i = e >> 1;
127 	uint32_t	lasti = i;
128 	uint32_t	bno_off;
129 
130 again:
131 	if (ext[i].ic_lbno <= lblkno) {
132 		if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
133 			/* FOUND IT */
134 			bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
135 			*pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
136 			*pblkno = ext[i].ic_pbno + bno_off;
137 			return (0);
138 		} else
139 			s = i;
140 	} else
141 		e = i;
142 	i = s + ((e - s) >> 1);
143 
144 	if (i == lasti) {
145 		*pbcount = bcount;
146 		return (ENOENT);
147 	}
148 	lasti = i;
149 
150 	goto again;
151 }
152 
153 /*
154  * The log is a set of extents (which typically will be only one, but
155  * may be more if the disk was close to full when the log was created)
156  * and hence the logical offsets into the log
157  * have to be translated into their real device locations before
158  * calling the device's strategy routine. The translation may result
159  * in several IO requests if this request spans extents.
160  */
161 void
162 ldl_strategy(ml_unit_t *ul, buf_t *pb)
163 {
164 	lufs_save_t	*sv;
165 	lufs_buf_t	*lbp;
166 	buf_t		*cb;
167 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
168 	daddr_t		lblkno, pblkno;
169 	size_t		nb_left, pbcount;
170 	off_t		offset;
171 	dev_t		dev	= ul->un_dev;
172 	int		error;
173 	int		read = pb->b_flags & B_READ;
174 
175 	/*
176 	 * Allocate and initialise the save stucture,
177 	 */
178 	sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
179 	sv->sv_error = 0;
180 	sv->sv_bp = pb;
181 	nb_left = pb->b_bcount;
182 	sv->sv_nb_left = nb_left;
183 
184 	lblkno = pb->b_blkno;
185 	offset = 0;
186 
187 	do {
188 		error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);
189 
190 		lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
191 		bioinit(&lbp->lb_buf);
192 		lbp->lb_ptr = sv;
193 
194 		cb = bioclone(pb, offset, pbcount, dev,
195 		    pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);
196 
197 		offset += pbcount;
198 		lblkno += btodb(pbcount);
199 		nb_left -= pbcount;
200 
201 		if (error) {
202 			cb->b_flags |= B_ERROR;
203 			cb->b_resid = cb->b_bcount;
204 			biodone(cb);
205 		} else {
206 			if (read) {
207 				logstats.ls_ldlreads.value.ui64++;
208 				ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
209 				lwp_stat_update(LWP_STAT_INBLK, 1);
210 			} else {
211 				logstats.ls_ldlwrites.value.ui64++;
212 				lwp_stat_update(LWP_STAT_OUBLK, 1);
213 			}
214 
215 			/*
216 			 * write through the snapshot driver if necessary
217 			 * We do not want this write to be throttled because
218 			 * we are holding the un_log mutex here. If we
219 			 * are throttled in fssnap_translate, the fssnap_taskq
220 			 * thread which can wake us up can get blocked on
221 			 * the un_log mutex resulting in a deadlock.
222 			 */
223 			if (ufsvfsp->vfs_snapshot) {
224 				(void) tsd_set(bypass_snapshot_throttle_key,
225 				    (void *)1);
226 				fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);
227 
228 				(void) tsd_set(bypass_snapshot_throttle_key,
229 				    (void *)0);
230 			} else {
231 				(void) bdev_strategy(cb);
232 			}
233 		}
234 
235 	} while (nb_left);
236 }
237 
238 static void
239 writelog(ml_unit_t *ul, buf_t *bp)
240 {
241 	ASSERT(SEMA_HELD(&bp->b_sem));
242 
243 	/*
244 	 * This is really an B_ASYNC write but we want Presto to
245 	 * cache this write.  The iodone routine, logdone, processes
246 	 * the buf correctly.
247 	 */
248 	bp->b_flags = B_WRITE;
249 	bp->b_edev = ul->un_dev;
250 	bp->b_iodone = logdone;
251 
252 	/*
253 	 * return EIO for every IO if in hard error state
254 	 */
255 	if (ul->un_flags & LDL_ERROR) {
256 		bp->b_flags |= B_ERROR;
257 		bp->b_error = EIO;
258 		biodone(bp);
259 		return;
260 	}
261 
262 	ldl_strategy(ul, bp);
263 }
264 
265 static void
266 readlog(ml_unit_t *ul, buf_t *bp)
267 {
268 	ASSERT(SEMA_HELD(&bp->b_sem));
269 	ASSERT(bp->b_bcount);
270 
271 	bp->b_flags = B_READ;
272 	bp->b_edev = ul->un_dev;
273 	bp->b_iodone = logdone;
274 
275 	/* all IO returns errors when in error state */
276 	if (ul->un_flags & LDL_ERROR) {
277 		bp->b_flags |= B_ERROR;
278 		bp->b_error = EIO;
279 		biodone(bp);
280 		(void) trans_wait(bp);
281 		return;
282 	}
283 
284 	ldl_strategy(ul, bp);
285 
286 	if (trans_wait(bp))
287 		ldl_seterror(ul, "Error reading ufs log");
288 }
289 
290 /*
291  * NOTE: writers are single threaded thru the log layer.
292  * This means we can safely reference and change the cb and bp fields
293  * that ldl_read does not reference w/o holding the cb_rwlock or
294  * the bp makebusy lock.
295  */
296 static void
297 push_dirty_bp(ml_unit_t *ul, buf_t *bp)
298 {
299 	buf_t		*newbp;
300 	cirbuf_t	*cb		= &ul->un_wrbuf;
301 
302 	ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
303 	ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);
304 
305 	/*
306 	 * async write the buf
307 	 */
308 	writelog(ul, bp);
309 
310 	/*
311 	 * no longer filling any buf
312 	 */
313 	cb->cb_dirty = NULL;
314 
315 	/*
316 	 * no extra buffer space; all done
317 	 */
318 	if (bp->b_bcount == bp->b_bufsize)
319 		return;
320 
321 	/*
322 	 * give extra buffer space to a new bp
323 	 * 	try to take buf off of free list
324 	 */
325 	if ((newbp = cb->cb_free) != NULL) {
326 		cb->cb_free = newbp->b_forw;
327 	} else {
328 		newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
329 		sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
330 		sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
331 	}
332 	newbp->b_flags = 0;
333 	newbp->b_bcount = 0;
334 	newbp->b_file = NULL;
335 	newbp->b_offset = -1;
336 	newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
337 	newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
338 	bp->b_bufsize = bp->b_bcount;
339 
340 	/*
341 	 * lock out readers and put new buf at LRU position
342 	 */
343 	rw_enter(&cb->cb_rwlock, RW_WRITER);
344 	newbp->b_forw = bp->b_forw;
345 	newbp->b_back = bp;
346 	bp->b_forw->b_back = newbp;
347 	bp->b_forw = newbp;
348 	rw_exit(&cb->cb_rwlock);
349 }
350 
351 static void
352 inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
353 {
354 	buf_t		*bp;
355 	off_t		elof	= lof + nb;
356 	off_t		buflof;
357 	off_t		bufelof;
358 
359 	/*
360 	 * discard all bufs that overlap the range (lof, lof + nb)
361 	 */
362 	rw_enter(&cb->cb_rwlock, RW_WRITER);
363 	bp = cb->cb_bp;
364 	do {
365 		if (bp == cb->cb_dirty || bp->b_bcount == 0) {
366 			bp = bp->b_forw;
367 			continue;
368 		}
369 		buflof = dbtob(bp->b_blkno);
370 		bufelof = buflof + bp->b_bcount;
371 		if ((buflof < lof && bufelof <= lof) ||
372 		    (buflof >= elof && bufelof > elof)) {
373 			bp = bp->b_forw;
374 			continue;
375 		}
376 		makebusy(ul, bp);
377 		bp->b_flags = 0;
378 		bp->b_bcount = 0;
379 		sema_v(&bp->b_sem);
380 		bp = bp->b_forw;
381 	} while (bp != cb->cb_bp);
382 	rw_exit(&cb->cb_rwlock);
383 }
384 
385 /*
386  * NOTE: writers are single threaded thru the log layer.
387  * This means we can safely reference and change the cb and bp fields
388  * that ldl_read does not reference w/o holding the cb_rwlock or
389  * the bp makebusy lock.
390  */
391 static buf_t *
392 get_write_bp(ml_unit_t *ul)
393 {
394 	cirbuf_t	*cb = &ul->un_wrbuf;
395 	buf_t		*bp;
396 
397 	/*
398 	 * cb_dirty is the buffer we are currently filling; if any
399 	 */
400 	if ((bp = cb->cb_dirty) != NULL) {
401 		makebusy(ul, bp);
402 		return (bp);
403 	}
404 	/*
405 	 * discard any bp that overlaps the current tail since we are
406 	 * about to overwrite it.
407 	 */
408 	inval_range(ul, cb, ul->un_tail_lof, 1);
409 
410 	/*
411 	 * steal LRU buf
412 	 */
413 	rw_enter(&cb->cb_rwlock, RW_WRITER);
414 	bp = cb->cb_bp->b_forw;
415 	makebusy(ul, bp);
416 
417 	cb->cb_dirty = bp;
418 	cb->cb_bp = bp;
419 
420 	bp->b_flags = 0;
421 	bp->b_bcount = 0;
422 	bp->b_blkno = btodb(ul->un_tail_lof);
423 	ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
424 	rw_exit(&cb->cb_rwlock);
425 
426 	/*
427 	 * NOTE:
428 	 *	1. un_tail_lof never addresses >= un_eol_lof
429 	 *	2. b_blkno + btodb(b_bufsize) may > un_eol_lof
430 	 *		this case is handled in storebuf
431 	 */
432 	return (bp);
433 }
434 
435 void
436 alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
437 {
438 	int	i;
439 	buf_t	*bp;
440 
441 	/*
442 	 * Clear previous allocation
443 	 */
444 	if (cb->cb_nb)
445 		free_cirbuf(cb);
446 
447 	bzero(cb, sizeof (*cb));
448 	rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
449 
450 	rw_enter(&cb->cb_rwlock, RW_WRITER);
451 
452 	/*
453 	 * preallocate 3 bp's and put them on the free list.
454 	 */
455 	for (i = 0; i < 3; ++i) {
456 		bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
457 		sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
458 		sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
459 		bp->b_offset = -1;
460 		bp->b_forw = cb->cb_free;
461 		cb->cb_free = bp;
462 	}
463 
464 	cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
465 	cb->cb_nb = bufsize;
466 
467 	/*
468 	 * first bp claims entire write buffer
469 	 */
470 	bp = cb->cb_free;
471 	cb->cb_free = bp->b_forw;
472 
473 	bp->b_forw = bp;
474 	bp->b_back = bp;
475 	cb->cb_bp = bp;
476 	bp->b_un.b_addr = cb->cb_va;
477 	bp->b_bufsize = cb->cb_nb;
478 
479 	rw_exit(&cb->cb_rwlock);
480 }
481 
482 void
483 alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
484 {
485 	caddr_t	va;
486 	size_t	nb;
487 	buf_t	*bp;
488 
489 	/*
490 	 * Clear previous allocation
491 	 */
492 	if (cb->cb_nb)
493 		free_cirbuf(cb);
494 
495 	bzero(cb, sizeof (*cb));
496 	rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
497 
498 	rw_enter(&cb->cb_rwlock, RW_WRITER);
499 
500 	cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
501 	cb->cb_nb = bufsize;
502 
503 	/*
504 	 * preallocate N bufs that are hard-sized to blksize
505 	 *	in other words, the read buffer pool is a linked list
506 	 *	of statically sized bufs.
507 	 */
508 	va = cb->cb_va;
509 	while ((nb = bufsize) != 0) {
510 		if (nb > blksize)
511 			nb = blksize;
512 		bp = kmem_alloc(sizeof (buf_t), KM_SLEEP);
513 		bzero(bp, sizeof (buf_t));
514 		sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
515 		sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
516 		bp->b_un.b_addr = va;
517 		bp->b_bufsize = nb;
518 		if (cb->cb_bp) {
519 			bp->b_forw = cb->cb_bp->b_forw;
520 			bp->b_back = cb->cb_bp;
521 			cb->cb_bp->b_forw->b_back = bp;
522 			cb->cb_bp->b_forw = bp;
523 		} else
524 			bp->b_forw = bp->b_back = bp;
525 		cb->cb_bp = bp;
526 		bufsize -= nb;
527 		va += nb;
528 	}
529 
530 	rw_exit(&cb->cb_rwlock);
531 }
532 
533 void
534 free_cirbuf(cirbuf_t *cb)
535 {
536 	buf_t	*bp;
537 
538 	if (cb->cb_nb == 0)
539 		return;
540 
541 	rw_enter(&cb->cb_rwlock, RW_WRITER);
542 	ASSERT(cb->cb_dirty == NULL);
543 
544 	/*
545 	 * free the active bufs
546 	 */
547 	while ((bp = cb->cb_bp) != NULL) {
548 		if (bp == bp->b_forw)
549 			cb->cb_bp = NULL;
550 		else
551 			cb->cb_bp = bp->b_forw;
552 		bp->b_back->b_forw = bp->b_forw;
553 		bp->b_forw->b_back = bp->b_back;
554 		sema_destroy(&bp->b_sem);
555 		sema_destroy(&bp->b_io);
556 		kmem_free(bp, sizeof (buf_t));
557 	}
558 
559 	/*
560 	 * free the free bufs
561 	 */
562 	while ((bp = cb->cb_free) != NULL) {
563 		cb->cb_free = bp->b_forw;
564 		sema_destroy(&bp->b_sem);
565 		sema_destroy(&bp->b_io);
566 		kmem_free(bp, sizeof (buf_t));
567 	}
568 	kmem_free(cb->cb_va, cb->cb_nb);
569 	cb->cb_va = NULL;
570 	cb->cb_nb = 0;
571 	rw_exit(&cb->cb_rwlock);
572 	rw_destroy(&cb->cb_rwlock);
573 }
574 
575 static int
576 within_range(off_t lof, daddr_t blkno, ulong_t bcount)
577 {
578 	off_t	blof	= dbtob(blkno);
579 
580 	return ((lof >= blof) && (lof < (blof + bcount)));
581 }
582 
583 static buf_t *
584 find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
585 {
586 	buf_t *bp;
587 
588 	/*
589 	 * find a buf that contains the offset lof
590 	 */
591 	rw_enter(&cb->cb_rwlock, RW_READER);
592 	bp = cb->cb_bp;
593 	do {
594 		if (bp->b_bcount &&
595 		    within_range(lof, bp->b_blkno, bp->b_bcount)) {
596 			makebusy(ul, bp);
597 			rw_exit(&cb->cb_rwlock);
598 			return (bp);
599 		}
600 		bp = bp->b_forw;
601 	} while (bp != cb->cb_bp);
602 	rw_exit(&cb->cb_rwlock);
603 
604 	return (NULL);
605 }
606 
607 static off_t
608 find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
609 {
610 	buf_t	*bp, *bpend;
611 	off_t	rlof;
612 
613 	/*
614 	 * we mustn't:
615 	 *	o read past eol
616 	 *	o read past the tail
617 	 *	o read data that may be being written.
618 	 */
619 	rw_enter(&cb->cb_rwlock, RW_READER);
620 	bpend = bp = cb->cb_bp->b_forw;
621 	rlof = ul->un_tail_lof;
622 	do {
623 		if (bp->b_bcount) {
624 			rlof = dbtob(bp->b_blkno);
625 			break;
626 		}
627 		bp = bp->b_forw;
628 	} while (bp != bpend);
629 	rw_exit(&cb->cb_rwlock);
630 
631 	if (lof <= rlof)
632 		/* lof is prior to the range represented by the write buf */
633 		return (rlof);
634 	else
635 		/* lof follows the range represented by the write buf */
636 		return ((off_t)ul->un_eol_lof);
637 }
638 
639 static buf_t *
640 get_read_bp(ml_unit_t *ul, off_t lof)
641 {
642 	cirbuf_t	*cb;
643 	buf_t		*bp;
644 	off_t		rlof;
645 
646 	/*
647 	 * retrieve as much data as possible from the incore buffers
648 	 */
649 	if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
650 		logstats.ls_lreadsinmem.value.ui64++;
651 		return (bp);
652 	}
653 	if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
654 		logstats.ls_lreadsinmem.value.ui64++;
655 		return (bp);
656 	}
657 
658 	/*
659 	 * steal the LRU buf
660 	 */
661 	cb = &ul->un_rdbuf;
662 	rw_enter(&cb->cb_rwlock, RW_WRITER);
663 	bp = cb->cb_bp->b_forw;
664 	makebusy(ul, bp);
665 	bp->b_flags = 0;
666 	bp->b_bcount = 0;
667 	cb->cb_bp = bp;
668 	rw_exit(&cb->cb_rwlock);
669 
670 	/*
671 	 * don't read past the tail or the end-of-log
672 	 */
673 	bp->b_blkno = btodb(lof);
674 	lof = dbtob(bp->b_blkno);
675 	rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
676 	bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
677 	readlog(ul, bp);
678 	return (bp);
679 }
680 
681 /*
682  * NOTE: writers are single threaded thru the log layer.
683  * This means we can safely reference and change the cb and bp fields
684  * that ldl_read does not reference w/o holding the cb_rwlock or
685  * the bp makebusy lock.
686  */
687 static int
688 extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
689 {
690 	buf_t	*bpforw	= bp->b_forw;
691 
692 	ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
693 
694 	/*
695 	 * there is no `next' bp; do nothing
696 	 */
697 	if (bpforw == bp)
698 		return (0);
699 
700 	/*
701 	 * buffer space is not adjacent; do nothing
702 	 */
703 	if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
704 		return (0);
705 
706 	/*
707 	 * locking protocol requires giving up any bp locks before
708 	 * acquiring cb_rwlock.  This is okay because we hold
709 	 * un_log_mutex.
710 	 */
711 	sema_v(&bp->b_sem);
712 
713 	/*
714 	 * lock out ldl_read
715 	 */
716 	rw_enter(&cb->cb_rwlock, RW_WRITER);
717 
718 	/*
719 	 * wait for current IO to finish w/next bp; if necessary
720 	 */
721 	makebusy(ul, bpforw);
722 
723 	/*
724 	 * free the next bp and steal its space
725 	 */
726 	bp->b_forw = bpforw->b_forw;
727 	bpforw->b_forw->b_back = bp;
728 	bp->b_bufsize += bpforw->b_bufsize;
729 	sema_v(&bpforw->b_sem);
730 	bpforw->b_forw = cb->cb_free;
731 	cb->cb_free = bpforw;
732 	makebusy(ul, bp);
733 	rw_exit(&cb->cb_rwlock);
734 
735 	return (1);
736 }
737 
738 static size_t
739 storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
740 {
741 	size_t		copy_nb;
742 	size_t		nb_in_sec;
743 	sect_trailer_t	*st;
744 	size_t		nb_left = nb;
745 	cirbuf_t	*cb	= &ul->un_wrbuf;
746 
747 again:
748 	nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
749 	copy_nb = MIN(nb_left, nb_in_sec);
750 
751 	ASSERT(copy_nb);
752 
753 	bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
754 	bp->b_bcount += copy_nb;
755 	va += copy_nb;
756 	nb_left -= copy_nb;
757 	ul->un_tail_lof += copy_nb;
758 
759 	if ((nb_in_sec -= copy_nb) == 0) {
760 		st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);
761 
762 		st->st_tid = ul->un_logmap->mtm_tid;
763 		st->st_ident = ul->un_tail_ident++;
764 		bp->b_bcount += sizeof (sect_trailer_t);
765 		ul->un_tail_lof += sizeof (sect_trailer_t);
766 		/*
767 		 * log wrapped; async write this bp
768 		 */
769 		if (ul->un_tail_lof == ul->un_eol_lof) {
770 			ul->un_tail_lof = ul->un_bol_lof;
771 			push_dirty_bp(ul, bp);
772 			return (nb - nb_left);
773 		}
774 		/*
775 		 * out of bp space; get more or async write buf
776 		 */
777 		if (bp->b_bcount == bp->b_bufsize) {
778 			if (!extend_write_bp(ul, cb, bp)) {
779 				push_dirty_bp(ul, bp);
780 				return (nb - nb_left);
781 			}
782 		}
783 	}
784 	if (nb_left)
785 		goto again;
786 
787 	sema_v(&bp->b_sem);
788 	return (nb);
789 }
790 
791 static void
792 fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
793 {
794 	offset_t	src_mof	= me->me_mof;
795 	size_t		src_nb	= me->me_nb;
796 
797 	if (src_mof > dst_mof) {
798 		ASSERT(src_mof < (dst_mof + dst_nb));
799 		dst_va += (src_mof - dst_mof);
800 		dst_nb -= (src_mof - dst_mof);
801 	} else {
802 		ASSERT(dst_mof < (src_mof + src_nb));
803 		src_nb -= (dst_mof - src_mof);
804 	}
805 
806 	src_nb = MIN(src_nb, dst_nb);
807 	ASSERT(src_nb);
808 	bzero(dst_va, src_nb);
809 }
810 
811 /*
812  * dst_va == NULL means don't copy anything
813  */
814 static ulong_t
815 fetchbuf(
816 	ml_unit_t *ul,
817 	buf_t *bp,
818 	caddr_t dst_va,
819 	size_t dst_nb,
820 	off_t *dst_lofp)
821 {
822 	caddr_t	copy_va;
823 	size_t	copy_nb;
824 	size_t	nb_sec;
825 	off_t	dst_lof		= *dst_lofp;
826 	ulong_t	sav_dst_nb	= dst_nb;
827 	ulong_t	src_nb		= bp->b_bcount;
828 	off_t	src_lof		= dbtob(bp->b_blkno);
829 	off_t	src_elof	= src_lof + src_nb;
830 	caddr_t	src_va		= bp->b_un.b_addr;
831 
832 	/*
833 	 * copy from bp to dst_va
834 	 */
835 	while (dst_nb) {
836 		/*
837 		 * compute address within bp
838 		 */
839 		copy_va = src_va + (dst_lof - src_lof);
840 
841 		/*
842 		 * adjust copy size to amount of data in bp
843 		 */
844 		copy_nb = MIN(dst_nb, src_elof - dst_lof);
845 
846 		/*
847 		 * adjust copy size to amount of data in sector
848 		 */
849 		nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
850 		copy_nb = MIN(copy_nb, nb_sec);
851 
852 		/*
853 		 * dst_va == NULL means don't do copy (see logseek())
854 		 */
855 		if (dst_va) {
856 			bcopy(copy_va, dst_va, copy_nb);
857 			dst_va += copy_nb;
858 		}
859 		dst_lof += copy_nb;
860 		dst_nb -= copy_nb;
861 		nb_sec -= copy_nb;
862 
863 		/*
864 		 * advance over sector trailer
865 		 */
866 		if (nb_sec == 0)
867 			dst_lof += sizeof (sect_trailer_t);
868 
869 		/*
870 		 * exhausted buffer
871 		 *	return current lof for next read
872 		 */
873 		if (dst_lof == src_elof) {
874 			sema_v(&bp->b_sem);
875 			if (dst_lof == ul->un_eol_lof)
876 				dst_lof = ul->un_bol_lof;
877 			*dst_lofp = dst_lof;
878 			return (sav_dst_nb - dst_nb);
879 		}
880 	}
881 
882 	/*
883 	 * copy complete - return current lof
884 	 */
885 	sema_v(&bp->b_sem);
886 	*dst_lofp = dst_lof;
887 	return (sav_dst_nb);
888 }
889 
890 void
891 ldl_round_commit(ml_unit_t *ul)
892 {
893 	int		wrapped;
894 	buf_t		*bp;
895 	sect_trailer_t	*st;
896 	size_t		bcount;
897 	cirbuf_t	*cb	= &ul->un_wrbuf;
898 
899 	/*
900 	 * if nothing to write; then do nothing
901 	 */
902 	if ((bp = cb->cb_dirty) == NULL)
903 		return;
904 	makebusy(ul, bp);
905 
906 	/*
907 	 * round up to sector boundary and set new tail
908 	 *	don't readjust st_ident if buf is already rounded
909 	 */
910 	bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
911 	if (bcount == bp->b_bcount) {
912 		sema_v(&bp->b_sem);
913 		return;
914 	}
915 	bp->b_bcount = bcount;
916 	ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
917 	wrapped = 0;
918 	if (ul->un_tail_lof == ul->un_eol_lof) {
919 		ul->un_tail_lof = ul->un_bol_lof;
920 		++wrapped;
921 	}
922 	ASSERT(ul->un_tail_lof != ul->un_head_lof);
923 
924 	/*
925 	 * fix up the sector trailer
926 	 */
927 	/* LINTED */
928 	st = (sect_trailer_t *)
929 	    ((bp->b_un.b_addr + bcount) - sizeof (*st));
930 	st->st_tid = ul->un_logmap->mtm_tid;
931 	st->st_ident = ul->un_tail_ident++;
932 
933 	/*
934 	 * if tail wrapped or we have exhausted this buffer
935 	 *	async write the buffer
936 	 */
937 	if (wrapped || bcount == bp->b_bufsize)
938 		push_dirty_bp(ul, bp);
939 	else
940 		sema_v(&bp->b_sem);
941 }
942 
943 void
944 ldl_push_commit(ml_unit_t *ul)
945 {
946 	buf_t		*bp;
947 	cirbuf_t	*cb	= &ul->un_wrbuf;
948 
949 	/*
950 	 * if nothing to write; then do nothing
951 	 */
952 	if ((bp = cb->cb_dirty) == NULL)
953 		return;
954 	makebusy(ul, bp);
955 	push_dirty_bp(ul, bp);
956 }
957 
958 int
959 ldl_need_commit(ml_unit_t *ul)
960 {
961 	return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
962 }
963 
964 int
965 ldl_has_space(ml_unit_t *ul, mapentry_t *me)
966 {
967 	off_t	nfb;
968 	off_t	nb;
969 
970 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
971 
972 	/*
973 	 * Add up the size used by the deltas
974 	 * round nb up to a sector length plus an extra sector
975 	 *	w/o the extra sector we couldn't distinguish
976 	 *	a full log (head == tail) from an empty log (head == tail)
977 	 */
978 	for (nb = DEV_BSIZE; me; me = me->me_hash) {
979 		nb += sizeof (struct delta);
980 		if (me->me_dt != DT_CANCEL)
981 			nb += me->me_nb;
982 	}
983 	nb = P2ROUNDUP(nb, DEV_BSIZE);
984 
985 	if (ul->un_head_lof <= ul->un_tail_lof)
986 		nfb = (ul->un_head_lof - ul->un_bol_lof) +
987 		    (ul->un_eol_lof - ul->un_tail_lof);
988 	else
989 		nfb = ul->un_head_lof - ul->un_tail_lof;
990 
991 	return (nb < nfb);
992 }
993 
994 void
995 ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
996 {
997 	buf_t		*bp;
998 	caddr_t		va;
999 	size_t		nb;
1000 	size_t		actual;
1001 
1002 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1003 
1004 	/* Write the delta */
1005 
1006 	nb = sizeof (struct delta);
1007 	va = (caddr_t)&me->me_delta;
1008 	bp = get_write_bp(ul);
1009 
1010 	while (nb) {
1011 		if (ul->un_flags & LDL_ERROR) {
1012 			sema_v(&bp->b_sem);
1013 			return;
1014 		}
1015 		actual = storebuf(ul, bp, va, nb);
1016 		ASSERT(actual);
1017 		va += actual;
1018 		nb -= actual;
1019 		if (nb)
1020 			bp = get_write_bp(ul);
1021 	}
1022 
1023 	/* If a commit, cancel, or 0's; we're almost done */
1024 	switch (me->me_dt) {
1025 		case DT_COMMIT:
1026 		case DT_CANCEL:
1027 		case DT_ABZERO:
1028 			/* roll needs to know where the next delta will go */
1029 			me->me_lof = ul->un_tail_lof;
1030 			return;
1031 		default:
1032 			break;
1033 	}
1034 
1035 	/* Now write the data */
1036 
1037 	ASSERT(me->me_nb != 0);
1038 
1039 	nb = me->me_nb;
1040 	va = (me->me_mof - bufmof) + bufp;
1041 	bp = get_write_bp(ul);
1042 
1043 	/* Save where we will put the data */
1044 	me->me_lof = ul->un_tail_lof;
1045 
1046 	while (nb) {
1047 		if (ul->un_flags & LDL_ERROR) {
1048 			sema_v(&bp->b_sem);
1049 			return;
1050 		}
1051 		actual = storebuf(ul, bp, va, nb);
1052 		ASSERT(actual);
1053 		va += actual;
1054 		nb -= actual;
1055 		if (nb)
1056 			bp = get_write_bp(ul);
1057 	}
1058 }
1059 
1060 void
1061 ldl_waito(ml_unit_t *ul)
1062 {
1063 	buf_t		*bp;
1064 	cirbuf_t	*cb	= &ul->un_wrbuf;
1065 
1066 	rw_enter(&cb->cb_rwlock, RW_WRITER);
1067 	/*
1068 	 * wait on them
1069 	 */
1070 	bp = cb->cb_bp;
1071 	do {
1072 		if ((bp->b_flags & B_DONE) == 0) {
1073 			makebusy(ul, bp);
1074 			sema_v(&bp->b_sem);
1075 		}
1076 		bp = bp->b_forw;
1077 	} while (bp != cb->cb_bp);
1078 	rw_exit(&cb->cb_rwlock);
1079 }
1080 
1081 /*
1082  * seek nb bytes from location lof
1083  */
1084 static int
1085 logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
1086 {
1087 	buf_t	*bp;
1088 	ulong_t	actual;
1089 
1090 	while (nb) {
1091 		bp = get_read_bp(ul, lof);
1092 		if (bp->b_flags & B_ERROR) {
1093 			sema_v(&bp->b_sem);
1094 			return (EIO);
1095 		}
1096 		actual = fetchbuf(ul, bp, NULL, nb, &lof);
1097 		ASSERT(actual);
1098 		nb -= actual;
1099 	}
1100 	*lofp = lof;
1101 	ASSERT(nb == 0);
1102 	return (0);
1103 }
1104 
1105 int
1106 ldl_read(
1107 	ml_unit_t *ul,		/* Log unit */
1108 	caddr_t va,		/* address of buffer to read into */
1109 	offset_t mof,		/* mof of buffer */
1110 	off_t nb,		/* length of buffer */
1111 	mapentry_t *me)		/* Map entry list */
1112 {
1113 	buf_t	*bp;
1114 	crb_t   *crb;
1115 	caddr_t	rva;			/* address to read into */
1116 	size_t	rnb;			/* # of bytes to read */
1117 	off_t	lof;			/* log device offset to read from */
1118 	off_t   skip;
1119 	ulong_t	actual;
1120 	int	error;
1121 	caddr_t	eva	= va + nb;	/* end of buffer */
1122 
1123 	for (; me; me = me->me_agenext) {
1124 		ASSERT(me->me_dt != DT_CANCEL);
1125 
1126 		/*
1127 		 * check for an cached roll buffer
1128 		 */
1129 		crb = me->me_crb;
1130 		if (crb) {
1131 			if (mof > crb->c_mof) {
1132 				/*
1133 				 * This mapentry overlaps with the beginning of
1134 				 * the supplied buffer
1135 				 */
1136 				skip = mof - crb->c_mof;
1137 				bcopy(crb->c_buf + skip, va,
1138 				    MIN(nb, crb->c_nb - skip));
1139 			} else {
1140 				/*
1141 				 * This mapentry starts at or after
1142 				 * the supplied buffer.
1143 				 */
1144 				skip = crb->c_mof - mof;
1145 				bcopy(crb->c_buf, va + skip,
1146 				    MIN(crb->c_nb, nb - skip));
1147 			}
1148 			logstats.ls_lreadsinmem.value.ui64++;
1149 			continue;
1150 		}
1151 
1152 		/*
1153 		 * check for a delta full of zeroes - there's no log data
1154 		 */
1155 		if (me->me_dt == DT_ABZERO) {
1156 			fetchzeroes(va, mof, nb, me);
1157 			continue;
1158 		}
1159 
1160 		if (mof > me->me_mof) {
1161 			rnb = (size_t)(mof - me->me_mof);
1162 			error = logseek(ul, me->me_lof, rnb, &lof);
1163 			if (error)
1164 				return (EIO);
1165 			rva = va;
1166 			rnb = me->me_nb - rnb;
1167 			rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
1168 		} else {
1169 			lof = me->me_lof;
1170 			rva = (me->me_mof - mof) + va;
1171 			rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
1172 		}
1173 
1174 		while (rnb) {
1175 			bp = get_read_bp(ul, lof);
1176 			if (bp->b_flags & B_ERROR) {
1177 				sema_v(&bp->b_sem);
1178 				return (EIO);
1179 			}
1180 			ASSERT(((me->me_flags & ME_ROLL) == 0) ||
1181 			    (bp != ul->un_wrbuf.cb_dirty));
1182 			actual = fetchbuf(ul, bp, rva, rnb, &lof);
1183 			ASSERT(actual);
1184 			rva += actual;
1185 			rnb -= actual;
1186 		}
1187 	}
1188 	return (0);
1189 }
1190 
1191 void
1192 ldl_savestate(ml_unit_t *ul)
1193 {
1194 	int		error;
1195 	buf_t		*bp	= ul->un_bp;
1196 	ml_odunit_t	*ud	= (void *)bp->b_un.b_addr;
1197 	ml_odunit_t	*ud2	= (void *)(bp->b_un.b_addr + DEV_BSIZE);
1198 
1199 #if	DEBUG
1200 	/*
1201 	 * Scan test is running; don't update intermediate state
1202 	 */
1203 	if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
1204 		return;
1205 #endif	/* DEBUG */
1206 
1207 	mutex_enter(&ul->un_state_mutex);
1208 	bcopy(&ul->un_ondisk, ud, sizeof (*ud));
1209 	ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
1210 	bcopy(ud, ud2, sizeof (*ud));
1211 
1212 	/* If a snapshot is enabled write through the shapshot driver. */
1213 	if (ul->un_ufsvfs->vfs_snapshot)
1214 		UFS_BWRITE2(ul->un_ufsvfs, bp);
1215 	else
1216 		BWRITE2(bp);
1217 	logstats.ls_ldlwrites.value.ui64++;
1218 	error = bp->b_flags & B_ERROR;
1219 	mutex_exit(&ul->un_state_mutex);
1220 	if (error)
1221 		ldl_seterror(ul, "Error writing ufs log state");
1222 }
1223 
1224 /*
1225  * The head will be set to (new_lof - header) since ldl_sethead is
1226  * called with the new_lof of the data portion of a delta.
1227  */
1228 void
1229 ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
1230 {
1231 	off_t		nb;
1232 	off_t		new_lof;
1233 	uint32_t	new_ident;
1234 	daddr_t		beg_blkno;
1235 	daddr_t		end_blkno;
1236 
1237 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1238 
1239 	if (data_lof == -1) {
1240 		/* log is empty */
1241 		new_ident = lufs_hd_genid(ul);
1242 		new_lof = ul->un_tail_lof;
1243 
1244 	} else {
1245 		/* compute header's lof */
1246 		new_ident = ul->un_head_ident;
1247 		new_lof = data_lof - sizeof (struct delta);
1248 
1249 		/* whoops, header spans sectors; subtract out sector trailer */
1250 		if (btodb(new_lof) != btodb(data_lof))
1251 			new_lof -= sizeof (sect_trailer_t);
1252 
1253 		/* whoops, header wrapped the log; go to last sector */
1254 		if (new_lof < ul->un_bol_lof) {
1255 			/* sector offset */
1256 			new_lof -= dbtob(btodb(new_lof));
1257 			/* add to last sector's lof */
1258 			new_lof += (ul->un_eol_lof - DEV_BSIZE);
1259 		}
1260 		ul->un_head_tid = tid;
1261 	}
1262 
1263 	/*
1264 	 * check for nop
1265 	 */
1266 	if (new_lof == ul->un_head_lof)
1267 		return;
1268 
1269 	/*
1270 	 * invalidate the affected bufs and calculate new ident
1271 	 */
1272 	if (new_lof > ul->un_head_lof) {
1273 		nb = new_lof - ul->un_head_lof;
1274 		inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1275 		inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1276 
1277 		end_blkno = btodb(new_lof);
1278 		beg_blkno = btodb(ul->un_head_lof);
1279 		new_ident += (end_blkno - beg_blkno);
1280 	} else {
1281 		nb = ul->un_eol_lof - ul->un_head_lof;
1282 		inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1283 		inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1284 
1285 		end_blkno = btodb(ul->un_eol_lof);
1286 		beg_blkno = btodb(ul->un_head_lof);
1287 		new_ident += (end_blkno - beg_blkno);
1288 
1289 		nb = new_lof - ul->un_bol_lof;
1290 		inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
1291 		inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);
1292 
1293 		end_blkno = btodb(new_lof);
1294 		beg_blkno = btodb(ul->un_bol_lof);
1295 		new_ident += (end_blkno - beg_blkno);
1296 	}
1297 	/*
1298 	 * don't update the head if there has been an error
1299 	 */
1300 	if (ul->un_flags & LDL_ERROR)
1301 		return;
1302 
1303 	/* Fix up the head and ident */
1304 	ASSERT(new_lof >= ul->un_bol_lof);
1305 	ul->un_head_lof = new_lof;
1306 	ul->un_head_ident = new_ident;
1307 	if (data_lof == -1) {
1308 		ul->un_tail_ident = ul->un_head_ident;
1309 	}
1310 
1311 
1312 	/* Commit to the database */
1313 	ldl_savestate(ul);
1314 
1315 	ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
1316 	    ldl_sethead_debug(ul));
1317 }
1318 
1319 /*
1320  * The tail will be set to the sector following lof+nb
1321  *	lof + nb == size of the last delta + commit record
1322  *	this function is called once after the log scan has completed.
1323  */
1324 void
1325 ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
1326 {
1327 	off_t		new_lof;
1328 	uint32_t	new_ident;
1329 	daddr_t		beg_blkno;
1330 	daddr_t		end_blkno;
1331 
1332 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1333 
1334 	if (lof == -1) {
1335 		ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
1336 		ul->un_head_lof = ul->un_tail_lof;
1337 		ul->un_head_ident = lufs_hd_genid(ul);
1338 		ul->un_tail_ident = ul->un_head_ident;
1339 
1340 		/* Commit to the database */
1341 		ldl_savestate(ul);
1342 
1343 		return;
1344 	}
1345 
1346 	/*
1347 	 * new_lof is the offset of the sector following the last commit
1348 	 */
1349 	(void) logseek(ul, lof, nb, &new_lof);
1350 	ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));
1351 
1352 	/*
1353 	 * calculate new ident
1354 	 */
1355 	if (new_lof > ul->un_head_lof) {
1356 		end_blkno = btodb(new_lof);
1357 		beg_blkno = btodb(ul->un_head_lof);
1358 		new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1359 	} else {
1360 		end_blkno = btodb(ul->un_eol_lof);
1361 		beg_blkno = btodb(ul->un_head_lof);
1362 		new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1363 
1364 		end_blkno = btodb(new_lof);
1365 		beg_blkno = btodb(ul->un_bol_lof);
1366 		new_ident += (end_blkno - beg_blkno);
1367 	}
1368 
1369 	/* Fix up the tail and ident */
1370 	ul->un_tail_lof = new_lof;
1371 	ul->un_tail_ident = new_ident;
1372 
1373 	/* Commit to the database */
1374 	ldl_savestate(ul);
1375 }
1376 
1377 /*
1378  * LOGSCAN STUFF
1379  */
1380 static int
1381 ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
1382 {
1383 	ulong_t		ident;
1384 	size_t		nblk, i;
1385 	sect_trailer_t	*st;
1386 
1387 	/*
1388 	 * compute ident for first sector in the buffer
1389 	 */
1390 	ident = ul->un_head_ident;
1391 	if (bp->b_blkno >= btodb(ul->un_head_lof)) {
1392 		ident += (bp->b_blkno - btodb(ul->un_head_lof));
1393 	} else {
1394 		ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
1395 		ident += (bp->b_blkno - btodb(ul->un_bol_lof));
1396 	}
1397 	/*
1398 	 * truncate the buffer down to the last valid sector
1399 	 */
1400 	nblk = btodb(bp->b_bcount);
1401 	bp->b_bcount = 0;
1402 	/* LINTED */
1403 	st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
1404 	for (i = 0; i < nblk; ++i) {
1405 		if (st->st_ident != ident)
1406 			break;
1407 
1408 		/* remember last valid tid for ldl_logscan_error() */
1409 		ul->un_tid = st->st_tid;
1410 
1411 		/* LINTED */
1412 		st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
1413 		++ident;
1414 		bp->b_bcount += DEV_BSIZE;
1415 	}
1416 	/*
1417 	 * make sure that lof is still within range
1418 	 */
1419 	return (within_range(lof, bp->b_blkno, bp->b_bcount));
1420 }
1421 
1422 ulong_t
1423 ldl_logscan_nbcommit(off_t lof)
1424 {
1425 	/*
1426 	 * lof is the offset following the commit header.  However,
1427 	 * if the commit header fell on the end-of-sector, then lof
1428 	 * has already been advanced to the beginning of the next
1429 	 * sector.  So do nothing.  Otherwise, return the remaining
1430 	 * bytes in the sector.
1431 	 */
1432 	if ((lof & (DEV_BSIZE - 1)) == 0)
1433 		return (0);
1434 	return (NB_LEFT_IN_SECTOR(lof));
1435 }
1436 
1437 int
1438 ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
1439 {
1440 	buf_t	*bp;
1441 	ulong_t	actual;
1442 
1443 	ASSERT(ul->un_head_lof != ul->un_tail_lof);
1444 
1445 	/*
1446 	 * Check the log data doesn't go out of bounds
1447 	 */
1448 	if (ul->un_head_lof < ul->un_tail_lof) {
1449 		if (!WITHIN(*lofp, nb, ul->un_head_lof,
1450 		    (ul->un_tail_lof - ul->un_head_lof))) {
1451 			return (EIO);
1452 		}
1453 	} else {
1454 		if (OVERLAP(*lofp, nb, ul->un_tail_lof,
1455 		    (ul->un_head_lof - ul->un_tail_lof))) {
1456 			return (EIO);
1457 		}
1458 	}
1459 
1460 	while (nb) {
1461 		bp = get_read_bp(ul, *lofp);
1462 		if (bp->b_flags & B_ERROR) {
1463 			sema_v(&bp->b_sem);
1464 			return (EIO);
1465 		}
1466 		/*
1467 		 * out-of-seq idents means partial transaction
1468 		 *	panic, non-corrupting powerfail, ...
1469 		 */
1470 		if (!ldl_logscan_ident(ul, bp, *lofp)) {
1471 			sema_v(&bp->b_sem);
1472 			return (EIO);
1473 		}
1474 		/*
1475 		 * copy the header into the caller's buf
1476 		 */
1477 		actual = fetchbuf(ul, bp, va, nb, lofp);
1478 		if (va)
1479 			va += actual;
1480 		nb -= actual;
1481 	}
1482 	return (0);
1483 }
1484 
1485 void
1486 ldl_logscan_begin(ml_unit_t *ul)
1487 {
1488 	size_t	bufsize;
1489 
1490 	ASSERT(ul->un_wrbuf.cb_dirty == NULL);
1491 
1492 	/*
1493 	 * logscan has begun
1494 	 */
1495 	ul->un_flags |= LDL_SCAN;
1496 
1497 	/*
1498 	 * reset the circular bufs
1499 	 */
1500 	bufsize = ldl_bufsize(ul);
1501 	alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
1502 	alloc_wrbuf(&ul->un_wrbuf, bufsize);
1503 
1504 	/*
1505 	 * set the tail to reflect a full log
1506 	 */
1507 	ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;
1508 
1509 	if (ul->un_tail_lof < ul->un_bol_lof)
1510 		ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
1511 	if (ul->un_tail_lof >= ul->un_eol_lof)
1512 		ul->un_tail_lof = ul->un_bol_lof;
1513 
1514 	/*
1515 	 * un_tid is used during error processing; it is initialized to
1516 	 * the tid of the delta at un_head_lof;
1517 	 */
1518 	ul->un_tid = ul->un_head_tid;
1519 }
1520 
1521 void
1522 ldl_logscan_end(ml_unit_t *ul)
1523 {
1524 	size_t	bufsize;
1525 
1526 	/*
1527 	 * reset the circular bufs
1528 	 */
1529 	bufsize = ldl_bufsize(ul);
1530 	alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
1531 	alloc_wrbuf(&ul->un_wrbuf, bufsize);
1532 
1533 	/*
1534 	 * Done w/scan
1535 	 */
1536 	ul->un_flags &= ~LDL_SCAN;
1537 }
1538 
1539 int
1540 ldl_need_roll(ml_unit_t *ul)
1541 {
1542 	off_t	busybytes;
1543 	off_t	head;
1544 	off_t	tail;
1545 	off_t	bol;
1546 	off_t	eol;
1547 	off_t	nb;
1548 
1549 	/*
1550 	 * snapshot the log state
1551 	 */
1552 	head = ul->un_head_lof;
1553 	tail = ul->un_tail_lof;
1554 	bol = ul->un_bol_lof;
1555 	eol = ul->un_eol_lof;
1556 	nb = ul->un_logsize;
1557 
1558 	/*
1559 	 * compute number of busy (inuse) bytes
1560 	 */
1561 	if (head <= tail)
1562 		busybytes = tail - head;
1563 	else
1564 		busybytes = (eol - head) + (tail - bol);
1565 
1566 	/*
1567 	 * return TRUE if > 75% full
1568 	 */
1569 	return (busybytes > (nb - (nb >> 2)));
1570 }
1571 
1572 void
1573 ldl_seterror(ml_unit_t *ul, char *why)
1574 {
1575 	/*
1576 	 * already in error state; do nothing
1577 	 */
1578 	if (ul->un_flags & LDL_ERROR)
1579 		return;
1580 
1581 	ul->un_flags |= LDL_ERROR;	/* incore */
1582 	ul->un_badlog = 1;		/* ondisk (cleared by fsck) */
1583 
1584 	/*
1585 	 * Commit to state sectors
1586 	 */
1587 	uniqtime(&ul->un_timestamp);
1588 	ldl_savestate(ul);
1589 
1590 	/* Pretty print */
1591 	cmn_err(CE_WARN, "%s", why);
1592 	cmn_err(CE_WARN, "ufs log for %s changed state to Error",
1593 	    ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1594 	cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
1595 	    ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1596 
1597 	/*
1598 	 * If we aren't in the middle of scan (aka snarf); tell ufs
1599 	 * to hard lock itself.
1600 	 */
1601 	if ((ul->un_flags & LDL_SCAN) == 0)
1602 		ufs_trans_onerror();
1603 }
1604 
1605 size_t
1606 ldl_bufsize(ml_unit_t *ul)
1607 {
1608 	size_t		bufsize;
1609 	extern uint32_t	ldl_minbufsize;
1610 
1611 	/*
1612 	 * initial guess is the maxtransfer value for this log device
1613 	 * 	increase if too small
1614 	 * 	decrease if too large
1615 	 */
1616 	bufsize = dbtob(btod(ul->un_maxtransfer));
1617 	if (bufsize < ldl_minbufsize)
1618 		bufsize = ldl_minbufsize;
1619 	if (bufsize > maxphys)
1620 		bufsize = maxphys;
1621 	if (bufsize > ul->un_maxtransfer)
1622 		bufsize = ul->un_maxtransfer;
1623 	return (bufsize);
1624 }
1625