xref: /titanic_41/usr/src/uts/common/fs/ufs/lufs_log.c (revision 4a7ceb24cfcc0a97f96d86cfe5852ae445b50e57)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/systm.h>
29 #include <sys/types.h>
30 #include <sys/vnode.h>
31 #include <sys/errno.h>
32 #include <sys/sysmacros.h>
33 #include <sys/debug.h>
34 #include <sys/kmem.h>
35 #include <sys/conf.h>
36 #include <sys/proc.h>
37 #include <sys/cmn_err.h>
38 #include <sys/fssnap_if.h>
39 #include <sys/fs/ufs_inode.h>
40 #include <sys/fs/ufs_filio.h>
41 #include <sys/fs/ufs_log.h>
42 #include <sys/fs/ufs_bio.h>
43 #include <sys/atomic.h>
44 
45 extern int		maxphys;
46 extern uint_t		bypass_snapshot_throttle_key;
47 
48 extern struct kmem_cache	*lufs_sv;
49 extern struct kmem_cache	*lufs_bp;
50 
51 static void
52 makebusy(ml_unit_t *ul, buf_t *bp)
53 {
54 	sema_p(&bp->b_sem);
55 	if ((bp->b_flags & B_ERROR) == 0)
56 		return;
57 	if (bp->b_flags & B_READ)
58 		ldl_seterror(ul, "Error reading ufs log");
59 	else
60 		ldl_seterror(ul, "Error writing ufs log");
61 }
62 
63 static int
64 logdone(buf_t *bp)
65 {
66 	bp->b_flags |= B_DONE;
67 
68 	if (bp->b_flags & B_WRITE)
69 		sema_v(&bp->b_sem);
70 	else
71 		/* wakeup the thread waiting on this buf */
72 		sema_v(&bp->b_io);
73 	return (0);
74 }
75 
76 static int
77 ldl_strategy_done(buf_t *cb)
78 {
79 	lufs_save_t	*sv;
80 	lufs_buf_t	*lbp;
81 	buf_t		*bp;
82 
83 	ASSERT(SEMA_HELD(&cb->b_sem));
84 	ASSERT((cb->b_flags & B_DONE) == 0);
85 
86 	/*
87 	 * Compute address of the ``save'' struct
88 	 */
89 	lbp = (lufs_buf_t *)cb;
90 	sv = (lufs_save_t *)lbp->lb_ptr;
91 
92 	if (cb->b_flags & B_ERROR)
93 		sv->sv_error = 1;
94 
95 	/*
96 	 * If this is the last request, release the resources and
97 	 * ``done'' the original buffer header.
98 	 */
99 	if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
100 		kmem_cache_free(lufs_bp, lbp);
101 		return (1);
102 	}
103 	/* Propagate any errors back to the original buffer header */
104 	bp = sv->sv_bp;
105 	if (sv->sv_error)
106 		bp->b_flags |= B_ERROR;
107 	kmem_cache_free(lufs_bp, lbp);
108 	kmem_cache_free(lufs_sv, sv);
109 
110 	biodone(bp);
111 	return (0);
112 }
113 
114 /*
115  * Map the log logical block number to a physical disk block number
116  */
117 static int
118 map_frag(
119 	ml_unit_t	*ul,
120 	daddr_t		lblkno,
121 	size_t		bcount,
122 	daddr_t		*pblkno,
123 	size_t		*pbcount)
124 {
125 	ic_extent_t	*ext = ul->un_ebp->ic_extents;
126 	uint32_t	e = ul->un_ebp->ic_nextents;
127 	uint32_t	s = 0;
128 	uint32_t	i = e >> 1;
129 	uint32_t	lasti = i;
130 	uint32_t	bno_off;
131 
132 again:
133 	if (ext[i].ic_lbno <= lblkno) {
134 		if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
135 			/* FOUND IT */
136 			bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
137 			*pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
138 			*pblkno = ext[i].ic_pbno + bno_off;
139 			return (0);
140 		} else
141 			s = i;
142 	} else
143 		e = i;
144 	i = s + ((e - s) >> 1);
145 
146 	if (i == lasti) {
147 		*pbcount = bcount;
148 		return (ENOENT);
149 	}
150 	lasti = i;
151 
152 	goto again;
153 }
154 
155 /*
156  * The log is a set of extents (which typically will be only one, but
157  * may be more if the disk was close to full when the log was created)
158  * and hence the logical offsets into the log
159  * have to be translated into their real device locations before
160  * calling the device's strategy routine. The translation may result
161  * in several IO requests if this request spans extents.
162  */
163 void
164 ldl_strategy(ml_unit_t *ul, buf_t *pb)
165 {
166 	lufs_save_t	*sv;
167 	lufs_buf_t	*lbp;
168 	buf_t		*cb;
169 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
170 	daddr_t		lblkno, pblkno;
171 	size_t		nb_left, pbcount;
172 	off_t		offset;
173 	dev_t		dev	= ul->un_dev;
174 	int		error;
175 	int		read = pb->b_flags & B_READ;
176 
177 	/*
178 	 * Allocate and initialise the save stucture,
179 	 */
180 	sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
181 	sv->sv_error = 0;
182 	sv->sv_bp = pb;
183 	nb_left = pb->b_bcount;
184 	sv->sv_nb_left = nb_left;
185 
186 	lblkno = pb->b_blkno;
187 	offset = 0;
188 
189 	do {
190 		error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);
191 
192 		lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
193 		bioinit(&lbp->lb_buf);
194 		lbp->lb_ptr = sv;
195 
196 		cb = bioclone(pb, offset, pbcount, dev,
197 		    pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);
198 
199 		offset += pbcount;
200 		lblkno += btodb(pbcount);
201 		nb_left -= pbcount;
202 
203 		if (error) {
204 			cb->b_flags |= B_ERROR;
205 			cb->b_resid = cb->b_bcount;
206 			biodone(cb);
207 		} else {
208 			if (read) {
209 				logstats.ls_ldlreads.value.ui64++;
210 				ufsvfsp->vfs_iotstamp = lbolt;
211 				lwp_stat_update(LWP_STAT_INBLK, 1);
212 			} else {
213 				logstats.ls_ldlwrites.value.ui64++;
214 				lwp_stat_update(LWP_STAT_OUBLK, 1);
215 			}
216 
217 			/*
218 			 * write through the snapshot driver if necessary
219 			 * We do not want this write to be throttled because
220 			 * we are holding the un_log mutex here. If we
221 			 * are throttled in fssnap_translate, the fssnap_taskq
222 			 * thread which can wake us up can get blocked on
223 			 * the un_log mutex resulting in a deadlock.
224 			 */
225 			if (ufsvfsp->vfs_snapshot) {
226 				(void) tsd_set(bypass_snapshot_throttle_key,
227 				    (void *)1);
228 				fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);
229 
230 				(void) tsd_set(bypass_snapshot_throttle_key,
231 				    (void *)0);
232 			} else {
233 				(void) bdev_strategy(cb);
234 			}
235 		}
236 
237 	} while (nb_left);
238 }
239 
240 static void
241 writelog(ml_unit_t *ul, buf_t *bp)
242 {
243 	ASSERT(SEMA_HELD(&bp->b_sem));
244 
245 	/*
246 	 * This is really an B_ASYNC write but we want Presto to
247 	 * cache this write.  The iodone routine, logdone, processes
248 	 * the buf correctly.
249 	 */
250 	bp->b_flags = B_WRITE;
251 	bp->b_edev = ul->un_dev;
252 	bp->b_iodone = logdone;
253 
254 	/*
255 	 * return EIO for every IO if in hard error state
256 	 */
257 	if (ul->un_flags & LDL_ERROR) {
258 		bp->b_flags |= B_ERROR;
259 		bp->b_error = EIO;
260 		biodone(bp);
261 		return;
262 	}
263 
264 	ldl_strategy(ul, bp);
265 }
266 
267 static void
268 readlog(ml_unit_t *ul, buf_t *bp)
269 {
270 	ASSERT(SEMA_HELD(&bp->b_sem));
271 	ASSERT(bp->b_bcount);
272 
273 	bp->b_flags = B_READ;
274 	bp->b_edev = ul->un_dev;
275 	bp->b_iodone = logdone;
276 
277 	/* all IO returns errors when in error state */
278 	if (ul->un_flags & LDL_ERROR) {
279 		bp->b_flags |= B_ERROR;
280 		bp->b_error = EIO;
281 		biodone(bp);
282 		(void) trans_wait(bp);
283 		return;
284 	}
285 
286 	ldl_strategy(ul, bp);
287 
288 	if (trans_wait(bp))
289 		ldl_seterror(ul, "Error reading ufs log");
290 }
291 
292 /*
293  * NOTE: writers are single threaded thru the log layer.
294  * This means we can safely reference and change the cb and bp fields
295  * that ldl_read does not reference w/o holding the cb_rwlock or
296  * the bp makebusy lock.
297  */
298 static void
299 push_dirty_bp(ml_unit_t *ul, buf_t *bp)
300 {
301 	buf_t		*newbp;
302 	cirbuf_t	*cb		= &ul->un_wrbuf;
303 
304 	ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
305 	ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);
306 
307 	/*
308 	 * async write the buf
309 	 */
310 	writelog(ul, bp);
311 
312 	/*
313 	 * no longer filling any buf
314 	 */
315 	cb->cb_dirty = NULL;
316 
317 	/*
318 	 * no extra buffer space; all done
319 	 */
320 	if (bp->b_bcount == bp->b_bufsize)
321 		return;
322 
323 	/*
324 	 * give extra buffer space to a new bp
325 	 * 	try to take buf off of free list
326 	 */
327 	if ((newbp = cb->cb_free) != NULL) {
328 		cb->cb_free = newbp->b_forw;
329 	} else {
330 		newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
331 		sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
332 		sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
333 	}
334 	newbp->b_flags = 0;
335 	newbp->b_bcount = 0;
336 	newbp->b_file = NULL;
337 	newbp->b_offset = -1;
338 	newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
339 	newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
340 	bp->b_bufsize = bp->b_bcount;
341 
342 	/*
343 	 * lock out readers and put new buf at LRU position
344 	 */
345 	rw_enter(&cb->cb_rwlock, RW_WRITER);
346 	newbp->b_forw = bp->b_forw;
347 	newbp->b_back = bp;
348 	bp->b_forw->b_back = newbp;
349 	bp->b_forw = newbp;
350 	rw_exit(&cb->cb_rwlock);
351 }
352 
353 static void
354 inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
355 {
356 	buf_t		*bp;
357 	off_t		elof	= lof + nb;
358 	off_t		buflof;
359 	off_t		bufelof;
360 
361 	/*
362 	 * discard all bufs that overlap the range (lof, lof + nb)
363 	 */
364 	rw_enter(&cb->cb_rwlock, RW_WRITER);
365 	bp = cb->cb_bp;
366 	do {
367 		if (bp == cb->cb_dirty || bp->b_bcount == 0) {
368 			bp = bp->b_forw;
369 			continue;
370 		}
371 		buflof = dbtob(bp->b_blkno);
372 		bufelof = buflof + bp->b_bcount;
373 		if ((buflof < lof && bufelof <= lof) ||
374 		    (buflof >= elof && bufelof > elof)) {
375 			bp = bp->b_forw;
376 			continue;
377 		}
378 		makebusy(ul, bp);
379 		bp->b_flags = 0;
380 		bp->b_bcount = 0;
381 		sema_v(&bp->b_sem);
382 		bp = bp->b_forw;
383 	} while (bp != cb->cb_bp);
384 	rw_exit(&cb->cb_rwlock);
385 }
386 
387 /*
388  * NOTE: writers are single threaded thru the log layer.
389  * This means we can safely reference and change the cb and bp fields
390  * that ldl_read does not reference w/o holding the cb_rwlock or
391  * the bp makebusy lock.
392  */
393 static buf_t *
394 get_write_bp(ml_unit_t *ul)
395 {
396 	cirbuf_t	*cb = &ul->un_wrbuf;
397 	buf_t		*bp;
398 
399 	/*
400 	 * cb_dirty is the buffer we are currently filling; if any
401 	 */
402 	if ((bp = cb->cb_dirty) != NULL) {
403 		makebusy(ul, bp);
404 		return (bp);
405 	}
406 	/*
407 	 * discard any bp that overlaps the current tail since we are
408 	 * about to overwrite it.
409 	 */
410 	inval_range(ul, cb, ul->un_tail_lof, 1);
411 
412 	/*
413 	 * steal LRU buf
414 	 */
415 	rw_enter(&cb->cb_rwlock, RW_WRITER);
416 	bp = cb->cb_bp->b_forw;
417 	makebusy(ul, bp);
418 
419 	cb->cb_dirty = bp;
420 	cb->cb_bp = bp;
421 
422 	bp->b_flags = 0;
423 	bp->b_bcount = 0;
424 	bp->b_blkno = btodb(ul->un_tail_lof);
425 	ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
426 	rw_exit(&cb->cb_rwlock);
427 
428 	/*
429 	 * NOTE:
430 	 *	1. un_tail_lof never addresses >= un_eol_lof
431 	 *	2. b_blkno + btodb(b_bufsize) may > un_eol_lof
432 	 *		this case is handled in storebuf
433 	 */
434 	return (bp);
435 }
436 
437 void
438 alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
439 {
440 	int	i;
441 	buf_t	*bp;
442 
443 	/*
444 	 * Clear previous allocation
445 	 */
446 	if (cb->cb_nb)
447 		free_cirbuf(cb);
448 
449 	bzero(cb, sizeof (*cb));
450 	rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
451 
452 	rw_enter(&cb->cb_rwlock, RW_WRITER);
453 
454 	/*
455 	 * preallocate 3 bp's and put them on the free list.
456 	 */
457 	for (i = 0; i < 3; ++i) {
458 		bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
459 		sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
460 		sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
461 		bp->b_offset = -1;
462 		bp->b_forw = cb->cb_free;
463 		cb->cb_free = bp;
464 	}
465 
466 	cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
467 	cb->cb_nb = bufsize;
468 
469 	/*
470 	 * first bp claims entire write buffer
471 	 */
472 	bp = cb->cb_free;
473 	cb->cb_free = bp->b_forw;
474 
475 	bp->b_forw = bp;
476 	bp->b_back = bp;
477 	cb->cb_bp = bp;
478 	bp->b_un.b_addr = cb->cb_va;
479 	bp->b_bufsize = cb->cb_nb;
480 
481 	rw_exit(&cb->cb_rwlock);
482 }
483 
484 void
485 alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
486 {
487 	caddr_t	va;
488 	size_t	nb;
489 	buf_t	*bp;
490 
491 	/*
492 	 * Clear previous allocation
493 	 */
494 	if (cb->cb_nb)
495 		free_cirbuf(cb);
496 
497 	bzero(cb, sizeof (*cb));
498 	rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
499 
500 	rw_enter(&cb->cb_rwlock, RW_WRITER);
501 
502 	cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
503 	cb->cb_nb = bufsize;
504 
505 	/*
506 	 * preallocate N bufs that are hard-sized to blksize
507 	 *	in other words, the read buffer pool is a linked list
508 	 *	of statically sized bufs.
509 	 */
510 	va = cb->cb_va;
511 	while ((nb = bufsize) != 0) {
512 		if (nb > blksize)
513 			nb = blksize;
514 		bp = kmem_alloc(sizeof (buf_t), KM_SLEEP);
515 		bzero(bp, sizeof (buf_t));
516 		sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
517 		sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
518 		bp->b_un.b_addr = va;
519 		bp->b_bufsize = nb;
520 		if (cb->cb_bp) {
521 			bp->b_forw = cb->cb_bp->b_forw;
522 			bp->b_back = cb->cb_bp;
523 			cb->cb_bp->b_forw->b_back = bp;
524 			cb->cb_bp->b_forw = bp;
525 		} else
526 			bp->b_forw = bp->b_back = bp;
527 		cb->cb_bp = bp;
528 		bufsize -= nb;
529 		va += nb;
530 	}
531 
532 	rw_exit(&cb->cb_rwlock);
533 }
534 
535 void
536 free_cirbuf(cirbuf_t *cb)
537 {
538 	buf_t	*bp;
539 
540 	if (cb->cb_nb == 0)
541 		return;
542 
543 	rw_enter(&cb->cb_rwlock, RW_WRITER);
544 	ASSERT(cb->cb_dirty == NULL);
545 
546 	/*
547 	 * free the active bufs
548 	 */
549 	while ((bp = cb->cb_bp) != NULL) {
550 		if (bp == bp->b_forw)
551 			cb->cb_bp = NULL;
552 		else
553 			cb->cb_bp = bp->b_forw;
554 		bp->b_back->b_forw = bp->b_forw;
555 		bp->b_forw->b_back = bp->b_back;
556 		sema_destroy(&bp->b_sem);
557 		sema_destroy(&bp->b_io);
558 		kmem_free(bp, sizeof (buf_t));
559 	}
560 
561 	/*
562 	 * free the free bufs
563 	 */
564 	while ((bp = cb->cb_free) != NULL) {
565 		cb->cb_free = bp->b_forw;
566 		sema_destroy(&bp->b_sem);
567 		sema_destroy(&bp->b_io);
568 		kmem_free(bp, sizeof (buf_t));
569 	}
570 	kmem_free(cb->cb_va, cb->cb_nb);
571 	cb->cb_va = NULL;
572 	cb->cb_nb = 0;
573 	rw_exit(&cb->cb_rwlock);
574 	rw_destroy(&cb->cb_rwlock);
575 }
576 
577 static int
578 within_range(off_t lof, daddr_t blkno, ulong_t bcount)
579 {
580 	off_t	blof	= dbtob(blkno);
581 
582 	return ((lof >= blof) && (lof < (blof + bcount)));
583 }
584 
585 static buf_t *
586 find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
587 {
588 	buf_t *bp;
589 
590 	/*
591 	 * find a buf that contains the offset lof
592 	 */
593 	rw_enter(&cb->cb_rwlock, RW_READER);
594 	bp = cb->cb_bp;
595 	do {
596 		if (bp->b_bcount &&
597 		    within_range(lof, bp->b_blkno, bp->b_bcount)) {
598 			makebusy(ul, bp);
599 			rw_exit(&cb->cb_rwlock);
600 			return (bp);
601 		}
602 		bp = bp->b_forw;
603 	} while (bp != cb->cb_bp);
604 	rw_exit(&cb->cb_rwlock);
605 
606 	return (NULL);
607 }
608 
609 static off_t
610 find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
611 {
612 	buf_t	*bp, *bpend;
613 	off_t	rlof;
614 
615 	/*
616 	 * we mustn't:
617 	 *	o read past eol
618 	 *	o read past the tail
619 	 *	o read data that may be being written.
620 	 */
621 	rw_enter(&cb->cb_rwlock, RW_READER);
622 	bpend = bp = cb->cb_bp->b_forw;
623 	rlof = ul->un_tail_lof;
624 	do {
625 		if (bp->b_bcount) {
626 			rlof = dbtob(bp->b_blkno);
627 			break;
628 		}
629 		bp = bp->b_forw;
630 	} while (bp != bpend);
631 	rw_exit(&cb->cb_rwlock);
632 
633 	if (lof <= rlof)
634 		/* lof is prior to the range represented by the write buf */
635 		return (rlof);
636 	else
637 		/* lof follows the range represented by the write buf */
638 		return ((off_t)ul->un_eol_lof);
639 }
640 
641 static buf_t *
642 get_read_bp(ml_unit_t *ul, off_t lof)
643 {
644 	cirbuf_t	*cb;
645 	buf_t		*bp;
646 	off_t		rlof;
647 
648 	/*
649 	 * retrieve as much data as possible from the incore buffers
650 	 */
651 	if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
652 		logstats.ls_lreadsinmem.value.ui64++;
653 		return (bp);
654 	}
655 	if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
656 		logstats.ls_lreadsinmem.value.ui64++;
657 		return (bp);
658 	}
659 
660 	/*
661 	 * steal the LRU buf
662 	 */
663 	cb = &ul->un_rdbuf;
664 	rw_enter(&cb->cb_rwlock, RW_WRITER);
665 	bp = cb->cb_bp->b_forw;
666 	makebusy(ul, bp);
667 	bp->b_flags = 0;
668 	bp->b_bcount = 0;
669 	cb->cb_bp = bp;
670 	rw_exit(&cb->cb_rwlock);
671 
672 	/*
673 	 * don't read past the tail or the end-of-log
674 	 */
675 	bp->b_blkno = btodb(lof);
676 	lof = dbtob(bp->b_blkno);
677 	rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
678 	bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
679 	readlog(ul, bp);
680 	return (bp);
681 }
682 
683 /*
684  * NOTE: writers are single threaded thru the log layer.
685  * This means we can safely reference and change the cb and bp fields
686  * that ldl_read does not reference w/o holding the cb_rwlock or
687  * the bp makebusy lock.
688  */
689 static int
690 extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
691 {
692 	buf_t	*bpforw	= bp->b_forw;
693 
694 	ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
695 
696 	/*
697 	 * there is no `next' bp; do nothing
698 	 */
699 	if (bpforw == bp)
700 		return (0);
701 
702 	/*
703 	 * buffer space is not adjacent; do nothing
704 	 */
705 	if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
706 		return (0);
707 
708 	/*
709 	 * locking protocol requires giving up any bp locks before
710 	 * acquiring cb_rwlock.  This is okay because we hold
711 	 * un_log_mutex.
712 	 */
713 	sema_v(&bp->b_sem);
714 
715 	/*
716 	 * lock out ldl_read
717 	 */
718 	rw_enter(&cb->cb_rwlock, RW_WRITER);
719 
720 	/*
721 	 * wait for current IO to finish w/next bp; if necessary
722 	 */
723 	makebusy(ul, bpforw);
724 
725 	/*
726 	 * free the next bp and steal its space
727 	 */
728 	bp->b_forw = bpforw->b_forw;
729 	bpforw->b_forw->b_back = bp;
730 	bp->b_bufsize += bpforw->b_bufsize;
731 	sema_v(&bpforw->b_sem);
732 	bpforw->b_forw = cb->cb_free;
733 	cb->cb_free = bpforw;
734 	makebusy(ul, bp);
735 	rw_exit(&cb->cb_rwlock);
736 
737 	return (1);
738 }
739 
740 static size_t
741 storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
742 {
743 	size_t		copy_nb;
744 	size_t		nb_in_sec;
745 	sect_trailer_t	*st;
746 	size_t		nb_left = nb;
747 	cirbuf_t	*cb	= &ul->un_wrbuf;
748 
749 again:
750 	nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
751 	copy_nb = MIN(nb_left, nb_in_sec);
752 
753 	ASSERT(copy_nb);
754 
755 	bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
756 	bp->b_bcount += copy_nb;
757 	va += copy_nb;
758 	nb_left -= copy_nb;
759 	ul->un_tail_lof += copy_nb;
760 
761 	if ((nb_in_sec -= copy_nb) == 0) {
762 		st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);
763 
764 		st->st_tid = ul->un_logmap->mtm_tid;
765 		st->st_ident = ul->un_tail_ident++;
766 		bp->b_bcount += sizeof (sect_trailer_t);
767 		ul->un_tail_lof += sizeof (sect_trailer_t);
768 		/*
769 		 * log wrapped; async write this bp
770 		 */
771 		if (ul->un_tail_lof == ul->un_eol_lof) {
772 			ul->un_tail_lof = ul->un_bol_lof;
773 			push_dirty_bp(ul, bp);
774 			return (nb - nb_left);
775 		}
776 		/*
777 		 * out of bp space; get more or async write buf
778 		 */
779 		if (bp->b_bcount == bp->b_bufsize) {
780 			if (!extend_write_bp(ul, cb, bp)) {
781 				push_dirty_bp(ul, bp);
782 				return (nb - nb_left);
783 			}
784 		}
785 	}
786 	if (nb_left)
787 		goto again;
788 
789 	sema_v(&bp->b_sem);
790 	return (nb);
791 }
792 
793 static void
794 fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
795 {
796 	offset_t	src_mof	= me->me_mof;
797 	size_t		src_nb	= me->me_nb;
798 
799 	if (src_mof > dst_mof) {
800 		ASSERT(src_mof < (dst_mof + dst_nb));
801 		dst_va += (src_mof - dst_mof);
802 		dst_nb -= (src_mof - dst_mof);
803 	} else {
804 		ASSERT(dst_mof < (src_mof + src_nb));
805 		src_nb -= (dst_mof - src_mof);
806 	}
807 
808 	src_nb = MIN(src_nb, dst_nb);
809 	ASSERT(src_nb);
810 	bzero(dst_va, src_nb);
811 }
812 
813 /*
814  * dst_va == NULL means don't copy anything
815  */
816 static ulong_t
817 fetchbuf(
818 	ml_unit_t *ul,
819 	buf_t *bp,
820 	caddr_t dst_va,
821 	size_t dst_nb,
822 	off_t *dst_lofp)
823 {
824 	caddr_t	copy_va;
825 	size_t	copy_nb;
826 	size_t	nb_sec;
827 	off_t	dst_lof		= *dst_lofp;
828 	ulong_t	sav_dst_nb	= dst_nb;
829 	ulong_t	src_nb		= bp->b_bcount;
830 	off_t	src_lof		= dbtob(bp->b_blkno);
831 	off_t	src_elof	= src_lof + src_nb;
832 	caddr_t	src_va		= bp->b_un.b_addr;
833 
834 	/*
835 	 * copy from bp to dst_va
836 	 */
837 	while (dst_nb) {
838 		/*
839 		 * compute address within bp
840 		 */
841 		copy_va = src_va + (dst_lof - src_lof);
842 
843 		/*
844 		 * adjust copy size to amount of data in bp
845 		 */
846 		copy_nb = MIN(dst_nb, src_elof - dst_lof);
847 
848 		/*
849 		 * adjust copy size to amount of data in sector
850 		 */
851 		nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
852 		copy_nb = MIN(copy_nb, nb_sec);
853 
854 		/*
855 		 * dst_va == NULL means don't do copy (see logseek())
856 		 */
857 		if (dst_va) {
858 			bcopy(copy_va, dst_va, copy_nb);
859 			dst_va += copy_nb;
860 		}
861 		dst_lof += copy_nb;
862 		dst_nb -= copy_nb;
863 		nb_sec -= copy_nb;
864 
865 		/*
866 		 * advance over sector trailer
867 		 */
868 		if (nb_sec == 0)
869 			dst_lof += sizeof (sect_trailer_t);
870 
871 		/*
872 		 * exhausted buffer
873 		 *	return current lof for next read
874 		 */
875 		if (dst_lof == src_elof) {
876 			sema_v(&bp->b_sem);
877 			if (dst_lof == ul->un_eol_lof)
878 				dst_lof = ul->un_bol_lof;
879 			*dst_lofp = dst_lof;
880 			return (sav_dst_nb - dst_nb);
881 		}
882 	}
883 
884 	/*
885 	 * copy complete - return current lof
886 	 */
887 	sema_v(&bp->b_sem);
888 	*dst_lofp = dst_lof;
889 	return (sav_dst_nb);
890 }
891 
892 void
893 ldl_round_commit(ml_unit_t *ul)
894 {
895 	int		wrapped;
896 	buf_t		*bp;
897 	sect_trailer_t	*st;
898 	size_t		bcount;
899 	cirbuf_t	*cb	= &ul->un_wrbuf;
900 
901 	/*
902 	 * if nothing to write; then do nothing
903 	 */
904 	if ((bp = cb->cb_dirty) == NULL)
905 		return;
906 	makebusy(ul, bp);
907 
908 	/*
909 	 * round up to sector boundary and set new tail
910 	 *	don't readjust st_ident if buf is already rounded
911 	 */
912 	bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
913 	if (bcount == bp->b_bcount) {
914 		sema_v(&bp->b_sem);
915 		return;
916 	}
917 	bp->b_bcount = bcount;
918 	ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
919 	wrapped = 0;
920 	if (ul->un_tail_lof == ul->un_eol_lof) {
921 		ul->un_tail_lof = ul->un_bol_lof;
922 		++wrapped;
923 	}
924 	ASSERT(ul->un_tail_lof != ul->un_head_lof);
925 
926 	/*
927 	 * fix up the sector trailer
928 	 */
929 	/* LINTED */
930 	st = (sect_trailer_t *)
931 	    ((bp->b_un.b_addr + bcount) - sizeof (*st));
932 	st->st_tid = ul->un_logmap->mtm_tid;
933 	st->st_ident = ul->un_tail_ident++;
934 
935 	/*
936 	 * if tail wrapped or we have exhausted this buffer
937 	 *	async write the buffer
938 	 */
939 	if (wrapped || bcount == bp->b_bufsize)
940 		push_dirty_bp(ul, bp);
941 	else
942 		sema_v(&bp->b_sem);
943 }
944 
945 void
946 ldl_push_commit(ml_unit_t *ul)
947 {
948 	buf_t		*bp;
949 	cirbuf_t	*cb	= &ul->un_wrbuf;
950 
951 	/*
952 	 * if nothing to write; then do nothing
953 	 */
954 	if ((bp = cb->cb_dirty) == NULL)
955 		return;
956 	makebusy(ul, bp);
957 	push_dirty_bp(ul, bp);
958 }
959 
960 int
961 ldl_need_commit(ml_unit_t *ul)
962 {
963 	return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
964 }
965 
966 int
967 ldl_has_space(ml_unit_t *ul, mapentry_t *me)
968 {
969 	off_t	nfb;
970 	off_t	nb;
971 
972 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
973 
974 	/*
975 	 * Add up the size used by the deltas
976 	 * round nb up to a sector length plus an extra sector
977 	 *	w/o the extra sector we couldn't distinguish
978 	 *	a full log (head == tail) from an empty log (head == tail)
979 	 */
980 	for (nb = DEV_BSIZE; me; me = me->me_hash) {
981 		nb += sizeof (struct delta);
982 		if (me->me_dt != DT_CANCEL)
983 			nb += me->me_nb;
984 	}
985 	nb = P2ROUNDUP(nb, DEV_BSIZE);
986 
987 	if (ul->un_head_lof <= ul->un_tail_lof)
988 		nfb = (ul->un_head_lof - ul->un_bol_lof) +
989 		    (ul->un_eol_lof - ul->un_tail_lof);
990 	else
991 		nfb = ul->un_head_lof - ul->un_tail_lof;
992 
993 	return (nb < nfb);
994 }
995 
996 void
997 ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
998 {
999 	buf_t		*bp;
1000 	caddr_t		va;
1001 	size_t		nb;
1002 	size_t		actual;
1003 
1004 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1005 
1006 	/* Write the delta */
1007 
1008 	nb = sizeof (struct delta);
1009 	va = (caddr_t)&me->me_delta;
1010 	bp = get_write_bp(ul);
1011 
1012 	while (nb) {
1013 		if (ul->un_flags & LDL_ERROR) {
1014 			sema_v(&bp->b_sem);
1015 			return;
1016 		}
1017 		actual = storebuf(ul, bp, va, nb);
1018 		ASSERT(actual);
1019 		va += actual;
1020 		nb -= actual;
1021 		if (nb)
1022 			bp = get_write_bp(ul);
1023 	}
1024 
1025 	/* If a commit, cancel, or 0's; we're almost done */
1026 	switch (me->me_dt) {
1027 		case DT_COMMIT:
1028 		case DT_CANCEL:
1029 		case DT_ABZERO:
1030 			/* roll needs to know where the next delta will go */
1031 			me->me_lof = ul->un_tail_lof;
1032 			return;
1033 		default:
1034 			break;
1035 	}
1036 
1037 	/* Now write the data */
1038 
1039 	ASSERT(me->me_nb != 0);
1040 
1041 	nb = me->me_nb;
1042 	va = (me->me_mof - bufmof) + bufp;
1043 	bp = get_write_bp(ul);
1044 
1045 	/* Save where we will put the data */
1046 	me->me_lof = ul->un_tail_lof;
1047 
1048 	while (nb) {
1049 		if (ul->un_flags & LDL_ERROR) {
1050 			sema_v(&bp->b_sem);
1051 			return;
1052 		}
1053 		actual = storebuf(ul, bp, va, nb);
1054 		ASSERT(actual);
1055 		va += actual;
1056 		nb -= actual;
1057 		if (nb)
1058 			bp = get_write_bp(ul);
1059 	}
1060 }
1061 
1062 void
1063 ldl_waito(ml_unit_t *ul)
1064 {
1065 	buf_t		*bp;
1066 	cirbuf_t	*cb	= &ul->un_wrbuf;
1067 
1068 	rw_enter(&cb->cb_rwlock, RW_WRITER);
1069 	/*
1070 	 * wait on them
1071 	 */
1072 	bp = cb->cb_bp;
1073 	do {
1074 		if ((bp->b_flags & B_DONE) == 0) {
1075 			makebusy(ul, bp);
1076 			sema_v(&bp->b_sem);
1077 		}
1078 		bp = bp->b_forw;
1079 	} while (bp != cb->cb_bp);
1080 	rw_exit(&cb->cb_rwlock);
1081 }
1082 
1083 /*
1084  * seek nb bytes from location lof
1085  */
1086 static int
1087 logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
1088 {
1089 	buf_t	*bp;
1090 	ulong_t	actual;
1091 
1092 	while (nb) {
1093 		bp = get_read_bp(ul, lof);
1094 		if (bp->b_flags & B_ERROR) {
1095 			sema_v(&bp->b_sem);
1096 			return (EIO);
1097 		}
1098 		actual = fetchbuf(ul, bp, NULL, nb, &lof);
1099 		ASSERT(actual);
1100 		nb -= actual;
1101 	}
1102 	*lofp = lof;
1103 	ASSERT(nb == 0);
1104 	return (0);
1105 }
1106 
1107 int
1108 ldl_read(
1109 	ml_unit_t *ul,		/* Log unit */
1110 	caddr_t va,		/* address of buffer to read into */
1111 	offset_t mof,		/* mof of buffer */
1112 	off_t nb,		/* length of buffer */
1113 	mapentry_t *me)		/* Map entry list */
1114 {
1115 	buf_t	*bp;
1116 	crb_t   *crb;
1117 	caddr_t	rva;			/* address to read into */
1118 	size_t	rnb;			/* # of bytes to read */
1119 	off_t	lof;			/* log device offset to read from */
1120 	off_t   skip;
1121 	ulong_t	actual;
1122 	int	error;
1123 	caddr_t	eva	= va + nb;	/* end of buffer */
1124 
1125 	for (; me; me = me->me_agenext) {
1126 		ASSERT(me->me_dt != DT_CANCEL);
1127 
1128 		/*
1129 		 * check for an cached roll buffer
1130 		 */
1131 		crb = me->me_crb;
1132 		if (crb) {
1133 			if (mof > crb->c_mof) {
1134 				/*
1135 				 * This mapentry overlaps with the beginning of
1136 				 * the supplied buffer
1137 				 */
1138 				skip = mof - crb->c_mof;
1139 				bcopy(crb->c_buf + skip, va,
1140 				    MIN(nb, crb->c_nb - skip));
1141 			} else {
1142 				/*
1143 				 * This mapentry starts at or after
1144 				 * the supplied buffer.
1145 				 */
1146 				skip = crb->c_mof - mof;
1147 				bcopy(crb->c_buf, va + skip,
1148 				    MIN(crb->c_nb, nb - skip));
1149 			}
1150 			logstats.ls_lreadsinmem.value.ui64++;
1151 			continue;
1152 		}
1153 
1154 		/*
1155 		 * check for a delta full of zeroes - there's no log data
1156 		 */
1157 		if (me->me_dt == DT_ABZERO) {
1158 			fetchzeroes(va, mof, nb, me);
1159 			continue;
1160 		}
1161 
1162 		if (mof > me->me_mof) {
1163 			rnb = (size_t)(mof - me->me_mof);
1164 			error = logseek(ul, me->me_lof, rnb, &lof);
1165 			if (error)
1166 				return (EIO);
1167 			rva = va;
1168 			rnb = me->me_nb - rnb;
1169 			rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
1170 		} else {
1171 			lof = me->me_lof;
1172 			rva = (me->me_mof - mof) + va;
1173 			rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
1174 		}
1175 
1176 		while (rnb) {
1177 			bp = get_read_bp(ul, lof);
1178 			if (bp->b_flags & B_ERROR) {
1179 				sema_v(&bp->b_sem);
1180 				return (EIO);
1181 			}
1182 			ASSERT(((me->me_flags & ME_ROLL) == 0) ||
1183 			    (bp != ul->un_wrbuf.cb_dirty));
1184 			actual = fetchbuf(ul, bp, rva, rnb, &lof);
1185 			ASSERT(actual);
1186 			rva += actual;
1187 			rnb -= actual;
1188 		}
1189 	}
1190 	return (0);
1191 }
1192 
1193 void
1194 ldl_savestate(ml_unit_t *ul)
1195 {
1196 	int		error;
1197 	buf_t		*bp	= ul->un_bp;
1198 	ml_odunit_t	*ud	= (void *)bp->b_un.b_addr;
1199 	ml_odunit_t	*ud2	= (void *)(bp->b_un.b_addr + DEV_BSIZE);
1200 
1201 #if	DEBUG
1202 	/*
1203 	 * Scan test is running; don't update intermediate state
1204 	 */
1205 	if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
1206 		return;
1207 #endif	/* DEBUG */
1208 
1209 	mutex_enter(&ul->un_state_mutex);
1210 	bcopy(&ul->un_ondisk, ud, sizeof (*ud));
1211 	ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
1212 	bcopy(ud, ud2, sizeof (*ud));
1213 
1214 	/* If a snapshot is enabled write through the shapshot driver. */
1215 	if (ul->un_ufsvfs->vfs_snapshot)
1216 		UFS_BWRITE2(ul->un_ufsvfs, bp);
1217 	else
1218 		BWRITE2(bp);
1219 	logstats.ls_ldlwrites.value.ui64++;
1220 	error = bp->b_flags & B_ERROR;
1221 	mutex_exit(&ul->un_state_mutex);
1222 	if (error)
1223 		ldl_seterror(ul, "Error writing ufs log state");
1224 }
1225 
1226 /*
1227  * The head will be set to (new_lof - header) since ldl_sethead is
1228  * called with the new_lof of the data portion of a delta.
1229  */
1230 void
1231 ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
1232 {
1233 	off_t		nb;
1234 	off_t		new_lof;
1235 	uint32_t	new_ident;
1236 	daddr_t		beg_blkno;
1237 	daddr_t		end_blkno;
1238 	struct timeval	tv;
1239 
1240 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1241 
1242 	if (data_lof == -1) {
1243 		/* log is empty */
1244 		uniqtime(&tv);
1245 		if (tv.tv_usec == ul->un_head_ident) {
1246 			tv.tv_usec++;
1247 		}
1248 		last_loghead_ident = tv.tv_usec;
1249 		new_ident = tv.tv_usec;
1250 		new_lof = ul->un_tail_lof;
1251 
1252 	} else {
1253 		/* compute header's lof */
1254 		new_ident = ul->un_head_ident;
1255 		new_lof = data_lof - sizeof (struct delta);
1256 
1257 		/* whoops, header spans sectors; subtract out sector trailer */
1258 		if (btodb(new_lof) != btodb(data_lof))
1259 			new_lof -= sizeof (sect_trailer_t);
1260 
1261 		/* whoops, header wrapped the log; go to last sector */
1262 		if (new_lof < ul->un_bol_lof) {
1263 			/* sector offset */
1264 			new_lof -= dbtob(btodb(new_lof));
1265 			/* add to last sector's lof */
1266 			new_lof += (ul->un_eol_lof - DEV_BSIZE);
1267 		}
1268 		ul->un_head_tid = tid;
1269 	}
1270 
1271 	/*
1272 	 * check for nop
1273 	 */
1274 	if (new_lof == ul->un_head_lof)
1275 		return;
1276 
1277 	/*
1278 	 * invalidate the affected bufs and calculate new ident
1279 	 */
1280 	if (new_lof > ul->un_head_lof) {
1281 		nb = new_lof - ul->un_head_lof;
1282 		inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1283 		inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1284 
1285 		end_blkno = btodb(new_lof);
1286 		beg_blkno = btodb(ul->un_head_lof);
1287 		new_ident += (end_blkno - beg_blkno);
1288 	} else {
1289 		nb = ul->un_eol_lof - ul->un_head_lof;
1290 		inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1291 		inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1292 
1293 		end_blkno = btodb(ul->un_eol_lof);
1294 		beg_blkno = btodb(ul->un_head_lof);
1295 		new_ident += (end_blkno - beg_blkno);
1296 
1297 		nb = new_lof - ul->un_bol_lof;
1298 		inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
1299 		inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);
1300 
1301 		end_blkno = btodb(new_lof);
1302 		beg_blkno = btodb(ul->un_bol_lof);
1303 		new_ident += (end_blkno - beg_blkno);
1304 	}
1305 	/*
1306 	 * don't update the head if there has been an error
1307 	 */
1308 	if (ul->un_flags & LDL_ERROR)
1309 		return;
1310 
1311 	/* Fix up the head and ident */
1312 	ASSERT(new_lof >= ul->un_bol_lof);
1313 	ul->un_head_lof = new_lof;
1314 	ul->un_head_ident = new_ident;
1315 	if (data_lof == -1) {
1316 		ul->un_tail_ident = ul->un_head_ident;
1317 	}
1318 
1319 
1320 	/* Commit to the database */
1321 	ldl_savestate(ul);
1322 
1323 	ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
1324 	    ldl_sethead_debug(ul));
1325 }
1326 
1327 /*
1328  * The tail will be set to the sector following lof+nb
1329  *	lof + nb == size of the last delta + commit record
1330  *	this function is called once after the log scan has completed.
1331  */
1332 void
1333 ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
1334 {
1335 	off_t		new_lof;
1336 	uint32_t	new_ident;
1337 	daddr_t		beg_blkno;
1338 	daddr_t		end_blkno;
1339 	struct timeval	tv;
1340 
1341 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1342 
1343 	if (lof == -1) {
1344 		uniqtime(&tv);
1345 		if (tv.tv_usec == ul->un_head_ident) {
1346 			tv.tv_usec++;
1347 		}
1348 		last_loghead_ident = tv.tv_usec;
1349 		ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
1350 		ul->un_head_lof = ul->un_tail_lof;
1351 		ul->un_head_ident = tv.tv_usec;
1352 		ul->un_tail_ident = ul->un_head_ident;
1353 
1354 		/* Commit to the database */
1355 		ldl_savestate(ul);
1356 
1357 		return;
1358 	}
1359 
1360 	/*
1361 	 * new_lof is the offset of the sector following the last commit
1362 	 */
1363 	(void) logseek(ul, lof, nb, &new_lof);
1364 	ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));
1365 
1366 	/*
1367 	 * calculate new ident
1368 	 */
1369 	if (new_lof > ul->un_head_lof) {
1370 		end_blkno = btodb(new_lof);
1371 		beg_blkno = btodb(ul->un_head_lof);
1372 		new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1373 	} else {
1374 		end_blkno = btodb(ul->un_eol_lof);
1375 		beg_blkno = btodb(ul->un_head_lof);
1376 		new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1377 
1378 		end_blkno = btodb(new_lof);
1379 		beg_blkno = btodb(ul->un_bol_lof);
1380 		new_ident += (end_blkno - beg_blkno);
1381 	}
1382 
1383 	/* Fix up the tail and ident */
1384 	ul->un_tail_lof = new_lof;
1385 	ul->un_tail_ident = new_ident;
1386 
1387 	/* Commit to the database */
1388 	ldl_savestate(ul);
1389 }
1390 
1391 /*
1392  * LOGSCAN STUFF
1393  */
1394 static int
1395 ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
1396 {
1397 	ulong_t		ident;
1398 	size_t		nblk, i;
1399 	sect_trailer_t	*st;
1400 
1401 	/*
1402 	 * compute ident for first sector in the buffer
1403 	 */
1404 	ident = ul->un_head_ident;
1405 	if (bp->b_blkno >= btodb(ul->un_head_lof)) {
1406 		ident += (bp->b_blkno - btodb(ul->un_head_lof));
1407 	} else {
1408 		ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
1409 		ident += (bp->b_blkno - btodb(ul->un_bol_lof));
1410 	}
1411 	/*
1412 	 * truncate the buffer down to the last valid sector
1413 	 */
1414 	nblk = btodb(bp->b_bcount);
1415 	bp->b_bcount = 0;
1416 	/* LINTED */
1417 	st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
1418 	for (i = 0; i < nblk; ++i) {
1419 		if (st->st_ident != ident)
1420 			break;
1421 
1422 		/* remember last valid tid for ldl_logscan_error() */
1423 		ul->un_tid = st->st_tid;
1424 
1425 		/* LINTED */
1426 		st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
1427 		++ident;
1428 		bp->b_bcount += DEV_BSIZE;
1429 	}
1430 	/*
1431 	 * make sure that lof is still within range
1432 	 */
1433 	return (within_range(lof, bp->b_blkno, bp->b_bcount));
1434 }
1435 
1436 ulong_t
1437 ldl_logscan_nbcommit(off_t lof)
1438 {
1439 	/*
1440 	 * lof is the offset following the commit header.  However,
1441 	 * if the commit header fell on the end-of-sector, then lof
1442 	 * has already been advanced to the beginning of the next
1443 	 * sector.  So do nothing.  Otherwise, return the remaining
1444 	 * bytes in the sector.
1445 	 */
1446 	if ((lof & (DEV_BSIZE - 1)) == 0)
1447 		return (0);
1448 	return (NB_LEFT_IN_SECTOR(lof));
1449 }
1450 
1451 int
1452 ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
1453 {
1454 	buf_t	*bp;
1455 	ulong_t	actual;
1456 
1457 	ASSERT(ul->un_head_lof != ul->un_tail_lof);
1458 
1459 	/*
1460 	 * Check the log data doesn't go out of bounds
1461 	 */
1462 	if (ul->un_head_lof < ul->un_tail_lof) {
1463 		if (!WITHIN(*lofp, nb, ul->un_head_lof,
1464 		    (ul->un_tail_lof - ul->un_head_lof))) {
1465 			return (EIO);
1466 		}
1467 	} else {
1468 		if (OVERLAP(*lofp, nb, ul->un_tail_lof,
1469 		    (ul->un_head_lof - ul->un_tail_lof))) {
1470 			return (EIO);
1471 		}
1472 	}
1473 
1474 	while (nb) {
1475 		bp = get_read_bp(ul, *lofp);
1476 		if (bp->b_flags & B_ERROR) {
1477 			sema_v(&bp->b_sem);
1478 			return (EIO);
1479 		}
1480 		/*
1481 		 * out-of-seq idents means partial transaction
1482 		 *	panic, non-corrupting powerfail, ...
1483 		 */
1484 		if (!ldl_logscan_ident(ul, bp, *lofp)) {
1485 			sema_v(&bp->b_sem);
1486 			return (EIO);
1487 		}
1488 		/*
1489 		 * copy the header into the caller's buf
1490 		 */
1491 		actual = fetchbuf(ul, bp, va, nb, lofp);
1492 		if (va)
1493 			va += actual;
1494 		nb -= actual;
1495 	}
1496 	return (0);
1497 }
1498 
1499 void
1500 ldl_logscan_begin(ml_unit_t *ul)
1501 {
1502 	size_t	bufsize;
1503 
1504 	ASSERT(ul->un_wrbuf.cb_dirty == NULL);
1505 
1506 	/*
1507 	 * logscan has begun
1508 	 */
1509 	ul->un_flags |= LDL_SCAN;
1510 
1511 	/*
1512 	 * reset the circular bufs
1513 	 */
1514 	bufsize = ldl_bufsize(ul);
1515 	alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
1516 	alloc_wrbuf(&ul->un_wrbuf, bufsize);
1517 
1518 	/*
1519 	 * set the tail to reflect a full log
1520 	 */
1521 	ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;
1522 
1523 	if (ul->un_tail_lof < ul->un_bol_lof)
1524 		ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
1525 	if (ul->un_tail_lof >= ul->un_eol_lof)
1526 		ul->un_tail_lof = ul->un_bol_lof;
1527 
1528 	/*
1529 	 * un_tid is used during error processing; it is initialized to
1530 	 * the tid of the delta at un_head_lof;
1531 	 */
1532 	ul->un_tid = ul->un_head_tid;
1533 }
1534 
1535 void
1536 ldl_logscan_end(ml_unit_t *ul)
1537 {
1538 	size_t	bufsize;
1539 
1540 	/*
1541 	 * reset the circular bufs
1542 	 */
1543 	bufsize = ldl_bufsize(ul);
1544 	alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
1545 	alloc_wrbuf(&ul->un_wrbuf, bufsize);
1546 
1547 	/*
1548 	 * Done w/scan
1549 	 */
1550 	ul->un_flags &= ~LDL_SCAN;
1551 }
1552 
1553 int
1554 ldl_need_roll(ml_unit_t *ul)
1555 {
1556 	off_t	busybytes;
1557 	off_t	head;
1558 	off_t	tail;
1559 	off_t	bol;
1560 	off_t	eol;
1561 	off_t	nb;
1562 
1563 	/*
1564 	 * snapshot the log state
1565 	 */
1566 	head = ul->un_head_lof;
1567 	tail = ul->un_tail_lof;
1568 	bol = ul->un_bol_lof;
1569 	eol = ul->un_eol_lof;
1570 	nb = ul->un_logsize;
1571 
1572 	/*
1573 	 * compute number of busy (inuse) bytes
1574 	 */
1575 	if (head <= tail)
1576 		busybytes = tail - head;
1577 	else
1578 		busybytes = (eol - head) + (tail - bol);
1579 
1580 	/*
1581 	 * return TRUE if > 75% full
1582 	 */
1583 	return (busybytes > (nb - (nb >> 2)));
1584 }
1585 
1586 void
1587 ldl_seterror(ml_unit_t *ul, char *why)
1588 {
1589 	/*
1590 	 * already in error state; do nothing
1591 	 */
1592 	if (ul->un_flags & LDL_ERROR)
1593 		return;
1594 
1595 	ul->un_flags |= LDL_ERROR;	/* incore */
1596 	ul->un_badlog = 1;		/* ondisk (cleared by fsck) */
1597 
1598 	/*
1599 	 * Commit to state sectors
1600 	 */
1601 	uniqtime(&ul->un_timestamp);
1602 	ldl_savestate(ul);
1603 
1604 	/* Pretty print */
1605 	cmn_err(CE_WARN, "%s", why);
1606 	cmn_err(CE_WARN, "ufs log for %s changed state to Error",
1607 	    ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1608 	cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
1609 	    ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1610 
1611 	/*
1612 	 * If we aren't in the middle of scan (aka snarf); tell ufs
1613 	 * to hard lock itself.
1614 	 */
1615 	if ((ul->un_flags & LDL_SCAN) == 0)
1616 		ufs_trans_onerror();
1617 }
1618 
1619 size_t
1620 ldl_bufsize(ml_unit_t *ul)
1621 {
1622 	size_t		bufsize;
1623 	extern uint32_t	ldl_minbufsize;
1624 
1625 	/*
1626 	 * initial guess is the maxtransfer value for this log device
1627 	 * 	increase if too small
1628 	 * 	decrease if too large
1629 	 */
1630 	bufsize = dbtob(btod(ul->un_maxtransfer));
1631 	if (bufsize < ldl_minbufsize)
1632 		bufsize = ldl_minbufsize;
1633 	if (bufsize > maxphys)
1634 		bufsize = maxphys;
1635 	if (bufsize > ul->un_maxtransfer)
1636 		bufsize = ul->un_maxtransfer;
1637 	return (bufsize);
1638 }
1639