xref: /titanic_41/usr/src/uts/common/fs/ufs/lufs_log.c (revision 45916cd2fec6e79bca5dee0421bd39e3c2910d1e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/systm.h>
30 #include <sys/types.h>
31 #include <sys/vnode.h>
32 #include <sys/errno.h>
33 #include <sys/sysmacros.h>
34 #include <sys/debug.h>
35 #include <sys/kmem.h>
36 #include <sys/conf.h>
37 #include <sys/proc.h>
38 #include <sys/cmn_err.h>
39 #include <sys/fssnap_if.h>
40 #include <sys/fs/ufs_inode.h>
41 #include <sys/fs/ufs_filio.h>
42 #include <sys/fs/ufs_log.h>
43 #include <sys/fs/ufs_bio.h>
44 #include <sys/atomic.h>
45 
46 extern int		maxphys;
47 extern uint_t		bypass_snapshot_throttle_key;
48 
49 extern struct kmem_cache	*lufs_sv;
50 extern struct kmem_cache	*lufs_bp;
51 
52 static void
53 makebusy(ml_unit_t *ul, buf_t *bp)
54 {
55 	sema_p(&bp->b_sem);
56 	if ((bp->b_flags & B_ERROR) == 0)
57 		return;
58 	if (bp->b_flags & B_READ)
59 		ldl_seterror(ul, "Error reading ufs log");
60 	else
61 		ldl_seterror(ul, "Error writing ufs log");
62 }
63 
64 static int
65 logdone(buf_t *bp)
66 {
67 	bp->b_flags |= B_DONE;
68 
69 	if (bp->b_flags & B_WRITE)
70 		sema_v(&bp->b_sem);
71 	else
72 		/* wakeup the thread waiting on this buf */
73 		sema_v(&bp->b_io);
74 	return (0);
75 }
76 
77 static int
78 ldl_strategy_done(buf_t *cb)
79 {
80 	lufs_save_t	*sv;
81 	lufs_buf_t	*lbp;
82 	buf_t		*bp;
83 
84 	ASSERT(SEMA_HELD(&cb->b_sem));
85 	ASSERT((cb->b_flags & B_DONE) == 0);
86 
87 	/*
88 	 * Compute address of the ``save'' struct
89 	 */
90 	lbp = (lufs_buf_t *)cb;
91 	sv = (lufs_save_t *)lbp->lb_ptr;
92 
93 	if (cb->b_flags & B_ERROR)
94 		sv->sv_error = 1;
95 
96 	/*
97 	 * If this is the last request, release the resources and
98 	 * ``done'' the original buffer header.
99 	 */
100 	if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
101 		kmem_cache_free(lufs_bp, lbp);
102 		return (1);
103 	}
104 	/* Propagate any errors back to the original buffer header */
105 	bp = sv->sv_bp;
106 	if (sv->sv_error)
107 		bp->b_flags |= B_ERROR;
108 	kmem_cache_free(lufs_bp, lbp);
109 	kmem_cache_free(lufs_sv, sv);
110 
111 	biodone(bp);
112 	return (0);
113 }
114 
115 /*
116  * Map the log logical block number to a physical disk block number
117  */
118 static int
119 map_frag(
120 	ml_unit_t	*ul,
121 	daddr_t		lblkno,
122 	size_t		bcount,
123 	daddr_t		*pblkno,
124 	size_t		*pbcount)
125 {
126 	ic_extent_t	*ext = ul->un_ebp->ic_extents;
127 	uint32_t	e = ul->un_ebp->ic_nextents;
128 	uint32_t	s = 0;
129 	uint32_t	i = e >> 1;
130 	uint32_t	lasti = i;
131 	uint32_t	bno_off;
132 
133 again:
134 	if (ext[i].ic_lbno <= lblkno) {
135 		if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
136 			/* FOUND IT */
137 			bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
138 			*pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
139 			*pblkno = ext[i].ic_pbno + bno_off;
140 			return (0);
141 		} else
142 			s = i;
143 	} else
144 		e = i;
145 	i = s + ((e - s) >> 1);
146 
147 	if (i == lasti) {
148 		*pbcount = bcount;
149 		return (ENOENT);
150 	}
151 	lasti = i;
152 
153 	goto again;
154 }
155 
156 /*
157  * The log is a set of extents (which typically will be only one, but
158  * may be more if the disk was close to full when the log was created)
159  * and hence the logical offsets into the log
160  * have to be translated into their real device locations before
161  * calling the device's strategy routine. The translation may result
162  * in several IO requests if this request spans extents.
163  */
164 void
165 ldl_strategy(ml_unit_t *ul, buf_t *pb)
166 {
167 	lufs_save_t	*sv;
168 	lufs_buf_t	*lbp;
169 	buf_t		*cb;
170 	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
171 	daddr_t		lblkno, pblkno;
172 	size_t		nb_left, pbcount;
173 	off_t		offset;
174 	dev_t		dev	= ul->un_dev;
175 	int		error;
176 	int		read = pb->b_flags & B_READ;
177 
178 	/*
179 	 * Allocate and initialise the save stucture,
180 	 */
181 	sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
182 	sv->sv_error = 0;
183 	sv->sv_bp = pb;
184 	nb_left = pb->b_bcount;
185 	sv->sv_nb_left = nb_left;
186 
187 	lblkno = pb->b_blkno;
188 	offset = 0;
189 
190 	do {
191 		error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);
192 
193 		lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
194 		bioinit(&lbp->lb_buf);
195 		lbp->lb_ptr = sv;
196 
197 		cb = bioclone(pb, offset, pbcount, dev,
198 		    pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);
199 
200 		offset += pbcount;
201 		lblkno += btodb(pbcount);
202 		nb_left -= pbcount;
203 
204 		if (error) {
205 			cb->b_flags |= B_ERROR;
206 			cb->b_resid = cb->b_bcount;
207 			biodone(cb);
208 		} else {
209 			if (read) {
210 				logstats.ls_ldlreads.value.ui64++;
211 				ufsvfsp->vfs_iotstamp = lbolt;
212 				lwp_stat_update(LWP_STAT_INBLK, 1);
213 			} else {
214 				logstats.ls_ldlwrites.value.ui64++;
215 				lwp_stat_update(LWP_STAT_OUBLK, 1);
216 			}
217 
218 			/*
219 			 * write through the snapshot driver if necessary
220 			 * We do not want this write to be throttled because
221 			 * we are holding the un_log mutex here. If we
222 			 * are throttled in fssnap_translate, the fssnap_taskq
223 			 * thread which can wake us up can get blocked on
224 			 * the un_log mutex resulting in a deadlock.
225 			 */
226 			if (ufsvfsp->vfs_snapshot) {
227 				(void) tsd_set(bypass_snapshot_throttle_key, \
228 							(void *)1);
229 				fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);
230 
231 				(void) tsd_set(bypass_snapshot_throttle_key, \
232 							(void *)0);
233 			} else {
234 				(void) bdev_strategy(cb);
235 			}
236 		}
237 
238 	} while (nb_left);
239 }
240 
241 static void
242 writelog(ml_unit_t *ul, buf_t *bp)
243 {
244 	ASSERT(SEMA_HELD(&bp->b_sem));
245 
246 	/*
247 	 * This is really an B_ASYNC write but we want Presto to
248 	 * cache this write.  The iodone routine, logdone, processes
249 	 * the buf correctly.
250 	 */
251 	bp->b_flags = B_WRITE;
252 	bp->b_edev = ul->un_dev;
253 	bp->b_iodone = logdone;
254 
255 	/*
256 	 * return EIO for every IO if in hard error state
257 	 */
258 	if (ul->un_flags & LDL_ERROR) {
259 		bp->b_flags |= B_ERROR;
260 		bp->b_error = EIO;
261 		biodone(bp);
262 		return;
263 	}
264 
265 	ldl_strategy(ul, bp);
266 }
267 
268 static void
269 readlog(ml_unit_t *ul, buf_t *bp)
270 {
271 	ASSERT(SEMA_HELD(&bp->b_sem));
272 	ASSERT(bp->b_bcount);
273 
274 	bp->b_flags = B_READ;
275 	bp->b_edev = ul->un_dev;
276 	bp->b_iodone = logdone;
277 
278 	/* all IO returns errors when in error state */
279 	if (ul->un_flags & LDL_ERROR) {
280 		bp->b_flags |= B_ERROR;
281 		bp->b_error = EIO;
282 		biodone(bp);
283 		(void) trans_wait(bp);
284 		return;
285 	}
286 
287 	ldl_strategy(ul, bp);
288 
289 	if (trans_wait(bp))
290 		ldl_seterror(ul, "Error reading ufs log");
291 }
292 
293 /*
294  * NOTE: writers are single threaded thru the log layer.
295  * This means we can safely reference and change the cb and bp fields
296  * that ldl_read does not reference w/o holding the cb_rwlock or
297  * the bp makebusy lock.
298  */
299 static void
300 push_dirty_bp(ml_unit_t *ul, buf_t *bp)
301 {
302 	buf_t		*newbp;
303 	cirbuf_t	*cb		= &ul->un_wrbuf;
304 
305 	ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
306 	ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);
307 
308 	/*
309 	 * async write the buf
310 	 */
311 	writelog(ul, bp);
312 
313 	/*
314 	 * no longer filling any buf
315 	 */
316 	cb->cb_dirty = NULL;
317 
318 	/*
319 	 * no extra buffer space; all done
320 	 */
321 	if (bp->b_bcount == bp->b_bufsize)
322 		return;
323 
324 	/*
325 	 * give extra buffer space to a new bp
326 	 * 	try to take buf off of free list
327 	 */
328 	if ((newbp = cb->cb_free) != NULL) {
329 		cb->cb_free = newbp->b_forw;
330 	} else {
331 		newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
332 		sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
333 		sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
334 	}
335 	newbp->b_flags = 0;
336 	newbp->b_bcount = 0;
337 	newbp->b_file = NULL;
338 	newbp->b_offset = -1;
339 	newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
340 	newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
341 	bp->b_bufsize = bp->b_bcount;
342 
343 	/*
344 	 * lock out readers and put new buf at LRU position
345 	 */
346 	rw_enter(&cb->cb_rwlock, RW_WRITER);
347 	newbp->b_forw = bp->b_forw;
348 	newbp->b_back = bp;
349 	bp->b_forw->b_back = newbp;
350 	bp->b_forw = newbp;
351 	rw_exit(&cb->cb_rwlock);
352 }
353 
354 static void
355 inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
356 {
357 	buf_t		*bp;
358 	off_t		elof	= lof + nb;
359 	off_t		buflof;
360 	off_t		bufelof;
361 
362 	/*
363 	 * discard all bufs that overlap the range (lof, lof + nb)
364 	 */
365 	rw_enter(&cb->cb_rwlock, RW_WRITER);
366 	bp = cb->cb_bp;
367 	do {
368 		if (bp == cb->cb_dirty || bp->b_bcount == 0) {
369 			bp = bp->b_forw;
370 			continue;
371 		}
372 		buflof = dbtob(bp->b_blkno);
373 		bufelof = buflof + bp->b_bcount;
374 		if ((buflof < lof && bufelof <= lof) ||
375 		    (buflof >= elof && bufelof > elof)) {
376 			bp = bp->b_forw;
377 			continue;
378 		}
379 		makebusy(ul, bp);
380 		bp->b_flags = 0;
381 		bp->b_bcount = 0;
382 		sema_v(&bp->b_sem);
383 		bp = bp->b_forw;
384 	} while (bp != cb->cb_bp);
385 	rw_exit(&cb->cb_rwlock);
386 }
387 
388 /*
389  * NOTE: writers are single threaded thru the log layer.
390  * This means we can safely reference and change the cb and bp fields
391  * that ldl_read does not reference w/o holding the cb_rwlock or
392  * the bp makebusy lock.
393  */
394 static buf_t *
395 get_write_bp(ml_unit_t *ul)
396 {
397 	cirbuf_t	*cb = &ul->un_wrbuf;
398 	buf_t		*bp;
399 
400 	/*
401 	 * cb_dirty is the buffer we are currently filling; if any
402 	 */
403 	if ((bp = cb->cb_dirty) != NULL) {
404 		makebusy(ul, bp);
405 		return (bp);
406 	}
407 	/*
408 	 * discard any bp that overlaps the current tail since we are
409 	 * about to overwrite it.
410 	 */
411 	inval_range(ul, cb, ul->un_tail_lof, 1);
412 
413 	/*
414 	 * steal LRU buf
415 	 */
416 	rw_enter(&cb->cb_rwlock, RW_WRITER);
417 	bp = cb->cb_bp->b_forw;
418 	makebusy(ul, bp);
419 
420 	cb->cb_dirty = bp;
421 	cb->cb_bp = bp;
422 
423 	bp->b_flags = 0;
424 	bp->b_bcount = 0;
425 	bp->b_blkno = btodb(ul->un_tail_lof);
426 	ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
427 	rw_exit(&cb->cb_rwlock);
428 
429 	/*
430 	 * NOTE:
431 	 *	1. un_tail_lof never addresses >= un_eol_lof
432 	 *	2. b_blkno + btodb(b_bufsize) may > un_eol_lof
433 	 *		this case is handled in storebuf
434 	 */
435 	return (bp);
436 }
437 
438 void
439 alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
440 {
441 	int	i;
442 	buf_t	*bp;
443 
444 	/*
445 	 * Clear previous allocation
446 	 */
447 	if (cb->cb_nb)
448 		free_cirbuf(cb);
449 
450 	bzero(cb, sizeof (*cb));
451 	rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
452 
453 	rw_enter(&cb->cb_rwlock, RW_WRITER);
454 
455 	/*
456 	 * preallocate 3 bp's and put them on the free list.
457 	 */
458 	for (i = 0; i < 3; ++i) {
459 		bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
460 		sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
461 		sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
462 		bp->b_offset = -1;
463 		bp->b_forw = cb->cb_free;
464 		cb->cb_free = bp;
465 	}
466 
467 	cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
468 	cb->cb_nb = bufsize;
469 
470 	/*
471 	 * first bp claims entire write buffer
472 	 */
473 	bp = cb->cb_free;
474 	cb->cb_free = bp->b_forw;
475 
476 	bp->b_forw = bp;
477 	bp->b_back = bp;
478 	cb->cb_bp = bp;
479 	bp->b_un.b_addr = cb->cb_va;
480 	bp->b_bufsize = cb->cb_nb;
481 
482 	rw_exit(&cb->cb_rwlock);
483 }
484 
485 void
486 alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
487 {
488 	caddr_t	va;
489 	size_t	nb;
490 	buf_t	*bp;
491 
492 	/*
493 	 * Clear previous allocation
494 	 */
495 	if (cb->cb_nb)
496 		free_cirbuf(cb);
497 
498 	bzero(cb, sizeof (*cb));
499 	rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
500 
501 	rw_enter(&cb->cb_rwlock, RW_WRITER);
502 
503 	cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
504 	cb->cb_nb = bufsize;
505 
506 	/*
507 	 * preallocate N bufs that are hard-sized to blksize
508 	 *	in other words, the read buffer pool is a linked list
509 	 *	of statically sized bufs.
510 	 */
511 	va = cb->cb_va;
512 	while ((nb = bufsize) != 0) {
513 		if (nb > blksize)
514 			nb = blksize;
515 		bp = kmem_alloc(sizeof (buf_t), KM_SLEEP);
516 		bzero(bp, sizeof (buf_t));
517 		sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
518 		sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
519 		bp->b_un.b_addr = va;
520 		bp->b_bufsize = nb;
521 		if (cb->cb_bp) {
522 			bp->b_forw = cb->cb_bp->b_forw;
523 			bp->b_back = cb->cb_bp;
524 			cb->cb_bp->b_forw->b_back = bp;
525 			cb->cb_bp->b_forw = bp;
526 		} else
527 			bp->b_forw = bp->b_back = bp;
528 		cb->cb_bp = bp;
529 		bufsize -= nb;
530 		va += nb;
531 	}
532 
533 	rw_exit(&cb->cb_rwlock);
534 }
535 
536 void
537 free_cirbuf(cirbuf_t *cb)
538 {
539 	buf_t	*bp;
540 
541 	if (cb->cb_nb == 0)
542 		return;
543 
544 	rw_enter(&cb->cb_rwlock, RW_WRITER);
545 	ASSERT(cb->cb_dirty == NULL);
546 
547 	/*
548 	 * free the active bufs
549 	 */
550 	while ((bp = cb->cb_bp) != NULL) {
551 		if (bp == bp->b_forw)
552 			cb->cb_bp = NULL;
553 		else
554 			cb->cb_bp = bp->b_forw;
555 		bp->b_back->b_forw = bp->b_forw;
556 		bp->b_forw->b_back = bp->b_back;
557 		sema_destroy(&bp->b_sem);
558 		sema_destroy(&bp->b_io);
559 		kmem_free(bp, sizeof (buf_t));
560 	}
561 
562 	/*
563 	 * free the free bufs
564 	 */
565 	while ((bp = cb->cb_free) != NULL) {
566 		cb->cb_free = bp->b_forw;
567 		sema_destroy(&bp->b_sem);
568 		sema_destroy(&bp->b_io);
569 		kmem_free(bp, sizeof (buf_t));
570 	}
571 	kmem_free(cb->cb_va, cb->cb_nb);
572 	cb->cb_va = NULL;
573 	cb->cb_nb = 0;
574 	rw_exit(&cb->cb_rwlock);
575 	rw_destroy(&cb->cb_rwlock);
576 }
577 
578 static int
579 within_range(off_t lof, daddr_t blkno, ulong_t bcount)
580 {
581 	off_t	blof	= dbtob(blkno);
582 
583 	return ((lof >= blof) && (lof < (blof + bcount)));
584 }
585 
586 static buf_t *
587 find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
588 {
589 	buf_t *bp;
590 
591 	/*
592 	 * find a buf that contains the offset lof
593 	 */
594 	rw_enter(&cb->cb_rwlock, RW_READER);
595 	bp = cb->cb_bp;
596 	do {
597 		if (bp->b_bcount &&
598 		    within_range(lof, bp->b_blkno, bp->b_bcount)) {
599 			makebusy(ul, bp);
600 			rw_exit(&cb->cb_rwlock);
601 			return (bp);
602 		}
603 		bp = bp->b_forw;
604 	} while (bp != cb->cb_bp);
605 	rw_exit(&cb->cb_rwlock);
606 
607 	return (NULL);
608 }
609 
610 static off_t
611 find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
612 {
613 	buf_t	*bp, *bpend;
614 	off_t	rlof;
615 
616 	/*
617 	 * we mustn't:
618 	 *	o read past eol
619 	 *	o read past the tail
620 	 *	o read data that may be being written.
621 	 */
622 	rw_enter(&cb->cb_rwlock, RW_READER);
623 	bpend = bp = cb->cb_bp->b_forw;
624 	rlof = ul->un_tail_lof;
625 	do {
626 		if (bp->b_bcount) {
627 			rlof = dbtob(bp->b_blkno);
628 			break;
629 		}
630 		bp = bp->b_forw;
631 	} while (bp != bpend);
632 	rw_exit(&cb->cb_rwlock);
633 
634 	if (lof <= rlof)
635 		/* lof is prior to the range represented by the write buf */
636 		return (rlof);
637 	else
638 		/* lof follows the range represented by the write buf */
639 		return ((off_t)ul->un_eol_lof);
640 }
641 
642 static buf_t *
643 get_read_bp(ml_unit_t *ul, off_t lof)
644 {
645 	cirbuf_t	*cb;
646 	buf_t		*bp;
647 	off_t		rlof;
648 
649 	/*
650 	 * retrieve as much data as possible from the incore buffers
651 	 */
652 	if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
653 		logstats.ls_lreadsinmem.value.ui64++;
654 		return (bp);
655 	}
656 	if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
657 		logstats.ls_lreadsinmem.value.ui64++;
658 		return (bp);
659 	}
660 
661 	/*
662 	 * steal the LRU buf
663 	 */
664 	cb = &ul->un_rdbuf;
665 	rw_enter(&cb->cb_rwlock, RW_WRITER);
666 	bp = cb->cb_bp->b_forw;
667 	makebusy(ul, bp);
668 	bp->b_flags = 0;
669 	bp->b_bcount = 0;
670 	cb->cb_bp = bp;
671 	rw_exit(&cb->cb_rwlock);
672 
673 	/*
674 	 * don't read past the tail or the end-of-log
675 	 */
676 	bp->b_blkno = btodb(lof);
677 	lof = dbtob(bp->b_blkno);
678 	rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
679 	bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
680 	readlog(ul, bp);
681 	return (bp);
682 }
683 
684 /*
685  * NOTE: writers are single threaded thru the log layer.
686  * This means we can safely reference and change the cb and bp fields
687  * that ldl_read does not reference w/o holding the cb_rwlock or
688  * the bp makebusy lock.
689  */
690 static int
691 extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
692 {
693 	buf_t	*bpforw	= bp->b_forw;
694 
695 	ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
696 
697 	/*
698 	 * there is no `next' bp; do nothing
699 	 */
700 	if (bpforw == bp)
701 		return (0);
702 
703 	/*
704 	 * buffer space is not adjacent; do nothing
705 	 */
706 	if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
707 		return (0);
708 
709 	/*
710 	 * locking protocol requires giving up any bp locks before
711 	 * acquiring cb_rwlock.  This is okay because we hold
712 	 * un_log_mutex.
713 	 */
714 	sema_v(&bp->b_sem);
715 
716 	/*
717 	 * lock out ldl_read
718 	 */
719 	rw_enter(&cb->cb_rwlock, RW_WRITER);
720 
721 	/*
722 	 * wait for current IO to finish w/next bp; if necessary
723 	 */
724 	makebusy(ul, bpforw);
725 
726 	/*
727 	 * free the next bp and steal its space
728 	 */
729 	bp->b_forw = bpforw->b_forw;
730 	bpforw->b_forw->b_back = bp;
731 	bp->b_bufsize += bpforw->b_bufsize;
732 	sema_v(&bpforw->b_sem);
733 	bpforw->b_forw = cb->cb_free;
734 	cb->cb_free = bpforw;
735 	makebusy(ul, bp);
736 	rw_exit(&cb->cb_rwlock);
737 
738 	return (1);
739 }
740 
741 static size_t
742 storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
743 {
744 	size_t		copy_nb;
745 	size_t		nb_in_sec;
746 	sect_trailer_t	*st;
747 	size_t		nb_left = nb;
748 	cirbuf_t	*cb	= &ul->un_wrbuf;
749 
750 again:
751 	nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
752 	copy_nb = MIN(nb_left, nb_in_sec);
753 
754 	ASSERT(copy_nb);
755 
756 	bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
757 	bp->b_bcount += copy_nb;
758 	va += copy_nb;
759 	nb_left -= copy_nb;
760 	ul->un_tail_lof += copy_nb;
761 
762 	if ((nb_in_sec -= copy_nb) == 0) {
763 		st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);
764 
765 		st->st_tid = ul->un_logmap->mtm_tid;
766 		st->st_ident = ul->un_tail_ident++;
767 		bp->b_bcount += sizeof (sect_trailer_t);
768 		ul->un_tail_lof += sizeof (sect_trailer_t);
769 		/*
770 		 * log wrapped; async write this bp
771 		 */
772 		if (ul->un_tail_lof == ul->un_eol_lof) {
773 			ul->un_tail_lof = ul->un_bol_lof;
774 			push_dirty_bp(ul, bp);
775 			return (nb - nb_left);
776 		}
777 		/*
778 		 * out of bp space; get more or async write buf
779 		 */
780 		if (bp->b_bcount == bp->b_bufsize) {
781 			if (!extend_write_bp(ul, cb, bp)) {
782 				push_dirty_bp(ul, bp);
783 				return (nb - nb_left);
784 			}
785 		}
786 	}
787 	if (nb_left)
788 		goto again;
789 
790 	sema_v(&bp->b_sem);
791 	return (nb);
792 }
793 
794 static void
795 fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
796 {
797 	offset_t	src_mof	= me->me_mof;
798 	size_t		src_nb	= me->me_nb;
799 
800 	if (src_mof > dst_mof) {
801 		ASSERT(src_mof < (dst_mof + dst_nb));
802 		dst_va += (src_mof - dst_mof);
803 		dst_nb -= (src_mof - dst_mof);
804 	} else {
805 		ASSERT(dst_mof < (src_mof + src_nb));
806 		src_nb -= (dst_mof - src_mof);
807 	}
808 
809 	src_nb = MIN(src_nb, dst_nb);
810 	ASSERT(src_nb);
811 	bzero(dst_va, src_nb);
812 }
813 
814 /*
815  * dst_va == NULL means don't copy anything
816  */
817 static ulong_t
818 fetchbuf(
819 	ml_unit_t *ul,
820 	buf_t *bp,
821 	caddr_t dst_va,
822 	size_t dst_nb,
823 	off_t *dst_lofp)
824 {
825 	caddr_t	copy_va;
826 	size_t	copy_nb;
827 	size_t	nb_sec;
828 	off_t	dst_lof		= *dst_lofp;
829 	ulong_t	sav_dst_nb	= dst_nb;
830 	ulong_t	src_nb		= bp->b_bcount;
831 	off_t	src_lof		= dbtob(bp->b_blkno);
832 	off_t	src_elof	= src_lof + src_nb;
833 	caddr_t	src_va		= bp->b_un.b_addr;
834 
835 	/*
836 	 * copy from bp to dst_va
837 	 */
838 	while (dst_nb) {
839 		/*
840 		 * compute address within bp
841 		 */
842 		copy_va = src_va + (dst_lof - src_lof);
843 
844 		/*
845 		 * adjust copy size to amount of data in bp
846 		 */
847 		copy_nb = MIN(dst_nb, src_elof - dst_lof);
848 
849 		/*
850 		 * adjust copy size to amount of data in sector
851 		 */
852 		nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
853 		copy_nb = MIN(copy_nb, nb_sec);
854 
855 		/*
856 		 * dst_va == NULL means don't do copy (see logseek())
857 		 */
858 		if (dst_va) {
859 			bcopy(copy_va, dst_va, copy_nb);
860 			dst_va += copy_nb;
861 		}
862 		dst_lof += copy_nb;
863 		dst_nb -= copy_nb;
864 		nb_sec -= copy_nb;
865 
866 		/*
867 		 * advance over sector trailer
868 		 */
869 		if (nb_sec == 0)
870 			dst_lof += sizeof (sect_trailer_t);
871 
872 		/*
873 		 * exhausted buffer
874 		 *	return current lof for next read
875 		 */
876 		if (dst_lof == src_elof) {
877 			sema_v(&bp->b_sem);
878 			if (dst_lof == ul->un_eol_lof)
879 				dst_lof = ul->un_bol_lof;
880 			*dst_lofp = dst_lof;
881 			return (sav_dst_nb - dst_nb);
882 		}
883 	}
884 
885 	/*
886 	 * copy complete - return current lof
887 	 */
888 	sema_v(&bp->b_sem);
889 	*dst_lofp = dst_lof;
890 	return (sav_dst_nb);
891 }
892 
893 void
894 ldl_round_commit(ml_unit_t *ul)
895 {
896 	int		wrapped;
897 	buf_t		*bp;
898 	sect_trailer_t	*st;
899 	size_t		bcount;
900 	cirbuf_t	*cb	= &ul->un_wrbuf;
901 
902 	/*
903 	 * if nothing to write; then do nothing
904 	 */
905 	if ((bp = cb->cb_dirty) == NULL)
906 		return;
907 	makebusy(ul, bp);
908 
909 	/*
910 	 * round up to sector boundary and set new tail
911 	 *	don't readjust st_ident if buf is already rounded
912 	 */
913 	bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
914 	if (bcount == bp->b_bcount) {
915 		sema_v(&bp->b_sem);
916 		return;
917 	}
918 	bp->b_bcount = bcount;
919 	ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
920 	wrapped = 0;
921 	if (ul->un_tail_lof == ul->un_eol_lof) {
922 		ul->un_tail_lof = ul->un_bol_lof;
923 		++wrapped;
924 	}
925 	ASSERT(ul->un_tail_lof != ul->un_head_lof);
926 
927 	/*
928 	 * fix up the sector trailer
929 	 */
930 	/* LINTED */
931 	st = (sect_trailer_t *)
932 		((bp->b_un.b_addr + bcount) - sizeof (*st));
933 	st->st_tid = ul->un_logmap->mtm_tid;
934 	st->st_ident = ul->un_tail_ident++;
935 
936 	/*
937 	 * if tail wrapped or we have exhausted this buffer
938 	 *	async write the buffer
939 	 */
940 	if (wrapped || bcount == bp->b_bufsize)
941 		push_dirty_bp(ul, bp);
942 	else
943 		sema_v(&bp->b_sem);
944 }
945 
946 void
947 ldl_push_commit(ml_unit_t *ul)
948 {
949 	buf_t		*bp;
950 	cirbuf_t	*cb	= &ul->un_wrbuf;
951 
952 	/*
953 	 * if nothing to write; then do nothing
954 	 */
955 	if ((bp = cb->cb_dirty) == NULL)
956 		return;
957 	makebusy(ul, bp);
958 	push_dirty_bp(ul, bp);
959 }
960 
961 int
962 ldl_need_commit(ml_unit_t *ul)
963 {
964 	return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
965 }
966 
967 int
968 ldl_has_space(ml_unit_t *ul, mapentry_t *me)
969 {
970 	off_t	nfb;
971 	off_t	nb;
972 
973 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
974 
975 	/*
976 	 * Add up the size used by the deltas
977 	 * round nb up to a sector length plus an extra sector
978 	 *	w/o the extra sector we couldn't distinguish
979 	 *	a full log (head == tail) from an empty log (head == tail)
980 	 */
981 	for (nb = DEV_BSIZE; me; me = me->me_hash) {
982 		nb += sizeof (struct delta);
983 		if (me->me_dt != DT_CANCEL)
984 			nb += me->me_nb;
985 	}
986 	nb = P2ROUNDUP(nb, DEV_BSIZE);
987 
988 	if (ul->un_head_lof <= ul->un_tail_lof)
989 		nfb = (ul->un_head_lof - ul->un_bol_lof) +
990 			(ul->un_eol_lof - ul->un_tail_lof);
991 	else
992 		nfb = ul->un_head_lof - ul->un_tail_lof;
993 
994 	return (nb < nfb);
995 }
996 
997 void
998 ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
999 {
1000 	buf_t		*bp;
1001 	caddr_t		va;
1002 	size_t		nb;
1003 	size_t		actual;
1004 
1005 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1006 
1007 	/* Write the delta */
1008 
1009 	nb = sizeof (struct delta);
1010 	va = (caddr_t)&me->me_delta;
1011 	bp = get_write_bp(ul);
1012 
1013 	while (nb) {
1014 		if (ul->un_flags & LDL_ERROR) {
1015 			sema_v(&bp->b_sem);
1016 			return;
1017 		}
1018 		actual = storebuf(ul, bp, va, nb);
1019 		ASSERT(actual);
1020 		va += actual;
1021 		nb -= actual;
1022 		if (nb)
1023 			bp = get_write_bp(ul);
1024 	}
1025 
1026 	/* If a commit, cancel, or 0's; we're almost done */
1027 	switch (me->me_dt) {
1028 		case DT_COMMIT:
1029 		case DT_CANCEL:
1030 		case DT_ABZERO:
1031 			/* roll needs to know where the next delta will go */
1032 			me->me_lof = ul->un_tail_lof;
1033 			return;
1034 		default:
1035 			break;
1036 	}
1037 
1038 	/* Now write the data */
1039 
1040 	ASSERT(me->me_nb != 0);
1041 
1042 	nb = me->me_nb;
1043 	va = (me->me_mof - bufmof) + bufp;
1044 	bp = get_write_bp(ul);
1045 
1046 	/* Save where we will put the data */
1047 	me->me_lof = ul->un_tail_lof;
1048 
1049 	while (nb) {
1050 		if (ul->un_flags & LDL_ERROR) {
1051 			sema_v(&bp->b_sem);
1052 			return;
1053 		}
1054 		actual = storebuf(ul, bp, va, nb);
1055 		ASSERT(actual);
1056 		va += actual;
1057 		nb -= actual;
1058 		if (nb)
1059 			bp = get_write_bp(ul);
1060 	}
1061 }
1062 
1063 void
1064 ldl_waito(ml_unit_t *ul)
1065 {
1066 	buf_t		*bp;
1067 	cirbuf_t	*cb	= &ul->un_wrbuf;
1068 
1069 	rw_enter(&cb->cb_rwlock, RW_WRITER);
1070 	/*
1071 	 * wait on them
1072 	 */
1073 	bp = cb->cb_bp;
1074 	do {
1075 		if ((bp->b_flags & B_DONE) == 0) {
1076 			makebusy(ul, bp);
1077 			sema_v(&bp->b_sem);
1078 		}
1079 		bp = bp->b_forw;
1080 	} while (bp != cb->cb_bp);
1081 	rw_exit(&cb->cb_rwlock);
1082 }
1083 
1084 /*
1085  * seek nb bytes from location lof
1086  */
1087 static int
1088 logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
1089 {
1090 	buf_t	*bp;
1091 	ulong_t	actual;
1092 
1093 	while (nb) {
1094 		bp = get_read_bp(ul, lof);
1095 		if (bp->b_flags & B_ERROR) {
1096 			sema_v(&bp->b_sem);
1097 			return (EIO);
1098 		}
1099 		actual = fetchbuf(ul, bp, NULL, nb, &lof);
1100 		ASSERT(actual);
1101 		nb -= actual;
1102 	}
1103 	*lofp = lof;
1104 	ASSERT(nb == 0);
1105 	return (0);
1106 }
1107 
1108 int
1109 ldl_read(
1110 	ml_unit_t *ul,		/* Log unit */
1111 	caddr_t va,		/* address of buffer to read into */
1112 	offset_t mof,		/* mof of buffer */
1113 	off_t nb,		/* length of buffer */
1114 	mapentry_t *me)		/* Map entry list */
1115 {
1116 	buf_t	*bp;
1117 	crb_t   *crb;
1118 	caddr_t	rva;			/* address to read into */
1119 	size_t	rnb;			/* # of bytes to read */
1120 	off_t	lof;			/* log device offset to read from */
1121 	off_t   skip;
1122 	ulong_t	actual;
1123 	int	error;
1124 	caddr_t	eva	= va + nb;	/* end of buffer */
1125 
1126 	for (; me; me = me->me_agenext) {
1127 		ASSERT(me->me_dt != DT_CANCEL);
1128 
1129 		/*
1130 		 * check for an cached roll buffer
1131 		 */
1132 		crb = me->me_crb;
1133 		if (crb) {
1134 			if (mof > crb->c_mof) {
1135 				/*
1136 				 * This mapentry overlaps with the beginning of
1137 				 * the supplied buffer
1138 				 */
1139 				skip = mof - crb->c_mof;
1140 				bcopy(crb->c_buf + skip, va,
1141 				    MIN(nb, crb->c_nb - skip));
1142 			} else {
1143 				/*
1144 				 * This mapentry starts at or after
1145 				 * the supplied buffer.
1146 				 */
1147 				skip = crb->c_mof - mof;
1148 				bcopy(crb->c_buf, va + skip,
1149 				    MIN(crb->c_nb, nb - skip));
1150 			}
1151 			logstats.ls_lreadsinmem.value.ui64++;
1152 			continue;
1153 		}
1154 
1155 		/*
1156 		 * check for a delta full of zeroes - there's no log data
1157 		 */
1158 		if (me->me_dt == DT_ABZERO) {
1159 			fetchzeroes(va, mof, nb, me);
1160 			continue;
1161 		}
1162 
1163 		if (mof > me->me_mof) {
1164 			rnb = (size_t)(mof - me->me_mof);
1165 			error = logseek(ul, me->me_lof, rnb, &lof);
1166 			if (error)
1167 				return (EIO);
1168 			rva = va;
1169 			rnb = me->me_nb - rnb;
1170 			rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
1171 		} else {
1172 			lof = me->me_lof;
1173 			rva = (me->me_mof - mof) + va;
1174 			rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
1175 		}
1176 
1177 		while (rnb) {
1178 			bp = get_read_bp(ul, lof);
1179 			if (bp->b_flags & B_ERROR) {
1180 				sema_v(&bp->b_sem);
1181 				return (EIO);
1182 			}
1183 			ASSERT(((me->me_flags & ME_ROLL) == 0) ||
1184 				(bp != ul->un_wrbuf.cb_dirty));
1185 			actual = fetchbuf(ul, bp, rva, rnb, &lof);
1186 			ASSERT(actual);
1187 			rva += actual;
1188 			rnb -= actual;
1189 		}
1190 	}
1191 	return (0);
1192 }
1193 
1194 void
1195 ldl_savestate(ml_unit_t *ul)
1196 {
1197 	int		error;
1198 	buf_t		*bp	= ul->un_bp;
1199 	ml_odunit_t	*ud	= (void *)bp->b_un.b_addr;
1200 	ml_odunit_t	*ud2	= (void *)(bp->b_un.b_addr + DEV_BSIZE);
1201 
1202 #if	DEBUG
1203 	/*
1204 	 * Scan test is running; don't update intermediate state
1205 	 */
1206 	if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
1207 		return;
1208 #endif	/* DEBUG */
1209 
1210 	mutex_enter(&ul->un_state_mutex);
1211 	bcopy(&ul->un_ondisk, ud, sizeof (*ud));
1212 	ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
1213 	bcopy(ud, ud2, sizeof (*ud));
1214 
1215 	/* If a snapshot is enabled write through the shapshot driver. */
1216 	if (ul->un_ufsvfs->vfs_snapshot)
1217 		UFS_BWRITE2(ul->un_ufsvfs, bp);
1218 	else
1219 		BWRITE2(bp);
1220 	logstats.ls_ldlwrites.value.ui64++;
1221 	error = bp->b_flags & B_ERROR;
1222 	mutex_exit(&ul->un_state_mutex);
1223 	if (error)
1224 		ldl_seterror(ul, "Error writing ufs log state");
1225 }
1226 
1227 /*
1228  * The head will be set to (new_lof - header) since ldl_sethead is
1229  * called with the new_lof of the data portion of a delta.
1230  */
1231 void
1232 ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
1233 {
1234 	off_t		nb;
1235 	off_t		new_lof;
1236 	uint32_t	new_ident;
1237 	daddr_t		beg_blkno;
1238 	daddr_t		end_blkno;
1239 	struct timeval	tv;
1240 
1241 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1242 
1243 	if (data_lof == -1) {
1244 		/* log is empty */
1245 		uniqtime(&tv);
1246 		if (tv.tv_usec == ul->un_head_ident) {
1247 			tv.tv_usec++;
1248 		}
1249 		last_loghead_ident = tv.tv_usec;
1250 		new_ident = tv.tv_usec;
1251 		new_lof = ul->un_tail_lof;
1252 
1253 	} else {
1254 		/* compute header's lof */
1255 		new_ident = ul->un_head_ident;
1256 		new_lof = data_lof - sizeof (struct delta);
1257 
1258 		/* whoops, header spans sectors; subtract out sector trailer */
1259 		if (btodb(new_lof) != btodb(data_lof))
1260 			new_lof -= sizeof (sect_trailer_t);
1261 
1262 		/* whoops, header wrapped the log; go to last sector */
1263 		if (new_lof < ul->un_bol_lof) {
1264 			/* sector offset */
1265 			new_lof -= dbtob(btodb(new_lof));
1266 			/* add to last sector's lof */
1267 			new_lof += (ul->un_eol_lof - DEV_BSIZE);
1268 		}
1269 		ul->un_head_tid = tid;
1270 	}
1271 
1272 	/*
1273 	 * check for nop
1274 	 */
1275 	if (new_lof == ul->un_head_lof)
1276 		return;
1277 
1278 	/*
1279 	 * invalidate the affected bufs and calculate new ident
1280 	 */
1281 	if (new_lof > ul->un_head_lof) {
1282 		nb = new_lof - ul->un_head_lof;
1283 		inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1284 		inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1285 
1286 		end_blkno = btodb(new_lof);
1287 		beg_blkno = btodb(ul->un_head_lof);
1288 		new_ident += (end_blkno - beg_blkno);
1289 	} else {
1290 		nb = ul->un_eol_lof - ul->un_head_lof;
1291 		inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1292 		inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1293 
1294 		end_blkno = btodb(ul->un_eol_lof);
1295 		beg_blkno = btodb(ul->un_head_lof);
1296 		new_ident += (end_blkno - beg_blkno);
1297 
1298 		nb = new_lof - ul->un_bol_lof;
1299 		inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
1300 		inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);
1301 
1302 		end_blkno = btodb(new_lof);
1303 		beg_blkno = btodb(ul->un_bol_lof);
1304 		new_ident += (end_blkno - beg_blkno);
1305 	}
1306 	/*
1307 	 * don't update the head if there has been an error
1308 	 */
1309 	if (ul->un_flags & LDL_ERROR)
1310 		return;
1311 
1312 	/* Fix up the head and ident */
1313 	ASSERT(new_lof >= ul->un_bol_lof);
1314 	ul->un_head_lof = new_lof;
1315 	ul->un_head_ident = new_ident;
1316 	if (data_lof == -1) {
1317 		ul->un_tail_ident = ul->un_head_ident;
1318 	}
1319 
1320 
1321 	/* Commit to the database */
1322 	ldl_savestate(ul);
1323 
1324 	ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
1325 		ldl_sethead_debug(ul));
1326 }
1327 
1328 /*
1329  * The tail will be set to the sector following lof+nb
1330  *	lof + nb == size of the last delta + commit record
1331  *	this function is called once after the log scan has completed.
1332  */
1333 void
1334 ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
1335 {
1336 	off_t		new_lof;
1337 	uint32_t	new_ident;
1338 	daddr_t		beg_blkno;
1339 	daddr_t		end_blkno;
1340 	struct timeval	tv;
1341 
1342 	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1343 
1344 	if (lof == -1) {
1345 		uniqtime(&tv);
1346 		if (tv.tv_usec == ul->un_head_ident) {
1347 			tv.tv_usec++;
1348 		}
1349 		last_loghead_ident = tv.tv_usec;
1350 		ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
1351 		ul->un_head_lof = ul->un_tail_lof;
1352 		ul->un_head_ident = tv.tv_usec;
1353 		ul->un_tail_ident = ul->un_head_ident;
1354 
1355 		/* Commit to the database */
1356 		ldl_savestate(ul);
1357 
1358 		return;
1359 	}
1360 
1361 	/*
1362 	 * new_lof is the offset of the sector following the last commit
1363 	 */
1364 	(void) logseek(ul, lof, nb, &new_lof);
1365 	ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));
1366 
1367 	/*
1368 	 * calculate new ident
1369 	 */
1370 	if (new_lof > ul->un_head_lof) {
1371 		end_blkno = btodb(new_lof);
1372 		beg_blkno = btodb(ul->un_head_lof);
1373 		new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1374 	} else {
1375 		end_blkno = btodb(ul->un_eol_lof);
1376 		beg_blkno = btodb(ul->un_head_lof);
1377 		new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1378 
1379 		end_blkno = btodb(new_lof);
1380 		beg_blkno = btodb(ul->un_bol_lof);
1381 		new_ident += (end_blkno - beg_blkno);
1382 	}
1383 
1384 	/* Fix up the tail and ident */
1385 	ul->un_tail_lof = new_lof;
1386 	ul->un_tail_ident = new_ident;
1387 
1388 	/* Commit to the database */
1389 	ldl_savestate(ul);
1390 }
1391 
1392 /*
1393  * LOGSCAN STUFF
1394  */
1395 static int
1396 ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
1397 {
1398 	ulong_t		ident;
1399 	size_t		nblk, i;
1400 	sect_trailer_t	*st;
1401 
1402 	/*
1403 	 * compute ident for first sector in the buffer
1404 	 */
1405 	ident = ul->un_head_ident;
1406 	if (bp->b_blkno >= btodb(ul->un_head_lof)) {
1407 		ident += (bp->b_blkno - btodb(ul->un_head_lof));
1408 	} else {
1409 		ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
1410 		ident += (bp->b_blkno - btodb(ul->un_bol_lof));
1411 	}
1412 	/*
1413 	 * truncate the buffer down to the last valid sector
1414 	 */
1415 	nblk = btodb(bp->b_bcount);
1416 	bp->b_bcount = 0;
1417 	/* LINTED */
1418 	st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
1419 	for (i = 0; i < nblk; ++i) {
1420 		if (st->st_ident != ident)
1421 			break;
1422 
1423 		/* remember last valid tid for ldl_logscan_error() */
1424 		ul->un_tid = st->st_tid;
1425 
1426 		/* LINTED */
1427 		st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
1428 		++ident;
1429 		bp->b_bcount += DEV_BSIZE;
1430 	}
1431 	/*
1432 	 * make sure that lof is still within range
1433 	 */
1434 	return (within_range(lof, bp->b_blkno, bp->b_bcount));
1435 }
1436 
1437 ulong_t
1438 ldl_logscan_nbcommit(off_t lof)
1439 {
1440 	/*
1441 	 * lof is the offset following the commit header.  However,
1442 	 * if the commit header fell on the end-of-sector, then lof
1443 	 * has already been advanced to the beginning of the next
1444 	 * sector.  So do nothing.  Otherwise, return the remaining
1445 	 * bytes in the sector.
1446 	 */
1447 	if ((lof & (DEV_BSIZE - 1)) == 0)
1448 		return (0);
1449 	return (NB_LEFT_IN_SECTOR(lof));
1450 }
1451 
1452 int
1453 ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
1454 {
1455 	buf_t	*bp;
1456 	ulong_t	actual;
1457 
1458 	ASSERT(ul->un_head_lof != ul->un_tail_lof);
1459 
1460 	/*
1461 	 * Check the log data doesn't go out of bounds
1462 	 */
1463 	if (ul->un_head_lof < ul->un_tail_lof) {
1464 		if (!WITHIN(*lofp, nb, ul->un_head_lof,
1465 		    (ul->un_tail_lof - ul->un_head_lof))) {
1466 			return (EIO);
1467 		}
1468 	} else {
1469 		if (OVERLAP(*lofp, nb, ul->un_tail_lof,
1470 		    (ul->un_head_lof - ul->un_tail_lof))) {
1471 			return (EIO);
1472 		}
1473 	}
1474 
1475 	while (nb) {
1476 		bp = get_read_bp(ul, *lofp);
1477 		if (bp->b_flags & B_ERROR) {
1478 			sema_v(&bp->b_sem);
1479 			return (EIO);
1480 		}
1481 		/*
1482 		 * out-of-seq idents means partial transaction
1483 		 *	panic, non-corrupting powerfail, ...
1484 		 */
1485 		if (!ldl_logscan_ident(ul, bp, *lofp)) {
1486 			sema_v(&bp->b_sem);
1487 			return (EIO);
1488 		}
1489 		/*
1490 		 * copy the header into the caller's buf
1491 		 */
1492 		actual = fetchbuf(ul, bp, va, nb, lofp);
1493 		if (va)
1494 			va += actual;
1495 		nb -= actual;
1496 	}
1497 	return (0);
1498 }
1499 
1500 void
1501 ldl_logscan_begin(ml_unit_t *ul)
1502 {
1503 	size_t	bufsize;
1504 
1505 	ASSERT(ul->un_wrbuf.cb_dirty == NULL);
1506 
1507 	/*
1508 	 * logscan has begun
1509 	 */
1510 	ul->un_flags |= LDL_SCAN;
1511 
1512 	/*
1513 	 * reset the circular bufs
1514 	 */
1515 	bufsize = ldl_bufsize(ul);
1516 	alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
1517 	alloc_wrbuf(&ul->un_wrbuf, bufsize);
1518 
1519 	/*
1520 	 * set the tail to reflect a full log
1521 	 */
1522 	ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;
1523 
1524 	if (ul->un_tail_lof < ul->un_bol_lof)
1525 		ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
1526 	if (ul->un_tail_lof >= ul->un_eol_lof)
1527 		ul->un_tail_lof = ul->un_bol_lof;
1528 
1529 	/*
1530 	 * un_tid is used during error processing; it is initialized to
1531 	 * the tid of the delta at un_head_lof;
1532 	 */
1533 	ul->un_tid = ul->un_head_tid;
1534 }
1535 
1536 void
1537 ldl_logscan_end(ml_unit_t *ul)
1538 {
1539 	size_t	bufsize;
1540 
1541 	/*
1542 	 * reset the circular bufs
1543 	 */
1544 	bufsize = ldl_bufsize(ul);
1545 	alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
1546 	alloc_wrbuf(&ul->un_wrbuf, bufsize);
1547 
1548 	/*
1549 	 * Done w/scan
1550 	 */
1551 	ul->un_flags &= ~LDL_SCAN;
1552 }
1553 
1554 int
1555 ldl_need_roll(ml_unit_t *ul)
1556 {
1557 	off_t	busybytes;
1558 	off_t	head;
1559 	off_t	tail;
1560 	off_t	bol;
1561 	off_t	eol;
1562 	off_t	nb;
1563 
1564 	/*
1565 	 * snapshot the log state
1566 	 */
1567 	head = ul->un_head_lof;
1568 	tail = ul->un_tail_lof;
1569 	bol = ul->un_bol_lof;
1570 	eol = ul->un_eol_lof;
1571 	nb = ul->un_logsize;
1572 
1573 	/*
1574 	 * compute number of busy (inuse) bytes
1575 	 */
1576 	if (head <= tail)
1577 		busybytes = tail - head;
1578 	else
1579 		busybytes = (eol - head) + (tail - bol);
1580 
1581 	/*
1582 	 * return TRUE if > 75% full
1583 	 */
1584 	return (busybytes > (nb - (nb >> 2)));
1585 }
1586 
1587 void
1588 ldl_seterror(ml_unit_t *ul, char *why)
1589 {
1590 	/*
1591 	 * already in error state; do nothing
1592 	 */
1593 	if (ul->un_flags & LDL_ERROR)
1594 		return;
1595 
1596 	ul->un_flags |= LDL_ERROR;	/* incore */
1597 	ul->un_badlog = 1;		/* ondisk (cleared by fsck) */
1598 
1599 	/*
1600 	 * Commit to state sectors
1601 	 */
1602 	uniqtime(&ul->un_timestamp);
1603 	ldl_savestate(ul);
1604 
1605 	/* Pretty print */
1606 	cmn_err(CE_WARN, "%s", why);
1607 	cmn_err(CE_WARN, "ufs log for %s changed state to Error",
1608 	    ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1609 	cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
1610 	    ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1611 
1612 	/*
1613 	 * If we aren't in the middle of scan (aka snarf); tell ufs
1614 	 * to hard lock itself.
1615 	 */
1616 	if ((ul->un_flags & LDL_SCAN) == 0)
1617 		ufs_trans_onerror();
1618 }
1619 
1620 size_t
1621 ldl_bufsize(ml_unit_t *ul)
1622 {
1623 	size_t		bufsize;
1624 	extern uint32_t	ldl_minbufsize;
1625 
1626 	/*
1627 	 * initial guess is the maxtransfer value for this log device
1628 	 * 	increase if too small
1629 	 * 	decrease if too large
1630 	 */
1631 	bufsize = dbtob(btod(ul->un_maxtransfer));
1632 	if (bufsize < ldl_minbufsize)
1633 		bufsize = ldl_minbufsize;
1634 	if (bufsize > maxphys)
1635 		bufsize = maxphys;
1636 	if (bufsize > ul->un_maxtransfer)
1637 		bufsize = ul->un_maxtransfer;
1638 	return (bufsize);
1639 }
1640