1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/vnode.h>
29 #include <sys/errno.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/kmem.h>
33 #include <sys/conf.h>
34 #include <sys/proc.h>
35 #include <sys/cmn_err.h>
36 #include <sys/fssnap_if.h>
37 #include <sys/fs/ufs_inode.h>
38 #include <sys/fs/ufs_filio.h>
39 #include <sys/fs/ufs_log.h>
40 #include <sys/fs/ufs_bio.h>
41 #include <sys/atomic.h>
42
43 extern int maxphys;
44 extern uint_t bypass_snapshot_throttle_key;
45
46 extern struct kmem_cache *lufs_sv;
47 extern struct kmem_cache *lufs_bp;
48
49 static void
makebusy(ml_unit_t * ul,buf_t * bp)50 makebusy(ml_unit_t *ul, buf_t *bp)
51 {
52 sema_p(&bp->b_sem);
53 if ((bp->b_flags & B_ERROR) == 0)
54 return;
55 if (bp->b_flags & B_READ)
56 ldl_seterror(ul, "Error reading ufs log");
57 else
58 ldl_seterror(ul, "Error writing ufs log");
59 }
60
61 static int
logdone(buf_t * bp)62 logdone(buf_t *bp)
63 {
64 bp->b_flags |= B_DONE;
65
66 if (bp->b_flags & B_WRITE)
67 sema_v(&bp->b_sem);
68 else
69 /* wakeup the thread waiting on this buf */
70 sema_v(&bp->b_io);
71 return (0);
72 }
73
74 static int
ldl_strategy_done(buf_t * cb)75 ldl_strategy_done(buf_t *cb)
76 {
77 lufs_save_t *sv;
78 lufs_buf_t *lbp;
79 buf_t *bp;
80
81 ASSERT(SEMA_HELD(&cb->b_sem));
82 ASSERT((cb->b_flags & B_DONE) == 0);
83
84 /*
85 * Compute address of the ``save'' struct
86 */
87 lbp = (lufs_buf_t *)cb;
88 sv = (lufs_save_t *)lbp->lb_ptr;
89
90 if (cb->b_flags & B_ERROR)
91 sv->sv_error = 1;
92
93 /*
94 * If this is the last request, release the resources and
95 * ``done'' the original buffer header.
96 */
97 if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
98 kmem_cache_free(lufs_bp, lbp);
99 return (1);
100 }
101 /* Propagate any errors back to the original buffer header */
102 bp = sv->sv_bp;
103 if (sv->sv_error)
104 bp->b_flags |= B_ERROR;
105 kmem_cache_free(lufs_bp, lbp);
106 kmem_cache_free(lufs_sv, sv);
107
108 biodone(bp);
109 return (0);
110 }
111
112 /*
113 * Map the log logical block number to a physical disk block number
114 */
115 static int
map_frag(ml_unit_t * ul,daddr_t lblkno,size_t bcount,daddr_t * pblkno,size_t * pbcount)116 map_frag(
117 ml_unit_t *ul,
118 daddr_t lblkno,
119 size_t bcount,
120 daddr_t *pblkno,
121 size_t *pbcount)
122 {
123 ic_extent_t *ext = ul->un_ebp->ic_extents;
124 uint32_t e = ul->un_ebp->ic_nextents;
125 uint32_t s = 0;
126 uint32_t i = e >> 1;
127 uint32_t lasti = i;
128 uint32_t bno_off;
129
130 again:
131 if (ext[i].ic_lbno <= lblkno) {
132 if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
133 /* FOUND IT */
134 bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
135 *pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
136 *pblkno = ext[i].ic_pbno + bno_off;
137 return (0);
138 } else
139 s = i;
140 } else
141 e = i;
142 i = s + ((e - s) >> 1);
143
144 if (i == lasti) {
145 *pbcount = bcount;
146 return (ENOENT);
147 }
148 lasti = i;
149
150 goto again;
151 }
152
153 /*
154 * The log is a set of extents (which typically will be only one, but
155 * may be more if the disk was close to full when the log was created)
156 * and hence the logical offsets into the log
157 * have to be translated into their real device locations before
158 * calling the device's strategy routine. The translation may result
159 * in several IO requests if this request spans extents.
160 */
161 void
ldl_strategy(ml_unit_t * ul,buf_t * pb)162 ldl_strategy(ml_unit_t *ul, buf_t *pb)
163 {
164 lufs_save_t *sv;
165 lufs_buf_t *lbp;
166 buf_t *cb;
167 ufsvfs_t *ufsvfsp = ul->un_ufsvfs;
168 daddr_t lblkno, pblkno;
169 size_t nb_left, pbcount;
170 off_t offset;
171 dev_t dev = ul->un_dev;
172 int error;
173 int read = pb->b_flags & B_READ;
174
175 /*
176 * Allocate and initialise the save stucture,
177 */
178 sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
179 sv->sv_error = 0;
180 sv->sv_bp = pb;
181 nb_left = pb->b_bcount;
182 sv->sv_nb_left = nb_left;
183
184 lblkno = pb->b_blkno;
185 offset = 0;
186
187 do {
188 error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);
189
190 lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
191 bioinit(&lbp->lb_buf);
192 lbp->lb_ptr = sv;
193
194 cb = bioclone(pb, offset, pbcount, dev,
195 pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);
196
197 offset += pbcount;
198 lblkno += btodb(pbcount);
199 nb_left -= pbcount;
200
201 if (error) {
202 cb->b_flags |= B_ERROR;
203 cb->b_resid = cb->b_bcount;
204 biodone(cb);
205 } else {
206 if (read) {
207 logstats.ls_ldlreads.value.ui64++;
208 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
209 lwp_stat_update(LWP_STAT_INBLK, 1);
210 } else {
211 logstats.ls_ldlwrites.value.ui64++;
212 lwp_stat_update(LWP_STAT_OUBLK, 1);
213 }
214
215 /*
216 * write through the snapshot driver if necessary
217 * We do not want this write to be throttled because
218 * we are holding the un_log mutex here. If we
219 * are throttled in fssnap_translate, the fssnap_taskq
220 * thread which can wake us up can get blocked on
221 * the un_log mutex resulting in a deadlock.
222 */
223 if (ufsvfsp->vfs_snapshot) {
224 (void) tsd_set(bypass_snapshot_throttle_key,
225 (void *)1);
226 fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);
227
228 (void) tsd_set(bypass_snapshot_throttle_key,
229 (void *)0);
230 } else {
231 (void) bdev_strategy(cb);
232 }
233 }
234
235 } while (nb_left);
236 }
237
238 static void
writelog(ml_unit_t * ul,buf_t * bp)239 writelog(ml_unit_t *ul, buf_t *bp)
240 {
241 ASSERT(SEMA_HELD(&bp->b_sem));
242
243 /*
244 * This is really an B_ASYNC write but we want Presto to
245 * cache this write. The iodone routine, logdone, processes
246 * the buf correctly.
247 */
248 bp->b_flags = B_WRITE;
249 bp->b_edev = ul->un_dev;
250 bp->b_iodone = logdone;
251
252 /*
253 * return EIO for every IO if in hard error state
254 */
255 if (ul->un_flags & LDL_ERROR) {
256 bp->b_flags |= B_ERROR;
257 bp->b_error = EIO;
258 biodone(bp);
259 return;
260 }
261
262 ldl_strategy(ul, bp);
263 }
264
265 static void
readlog(ml_unit_t * ul,buf_t * bp)266 readlog(ml_unit_t *ul, buf_t *bp)
267 {
268 ASSERT(SEMA_HELD(&bp->b_sem));
269 ASSERT(bp->b_bcount);
270
271 bp->b_flags = B_READ;
272 bp->b_edev = ul->un_dev;
273 bp->b_iodone = logdone;
274
275 /* all IO returns errors when in error state */
276 if (ul->un_flags & LDL_ERROR) {
277 bp->b_flags |= B_ERROR;
278 bp->b_error = EIO;
279 biodone(bp);
280 (void) trans_wait(bp);
281 return;
282 }
283
284 ldl_strategy(ul, bp);
285
286 if (trans_wait(bp))
287 ldl_seterror(ul, "Error reading ufs log");
288 }
289
290 /*
291 * NOTE: writers are single threaded thru the log layer.
292 * This means we can safely reference and change the cb and bp fields
293 * that ldl_read does not reference w/o holding the cb_rwlock or
294 * the bp makebusy lock.
295 */
296 static void
push_dirty_bp(ml_unit_t * ul,buf_t * bp)297 push_dirty_bp(ml_unit_t *ul, buf_t *bp)
298 {
299 buf_t *newbp;
300 cirbuf_t *cb = &ul->un_wrbuf;
301
302 ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
303 ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);
304
305 /*
306 * async write the buf
307 */
308 writelog(ul, bp);
309
310 /*
311 * no longer filling any buf
312 */
313 cb->cb_dirty = NULL;
314
315 /*
316 * no extra buffer space; all done
317 */
318 if (bp->b_bcount == bp->b_bufsize)
319 return;
320
321 /*
322 * give extra buffer space to a new bp
323 * try to take buf off of free list
324 */
325 if ((newbp = cb->cb_free) != NULL) {
326 cb->cb_free = newbp->b_forw;
327 } else {
328 newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
329 sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
330 sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
331 }
332 newbp->b_flags = 0;
333 newbp->b_bcount = 0;
334 newbp->b_file = NULL;
335 newbp->b_offset = -1;
336 newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
337 newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
338 bp->b_bufsize = bp->b_bcount;
339
340 /*
341 * lock out readers and put new buf at LRU position
342 */
343 rw_enter(&cb->cb_rwlock, RW_WRITER);
344 newbp->b_forw = bp->b_forw;
345 newbp->b_back = bp;
346 bp->b_forw->b_back = newbp;
347 bp->b_forw = newbp;
348 rw_exit(&cb->cb_rwlock);
349 }
350
351 static void
inval_range(ml_unit_t * ul,cirbuf_t * cb,off_t lof,off_t nb)352 inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
353 {
354 buf_t *bp;
355 off_t elof = lof + nb;
356 off_t buflof;
357 off_t bufelof;
358
359 /*
360 * discard all bufs that overlap the range (lof, lof + nb)
361 */
362 rw_enter(&cb->cb_rwlock, RW_WRITER);
363 bp = cb->cb_bp;
364 do {
365 if (bp == cb->cb_dirty || bp->b_bcount == 0) {
366 bp = bp->b_forw;
367 continue;
368 }
369 buflof = dbtob(bp->b_blkno);
370 bufelof = buflof + bp->b_bcount;
371 if ((buflof < lof && bufelof <= lof) ||
372 (buflof >= elof && bufelof > elof)) {
373 bp = bp->b_forw;
374 continue;
375 }
376 makebusy(ul, bp);
377 bp->b_flags = 0;
378 bp->b_bcount = 0;
379 sema_v(&bp->b_sem);
380 bp = bp->b_forw;
381 } while (bp != cb->cb_bp);
382 rw_exit(&cb->cb_rwlock);
383 }
384
385 /*
386 * NOTE: writers are single threaded thru the log layer.
387 * This means we can safely reference and change the cb and bp fields
388 * that ldl_read does not reference w/o holding the cb_rwlock or
389 * the bp makebusy lock.
390 */
391 static buf_t *
get_write_bp(ml_unit_t * ul)392 get_write_bp(ml_unit_t *ul)
393 {
394 cirbuf_t *cb = &ul->un_wrbuf;
395 buf_t *bp;
396
397 /*
398 * cb_dirty is the buffer we are currently filling; if any
399 */
400 if ((bp = cb->cb_dirty) != NULL) {
401 makebusy(ul, bp);
402 return (bp);
403 }
404 /*
405 * discard any bp that overlaps the current tail since we are
406 * about to overwrite it.
407 */
408 inval_range(ul, cb, ul->un_tail_lof, 1);
409
410 /*
411 * steal LRU buf
412 */
413 rw_enter(&cb->cb_rwlock, RW_WRITER);
414 bp = cb->cb_bp->b_forw;
415 makebusy(ul, bp);
416
417 cb->cb_dirty = bp;
418 cb->cb_bp = bp;
419
420 bp->b_flags = 0;
421 bp->b_bcount = 0;
422 bp->b_blkno = btodb(ul->un_tail_lof);
423 ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
424 rw_exit(&cb->cb_rwlock);
425
426 /*
427 * NOTE:
428 * 1. un_tail_lof never addresses >= un_eol_lof
429 * 2. b_blkno + btodb(b_bufsize) may > un_eol_lof
430 * this case is handled in storebuf
431 */
432 return (bp);
433 }
434
435 void
alloc_wrbuf(cirbuf_t * cb,size_t bufsize)436 alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
437 {
438 int i;
439 buf_t *bp;
440
441 /*
442 * Clear previous allocation
443 */
444 if (cb->cb_nb)
445 free_cirbuf(cb);
446
447 bzero(cb, sizeof (*cb));
448 rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
449
450 rw_enter(&cb->cb_rwlock, RW_WRITER);
451
452 /*
453 * preallocate 3 bp's and put them on the free list.
454 */
455 for (i = 0; i < 3; ++i) {
456 bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
457 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
458 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
459 bp->b_offset = -1;
460 bp->b_forw = cb->cb_free;
461 cb->cb_free = bp;
462 }
463
464 cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
465 cb->cb_nb = bufsize;
466
467 /*
468 * first bp claims entire write buffer
469 */
470 bp = cb->cb_free;
471 cb->cb_free = bp->b_forw;
472
473 bp->b_forw = bp;
474 bp->b_back = bp;
475 cb->cb_bp = bp;
476 bp->b_un.b_addr = cb->cb_va;
477 bp->b_bufsize = cb->cb_nb;
478
479 rw_exit(&cb->cb_rwlock);
480 }
481
482 void
alloc_rdbuf(cirbuf_t * cb,size_t bufsize,size_t blksize)483 alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
484 {
485 caddr_t va;
486 size_t nb;
487 buf_t *bp;
488
489 /*
490 * Clear previous allocation
491 */
492 if (cb->cb_nb)
493 free_cirbuf(cb);
494
495 bzero(cb, sizeof (*cb));
496 rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
497
498 rw_enter(&cb->cb_rwlock, RW_WRITER);
499
500 cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
501 cb->cb_nb = bufsize;
502
503 /*
504 * preallocate N bufs that are hard-sized to blksize
505 * in other words, the read buffer pool is a linked list
506 * of statically sized bufs.
507 */
508 va = cb->cb_va;
509 while ((nb = bufsize) != 0) {
510 if (nb > blksize)
511 nb = blksize;
512 bp = kmem_alloc(sizeof (buf_t), KM_SLEEP);
513 bzero(bp, sizeof (buf_t));
514 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
515 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
516 bp->b_un.b_addr = va;
517 bp->b_bufsize = nb;
518 if (cb->cb_bp) {
519 bp->b_forw = cb->cb_bp->b_forw;
520 bp->b_back = cb->cb_bp;
521 cb->cb_bp->b_forw->b_back = bp;
522 cb->cb_bp->b_forw = bp;
523 } else
524 bp->b_forw = bp->b_back = bp;
525 cb->cb_bp = bp;
526 bufsize -= nb;
527 va += nb;
528 }
529
530 rw_exit(&cb->cb_rwlock);
531 }
532
533 void
free_cirbuf(cirbuf_t * cb)534 free_cirbuf(cirbuf_t *cb)
535 {
536 buf_t *bp;
537
538 if (cb->cb_nb == 0)
539 return;
540
541 rw_enter(&cb->cb_rwlock, RW_WRITER);
542 ASSERT(cb->cb_dirty == NULL);
543
544 /*
545 * free the active bufs
546 */
547 while ((bp = cb->cb_bp) != NULL) {
548 if (bp == bp->b_forw)
549 cb->cb_bp = NULL;
550 else
551 cb->cb_bp = bp->b_forw;
552 bp->b_back->b_forw = bp->b_forw;
553 bp->b_forw->b_back = bp->b_back;
554 sema_destroy(&bp->b_sem);
555 sema_destroy(&bp->b_io);
556 kmem_free(bp, sizeof (buf_t));
557 }
558
559 /*
560 * free the free bufs
561 */
562 while ((bp = cb->cb_free) != NULL) {
563 cb->cb_free = bp->b_forw;
564 sema_destroy(&bp->b_sem);
565 sema_destroy(&bp->b_io);
566 kmem_free(bp, sizeof (buf_t));
567 }
568 kmem_free(cb->cb_va, cb->cb_nb);
569 cb->cb_va = NULL;
570 cb->cb_nb = 0;
571 rw_exit(&cb->cb_rwlock);
572 rw_destroy(&cb->cb_rwlock);
573 }
574
575 static int
within_range(off_t lof,daddr_t blkno,ulong_t bcount)576 within_range(off_t lof, daddr_t blkno, ulong_t bcount)
577 {
578 off_t blof = dbtob(blkno);
579
580 return ((lof >= blof) && (lof < (blof + bcount)));
581 }
582
583 static buf_t *
find_bp(ml_unit_t * ul,cirbuf_t * cb,off_t lof)584 find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
585 {
586 buf_t *bp;
587
588 /*
589 * find a buf that contains the offset lof
590 */
591 rw_enter(&cb->cb_rwlock, RW_READER);
592 bp = cb->cb_bp;
593 do {
594 if (bp->b_bcount &&
595 within_range(lof, bp->b_blkno, bp->b_bcount)) {
596 makebusy(ul, bp);
597 rw_exit(&cb->cb_rwlock);
598 return (bp);
599 }
600 bp = bp->b_forw;
601 } while (bp != cb->cb_bp);
602 rw_exit(&cb->cb_rwlock);
603
604 return (NULL);
605 }
606
607 static off_t
find_read_lof(ml_unit_t * ul,cirbuf_t * cb,off_t lof)608 find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
609 {
610 buf_t *bp, *bpend;
611 off_t rlof;
612
613 /*
614 * we mustn't:
615 * o read past eol
616 * o read past the tail
617 * o read data that may be being written.
618 */
619 rw_enter(&cb->cb_rwlock, RW_READER);
620 bpend = bp = cb->cb_bp->b_forw;
621 rlof = ul->un_tail_lof;
622 do {
623 if (bp->b_bcount) {
624 rlof = dbtob(bp->b_blkno);
625 break;
626 }
627 bp = bp->b_forw;
628 } while (bp != bpend);
629 rw_exit(&cb->cb_rwlock);
630
631 if (lof <= rlof)
632 /* lof is prior to the range represented by the write buf */
633 return (rlof);
634 else
635 /* lof follows the range represented by the write buf */
636 return ((off_t)ul->un_eol_lof);
637 }
638
639 static buf_t *
get_read_bp(ml_unit_t * ul,off_t lof)640 get_read_bp(ml_unit_t *ul, off_t lof)
641 {
642 cirbuf_t *cb;
643 buf_t *bp;
644 off_t rlof;
645
646 /*
647 * retrieve as much data as possible from the incore buffers
648 */
649 if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
650 logstats.ls_lreadsinmem.value.ui64++;
651 return (bp);
652 }
653 if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
654 logstats.ls_lreadsinmem.value.ui64++;
655 return (bp);
656 }
657
658 /*
659 * steal the LRU buf
660 */
661 cb = &ul->un_rdbuf;
662 rw_enter(&cb->cb_rwlock, RW_WRITER);
663 bp = cb->cb_bp->b_forw;
664 makebusy(ul, bp);
665 bp->b_flags = 0;
666 bp->b_bcount = 0;
667 cb->cb_bp = bp;
668 rw_exit(&cb->cb_rwlock);
669
670 /*
671 * don't read past the tail or the end-of-log
672 */
673 bp->b_blkno = btodb(lof);
674 lof = dbtob(bp->b_blkno);
675 rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
676 bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
677 readlog(ul, bp);
678 return (bp);
679 }
680
681 /*
682 * NOTE: writers are single threaded thru the log layer.
683 * This means we can safely reference and change the cb and bp fields
684 * that ldl_read does not reference w/o holding the cb_rwlock or
685 * the bp makebusy lock.
686 */
687 static int
extend_write_bp(ml_unit_t * ul,cirbuf_t * cb,buf_t * bp)688 extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
689 {
690 buf_t *bpforw = bp->b_forw;
691
692 ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
693
694 /*
695 * there is no `next' bp; do nothing
696 */
697 if (bpforw == bp)
698 return (0);
699
700 /*
701 * buffer space is not adjacent; do nothing
702 */
703 if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
704 return (0);
705
706 /*
707 * locking protocol requires giving up any bp locks before
708 * acquiring cb_rwlock. This is okay because we hold
709 * un_log_mutex.
710 */
711 sema_v(&bp->b_sem);
712
713 /*
714 * lock out ldl_read
715 */
716 rw_enter(&cb->cb_rwlock, RW_WRITER);
717
718 /*
719 * wait for current IO to finish w/next bp; if necessary
720 */
721 makebusy(ul, bpforw);
722
723 /*
724 * free the next bp and steal its space
725 */
726 bp->b_forw = bpforw->b_forw;
727 bpforw->b_forw->b_back = bp;
728 bp->b_bufsize += bpforw->b_bufsize;
729 sema_v(&bpforw->b_sem);
730 bpforw->b_forw = cb->cb_free;
731 cb->cb_free = bpforw;
732 makebusy(ul, bp);
733 rw_exit(&cb->cb_rwlock);
734
735 return (1);
736 }
737
738 static size_t
storebuf(ml_unit_t * ul,buf_t * bp,caddr_t va,size_t nb)739 storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
740 {
741 size_t copy_nb;
742 size_t nb_in_sec;
743 sect_trailer_t *st;
744 size_t nb_left = nb;
745 cirbuf_t *cb = &ul->un_wrbuf;
746
747 again:
748 nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
749 copy_nb = MIN(nb_left, nb_in_sec);
750
751 ASSERT(copy_nb);
752
753 bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
754 bp->b_bcount += copy_nb;
755 va += copy_nb;
756 nb_left -= copy_nb;
757 ul->un_tail_lof += copy_nb;
758
759 if ((nb_in_sec -= copy_nb) == 0) {
760 st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);
761
762 st->st_tid = ul->un_logmap->mtm_tid;
763 st->st_ident = ul->un_tail_ident++;
764 bp->b_bcount += sizeof (sect_trailer_t);
765 ul->un_tail_lof += sizeof (sect_trailer_t);
766 /*
767 * log wrapped; async write this bp
768 */
769 if (ul->un_tail_lof == ul->un_eol_lof) {
770 ul->un_tail_lof = ul->un_bol_lof;
771 push_dirty_bp(ul, bp);
772 return (nb - nb_left);
773 }
774 /*
775 * out of bp space; get more or async write buf
776 */
777 if (bp->b_bcount == bp->b_bufsize) {
778 if (!extend_write_bp(ul, cb, bp)) {
779 push_dirty_bp(ul, bp);
780 return (nb - nb_left);
781 }
782 }
783 }
784 if (nb_left)
785 goto again;
786
787 sema_v(&bp->b_sem);
788 return (nb);
789 }
790
791 static void
fetchzeroes(caddr_t dst_va,offset_t dst_mof,ulong_t dst_nb,mapentry_t * me)792 fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
793 {
794 offset_t src_mof = me->me_mof;
795 size_t src_nb = me->me_nb;
796
797 if (src_mof > dst_mof) {
798 ASSERT(src_mof < (dst_mof + dst_nb));
799 dst_va += (src_mof - dst_mof);
800 dst_nb -= (src_mof - dst_mof);
801 } else {
802 ASSERT(dst_mof < (src_mof + src_nb));
803 src_nb -= (dst_mof - src_mof);
804 }
805
806 src_nb = MIN(src_nb, dst_nb);
807 ASSERT(src_nb);
808 bzero(dst_va, src_nb);
809 }
810
811 /*
812 * dst_va == NULL means don't copy anything
813 */
814 static ulong_t
fetchbuf(ml_unit_t * ul,buf_t * bp,caddr_t dst_va,size_t dst_nb,off_t * dst_lofp)815 fetchbuf(
816 ml_unit_t *ul,
817 buf_t *bp,
818 caddr_t dst_va,
819 size_t dst_nb,
820 off_t *dst_lofp)
821 {
822 caddr_t copy_va;
823 size_t copy_nb;
824 size_t nb_sec;
825 off_t dst_lof = *dst_lofp;
826 ulong_t sav_dst_nb = dst_nb;
827 ulong_t src_nb = bp->b_bcount;
828 off_t src_lof = dbtob(bp->b_blkno);
829 off_t src_elof = src_lof + src_nb;
830 caddr_t src_va = bp->b_un.b_addr;
831
832 /*
833 * copy from bp to dst_va
834 */
835 while (dst_nb) {
836 /*
837 * compute address within bp
838 */
839 copy_va = src_va + (dst_lof - src_lof);
840
841 /*
842 * adjust copy size to amount of data in bp
843 */
844 copy_nb = MIN(dst_nb, src_elof - dst_lof);
845
846 /*
847 * adjust copy size to amount of data in sector
848 */
849 nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
850 copy_nb = MIN(copy_nb, nb_sec);
851
852 /*
853 * dst_va == NULL means don't do copy (see logseek())
854 */
855 if (dst_va) {
856 bcopy(copy_va, dst_va, copy_nb);
857 dst_va += copy_nb;
858 }
859 dst_lof += copy_nb;
860 dst_nb -= copy_nb;
861 nb_sec -= copy_nb;
862
863 /*
864 * advance over sector trailer
865 */
866 if (nb_sec == 0)
867 dst_lof += sizeof (sect_trailer_t);
868
869 /*
870 * exhausted buffer
871 * return current lof for next read
872 */
873 if (dst_lof == src_elof) {
874 sema_v(&bp->b_sem);
875 if (dst_lof == ul->un_eol_lof)
876 dst_lof = ul->un_bol_lof;
877 *dst_lofp = dst_lof;
878 return (sav_dst_nb - dst_nb);
879 }
880 }
881
882 /*
883 * copy complete - return current lof
884 */
885 sema_v(&bp->b_sem);
886 *dst_lofp = dst_lof;
887 return (sav_dst_nb);
888 }
889
890 void
ldl_round_commit(ml_unit_t * ul)891 ldl_round_commit(ml_unit_t *ul)
892 {
893 int wrapped;
894 buf_t *bp;
895 sect_trailer_t *st;
896 size_t bcount;
897 cirbuf_t *cb = &ul->un_wrbuf;
898
899 /*
900 * if nothing to write; then do nothing
901 */
902 if ((bp = cb->cb_dirty) == NULL)
903 return;
904 makebusy(ul, bp);
905
906 /*
907 * round up to sector boundary and set new tail
908 * don't readjust st_ident if buf is already rounded
909 */
910 bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
911 if (bcount == bp->b_bcount) {
912 sema_v(&bp->b_sem);
913 return;
914 }
915 bp->b_bcount = bcount;
916 ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
917 wrapped = 0;
918 if (ul->un_tail_lof == ul->un_eol_lof) {
919 ul->un_tail_lof = ul->un_bol_lof;
920 ++wrapped;
921 }
922 ASSERT(ul->un_tail_lof != ul->un_head_lof);
923
924 /*
925 * fix up the sector trailer
926 */
927 /* LINTED */
928 st = (sect_trailer_t *)
929 ((bp->b_un.b_addr + bcount) - sizeof (*st));
930 st->st_tid = ul->un_logmap->mtm_tid;
931 st->st_ident = ul->un_tail_ident++;
932
933 /*
934 * if tail wrapped or we have exhausted this buffer
935 * async write the buffer
936 */
937 if (wrapped || bcount == bp->b_bufsize)
938 push_dirty_bp(ul, bp);
939 else
940 sema_v(&bp->b_sem);
941 }
942
943 void
ldl_push_commit(ml_unit_t * ul)944 ldl_push_commit(ml_unit_t *ul)
945 {
946 buf_t *bp;
947 cirbuf_t *cb = &ul->un_wrbuf;
948
949 /*
950 * if nothing to write; then do nothing
951 */
952 if ((bp = cb->cb_dirty) == NULL)
953 return;
954 makebusy(ul, bp);
955 push_dirty_bp(ul, bp);
956 }
957
958 int
ldl_need_commit(ml_unit_t * ul)959 ldl_need_commit(ml_unit_t *ul)
960 {
961 return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
962 }
963
964 int
ldl_has_space(ml_unit_t * ul,mapentry_t * me)965 ldl_has_space(ml_unit_t *ul, mapentry_t *me)
966 {
967 off_t nfb;
968 off_t nb;
969
970 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
971
972 /*
973 * Add up the size used by the deltas
974 * round nb up to a sector length plus an extra sector
975 * w/o the extra sector we couldn't distinguish
976 * a full log (head == tail) from an empty log (head == tail)
977 */
978 for (nb = DEV_BSIZE; me; me = me->me_hash) {
979 nb += sizeof (struct delta);
980 if (me->me_dt != DT_CANCEL)
981 nb += me->me_nb;
982 }
983 nb = P2ROUNDUP(nb, DEV_BSIZE);
984
985 if (ul->un_head_lof <= ul->un_tail_lof)
986 nfb = (ul->un_head_lof - ul->un_bol_lof) +
987 (ul->un_eol_lof - ul->un_tail_lof);
988 else
989 nfb = ul->un_head_lof - ul->un_tail_lof;
990
991 return (nb < nfb);
992 }
993
994 void
ldl_write(ml_unit_t * ul,caddr_t bufp,offset_t bufmof,struct mapentry * me)995 ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
996 {
997 buf_t *bp;
998 caddr_t va;
999 size_t nb;
1000 size_t actual;
1001
1002 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1003
1004 /* Write the delta */
1005
1006 nb = sizeof (struct delta);
1007 va = (caddr_t)&me->me_delta;
1008 bp = get_write_bp(ul);
1009
1010 while (nb) {
1011 if (ul->un_flags & LDL_ERROR) {
1012 sema_v(&bp->b_sem);
1013 return;
1014 }
1015 actual = storebuf(ul, bp, va, nb);
1016 ASSERT(actual);
1017 va += actual;
1018 nb -= actual;
1019 if (nb)
1020 bp = get_write_bp(ul);
1021 }
1022
1023 /* If a commit, cancel, or 0's; we're almost done */
1024 switch (me->me_dt) {
1025 case DT_COMMIT:
1026 case DT_CANCEL:
1027 case DT_ABZERO:
1028 /* roll needs to know where the next delta will go */
1029 me->me_lof = ul->un_tail_lof;
1030 return;
1031 default:
1032 break;
1033 }
1034
1035 /* Now write the data */
1036
1037 ASSERT(me->me_nb != 0);
1038
1039 nb = me->me_nb;
1040 va = (me->me_mof - bufmof) + bufp;
1041 bp = get_write_bp(ul);
1042
1043 /* Save where we will put the data */
1044 me->me_lof = ul->un_tail_lof;
1045
1046 while (nb) {
1047 if (ul->un_flags & LDL_ERROR) {
1048 sema_v(&bp->b_sem);
1049 return;
1050 }
1051 actual = storebuf(ul, bp, va, nb);
1052 ASSERT(actual);
1053 va += actual;
1054 nb -= actual;
1055 if (nb)
1056 bp = get_write_bp(ul);
1057 }
1058 }
1059
1060 void
ldl_waito(ml_unit_t * ul)1061 ldl_waito(ml_unit_t *ul)
1062 {
1063 buf_t *bp;
1064 cirbuf_t *cb = &ul->un_wrbuf;
1065
1066 rw_enter(&cb->cb_rwlock, RW_WRITER);
1067 /*
1068 * wait on them
1069 */
1070 bp = cb->cb_bp;
1071 do {
1072 if ((bp->b_flags & B_DONE) == 0) {
1073 makebusy(ul, bp);
1074 sema_v(&bp->b_sem);
1075 }
1076 bp = bp->b_forw;
1077 } while (bp != cb->cb_bp);
1078 rw_exit(&cb->cb_rwlock);
1079 }
1080
1081 /*
1082 * seek nb bytes from location lof
1083 */
1084 static int
logseek(ml_unit_t * ul,off_t lof,size_t nb,off_t * lofp)1085 logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
1086 {
1087 buf_t *bp;
1088 ulong_t actual;
1089
1090 while (nb) {
1091 bp = get_read_bp(ul, lof);
1092 if (bp->b_flags & B_ERROR) {
1093 sema_v(&bp->b_sem);
1094 return (EIO);
1095 }
1096 actual = fetchbuf(ul, bp, NULL, nb, &lof);
1097 ASSERT(actual);
1098 nb -= actual;
1099 }
1100 *lofp = lof;
1101 ASSERT(nb == 0);
1102 return (0);
1103 }
1104
1105 int
ldl_read(ml_unit_t * ul,caddr_t va,offset_t mof,off_t nb,mapentry_t * me)1106 ldl_read(
1107 ml_unit_t *ul, /* Log unit */
1108 caddr_t va, /* address of buffer to read into */
1109 offset_t mof, /* mof of buffer */
1110 off_t nb, /* length of buffer */
1111 mapentry_t *me) /* Map entry list */
1112 {
1113 buf_t *bp;
1114 crb_t *crb;
1115 caddr_t rva; /* address to read into */
1116 size_t rnb; /* # of bytes to read */
1117 off_t lof; /* log device offset to read from */
1118 off_t skip;
1119 ulong_t actual;
1120 int error;
1121 caddr_t eva = va + nb; /* end of buffer */
1122
1123 for (; me; me = me->me_agenext) {
1124 ASSERT(me->me_dt != DT_CANCEL);
1125
1126 /*
1127 * check for an cached roll buffer
1128 */
1129 crb = me->me_crb;
1130 if (crb) {
1131 if (mof > crb->c_mof) {
1132 /*
1133 * This mapentry overlaps with the beginning of
1134 * the supplied buffer
1135 */
1136 skip = mof - crb->c_mof;
1137 bcopy(crb->c_buf + skip, va,
1138 MIN(nb, crb->c_nb - skip));
1139 } else {
1140 /*
1141 * This mapentry starts at or after
1142 * the supplied buffer.
1143 */
1144 skip = crb->c_mof - mof;
1145 bcopy(crb->c_buf, va + skip,
1146 MIN(crb->c_nb, nb - skip));
1147 }
1148 logstats.ls_lreadsinmem.value.ui64++;
1149 continue;
1150 }
1151
1152 /*
1153 * check for a delta full of zeroes - there's no log data
1154 */
1155 if (me->me_dt == DT_ABZERO) {
1156 fetchzeroes(va, mof, nb, me);
1157 continue;
1158 }
1159
1160 if (mof > me->me_mof) {
1161 rnb = (size_t)(mof - me->me_mof);
1162 error = logseek(ul, me->me_lof, rnb, &lof);
1163 if (error)
1164 return (EIO);
1165 rva = va;
1166 rnb = me->me_nb - rnb;
1167 rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
1168 } else {
1169 lof = me->me_lof;
1170 rva = (me->me_mof - mof) + va;
1171 rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
1172 }
1173
1174 while (rnb) {
1175 bp = get_read_bp(ul, lof);
1176 if (bp->b_flags & B_ERROR) {
1177 sema_v(&bp->b_sem);
1178 return (EIO);
1179 }
1180 ASSERT(((me->me_flags & ME_ROLL) == 0) ||
1181 (bp != ul->un_wrbuf.cb_dirty));
1182 actual = fetchbuf(ul, bp, rva, rnb, &lof);
1183 ASSERT(actual);
1184 rva += actual;
1185 rnb -= actual;
1186 }
1187 }
1188 return (0);
1189 }
1190
1191 void
ldl_savestate(ml_unit_t * ul)1192 ldl_savestate(ml_unit_t *ul)
1193 {
1194 int error;
1195 buf_t *bp = ul->un_bp;
1196 ml_odunit_t *ud = (void *)bp->b_un.b_addr;
1197 ml_odunit_t *ud2 = (void *)(bp->b_un.b_addr + DEV_BSIZE);
1198
1199 #if DEBUG
1200 /*
1201 * Scan test is running; don't update intermediate state
1202 */
1203 if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
1204 return;
1205 #endif /* DEBUG */
1206
1207 mutex_enter(&ul->un_state_mutex);
1208 bcopy(&ul->un_ondisk, ud, sizeof (*ud));
1209 ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
1210 bcopy(ud, ud2, sizeof (*ud));
1211
1212 /* If a snapshot is enabled write through the shapshot driver. */
1213 if (ul->un_ufsvfs->vfs_snapshot)
1214 UFS_BWRITE2(ul->un_ufsvfs, bp);
1215 else
1216 BWRITE2(bp);
1217 logstats.ls_ldlwrites.value.ui64++;
1218 error = bp->b_flags & B_ERROR;
1219 mutex_exit(&ul->un_state_mutex);
1220 if (error)
1221 ldl_seterror(ul, "Error writing ufs log state");
1222 }
1223
1224 /*
1225 * The head will be set to (new_lof - header) since ldl_sethead is
1226 * called with the new_lof of the data portion of a delta.
1227 */
1228 void
ldl_sethead(ml_unit_t * ul,off_t data_lof,uint32_t tid)1229 ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
1230 {
1231 off_t nb;
1232 off_t new_lof;
1233 uint32_t new_ident;
1234 daddr_t beg_blkno;
1235 daddr_t end_blkno;
1236
1237 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1238
1239 if (data_lof == -1) {
1240 /* log is empty */
1241 new_ident = lufs_hd_genid(ul);
1242 new_lof = ul->un_tail_lof;
1243
1244 } else {
1245 /* compute header's lof */
1246 new_ident = ul->un_head_ident;
1247 new_lof = data_lof - sizeof (struct delta);
1248
1249 /* whoops, header spans sectors; subtract out sector trailer */
1250 if (btodb(new_lof) != btodb(data_lof))
1251 new_lof -= sizeof (sect_trailer_t);
1252
1253 /* whoops, header wrapped the log; go to last sector */
1254 if (new_lof < ul->un_bol_lof) {
1255 /* sector offset */
1256 new_lof -= dbtob(btodb(new_lof));
1257 /* add to last sector's lof */
1258 new_lof += (ul->un_eol_lof - DEV_BSIZE);
1259 }
1260 ul->un_head_tid = tid;
1261 }
1262
1263 /*
1264 * check for nop
1265 */
1266 if (new_lof == ul->un_head_lof)
1267 return;
1268
1269 /*
1270 * invalidate the affected bufs and calculate new ident
1271 */
1272 if (new_lof > ul->un_head_lof) {
1273 nb = new_lof - ul->un_head_lof;
1274 inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1275 inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1276
1277 end_blkno = btodb(new_lof);
1278 beg_blkno = btodb(ul->un_head_lof);
1279 new_ident += (end_blkno - beg_blkno);
1280 } else {
1281 nb = ul->un_eol_lof - ul->un_head_lof;
1282 inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1283 inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1284
1285 end_blkno = btodb(ul->un_eol_lof);
1286 beg_blkno = btodb(ul->un_head_lof);
1287 new_ident += (end_blkno - beg_blkno);
1288
1289 nb = new_lof - ul->un_bol_lof;
1290 inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
1291 inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);
1292
1293 end_blkno = btodb(new_lof);
1294 beg_blkno = btodb(ul->un_bol_lof);
1295 new_ident += (end_blkno - beg_blkno);
1296 }
1297 /*
1298 * don't update the head if there has been an error
1299 */
1300 if (ul->un_flags & LDL_ERROR)
1301 return;
1302
1303 /* Fix up the head and ident */
1304 ASSERT(new_lof >= ul->un_bol_lof);
1305 ul->un_head_lof = new_lof;
1306 ul->un_head_ident = new_ident;
1307 if (data_lof == -1) {
1308 ul->un_tail_ident = ul->un_head_ident;
1309 }
1310
1311
1312 /* Commit to the database */
1313 ldl_savestate(ul);
1314
1315 ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
1316 ldl_sethead_debug(ul));
1317 }
1318
1319 /*
1320 * The tail will be set to the sector following lof+nb
1321 * lof + nb == size of the last delta + commit record
1322 * this function is called once after the log scan has completed.
1323 */
1324 void
ldl_settail(ml_unit_t * ul,off_t lof,size_t nb)1325 ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
1326 {
1327 off_t new_lof;
1328 uint32_t new_ident;
1329 daddr_t beg_blkno;
1330 daddr_t end_blkno;
1331
1332 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1333
1334 if (lof == -1) {
1335 ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
1336 ul->un_head_lof = ul->un_tail_lof;
1337 ul->un_head_ident = lufs_hd_genid(ul);
1338 ul->un_tail_ident = ul->un_head_ident;
1339
1340 /* Commit to the database */
1341 ldl_savestate(ul);
1342
1343 return;
1344 }
1345
1346 /*
1347 * new_lof is the offset of the sector following the last commit
1348 */
1349 (void) logseek(ul, lof, nb, &new_lof);
1350 ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));
1351
1352 /*
1353 * calculate new ident
1354 */
1355 if (new_lof > ul->un_head_lof) {
1356 end_blkno = btodb(new_lof);
1357 beg_blkno = btodb(ul->un_head_lof);
1358 new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1359 } else {
1360 end_blkno = btodb(ul->un_eol_lof);
1361 beg_blkno = btodb(ul->un_head_lof);
1362 new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1363
1364 end_blkno = btodb(new_lof);
1365 beg_blkno = btodb(ul->un_bol_lof);
1366 new_ident += (end_blkno - beg_blkno);
1367 }
1368
1369 /* Fix up the tail and ident */
1370 ul->un_tail_lof = new_lof;
1371 ul->un_tail_ident = new_ident;
1372
1373 /* Commit to the database */
1374 ldl_savestate(ul);
1375 }
1376
1377 /*
1378 * LOGSCAN STUFF
1379 */
1380 static int
ldl_logscan_ident(ml_unit_t * ul,buf_t * bp,off_t lof)1381 ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
1382 {
1383 ulong_t ident;
1384 size_t nblk, i;
1385 sect_trailer_t *st;
1386
1387 /*
1388 * compute ident for first sector in the buffer
1389 */
1390 ident = ul->un_head_ident;
1391 if (bp->b_blkno >= btodb(ul->un_head_lof)) {
1392 ident += (bp->b_blkno - btodb(ul->un_head_lof));
1393 } else {
1394 ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
1395 ident += (bp->b_blkno - btodb(ul->un_bol_lof));
1396 }
1397 /*
1398 * truncate the buffer down to the last valid sector
1399 */
1400 nblk = btodb(bp->b_bcount);
1401 bp->b_bcount = 0;
1402 /* LINTED */
1403 st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
1404 for (i = 0; i < nblk; ++i) {
1405 if (st->st_ident != ident)
1406 break;
1407
1408 /* remember last valid tid for ldl_logscan_error() */
1409 ul->un_tid = st->st_tid;
1410
1411 /* LINTED */
1412 st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
1413 ++ident;
1414 bp->b_bcount += DEV_BSIZE;
1415 }
1416 /*
1417 * make sure that lof is still within range
1418 */
1419 return (within_range(lof, bp->b_blkno, bp->b_bcount));
1420 }
1421
1422 ulong_t
ldl_logscan_nbcommit(off_t lof)1423 ldl_logscan_nbcommit(off_t lof)
1424 {
1425 /*
1426 * lof is the offset following the commit header. However,
1427 * if the commit header fell on the end-of-sector, then lof
1428 * has already been advanced to the beginning of the next
1429 * sector. So do nothing. Otherwise, return the remaining
1430 * bytes in the sector.
1431 */
1432 if ((lof & (DEV_BSIZE - 1)) == 0)
1433 return (0);
1434 return (NB_LEFT_IN_SECTOR(lof));
1435 }
1436
1437 int
ldl_logscan_read(ml_unit_t * ul,off_t * lofp,size_t nb,caddr_t va)1438 ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
1439 {
1440 buf_t *bp;
1441 ulong_t actual;
1442
1443 ASSERT(ul->un_head_lof != ul->un_tail_lof);
1444
1445 /*
1446 * Check the log data doesn't go out of bounds
1447 */
1448 if (ul->un_head_lof < ul->un_tail_lof) {
1449 if (!WITHIN(*lofp, nb, ul->un_head_lof,
1450 (ul->un_tail_lof - ul->un_head_lof))) {
1451 return (EIO);
1452 }
1453 } else {
1454 if (OVERLAP(*lofp, nb, ul->un_tail_lof,
1455 (ul->un_head_lof - ul->un_tail_lof))) {
1456 return (EIO);
1457 }
1458 }
1459
1460 while (nb) {
1461 bp = get_read_bp(ul, *lofp);
1462 if (bp->b_flags & B_ERROR) {
1463 sema_v(&bp->b_sem);
1464 return (EIO);
1465 }
1466 /*
1467 * out-of-seq idents means partial transaction
1468 * panic, non-corrupting powerfail, ...
1469 */
1470 if (!ldl_logscan_ident(ul, bp, *lofp)) {
1471 sema_v(&bp->b_sem);
1472 return (EIO);
1473 }
1474 /*
1475 * copy the header into the caller's buf
1476 */
1477 actual = fetchbuf(ul, bp, va, nb, lofp);
1478 if (va)
1479 va += actual;
1480 nb -= actual;
1481 }
1482 return (0);
1483 }
1484
1485 void
ldl_logscan_begin(ml_unit_t * ul)1486 ldl_logscan_begin(ml_unit_t *ul)
1487 {
1488 size_t bufsize;
1489
1490 ASSERT(ul->un_wrbuf.cb_dirty == NULL);
1491
1492 /*
1493 * logscan has begun
1494 */
1495 ul->un_flags |= LDL_SCAN;
1496
1497 /*
1498 * reset the circular bufs
1499 */
1500 bufsize = ldl_bufsize(ul);
1501 alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
1502 alloc_wrbuf(&ul->un_wrbuf, bufsize);
1503
1504 /*
1505 * set the tail to reflect a full log
1506 */
1507 ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;
1508
1509 if (ul->un_tail_lof < ul->un_bol_lof)
1510 ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
1511 if (ul->un_tail_lof >= ul->un_eol_lof)
1512 ul->un_tail_lof = ul->un_bol_lof;
1513
1514 /*
1515 * un_tid is used during error processing; it is initialized to
1516 * the tid of the delta at un_head_lof;
1517 */
1518 ul->un_tid = ul->un_head_tid;
1519 }
1520
1521 void
ldl_logscan_end(ml_unit_t * ul)1522 ldl_logscan_end(ml_unit_t *ul)
1523 {
1524 size_t bufsize;
1525
1526 /*
1527 * reset the circular bufs
1528 */
1529 bufsize = ldl_bufsize(ul);
1530 alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
1531 alloc_wrbuf(&ul->un_wrbuf, bufsize);
1532
1533 /*
1534 * Done w/scan
1535 */
1536 ul->un_flags &= ~LDL_SCAN;
1537 }
1538
1539 int
ldl_need_roll(ml_unit_t * ul)1540 ldl_need_roll(ml_unit_t *ul)
1541 {
1542 off_t busybytes;
1543 off_t head;
1544 off_t tail;
1545 off_t bol;
1546 off_t eol;
1547 off_t nb;
1548
1549 /*
1550 * snapshot the log state
1551 */
1552 head = ul->un_head_lof;
1553 tail = ul->un_tail_lof;
1554 bol = ul->un_bol_lof;
1555 eol = ul->un_eol_lof;
1556 nb = ul->un_logsize;
1557
1558 /*
1559 * compute number of busy (inuse) bytes
1560 */
1561 if (head <= tail)
1562 busybytes = tail - head;
1563 else
1564 busybytes = (eol - head) + (tail - bol);
1565
1566 /*
1567 * return TRUE if > 75% full
1568 */
1569 return (busybytes > (nb - (nb >> 2)));
1570 }
1571
1572 void
ldl_seterror(ml_unit_t * ul,char * why)1573 ldl_seterror(ml_unit_t *ul, char *why)
1574 {
1575 /*
1576 * already in error state; do nothing
1577 */
1578 if (ul->un_flags & LDL_ERROR)
1579 return;
1580
1581 ul->un_flags |= LDL_ERROR; /* incore */
1582 ul->un_badlog = 1; /* ondisk (cleared by fsck) */
1583
1584 /*
1585 * Commit to state sectors
1586 */
1587 uniqtime(&ul->un_timestamp);
1588 ldl_savestate(ul);
1589
1590 /* Pretty print */
1591 cmn_err(CE_WARN, "%s", why);
1592 cmn_err(CE_WARN, "ufs log for %s changed state to Error",
1593 ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1594 cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
1595 ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1596
1597 /*
1598 * If we aren't in the middle of scan (aka snarf); tell ufs
1599 * to hard lock itself.
1600 */
1601 if ((ul->un_flags & LDL_SCAN) == 0)
1602 ufs_trans_onerror();
1603 }
1604
1605 size_t
ldl_bufsize(ml_unit_t * ul)1606 ldl_bufsize(ml_unit_t *ul)
1607 {
1608 size_t bufsize;
1609 extern uint32_t ldl_minbufsize;
1610
1611 /*
1612 * initial guess is the maxtransfer value for this log device
1613 * increase if too small
1614 * decrease if too large
1615 */
1616 bufsize = dbtob(btod(ul->un_maxtransfer));
1617 if (bufsize < ldl_minbufsize)
1618 bufsize = ldl_minbufsize;
1619 if (bufsize > maxphys)
1620 bufsize = maxphys;
1621 if (bufsize > ul->un_maxtransfer)
1622 bufsize = ul->un_maxtransfer;
1623 return (bufsize);
1624 }
1625