xref: /illumos-gate/usr/src/stand/lib/fs/ufs/lufsboot.c (revision 35a5a3587fd94b666239c157d3722745250ccbd7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/param.h>
29 #include <sys/vnode.h>
30 #include <sys/fs/ufs_fsdir.h>
31 #include <sys/fs/ufs_fs.h>
32 #include <sys/fs/ufs_inode.h>
33 #include <sys/fs/ufs_log.h>
34 #include <sys/sysmacros.h>
35 #include <sys/promif.h>
36 #include <sys/machparam.h>
37 
38 #include <sys/stat.h>
39 #include <sys/bootdebug.h>
40 #include <sys/salib.h>
41 #include <sys/saio.h>
42 #include <sys/filep.h>
43 
44 
45 /*
46  * Big theory statement on how ufsboot makes use of the log
47  * in case the filesystem wasn't shut down cleanly.
48  *
49  * The structure of the ufs on-disk log looks like this:
50  *
51  * +-----------------+
52  * | SUPERBLOCK      |
53  * | ...             |
54  * | fs_logbno       +--> +-----------------------+
55  * | ...             |    | EXTENT BLOCK          |
56  * +-----------------+    |   ...                 |
57  *                        |   nextents            |
58  * +----------------------+   extents[0].pbno     |
59  * |                      | { extents[1].pbno }   +------------+
60  * |                      |   ...                 +--> ...     |
61  * |                      +-----------------------+            |
62  * v                                                           |
63  * +-----------------------------+      \                      |
64  * | ON-DISK LOG HEADER          |      |                      |
65  * | ...                         |      |                      |
66  * | od_head_lof                 +--+   |                      |
67  * | ...                         |  |   |                      |
68  * +-----------------------------+ <|---|- od_bol_lof          |
69  * | sector (may contain deltas) |  |   |  (logical offset)    |
70  * |   +-------------------------+  |   |                      |
71  * |   | trailer (some ident#)   |  |    > extents[0].nbno     |
72  * +---+-------------------------+  |   |  blocks ("sectors")  |
73  * .                             .  |   |                      |
74  * .                             .  |   |                      |
75  * +-----------------------------+<-+   |                      |
76  * | delta1 delta2       delta3  |      |                      |
77  * | d +-------------------------+      |                      |
78  * | e | ident#: od_head_ident   |      |                      |
79  * +---+-------------------------+      /                      |
80  *                                                             |
81  * +-----------------------------+ <---------------------------+
82  * | lta4    delta5 delta6    de |
83  * | l +-------------------------+
84  * | t | ident#: od_head_ident+1 |
85  * +---+-------------------------+
86  * .                             .
87  * +-----------------------------+
88  * | sector (may contain deltas) |
89  * |          +------------------+
90  * |          | trailer (ident#) |
91  * +----------+------------------+ <-- od_eol_lof (logical offset)
92  *
93  * The ufs on-disk log has the following properties:
94  *
95  * 1. The log is made up from at least one extent. "fs_logbno" in
96  *    the superblock points to where this is found.
97  * 2. Extents describe the logical layout.
98  *      - Logical offset 0 is the on-disk log header. It's also
99  *        at the beginning of the first physical block.
100  *      - If there's more than one extent, the equation holds:
101  *             extent[i+1].lbno == extent[i].lbno + extent[i].nbno
102  *        i.e. logical offsets form a contiguous sequence. Yet on disk,
103  *        two logically-adjacent offsets may be located in two
104  *        physically disjoint extents, so logical offsets need to be
105  *        translated into physical disk block addresses for access.
106  *      - Various fields in the on-disk log header structure refer
107  *        to such logical log offsets.
108  * 3. The actual logical logspace begins after the log header, at
109  *    the logical offset indicated by "od_bol_lof". Every 512 Bytes
110  *    (a "sector" in terms of ufs logging) is a sector trailer which
111  *    contains a sequence number, the sector ident.
112  * 4. Deltas are packed tight in the remaining space, i.e. a delta
113  *    may be part of more than one sector. Reads from the logspace
114  *    must be split at sector boundaries, since the trailer is never
115  *    part of a delta. Delta sizes vary.
116  * 5. The field "od_head_lof" points to the start of the dirty part
117  *    of the log, i.e. to the first delta header. Likewise, "od_head_ident"
118  *    is the sequence number where the valid part of the log starts; if
119  *    the sector pointed to by "od_head_lof" has a sector ident different
120  *    from "od_head_ident", the log is empty.
121  * 6. The valid part of the log extends for as many sectors as their ident
122  *    numbers form a contiguous sequence. When reaching the logical end of
123  *    the log, "od_bol_lof", logical offsets wrap around to "od_bol_lof",
124  *    i.e. the log forms a circular buffer.
125  *
126  * For the strategy how to handle accessing the log, item 4. is the
127  * most important one - its consequence is that the log can only be
128  * read in one direction - forward, starting at the head.
129  *
130  * The task of identifying whether a given metadata block is
131  * actually in the log therefore requires reading the entire
132  * log. Doing so is memory-efficient but kills speed if re-done
133  * at every metadata read (64MB log size vs. 512 byte metadata
134  * block size: 128 times as much I/O, possibly only to find out
135  * that this block was not in the log ...).
136  *
137  * First thought to speed this up is to let ufsboot roll the log.
138  * But this is not possible because:
139  * - ufsboot currently does not implement any write functionality,
140  *   the boot-time ufs implementation is read-only.
141  * - firmware write interfaces may or may not be available, in any
142  *   case, they're rarely used and untested for such a purpose.
143  * - that would duplicate a lot of code, since at the moment only
144  *   kernel ufs logging implements log rolling.
145  * - the boot environment cannot be considered high-performance;
146  *   rolling the log there would be slow.
147  * - boot device and root device could well be different, creating
148  *   inconsistencies e.g. with a mirrored root if the log is rolled.
149  *
150  * Therefore, caching the log structural information (boot-relevant
151  * deltas and their logical log offset) is required for fast access
152  * to the data in the log. This code builds a logmap for that purpose.
153  *
154  * As a simple optimization, if we find the log is empty, we will not
155  * use it - log reader support for ufsboot has no noticeable overhead
156  * for clean logs, or for root filesystems that aren't logging.
157  */
158 
159 #define	LB_HASHSHIFT		13
160 #define	LB_HASHSIZE		(1 << LB_HASHSHIFT)
161 #define	LB_HASHFUNC(mof)	(((mof) >> LB_HASHSHIFT) & (LB_HASHSIZE - 1))
162 
163 #define	LOGBUF_MAXSIZE	(8*1024*1024)
164 #define	LOGBUF_MINSIZE	(256*1024)
165 
166 #define	LOG_IS_EMPTY	0
167 #define	LOG_IS_OK	1
168 #define	LOG_IS_ERRORED	2
169 
170 /*
171  * We build a hashed logmap of those while scanning the log.
172  * sizeof(lb_map_t) is 40 on 64bit, 32 on 32bit; the max sized
173  * resalloc'ed buffer can accomodate around ~500k of those;
174  * this is approximately the maximum amount of deltas we'll
175  * see if a 64MB ufs log is completely filled. We'll make no
176  * attempt to free and reallocate the resalloc'ed buffer if
177  * we overflow, as conservative sizing should make that an
178  * impossibility. A future enhancement may allocate memory
179  * here as needed - once the boot time memory allocator
180  * supports that.
181  */
182 typedef struct lb_mapentry {
183 	struct lb_mapentry	*l_next;	/* hash chaining */
184 	struct lb_mapentry	*l_prev;	/* hash chaining */
185 	int64_t		l_mof;		/* disk addr this delta is against */
186 	int16_t		l_nb;		/* size of delta */
187 	int16_t		l_flags;
188 	int32_t		l_lof;		/* log offset for delta header */
189 	int32_t		l_tid;		/* transaction this delta is part of */
190 	delta_t		l_typ;		/* see <sys/fs/ufs_trans.h> */
191 } lb_me_t;
192 
193 #define	LB_ISCANCELLED	1
194 
195 #define	inslist(lh, l)	if ((*(lh))) {				\
196 				(*(lh))->l_prev->l_next = (l);	\
197 				(l)->l_next = (*(lh));		\
198 				(l)->l_prev = (*(lh))->l_prev;	\
199 				(*(lh))->l_prev = (l);		\
200 			} else {				\
201 				(l)->l_next = (l);		\
202 				(l)->l_prev = (l);		\
203 				(*(lh)) = l;			\
204 			}
205 
206 #define	remlist(lh, l)	\
207 	if ((l)->l_next == (l)) {			\
208 		if (*(lh) != (l) || (l)->l_prev != (l))	\
209 			dprintf("Logmap hash inconsistency.\n");	\
210 		*(lh) = (lb_me_t *)NULL;		\
211 	} else {					\
212 		if (*(lh) == (l))			\
213 			*(lh) = (l)->l_next;		\
214 		(l)->l_prev->l_next = (l)->l_next;	\
215 		(l)->l_next->l_prev = (l)->l_prev;	\
216 	}
217 
218 #define	lufs_alloc_me()	\
219 	(lb_me_t *)lufs_alloc_from_logbuf(sizeof (lb_me_t))
220 
221 extern int		boothowto;
222 static int		ufs_is_lufs = 0;
223 static fileid_t		*logfp = (fileid_t *)NULL;
224 static extent_block_t	*eb = (extent_block_t *)NULL;
225 static ml_odunit_t	odi;
226 
227 static char		logbuffer_min[LOGBUF_MINSIZE];
228 static caddr_t		logbuffer = (caddr_t)NULL;
229 static caddr_t		elogbuffer = (caddr_t)NULL;
230 static caddr_t		logbuf_curptr;
231 static lb_me_t		**loghash = (lb_me_t **)NULL;
232 static lb_me_t		*lfreelist;
233 
234 static uint32_t		curtid;
235 
236 
237 int	lufs_support = 1;
238 
239 void	lufs_boot_init(fileid_t *);
240 void	lufs_closeall(void);
241 void	lufs_merge_deltas(fileid_t *);
242 
243 static	int	lufs_logscan(void);
244 
245 extern	int	diskread(fileid_t *filep);
246 extern	caddr_t	resalloc(enum RESOURCES, size_t, caddr_t, int);
247 
248 #if defined(__sparcv9)
249 #define	LOGBUF_BASEADDR	((caddr_t)(SYSBASE - LOGBUF_MAXSIZE))
250 #endif
251 
252 static int
253 lufs_alloc_logbuf(void)
254 {
255 	/*
256 	 * Allocate memory for caching the log. Since the logbuffer can
257 	 * potentially exceed the boot scratch memory limit, we use resalloc
258 	 * directly, passing the allocation to the low-level boot-time
259 	 * backend allocator. The chosen VA range is the top end of
260 	 * the kernel's segmap segment, so we're not interfering
261 	 * with the kernel because segmap is created at a time when
262 	 * the 2nd-stage boot has already been unloaded and this VA
263 	 * range was given back.
264 	 *
265 	 * On sparc platforms, the kernel cannot recover the memory
266 	 * obtained from resalloc because the page structs are allocated
267 	 * before the call to BOP_QUIESCE. To avoid leaking this
268 	 * memory, the logbuffer is allocated from a small bss array
269 	 * that should hold the logmap except in the most extreme cases.
270 	 * If the bss array is too small, the logbuffer is extended
271 	 * from resalloc 1 page at a time.
272 	 */
273 
274 	logbuffer = logbuffer_min;
275 	elogbuffer = logbuffer+LOGBUF_MINSIZE;
276 	logbuf_curptr = logbuffer;
277 	lfreelist = (lb_me_t *)NULL;
278 
279 	if (logbuffer == (caddr_t)NULL)
280 		return (0);
281 
282 	dprintf("Buffer for boot loader logging support: 0x%p, size 0x%x\n",
283 	    logbuffer, elogbuffer-logbuffer);
284 
285 	return (1);
286 }
287 
288 static void
289 lufs_free_logbuf()
290 {
291 	/*
292 	 * Solaris/x86 has no prom_free() routine at this time.
293 	 * Reclaiming the VA range below KERNEL_TEXT on Solaris/x86
294 	 * is done by the kernel startup itself, in hat_unload_prom()
295 	 * after the bootloader has been quiesced.
296 	 *
297 	 * Solaris on sparc has a prom_free() routine that will update
298 	 *   the memlist properties to reflect the freeing of the
299 	 *   logbuffer. However, the sparc kernel cannot recover
300 	 *   the memory freed after the call to BOP_QUIESCE as the
301 	 *   page struct have already been allocated. We call
302 	 *   prom_free anyway so that the kernel can reclaim this
303 	 *   memory in the future.
304 	 */
305 	if (logbuffer == LOGBUF_BASEADDR)
306 		prom_free(logbuffer, elogbuffer-logbuffer);
307 	logbuffer = (caddr_t)NULL;
308 }
309 
310 static caddr_t
311 lufs_alloc_from_logbuf(size_t sz)
312 {
313 	caddr_t tmpaddr;
314 	lb_me_t	*l;
315 
316 	/*
317 	 * Satisfy lb_me_t allocations from the freelist
318 	 * first if possible.
319 	 */
320 	if ((sz == sizeof (lb_me_t)) && lfreelist) {
321 		l = lfreelist;
322 		lfreelist = lfreelist->l_next;
323 		return ((caddr_t)l);
324 	}
325 	if (elogbuffer < logbuf_curptr + sz) {
326 		caddr_t np;
327 		size_t nsz;
328 
329 		/*
330 		 * Out of space in current chunk - try to add another.
331 		 */
332 		if (logbuffer == logbuffer_min) {
333 			np = LOGBUF_BASEADDR;
334 		} else {
335 			np = elogbuffer;
336 		}
337 		nsz = roundup(sz, PAGESIZE);
338 		if (np + nsz > LOGBUF_BASEADDR + LOGBUF_MAXSIZE) {
339 			return ((caddr_t)NULL);
340 		}
341 
342 		np = resalloc(RES_CHILDVIRT, nsz, np, 0UL);
343 		if (np == (caddr_t)NULL) {
344 			return ((caddr_t)NULL);
345 		}
346 		if (logbuffer == logbuffer_min)
347 			logbuffer = LOGBUF_BASEADDR;
348 		logbuf_curptr = np;
349 		elogbuffer = logbuf_curptr + nsz;
350 	}
351 
352 	tmpaddr = logbuf_curptr;
353 	logbuf_curptr += sz;
354 	bzero(tmpaddr, sz);
355 	return (tmpaddr);
356 }
357 
358 static int32_t
359 lufs_read_log(int32_t addr, caddr_t va, int nb)
360 {
361 	int		i, fastpath = 0;
362 	daddr_t		pblk, lblk;
363 	sect_trailer_t	*st;
364 	uint32_t	ident;
365 
366 	/*
367 	 * Fast path for skipping the read if no target buffer
368 	 * is specified. Don't do this for the initial scan.
369 	 */
370 	if (ufs_is_lufs && (va == (caddr_t)NULL))
371 		fastpath = 1;
372 
373 	while (nb) {
374 		/* log wraparound check */
375 		if (addr == odi.od_eol_lof)
376 			addr = odi.od_bol_lof;
377 		if (fastpath)
378 			goto read_done;
379 
380 		/*
381 		 * Translate logically-contiguous log offsets into physical
382 		 * block numbers. For a log consisting of a single extent:
383 		 *	pbno = btodb(addr) - extents[0].lbno;
384 		 * Otherwise, search for the extent which contains addr.
385 		 */
386 		pblk = 0;
387 		lblk = btodb(addr);
388 		for (i = 0; i < eb->nextents; i++) {
389 			if (lblk >= eb->extents[i].lbno &&
390 			    lblk < eb->extents[i].lbno +
391 			    eb->extents[i].nbno) {
392 				pblk = lblk - eb->extents[i].lbno +
393 				    eb->extents[i].pbno;
394 				break;
395 			}
396 		}
397 
398 		if (pblk == 0) {
399 			/*
400 			 * block #0 can never be in a log extent since this
401 			 * block always contains the primary superblock copy.
402 			 */
403 			dprintf("No log extent found for log offset 0x%llx.\n",
404 			    addr);
405 			return (0);
406 		}
407 
408 		/*
409 		 * Check whether the block we want is cached from the last
410 		 * read. If not, read it in now.
411 		 */
412 		if (logfp->fi_blocknum != pblk) {
413 			logfp->fi_blocknum = pblk;
414 			logfp->fi_memp = logfp->fi_buf;
415 			logfp->fi_count = DEV_BSIZE;
416 			logfp->fi_offset = 0;
417 			if (diskread(logfp)) {
418 				dprintf("I/O error reading the ufs log" \
419 				    " at block 0x%x.\n",
420 				    logfp->fi_blocknum);
421 				return (0);
422 			}
423 			/*
424 			 * Log structure verification. The block which we just
425 			 * read has an ident number that must match its offset
426 			 * in blocks from the head of the log. Since the log
427 			 * can wrap around, we have to check for that to get the
428 			 * ident right. Out-of-sequence idents can happen after
429 			 * power failures, panics during a partial transaction,
430 			 * media errors, ... - in any case, they mark the end of
431 			 * the valid part of the log.
432 			 */
433 			st = (sect_trailer_t *)(logfp->fi_memp +
434 			    LDL_USABLE_BSIZE);
435 			/* od_head_ident is where the sequence starts */
436 			ident = odi.od_head_ident;
437 			if (lblk >= lbtodb(odi.od_head_lof)) {
438 				/* no wraparound */
439 				ident += (lblk - lbtodb(odi.od_head_lof));
440 			} else {
441 				/* log wrapped around the end */
442 				ident += (lbtodb(odi.od_eol_lof) -
443 				    lbtodb(odi.od_head_lof));
444 				ident += (lblk - lbtodb(odi.od_bol_lof));
445 			}
446 
447 			if (ident != st->st_ident)
448 				return (0);
449 		}
450 read_done:
451 		/*
452 		 * Copy the delta contents to the destination buffer if
453 		 * one was specified. Otherwise, just skip the contents.
454 		 */
455 		i = MIN(NB_LEFT_IN_SECTOR(addr), nb);
456 		if (va != NULL) {
457 			bcopy(logfp->fi_buf + (addr - ldbtob(lbtodb(addr))),
458 			    va, i);
459 			va += i;
460 		}
461 		nb -= i;
462 		addr += i;
463 		/*
464 		 * Skip sector trailer if necessary.
465 		 */
466 		if (NB_LEFT_IN_SECTOR(addr) == 0)
467 			addr += sizeof (sect_trailer_t);
468 	}
469 	return (addr);
470 }
471 
472 void
473 lufs_boot_init(fileid_t *filep)
474 {
475 	struct fs *sb = (struct fs *)filep->fi_memp;
476 	int err = 0;
477 
478 	/*
479 	 * boot_ufs_mountroot() should have called us with a
480 	 * filep pointing to the superblock. Verify that this
481 	 * is so first.
482 	 * Then check whether this filesystem has a dirty log.
483 	 * Also return if lufs support was disabled on request.
484 	 */
485 	if (!lufs_support ||
486 	    sb != (struct fs *)&filep->fi_devp->un_fs.di_fs ||
487 	    sb->fs_clean != FSLOG || sb->fs_logbno == NULL) {
488 		return;
489 	}
490 
491 	if (boothowto & RB_VERBOSE)
492 		printf("The boot filesystem is logging.\n");
493 
494 	/*
495 	 * The filesystem is logging, there is a log area
496 	 * allocated for it. Check the log state and determine
497 	 * whether it'll be possible to use this log.
498 	 */
499 
500 	/*
501 	 * Allocate a private fileid_t for use when reading
502 	 * from the log.
503 	 */
504 	eb = (extent_block_t *)bkmem_zalloc(sb->fs_bsize);
505 	logfp = (fileid_t *)bkmem_zalloc(sizeof (fileid_t));
506 	logfp->fi_memp = logfp->fi_buf;
507 	logfp->fi_devp = filep->fi_devp;
508 
509 	/*
510 	 * Read the extent block and verify that what we
511 	 * find there are actually lufs extents.
512 	 * Make it simple: the extent block including all
513 	 * extents cannot be larger than a filesystem block.
514 	 * So read a whole filesystem block, to make sure
515 	 * we have read all extents in the same operation.
516 	 */
517 	logfp->fi_blocknum = sb->fs_logbno;
518 	logfp->fi_count = sb->fs_bsize;
519 	logfp->fi_memp = (caddr_t)eb;
520 	logfp->fi_offset = 0;
521 	if (diskread(logfp) || eb->type != LUFS_EXTENTS) {
522 		dprintf("Failed to read log extent block.\n");
523 		err = LOG_IS_ERRORED;
524 		goto out;
525 	}
526 
527 	/*
528 	 * Read the on disk log header. If that fails,
529 	 * try the backup copy on the adjacent block.
530 	 */
531 	logfp->fi_blocknum = eb->extents[0].pbno;
532 	logfp->fi_count = sizeof (ml_odunit_t);
533 	logfp->fi_memp = (caddr_t)&odi;
534 	logfp->fi_offset = 0;
535 	if (diskread(logfp)) {
536 		logfp->fi_blocknum = eb->extents[0].pbno + 1;
537 		logfp->fi_count = sizeof (ml_odunit_t);
538 		logfp->fi_memp = (caddr_t)&odi;
539 		logfp->fi_offset = 0;
540 		if (diskread(logfp)) {
541 			dprintf("Failed to read on-disk log header.\n");
542 			err = LOG_IS_ERRORED;
543 			goto out;
544 		}
545 	}
546 
547 	/*
548 	 * Verify that we understand this log, and
549 	 * that the log isn't bad or empty.
550 	 */
551 	if (odi.od_version != LUFS_VERSION_LATEST) {
552 		dprintf("On-disk log format v%d != supported format v%d.\n",
553 		    odi.od_version, LUFS_VERSION_LATEST);
554 		err = LOG_IS_ERRORED;
555 	} else if (odi.od_badlog) {
556 		dprintf("On-disk log is marked bad.\n");
557 		err = LOG_IS_ERRORED;
558 	} else if (odi.od_chksum != odi.od_head_ident + odi.od_tail_ident) {
559 		dprintf("On-disk log checksum %d != ident sum %d.\n",
560 		    odi.od_chksum, odi.od_head_ident + odi.od_tail_ident);
561 		err = LOG_IS_ERRORED;
562 	} else {
563 		/*
564 		 * All consistency checks ok. Scan the log, build the
565 		 * log hash. If this succeeds we'll be using the log
566 		 * when reading from this filesystem.
567 		 */
568 		err = lufs_logscan();
569 	}
570 out:
571 	ufs_is_lufs = 1;
572 	switch (err) {
573 	case LOG_IS_EMPTY:
574 		if (boothowto & RB_VERBOSE)
575 			printf("The ufs log is empty and will not be used.\n");
576 		lufs_closeall();
577 		break;
578 	case LOG_IS_OK:
579 		if (boothowto & RB_VERBOSE)
580 			printf("Using the ufs log.\n");
581 		break;
582 	case LOG_IS_ERRORED:
583 		if (boothowto & RB_VERBOSE)
584 			printf("Couldn't build log hash. Can't use ufs log.\n");
585 		lufs_closeall();
586 		break;
587 	default:
588 		dprintf("Invalid error %d while scanning the ufs log.\n", err);
589 		break;
590 	}
591 }
592 
593 static int
594 lufs_logscan_read(int32_t *addr, struct delta *d)
595 {
596 	*addr = lufs_read_log(*addr, (caddr_t)d, sizeof (struct delta));
597 
598 	if (*addr == 0 ||
599 	    d->d_typ < DT_NONE || d->d_typ > DT_MAX ||
600 	    d->d_nb >= odi.od_logsize)
601 		return (0);
602 
603 	return (1);
604 }
605 
606 static int
607 lufs_logscan_skip(int32_t *addr, struct delta *d)
608 {
609 	switch (d->d_typ) {
610 	case DT_COMMIT:
611 		/*
612 		 * A DT_COMMIT delta has no size as such, but will
613 		 * always "fill up" the sector that contains it.
614 		 * The next delta header is found at the beginning
615 		 * of the next 512-Bytes sector, adjust "addr" to
616 		 * reflect that.
617 		 */
618 		*addr += ((*addr & (DEV_BSIZE - 1))) ?
619 		    NB_LEFT_IN_SECTOR(*addr) +
620 		    sizeof (sect_trailer_t) : 0;
621 		return (1);
622 	case DT_CANCEL:
623 	case DT_ABZERO:
624 		/*
625 		 * These types of deltas occupy no space in the log
626 		 */
627 		return (1);
628 	default:
629 		/*
630 		 * Skip over the delta contents.
631 		 */
632 		*addr = lufs_read_log(*addr, NULL, d->d_nb);
633 	}
634 
635 	return (*addr != NULL);
636 }
637 
638 static void
639 lufs_logscan_freecancel(void)
640 {
641 	lb_me_t		**lh, *l, *lnext;
642 	int		i;
643 
644 	/*
645 	 * Walk the entire log hash and put cancelled entries
646 	 * onto the freelist. Corner cases:
647 	 * a) empty hash chain (*lh == NULL)
648 	 * b) only one entry in chain, and that is cancelled.
649 	 *    If for every cancelled delta another one would've
650 	 *    been added, this situation couldn't occur, but a
651 	 *    DT_CANCEL delta can lead to this as it is never
652 	 *    added.
653 	 */
654 	for (i = 0; i < LB_HASHSIZE; i++) {
655 		lh = &loghash[i];
656 		l = *lh;
657 		do {
658 			if (*lh == (lb_me_t *)NULL)
659 				break;
660 			lnext = l->l_next;
661 			if (l->l_flags & LB_ISCANCELLED) {
662 				remlist(lh, l);
663 				bzero((caddr_t)l, sizeof (lb_me_t));
664 				l->l_next = lfreelist;
665 				lfreelist = l;
666 				/*
667 				 * Just removed the hash head. In order not
668 				 * to terminate the while loop, respin chain
669 				 * walk for this hash chain.
670 				 */
671 				if (lnext == *lh) {
672 					i--;
673 					break;
674 				}
675 			}
676 			l = lnext;
677 		} while (l != *lh);
678 	}
679 }
680 
681 static int
682 lufs_logscan_addmap(int32_t *addr, struct delta *d)
683 {
684 	lb_me_t		**lh, *l;
685 
686 	switch (d->d_typ) {
687 	case DT_COMMIT:
688 		/*
689 		 * Handling DT_COMMIT deltas is special. We need to:
690 		 * 1. increase the transaction ID
691 		 * 2. remove cancelled entries.
692 		 */
693 		lufs_logscan_freecancel();
694 		curtid++;
695 		break;
696 	case DT_INODE:
697 		/*
698 		 * Deltas against parts of on-disk inodes are
699 		 * assumed to be timestamps. Ignore those.
700 		 */
701 		if (d->d_nb != sizeof (struct dinode))
702 			break;
703 		/* FALLTHROUGH */
704 	case DT_CANCEL:
705 	case DT_ABZERO:
706 	case DT_AB:
707 	case DT_DIR:
708 	case DT_FBI:
709 		/*
710 		 * These types of deltas contain and/or modify structural
711 		 * information that is needed for booting the system:
712 		 * - where to find a file (DT_DIR, DT_FBI)
713 		 * - the file itself (DT_INODE)
714 		 * - data blocks associated with a file (DT_AB, DT_ABZERO)
715 		 *
716 		 * Building the hash chains becomes complicated because there
717 		 * may exist an older (== previously added) entry that overlaps
718 		 * with the one we want to add.
719 		 * Four cases must be distinguished:
720 		 * 1. The new delta is an exact match for an existing one,
721 		 *    or is a superset of an existing one, and both
722 		 *    belong to the same transaction.
723 		 *    The new delta completely supersedes the old one, so
724 		 *    remove that and reuse the structure for the new.
725 		 *    Then add the new delta to the head of the hashchain.
726 		 * 2. The new delta is an exact match for an existing one,
727 		 *    or is a superset of an existing one, but the two
728 		 *    belong to different transactions (i.e. the old one is
729 		 *    committed).
730 		 *    The existing one is marked to be cancelled when the
731 		 *    next DT_COMMIT record is found, and the hash chain
732 		 *    walk is continued as there may be more existing entries
733 		 *    found which overlap the new delta (happens if that is
734 		 *    a superset of those in the log).
735 		 *    Once no more overlaps are found, goto 4.
736 		 * 3. An existing entry completely covers the new one.
737 		 *    The new delta is then added directly before this
738 		 *    existing one.
739 		 * 4. No (more) overlaps with existing entries are found.
740 		 *    Unless this is a DT_CANCEL delta, whose only purpose
741 		 *    is already handled by marking overlapping entries for
742 		 *    cancellation, add the new delta at the hash chain head.
743 		 *
744 		 * This strategy makes sure that the hash chains are properly
745 		 * ordered. lufs_merge_deltas() walks the hash chain backward,
746 		 * which then ensures that delta merging is done in the same
747 		 * order as those deltas occur in the log - remember, the
748 		 * log can only be read in one direction.
749 		 *
750 		 */
751 		lh = &loghash[LB_HASHFUNC(d->d_mof)];
752 		l = *lh;
753 		do {
754 			if (l == (lb_me_t *)NULL)
755 				break;
756 			/*
757 			 * This covers the first two cases above.
758 			 * If this is a perfect match from the same transaction,
759 			 * and it isn't already cancelled, we simply replace it
760 			 * with its newer incarnation.
761 			 * Otherwise, mark it for cancellation. Handling of
762 			 * DT_COMMIT is going to remove it, then.
763 			 */
764 			if (WITHIN(l->l_mof, l->l_nb, d->d_mof, d->d_nb)) {
765 				if (!(l->l_flags & LB_ISCANCELLED)) {
766 					if (l->l_tid == curtid &&
767 					    d->d_typ != DT_CANCEL) {
768 						remlist(lh, l);
769 						l->l_mof = d->d_mof;
770 						l->l_lof = *addr;
771 						l->l_nb = d->d_nb;
772 						l->l_typ = d->d_typ;
773 						l->l_flags = 0;
774 						l->l_tid = curtid;
775 						inslist(lh, l);
776 						return (1);
777 					} else {
778 						/*
779 						 * 2nd case - cancel only.
780 						 */
781 						l->l_flags |= LB_ISCANCELLED;
782 					}
783 				}
784 			} else if (WITHIN(d->d_mof, d->d_nb,
785 			    l->l_mof, l->l_nb)) {
786 				/*
787 				 * This is the third case above.
788 				 * With deltas DT_ABZERO/DT_AB and DT_FBI/DT_DIR
789 				 * this may happen - an existing previous delta
790 				 * is larger than the current one we're planning
791 				 * to add - DT_ABZERO deltas are supersets of
792 				 * DT_AB deltas, and likewise DT_FBI/DT_DIR.
793 				 * In order to do merging correctly, such deltas
794 				 * put up a barrier for new ones that overlap,
795 				 * and we have to add the new delta immediately
796 				 * before (!) the existing one.
797 				 */
798 				lb_me_t *newl;
799 				newl = lufs_alloc_me();
800 				if (newl == (lb_me_t *)NULL) {
801 					/*
802 					 * No memory. Throw away everything
803 					 * and try booting without logging
804 					 * support.
805 					 */
806 					curtid = 0;
807 					return (0);
808 				}
809 				newl->l_mof = d->d_mof;
810 				newl->l_lof = *addr;	/* "payload" address */
811 				newl->l_nb = d->d_nb;
812 				newl->l_typ = d->d_typ;
813 				newl->l_tid = curtid;
814 				newl->l_prev = l->l_prev;
815 				newl->l_next = l;
816 				l->l_prev->l_next = newl;
817 				l->l_prev = newl;
818 				if (*lh == l)
819 					*lh = newl;
820 				return (1);
821 			}
822 			l = l->l_next;
823 		} while (l != *lh);
824 
825 		/*
826 		 * This is case 4., add a new delta at the head of the chain.
827 		 *
828 		 * If the new delta is a DT_CANCEL entry, we handled it by
829 		 * marking everything it covered for cancellation. We can
830 		 * get by without actually adding the delta itself to the
831 		 * hash, as it'd need to be removed by the commit code anyway.
832 		 */
833 		if (d->d_typ == DT_CANCEL)
834 			break;
835 
836 		l = lufs_alloc_me();
837 		if (l == (lb_me_t *)NULL) {
838 			/*
839 			 * No memory. Throw away everything
840 			 * and try booting without logging
841 			 * support.
842 			 */
843 			curtid = 0;
844 			return (0);
845 		}
846 		l->l_mof = d->d_mof;
847 		l->l_lof = *addr;	/* this is the "payload" address */
848 		l->l_nb = d->d_nb;
849 		l->l_typ = d->d_typ;
850 		l->l_tid = curtid;
851 		inslist(lh, l);
852 		break;
853 	default:
854 		break;
855 	}
856 	return (1);
857 }
858 
859 static int
860 lufs_logscan_prescan(void)
861 {
862 	/*
863 	 * Simulate a full log by setting the tail to be one sector
864 	 * behind the head. This will make the logscan read all
865 	 * of the log until an out-of-sequence sector ident is
866 	 * found.
867 	 */
868 	odi.od_tail_lof = dbtob(btodb(odi.od_head_lof)) - DEV_BSIZE;
869 	if (odi.od_tail_lof < odi.od_bol_lof)
870 		odi.od_tail_lof = odi.od_eol_lof - DEV_BSIZE;
871 	if (odi.od_tail_lof >= odi.od_eol_lof)
872 		odi.od_tail_lof = odi.od_bol_lof;
873 
874 	/*
875 	 * While sector trailers maintain TID values, od_head_tid
876 	 * is not being updated by the kernel ufs logging support
877 	 * at this time. We therefore count transactions ourselves
878 	 * starting at zero - as does the kernel ufs logscan code.
879 	 */
880 	curtid = 0;
881 
882 	if (!lufs_alloc_logbuf()) {
883 		dprintf("Failed to allocate log buffer.\n");
884 		return (0);
885 	}
886 
887 	loghash = (lb_me_t **)lufs_alloc_from_logbuf(
888 	    LB_HASHSIZE * sizeof (lb_me_t *));
889 	if (loghash == (lb_me_t **)NULL) {
890 		dprintf("Can't allocate loghash[] array.");
891 		return (0);
892 	}
893 	return (1);
894 }
895 
896 /*
897  * This function must remove all uncommitted entries (l->l_tid == curtid)
898  * from the log hash. Doing this, we implicitly delete pending cancellations
899  * as well.
900  * It uses the same hash walk algorithm as lufs_logscan_freecancel(). Only
901  * the check for entries that need to be removed is different.
902  */
903 static void
904 lufs_logscan_postscan(void)
905 {
906 	lb_me_t	**lh, *l, *lnext;
907 	int	i;
908 
909 	for (i = 0; i < LB_HASHSIZE; i++) {
910 		lh = &loghash[i];
911 		l = *lh;
912 		do {
913 			if (l == (lb_me_t *)NULL)
914 				break;
915 			lnext = l->l_next;
916 			if (l->l_tid == curtid) {
917 				remlist(lh, l);
918 				bzero((caddr_t)l, sizeof (lb_me_t));
919 				l->l_next = lfreelist;
920 				lfreelist = l;
921 				if (*lh == (lb_me_t *)NULL)
922 					break;
923 				/*
924 				 * Just removed the hash head. In order not
925 				 * to terminate the while loop, respin chain
926 				 * walk for this hash chain.
927 				 */
928 				if (lnext == *lh) {
929 					i--;
930 					break;
931 				}
932 			} else {
933 				l->l_flags &= ~(LB_ISCANCELLED);
934 			}
935 			l = lnext;
936 		} while (l != *lh);
937 	}
938 }
939 
940 /*
941  * This function builds the log hash. It performs the same sequence
942  * of actions at logscan as the kernel ufs logging support:
943  * - Prepare the log for scanning by simulating a full log.
944  * - As long as sectors read from the log have contiguous idents, do:
945  *	read the delta header
946  *	add the delta to the logmap
947  *	skip over the contents to the start of the next delta header
948  * - After terminating the scan, remove uncommitted entries.
949  *
950  * This function cannot fail except if mapping the logbuffer area
951  * during lufs_logscan_prescan() fails. If there is a structural
952  * integrity problem and the on-disk log cannot be read, we'll
953  * treat this as the same situation as an uncommitted transaction
954  * at the end of the log (or, corner case of that, an empty log
955  * with no committed transactions in it at all).
956  *
957  */
958 static int
959 lufs_logscan(void)
960 {
961 	int32_t		addr;
962 	struct delta	d;
963 
964 	if (!lufs_logscan_prescan())
965 		return (LOG_IS_ERRORED);
966 
967 	addr = odi.od_head_lof;
968 
969 	/*
970 	 * Note that addr == od_tail_lof means a completely filled
971 	 * log. This almost never happens, so the common exit path
972 	 * from this loop is via one of the 'break's.
973 	 */
974 	while (addr != odi.od_tail_lof) {
975 		if (!lufs_logscan_read(&addr, &d))
976 			break;
977 		if (!lufs_logscan_addmap(&addr, &d))
978 			return (LOG_IS_ERRORED);
979 		if (!lufs_logscan_skip(&addr, &d))
980 			break;
981 	}
982 
983 	lufs_logscan_postscan();
984 	/*
985 	 * Check whether the log contains data, and if so whether
986 	 * it contains committed data.
987 	 */
988 	if (addr == odi.od_head_lof || curtid == 0) {
989 		return (LOG_IS_EMPTY);
990 	}
991 	return (LOG_IS_OK);
992 }
993 
994 /*
995  * A metadata block was read from disk. Check whether the logmap
996  * has a delta against this byte range, and if so read it in, since
997  * the data in the log is more recent than what was read from other
998  * places on the disk.
999  */
1000 void
1001 lufs_merge_deltas(fileid_t *fp)
1002 {
1003 	int		nb;
1004 	int64_t		bof;
1005 	lb_me_t		**lh, *l;
1006 	int32_t		skip;
1007 
1008 	/*
1009 	 * No logmap: Empty log. Nothing to do here.
1010 	 */
1011 	if (!ufs_is_lufs || logbuffer == (caddr_t)NULL)
1012 		return;
1013 
1014 	bof = ldbtob(fp->fi_blocknum);
1015 	nb = fp->fi_count;
1016 
1017 	/*
1018 	 * Search the log hash.
1019 	 * Merge deltas if an overlap is found.
1020 	 */
1021 
1022 	lh = &loghash[LB_HASHFUNC(bof)];
1023 
1024 	if (*lh == (lb_me_t *)NULL)
1025 		return;
1026 
1027 	l = *lh;
1028 
1029 	do {
1030 		l = l->l_prev;
1031 		if (OVERLAP(l->l_mof, l->l_nb, bof, nb)) {
1032 			/*
1033 			 * Found a delta in the log hash which overlaps
1034 			 * with the current metadata block. Read the
1035 			 * actual delta payload from the on-disk log
1036 			 * directly into the file buffer.
1037 			 */
1038 			if (l->l_typ != DT_ABZERO) {
1039 				/*
1040 				 * We have to actually read this part of the
1041 				 * log as it could contain a sector trailer, or
1042 				 * wrap around the end of the log.
1043 				 * If it did, the second offset generation would
1044 				 * be incorrect if we'd started at l->l_lof.
1045 				 */
1046 				if (!(skip = lufs_read_log(l->l_lof, NULL,
1047 				    MAX(bof - l->l_mof, 0))))
1048 					dprintf("scan/merge error, pre-skip\n");
1049 				if (!(skip = lufs_read_log(skip,
1050 				    fp->fi_memp + MAX(l->l_mof - bof, 0),
1051 				    MIN(l->l_mof + l->l_nb, bof + nb) -
1052 				    MAX(l->l_mof, bof))))
1053 					dprintf("scan/merge error, merge\n");
1054 			} else {
1055 				/*
1056 				 * DT_ABZERO requires no disk access, just
1057 				 * clear the byte range which overlaps with
1058 				 * the delta.
1059 				 */
1060 				bzero(fp->fi_memp + MAX(l->l_mof - bof, 0),
1061 				    MIN(l->l_mof + l->l_nb, bof + nb) -
1062 				    MAX(l->l_mof, bof));
1063 			}
1064 		}
1065 	} while (l->l_prev != (*lh)->l_prev);
1066 
1067 	printf("*\b");
1068 }
1069 
1070 void
1071 lufs_closeall(void)
1072 {
1073 	if (ufs_is_lufs) {
1074 		bkmem_free((char *)eb, logfp->fi_devp->un_fs.di_fs.fs_bsize);
1075 		bkmem_free((char *)logfp, sizeof (fileid_t));
1076 		eb = (extent_block_t *)NULL;
1077 		bzero((caddr_t)&odi, sizeof (ml_odunit_t));
1078 		logfp = (fileid_t *)NULL;
1079 		lufs_free_logbuf();
1080 		ufs_is_lufs = 0;
1081 	}
1082 }
1083