xref: /titanic_50/usr/src/stand/lib/fs/ufs/lufsboot.c (revision 9dd0f810214fdc8e1af881a9a5c4b6927629ff9e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/param.h>
30 #include <sys/vnode.h>
31 #include <sys/fs/ufs_fsdir.h>
32 #include <sys/fs/ufs_fs.h>
33 #include <sys/fs/ufs_inode.h>
34 #include <sys/fs/ufs_log.h>
35 #include <sys/sysmacros.h>
36 #include <sys/promif.h>
37 #include <sys/machparam.h>
38 
39 #include <sys/stat.h>
40 #include <sys/bootdebug.h>
41 #include <sys/salib.h>
42 #include <sys/saio.h>
43 #include <sys/filep.h>
44 
45 
46 /*
47  * Big theory statement on how ufsboot makes use of the log
48  * in case the filesystem wasn't shut down cleanly.
49  *
50  * The structure of the ufs on-disk log looks like this:
51  *
52  * +-----------------+
53  * | SUPERBLOCK      |
54  * | ...             |
55  * | fs_logbno       +--> +-----------------------+
56  * | ...             |    | EXTENT BLOCK          |
57  * +-----------------+    |   ...                 |
58  *                        |   nextents            |
59  * +----------------------+   extents[0].pbno     |
60  * |                      | { extents[1].pbno }   +------------+
61  * |                      |   ...                 +--> ...     |
62  * |                      +-----------------------+            |
63  * v                                                           |
64  * +-----------------------------+      \                      |
65  * | ON-DISK LOG HEADER          |      |                      |
66  * | ...                         |      |                      |
67  * | od_head_lof                 +--+   |                      |
68  * | ...                         |  |   |                      |
69  * +-----------------------------+ <|---|- od_bol_lof          |
70  * | sector (may contain deltas) |  |   |  (logical offset)    |
71  * |   +-------------------------+  |   |                      |
72  * |   | trailer (some ident#)   |  |    > extents[0].nbno     |
73  * +---+-------------------------+  |   |  blocks ("sectors")  |
74  * .                             .  |   |                      |
75  * .                             .  |   |                      |
76  * +-----------------------------+<-+   |                      |
77  * | delta1 delta2       delta3  |      |                      |
78  * | d +-------------------------+      |                      |
79  * | e | ident#: od_head_ident   |      |                      |
80  * +---+-------------------------+      /                      |
81  *                                                             |
82  * +-----------------------------+ <---------------------------+
83  * | lta4    delta5 delta6    de |
84  * | l +-------------------------+
85  * | t | ident#: od_head_ident+1 |
86  * +---+-------------------------+
87  * .                             .
88  * +-----------------------------+
89  * | sector (may contain deltas) |
90  * |          +------------------+
91  * |          | trailer (ident#) |
92  * +----------+------------------+ <-- od_eol_lof (logical offset)
93  *
94  * The ufs on-disk log has the following properties:
95  *
96  * 1. The log is made up from at least one extent. "fs_logbno" in
97  *    the superblock points to where this is found.
98  * 2. Extents describe the logical layout.
99  *      - Logical offset 0 is the on-disk log header. It's also
100  *        at the beginning of the first physical block.
101  *      - If there's more than one extent, the equation holds:
102  *             extent[i+1].lbno == extent[i].lbno + extent[i].nbno
103  *        i.e. logical offsets form a contiguous sequence. Yet on disk,
104  *        two logically-adjacent offsets may be located in two
105  *        physically disjoint extents, so logical offsets need to be
106  *        translated into physical disk block addresses for access.
107  *      - Various fields in the on-disk log header structure refer
108  *        to such logical log offsets.
109  * 3. The actual logical logspace begins after the log header, at
110  *    the logical offset indicated by "od_bol_lof". Every 512 Bytes
111  *    (a "sector" in terms of ufs logging) is a sector trailer which
112  *    contains a sequence number, the sector ident.
113  * 4. Deltas are packed tight in the remaining space, i.e. a delta
114  *    may be part of more than one sector. Reads from the logspace
115  *    must be split at sector boundaries, since the trailer is never
116  *    part of a delta. Delta sizes vary.
117  * 5. The field "od_head_lof" points to the start of the dirty part
118  *    of the log, i.e. to the first delta header. Likewise, "od_head_ident"
119  *    is the sequence number where the valid part of the log starts; if
120  *    the sector pointed to by "od_head_lof" has a sector ident different
121  *    from "od_head_ident", the log is empty.
122  * 6. The valid part of the log extends for as many sectors as their ident
123  *    numbers form a contiguous sequence. When reaching the logical end of
124  *    the log, "od_bol_lof", logical offsets wrap around to "od_bol_lof",
125  *    i.e. the log forms a circular buffer.
126  *
127  * For the strategy how to handle accessing the log, item 4. is the
128  * most important one - its consequence is that the log can only be
129  * read in one direction - forward, starting at the head.
130  *
131  * The task of identifying whether a given metadata block is
132  * actually in the log therefore requires reading the entire
133  * log. Doing so is memory-efficient but kills speed if re-done
134  * at every metadata read (64MB log size vs. 512 byte metadata
135  * block size: 128 times as much I/O, possibly only to find out
136  * that this block was not in the log ...).
137  *
138  * First thought to speed this up is to let ufsboot roll the log.
139  * But this is not possible because:
140  * - ufsboot currently does not implement any write functionality,
141  *   the boot-time ufs implementation is read-only.
142  * - firmware write interfaces may or may not be available, in any
143  *   case, they're rarely used and untested for such a purpose.
144  * - that would duplicate a lot of code, since at the moment only
145  *   kernel ufs logging implements log rolling.
146  * - the boot environment cannot be considered high-performance;
147  *   rolling the log there would be slow.
148  * - boot device and root device could well be different, creating
149  *   inconsistencies e.g. with a mirrored root if the log is rolled.
150  *
151  * Therefore, caching the log structural information (boot-relevant
152  * deltas and their logical log offset) is required for fast access
153  * to the data in the log. This code builds a logmap for that purpose.
154  *
155  * As a simple optimization, if we find the log is empty, we will not
156  * use it - log reader support for ufsboot has no noticeable overhead
157  * for clean logs, or for root filesystems that aren't logging.
158  */
159 
160 #define	LB_HASHSHIFT		13
161 #define	LB_HASHSIZE		(1 << LB_HASHSHIFT)
162 #define	LB_HASHFUNC(mof)	(((mof) >> LB_HASHSHIFT) & (LB_HASHSIZE - 1))
163 
164 #define	LOGBUF_MAXSIZE	(8*1024*1024)
165 #define	LOGBUF_MINSIZE	(256*1024)
166 
167 #define	LOG_IS_EMPTY	0
168 #define	LOG_IS_OK	1
169 #define	LOG_IS_ERRORED	2
170 
171 /*
172  * We build a hashed logmap of those while scanning the log.
173  * sizeof(lb_map_t) is 40 on 64bit, 32 on 32bit; the max sized
174  * resalloc'ed buffer can accomodate around ~500k of those;
175  * this is approximately the maximum amount of deltas we'll
176  * see if a 64MB ufs log is completely filled. We'll make no
177  * attempt to free and reallocate the resalloc'ed buffer if
178  * we overflow, as conservative sizing should make that an
179  * impossibility. A future enhancement may allocate memory
180  * here as needed - once the boot time memory allocator
181  * supports that.
182  */
183 typedef struct lb_mapentry {
184 	struct lb_mapentry	*l_next;	/* hash chaining */
185 	struct lb_mapentry	*l_prev;	/* hash chaining */
186 	int64_t		l_mof;		/* disk addr this delta is against */
187 	int16_t		l_nb;		/* size of delta */
188 	int16_t		l_flags;
189 	int32_t		l_lof;		/* log offset for delta header */
190 	int32_t		l_tid;		/* transaction this delta is part of */
191 	delta_t		l_typ;		/* see <sys/fs/ufs_trans.h> */
192 } lb_me_t;
193 
194 #define	LB_ISCANCELLED	1
195 
196 #define	inslist(lh, l)	if ((*(lh))) {				\
197 				(*(lh))->l_prev->l_next = (l);	\
198 				(l)->l_next = (*(lh));		\
199 				(l)->l_prev = (*(lh))->l_prev;	\
200 				(*(lh))->l_prev = (l);		\
201 			} else {				\
202 				(l)->l_next = (l);		\
203 				(l)->l_prev = (l);		\
204 				(*(lh)) = l;			\
205 			}
206 
207 #define	remlist(lh, l)	\
208 	if ((l)->l_next == (l)) {			\
209 		if (*(lh) != (l) || (l)->l_prev != (l))	\
210 			dprintf("Logmap hash inconsistency.\n");	\
211 		*(lh) = (lb_me_t *)NULL;		\
212 	} else {					\
213 		if (*(lh) == (l))			\
214 			*(lh) = (l)->l_next;		\
215 		(l)->l_prev->l_next = (l)->l_next;	\
216 		(l)->l_next->l_prev = (l)->l_prev;	\
217 	}
218 
219 #define	lufs_alloc_me()	\
220 	(lb_me_t *)lufs_alloc_from_logbuf(sizeof (lb_me_t))
221 
222 extern int		boothowto;
223 static int		ufs_is_lufs = 0;
224 static fileid_t		*logfp = (fileid_t *)NULL;
225 static extent_block_t	*eb = (extent_block_t *)NULL;
226 static ml_odunit_t	odi;
227 
228 #ifndef i386
229 static char		logbuffer_min[LOGBUF_MINSIZE];
230 #endif
231 static caddr_t		logbuffer = (caddr_t)NULL;
232 static caddr_t		elogbuffer = (caddr_t)NULL;
233 static caddr_t		logbuf_curptr;
234 static lb_me_t		**loghash = (lb_me_t **)NULL;
235 static lb_me_t		*lfreelist;
236 
237 static uint32_t		curtid;
238 
239 
240 int	lufs_support = 1;
241 
242 void	lufs_boot_init(fileid_t *);
243 void	lufs_closeall(void);
244 void	lufs_merge_deltas(fileid_t *);
245 
246 static	int	lufs_logscan(void);
247 
248 extern	int	diskread(fileid_t *filep);
249 extern	caddr_t	resalloc(enum RESOURCES, size_t, caddr_t, int);
250 
251 #if defined(i386)
252 #define	LOGBUF_BASEADDR	((caddr_t)(KERNEL_TEXT - LOGBUF_MAXSIZE))
253 #elif defined(__sparcv9)
254 #define	LOGBUF_BASEADDR	((caddr_t)(SYSBASE - LOGBUF_MAXSIZE))
255 #endif
256 
257 static int
258 lufs_alloc_logbuf(void)
259 {
260 	/*
261 	 * Allocate memory for caching the log. Since the logbuffer can
262 	 * potentially exceed the boot scratch memory limit, we use resalloc
263 	 * directly, passing the allocation to the low-level boot-time
264 	 * backend allocator. The chosen VA range is the top end of
265 	 * the kernel's segmap segment, so we're not interfering
266 	 * with the kernel because segmap is created at a time when
267 	 * the 2nd-stage boot has already been unloaded and this VA
268 	 * range was given back.
269 	 *
270 	 * On sparc platforms, the kernel cannot recover the memory
271 	 * obtained from resalloc because the page structs are allocated
272 	 * before the call to BOP_QUIESCE. To avoid leaking this
273 	 * memory, the logbuffer is allocated from a small bss array
274 	 * that should hold the logmap except in the most extreme cases.
275 	 * If the bss array is too small, the logbuffer is extended
276 	 * from resalloc 1 page at a time.
277 	 */
278 
279 #ifdef i386
280 	logbuffer = resalloc(RES_CHILDVIRT, LOGBUF_MAXSIZE,
281 			LOGBUF_BASEADDR, 0UL);
282 	elogbuffer = logbuffer+LOGBUF_MAXSIZE;
283 #else
284 	logbuffer = logbuffer_min;
285 	elogbuffer = logbuffer+LOGBUF_MINSIZE;
286 #endif
287 	logbuf_curptr = logbuffer;
288 	lfreelist = (lb_me_t *)NULL;
289 
290 	if (logbuffer == (caddr_t)NULL)
291 		return (0);
292 
293 	dprintf("Buffer for boot loader logging support: 0x%p, size 0x%x\n",
294 		logbuffer, elogbuffer-logbuffer);
295 
296 	return (1);
297 }
298 
299 static void
300 lufs_free_logbuf()
301 {
302 	/*
303 	 * Solaris/x86 has no prom_free() routine at this time.
304 	 * Reclaiming the VA range below KERNEL_TEXT on Solaris/x86
305 	 * is done by the kernel startup itself, in hat_unload_prom()
306 	 * after the bootloader has been quiesced.
307 	 *
308 	 * Solaris on sparc has a prom_free() routine that will update
309 	 *   the memlist properties to reflect the freeing of the
310 	 *   logbuffer. However, the sparc kernel cannot recover
311 	 *   the memory freed after the call to BOP_QUIESCE as the
312 	 *   page struct have already been allocated. We call
313 	 *   prom_free anyway so that the kernel can reclaim this
314 	 *   memory in the future.
315 	 */
316 #ifndef i386
317 	if (logbuffer == LOGBUF_BASEADDR)
318 		prom_free(logbuffer, elogbuffer-logbuffer);
319 #endif
320 	logbuffer = (caddr_t)NULL;
321 }
322 
323 static caddr_t
324 lufs_alloc_from_logbuf(size_t sz)
325 {
326 	caddr_t tmpaddr;
327 	lb_me_t	*l;
328 
329 	/*
330 	 * Satisfy lb_me_t allocations from the freelist
331 	 * first if possible.
332 	 */
333 	if ((sz == sizeof (lb_me_t)) && lfreelist) {
334 		l = lfreelist;
335 		lfreelist = lfreelist->l_next;
336 		return ((caddr_t)l);
337 	}
338 	if (elogbuffer < logbuf_curptr + sz) {
339 #ifdef i386
340 		return ((caddr_t)NULL);
341 #else
342 		caddr_t np;
343 		size_t nsz;
344 
345 		/*
346 		 * Out of space in current chunk - try to add another.
347 		 */
348 		if (logbuffer == logbuffer_min) {
349 			np = LOGBUF_BASEADDR;
350 		} else {
351 			np = elogbuffer;
352 		}
353 		nsz = roundup(sz, PAGESIZE);
354 		if (np + nsz > LOGBUF_BASEADDR + LOGBUF_MAXSIZE) {
355 			return ((caddr_t)NULL);
356 		}
357 
358 		np = resalloc(RES_CHILDVIRT, nsz, np, 0UL);
359 		if (np == (caddr_t)NULL) {
360 			return ((caddr_t)NULL);
361 		}
362 		if (logbuffer == logbuffer_min)
363 			logbuffer = LOGBUF_BASEADDR;
364 		logbuf_curptr = np;
365 		elogbuffer = logbuf_curptr + nsz;
366 #endif
367 	}
368 
369 	tmpaddr = logbuf_curptr;
370 	logbuf_curptr += sz;
371 	bzero(tmpaddr, sz);
372 	return (tmpaddr);
373 }
374 
375 static int32_t
376 lufs_read_log(int32_t addr, caddr_t va, int nb)
377 {
378 	int		i, fastpath = 0;
379 	daddr_t		pblk, lblk;
380 	sect_trailer_t	*st;
381 	uint32_t	ident;
382 
383 	/*
384 	 * Fast path for skipping the read if no target buffer
385 	 * is specified. Don't do this for the initial scan.
386 	 */
387 	if (ufs_is_lufs && (va == (caddr_t)NULL))
388 		fastpath = 1;
389 
390 	while (nb) {
391 		/* log wraparound check */
392 		if (addr == odi.od_eol_lof)
393 			addr = odi.od_bol_lof;
394 		if (fastpath)
395 			goto read_done;
396 
397 		/*
398 		 * Translate logically-contiguous log offsets into physical
399 		 * block numbers. For a log consisting of a single extent:
400 		 *	pbno = btodb(addr) - extents[0].lbno;
401 		 * Otherwise, search for the extent which contains addr.
402 		 */
403 		pblk = 0;
404 		lblk = btodb(addr);
405 		for (i = 0; i < eb->nextents; i++) {
406 			if (lblk >= eb->extents[i].lbno &&
407 				lblk < eb->extents[i].lbno +
408 						eb->extents[i].nbno) {
409 				pblk = lblk - eb->extents[i].lbno +
410 					eb->extents[i].pbno;
411 				break;
412 			}
413 		}
414 
415 		if (pblk == 0) {
416 			/*
417 			 * block #0 can never be in a log extent since this
418 			 * block always contains the primary superblock copy.
419 			 */
420 			dprintf("No log extent found for log offset 0x%llx.\n",
421 				addr);
422 			return (0);
423 		}
424 
425 		/*
426 		 * Check whether the block we want is cached from the last
427 		 * read. If not, read it in now.
428 		 */
429 		if (logfp->fi_blocknum != pblk) {
430 			logfp->fi_blocknum = pblk;
431 			logfp->fi_memp = logfp->fi_buf;
432 			logfp->fi_count = DEV_BSIZE;
433 			logfp->fi_offset = 0;
434 			if (diskread(logfp)) {
435 				dprintf("I/O error reading the ufs log" \
436 					" at block 0x%x.\n",
437 					logfp->fi_blocknum);
438 				return (0);
439 			}
440 			/*
441 			 * Log structure verification. The block which we just
442 			 * read has an ident number that must match its offset
443 			 * in blocks from the head of the log. Since the log
444 			 * can wrap around, we have to check for that to get the
445 			 * ident right. Out-of-sequence idents can happen after
446 			 * power failures, panics during a partial transaction,
447 			 * media errors, ... - in any case, they mark the end of
448 			 * the valid part of the log.
449 			 */
450 			st = (sect_trailer_t *)(logfp->fi_memp +
451 						LDL_USABLE_BSIZE);
452 			/* od_head_ident is where the sequence starts */
453 			ident = odi.od_head_ident;
454 			if (lblk >= lbtodb(odi.od_head_lof)) {
455 				/* no wraparound */
456 				ident += (lblk - lbtodb(odi.od_head_lof));
457 			} else {
458 				/* log wrapped around the end */
459 				ident += (lbtodb(odi.od_eol_lof) -
460 					    lbtodb(odi.od_head_lof));
461 				ident += (lblk - lbtodb(odi.od_bol_lof));
462 			}
463 
464 			if (ident != st->st_ident)
465 				return (0);
466 		}
467 read_done:
468 		/*
469 		 * Copy the delta contents to the destination buffer if
470 		 * one was specified. Otherwise, just skip the contents.
471 		 */
472 		i = MIN(NB_LEFT_IN_SECTOR(addr), nb);
473 		if (va != NULL) {
474 			bcopy(logfp->fi_buf + (addr - ldbtob(lbtodb(addr))),
475 				va, i);
476 			va += i;
477 		}
478 		nb -= i;
479 		addr += i;
480 		/*
481 		 * Skip sector trailer if necessary.
482 		 */
483 		if (NB_LEFT_IN_SECTOR(addr) == 0)
484 			addr += sizeof (sect_trailer_t);
485 	}
486 	return (addr);
487 }
488 
489 void
490 lufs_boot_init(fileid_t *filep)
491 {
492 	struct fs *sb = (struct fs *)filep->fi_memp;
493 	int err = 0;
494 
495 	/*
496 	 * boot_ufs_mountroot() should have called us with a
497 	 * filep pointing to the superblock. Verify that this
498 	 * is so first.
499 	 * Then check whether this filesystem has a dirty log.
500 	 * Also return if lufs support was disabled on request.
501 	 */
502 	if (!lufs_support ||
503 		sb != (struct fs *)&filep->fi_devp->un_fs.di_fs ||
504 		sb->fs_clean != FSLOG || sb->fs_logbno == NULL) {
505 		return;
506 	}
507 
508 	if (boothowto & RB_VERBOSE)
509 		printf("The boot filesystem is logging.\n");
510 
511 	/*
512 	 * The filesystem is logging, there is a log area
513 	 * allocated for it. Check the log state and determine
514 	 * whether it'll be possible to use this log.
515 	 */
516 
517 	/*
518 	 * Allocate a private fileid_t for use when reading
519 	 * from the log.
520 	 */
521 	eb = (extent_block_t *)bkmem_zalloc(sb->fs_bsize);
522 	logfp = (fileid_t *)bkmem_zalloc(sizeof (fileid_t));
523 	logfp->fi_memp = logfp->fi_buf;
524 	logfp->fi_devp = filep->fi_devp;
525 
526 	/*
527 	 * Read the extent block and verify that what we
528 	 * find there are actually lufs extents.
529 	 * Make it simple: the extent block including all
530 	 * extents cannot be larger than a filesystem block.
531 	 * So read a whole filesystem block, to make sure
532 	 * we have read all extents in the same operation.
533 	 */
534 	logfp->fi_blocknum = sb->fs_logbno;
535 	logfp->fi_count = sb->fs_bsize;
536 	logfp->fi_memp = (caddr_t)eb;
537 	logfp->fi_offset = 0;
538 	if (diskread(logfp) || eb->type != LUFS_EXTENTS) {
539 		dprintf("Failed to read log extent block.\n");
540 		err = LOG_IS_ERRORED;
541 		goto out;
542 	}
543 
544 	/*
545 	 * Read the on disk log header. If that fails,
546 	 * try the backup copy on the adjacent block.
547 	 */
548 	logfp->fi_blocknum = eb->extents[0].pbno;
549 	logfp->fi_count = sizeof (ml_odunit_t);
550 	logfp->fi_memp = (caddr_t)&odi;
551 	logfp->fi_offset = 0;
552 	if (diskread(logfp)) {
553 		logfp->fi_blocknum = eb->extents[0].pbno + 1;
554 		logfp->fi_count = sizeof (ml_odunit_t);
555 		logfp->fi_memp = (caddr_t)&odi;
556 		logfp->fi_offset = 0;
557 		if (diskread(logfp)) {
558 			dprintf("Failed to read on-disk log header.\n");
559 			err = LOG_IS_ERRORED;
560 			goto out;
561 		}
562 	}
563 
564 	/*
565 	 * Verify that we understand this log, and
566 	 * that the log isn't bad or empty.
567 	 */
568 	if (odi.od_version != LUFS_VERSION_LATEST) {
569 		dprintf("On-disk log format v%d != supported format v%d.\n",
570 			odi.od_version, LUFS_VERSION_LATEST);
571 		err = LOG_IS_ERRORED;
572 	} else if (odi.od_badlog) {
573 		dprintf("On-disk log is marked bad.\n");
574 		err = LOG_IS_ERRORED;
575 	} else if (odi.od_chksum != odi.od_head_ident + odi.od_tail_ident) {
576 		dprintf("On-disk log checksum %d != ident sum %d.\n",
577 			odi.od_chksum, odi.od_head_ident + odi.od_tail_ident);
578 		err = LOG_IS_ERRORED;
579 	} else {
580 		/*
581 		 * All consistency checks ok. Scan the log, build the
582 		 * log hash. If this succeeds we'll be using the log
583 		 * when reading from this filesystem.
584 		 */
585 		err = lufs_logscan();
586 	}
587 out:
588 	ufs_is_lufs = 1;
589 	switch (err) {
590 	case LOG_IS_EMPTY:
591 		if (boothowto & RB_VERBOSE)
592 			printf("The ufs log is empty and will not be used.\n");
593 		lufs_closeall();
594 		break;
595 	case LOG_IS_OK:
596 		if (boothowto & RB_VERBOSE)
597 			printf("Using the ufs log.\n");
598 		break;
599 	case LOG_IS_ERRORED:
600 		if (boothowto & RB_VERBOSE)
601 			printf("Couldn't build log hash. Can't use ufs log.\n");
602 		lufs_closeall();
603 		break;
604 	default:
605 		dprintf("Invalid error %d while scanning the ufs log.\n", err);
606 		break;
607 	}
608 }
609 
610 static int
611 lufs_logscan_read(int32_t *addr, struct delta *d)
612 {
613 	*addr = lufs_read_log(*addr, (caddr_t)d, sizeof (struct delta));
614 
615 	if (*addr == 0 ||
616 	    d->d_typ < DT_NONE || d->d_typ > DT_MAX ||
617 	    d->d_nb >= odi.od_logsize)
618 		return (0);
619 
620 	return (1);
621 }
622 
623 static int
624 lufs_logscan_skip(int32_t *addr, struct delta *d)
625 {
626 	switch (d->d_typ) {
627 	case DT_COMMIT:
628 		/*
629 		 * A DT_COMMIT delta has no size as such, but will
630 		 * always "fill up" the sector that contains it.
631 		 * The next delta header is found at the beginning
632 		 * of the next 512-Bytes sector, adjust "addr" to
633 		 * reflect that.
634 		 */
635 		*addr += ((*addr & (DEV_BSIZE - 1))) ?
636 			NB_LEFT_IN_SECTOR(*addr) +
637 			sizeof (sect_trailer_t) : 0;
638 		return (1);
639 	case DT_CANCEL:
640 	case DT_ABZERO:
641 		/*
642 		 * These types of deltas occupy no space in the log
643 		 */
644 		return (1);
645 	default:
646 		/*
647 		 * Skip over the delta contents.
648 		 */
649 		*addr = lufs_read_log(*addr, NULL, d->d_nb);
650 	}
651 
652 	return (*addr != NULL);
653 }
654 
655 static void
656 lufs_logscan_freecancel(void)
657 {
658 	lb_me_t		**lh, *l, *lnext;
659 	int		i;
660 
661 	/*
662 	 * Walk the entire log hash and put cancelled entries
663 	 * onto the freelist. Corner cases:
664 	 * a) empty hash chain (*lh == NULL)
665 	 * b) only one entry in chain, and that is cancelled.
666 	 *    If for every cancelled delta another one would've
667 	 *    been added, this situation couldn't occur, but a
668 	 *    DT_CANCEL delta can lead to this as it is never
669 	 *    added.
670 	 */
671 	for (i = 0; i < LB_HASHSIZE; i++) {
672 		lh = &loghash[i];
673 		l = *lh;
674 		do {
675 			if (*lh == (lb_me_t *)NULL)
676 				break;
677 			lnext = l->l_next;
678 			if (l->l_flags & LB_ISCANCELLED) {
679 				remlist(lh, l);
680 				bzero((caddr_t)l, sizeof (lb_me_t));
681 				l->l_next = lfreelist;
682 				lfreelist = l;
683 				/*
684 				 * Just removed the hash head. In order not
685 				 * to terminate the while loop, respin chain
686 				 * walk for this hash chain.
687 				 */
688 				if (lnext == *lh) {
689 					i--;
690 					break;
691 				}
692 			}
693 			l = lnext;
694 		} while (l != *lh);
695 	}
696 }
697 
698 static int
699 lufs_logscan_addmap(int32_t *addr, struct delta *d)
700 {
701 	lb_me_t		**lh, *l;
702 
703 	switch (d->d_typ) {
704 	case DT_COMMIT:
705 		/*
706 		 * Handling DT_COMMIT deltas is special. We need to:
707 		 * 1. increase the transaction ID
708 		 * 2. remove cancelled entries.
709 		 */
710 		lufs_logscan_freecancel();
711 		curtid++;
712 		break;
713 	case DT_INODE:
714 		/*
715 		 * Deltas against parts of on-disk inodes are
716 		 * assumed to be timestamps. Ignore those.
717 		 */
718 		if (d->d_nb != sizeof (struct dinode))
719 			break;
720 		/* FALLTHROUGH */
721 	case DT_CANCEL:
722 	case DT_ABZERO:
723 	case DT_AB:
724 	case DT_DIR:
725 	case DT_FBI:
726 		/*
727 		 * These types of deltas contain and/or modify structural
728 		 * information that is needed for booting the system:
729 		 * - where to find a file (DT_DIR, DT_FBI)
730 		 * - the file itself (DT_INODE)
731 		 * - data blocks associated with a file (DT_AB, DT_ABZERO)
732 		 *
733 		 * Building the hash chains becomes complicated because there
734 		 * may exist an older (== previously added) entry that overlaps
735 		 * with the one we want to add.
736 		 * Four cases must be distinguished:
737 		 * 1. The new delta is an exact match for an existing one,
738 		 *    or is a superset of an existing one, and both
739 		 *    belong to the same transaction.
740 		 *    The new delta completely supersedes the old one, so
741 		 *    remove that and reuse the structure for the new.
742 		 *    Then add the new delta to the head of the hashchain.
743 		 * 2. The new delta is an exact match for an existing one,
744 		 *    or is a superset of an existing one, but the two
745 		 *    belong to different transactions (i.e. the old one is
746 		 *    committed).
747 		 *    The existing one is marked to be cancelled when the
748 		 *    next DT_COMMIT record is found, and the hash chain
749 		 *    walk is continued as there may be more existing entries
750 		 *    found which overlap the new delta (happens if that is
751 		 *    a superset of those in the log).
752 		 *    Once no more overlaps are found, goto 4.
753 		 * 3. An existing entry completely covers the new one.
754 		 *    The new delta is then added directly before this
755 		 *    existing one.
756 		 * 4. No (more) overlaps with existing entries are found.
757 		 *    Unless this is a DT_CANCEL delta, whose only purpose
758 		 *    is already handled by marking overlapping entries for
759 		 *    cancellation, add the new delta at the hash chain head.
760 		 *
761 		 * This strategy makes sure that the hash chains are properly
762 		 * ordered. lufs_merge_deltas() walks the hash chain backward,
763 		 * which then ensures that delta merging is done in the same
764 		 * order as those deltas occur in the log - remember, the
765 		 * log can only be read in one direction.
766 		 *
767 		 */
768 		lh = &loghash[LB_HASHFUNC(d->d_mof)];
769 		l = *lh;
770 		do {
771 			if (l == (lb_me_t *)NULL)
772 				break;
773 			/*
774 			 * This covers the first two cases above.
775 			 * If this is a perfect match from the same transaction,
776 			 * and it isn't already cancelled, we simply replace it
777 			 * with its newer incarnation.
778 			 * Otherwise, mark it for cancellation. Handling of
779 			 * DT_COMMIT is going to remove it, then.
780 			 */
781 			if (WITHIN(l->l_mof, l->l_nb, d->d_mof, d->d_nb)) {
782 				if (!(l->l_flags & LB_ISCANCELLED)) {
783 					if (l->l_tid == curtid &&
784 					    d->d_typ != DT_CANCEL) {
785 						remlist(lh, l);
786 						l->l_mof = d->d_mof;
787 						l->l_lof = *addr;
788 						l->l_nb = d->d_nb;
789 						l->l_typ = d->d_typ;
790 						l->l_flags = 0;
791 						l->l_tid = curtid;
792 						inslist(lh, l);
793 						return (1);
794 					} else {
795 						/*
796 						 * 2nd case - cancel only.
797 						 */
798 						l->l_flags |= LB_ISCANCELLED;
799 					}
800 				}
801 			} else if (WITHIN(d->d_mof, d->d_nb,
802 					l->l_mof, l->l_nb)) {
803 				/*
804 				 * This is the third case above.
805 				 * With deltas DT_ABZERO/DT_AB and DT_FBI/DT_DIR
806 				 * this may happen - an existing previous delta
807 				 * is larger than the current one we're planning
808 				 * to add - DT_ABZERO deltas are supersets of
809 				 * DT_AB deltas, and likewise DT_FBI/DT_DIR.
810 				 * In order to do merging correctly, such deltas
811 				 * put up a barrier for new ones that overlap,
812 				 * and we have to add the new delta immediately
813 				 * before (!) the existing one.
814 				 */
815 				lb_me_t *newl;
816 				newl = lufs_alloc_me();
817 				if (newl == (lb_me_t *)NULL) {
818 					/*
819 					 * No memory. Throw away everything
820 					 * and try booting without logging
821 					 * support.
822 					 */
823 					curtid = 0;
824 					return (0);
825 				}
826 				newl->l_mof = d->d_mof;
827 				newl->l_lof = *addr;	/* "payload" address */
828 				newl->l_nb = d->d_nb;
829 				newl->l_typ = d->d_typ;
830 				newl->l_tid = curtid;
831 				newl->l_prev = l->l_prev;
832 				newl->l_next = l;
833 				l->l_prev->l_next = newl;
834 				l->l_prev = newl;
835 				if (*lh == l)
836 					*lh = newl;
837 				return (1);
838 			}
839 			l = l->l_next;
840 		} while (l != *lh);
841 
842 		/*
843 		 * This is case 4., add a new delta at the head of the chain.
844 		 *
845 		 * If the new delta is a DT_CANCEL entry, we handled it by
846 		 * marking everything it covered for cancellation. We can
847 		 * get by without actually adding the delta itself to the
848 		 * hash, as it'd need to be removed by the commit code anyway.
849 		 */
850 		if (d->d_typ == DT_CANCEL)
851 			break;
852 
853 		l = lufs_alloc_me();
854 		if (l == (lb_me_t *)NULL) {
855 			/*
856 			 * No memory. Throw away everything
857 			 * and try booting without logging
858 			 * support.
859 			 */
860 			curtid = 0;
861 			return (0);
862 		}
863 		l->l_mof = d->d_mof;
864 		l->l_lof = *addr;	/* this is the "payload" address */
865 		l->l_nb = d->d_nb;
866 		l->l_typ = d->d_typ;
867 		l->l_tid = curtid;
868 		inslist(lh, l);
869 		break;
870 	default:
871 		break;
872 	}
873 	return (1);
874 }
875 
876 static int
877 lufs_logscan_prescan(void)
878 {
879 	/*
880 	 * Simulate a full log by setting the tail to be one sector
881 	 * behind the head. This will make the logscan read all
882 	 * of the log until an out-of-sequence sector ident is
883 	 * found.
884 	 */
885 	odi.od_tail_lof = dbtob(btodb(odi.od_head_lof)) - DEV_BSIZE;
886 	if (odi.od_tail_lof < odi.od_bol_lof)
887 		odi.od_tail_lof = odi.od_eol_lof - DEV_BSIZE;
888 	if (odi.od_tail_lof >= odi.od_eol_lof)
889 		odi.od_tail_lof = odi.od_bol_lof;
890 
891 	/*
892 	 * While sector trailers maintain TID values, od_head_tid
893 	 * is not being updated by the kernel ufs logging support
894 	 * at this time. We therefore count transactions ourselves
895 	 * starting at zero - as does the kernel ufs logscan code.
896 	 */
897 	curtid = 0;
898 
899 	if (!lufs_alloc_logbuf()) {
900 		dprintf("Failed to allocate log buffer.\n");
901 		return (0);
902 	}
903 
904 	loghash = (lb_me_t **)lufs_alloc_from_logbuf(
905 				LB_HASHSIZE * sizeof (lb_me_t *));
906 	if (loghash == (lb_me_t **)NULL) {
907 		dprintf("Can't allocate loghash[] array.");
908 		return (0);
909 	}
910 	return (1);
911 }
912 
913 /*
914  * This function must remove all uncommitted entries (l->l_tid == curtid)
915  * from the log hash. Doing this, we implicitly delete pending cancellations
916  * as well.
917  * It uses the same hash walk algorithm as lufs_logscan_freecancel(). Only
918  * the check for entries that need to be removed is different.
919  */
920 static void
921 lufs_logscan_postscan(void)
922 {
923 	lb_me_t	**lh, *l, *lnext;
924 	int	i;
925 
926 	for (i = 0; i < LB_HASHSIZE; i++) {
927 		lh = &loghash[i];
928 		l = *lh;
929 		do {
930 			if (l == (lb_me_t *)NULL)
931 				break;
932 			lnext = l->l_next;
933 			if (l->l_tid == curtid) {
934 				remlist(lh, l);
935 				bzero((caddr_t)l, sizeof (lb_me_t));
936 				l->l_next = lfreelist;
937 				lfreelist = l;
938 				if (*lh == (lb_me_t *)NULL)
939 					break;
940 				/*
941 				 * Just removed the hash head. In order not
942 				 * to terminate the while loop, respin chain
943 				 * walk for this hash chain.
944 				 */
945 				if (lnext == *lh) {
946 					i--;
947 					break;
948 				}
949 			} else {
950 				l->l_flags &= ~(LB_ISCANCELLED);
951 			}
952 			l = lnext;
953 		} while (l != *lh);
954 	}
955 }
956 
957 /*
958  * This function builds the log hash. It performs the same sequence
959  * of actions at logscan as the kernel ufs logging support:
960  * - Prepare the log for scanning by simulating a full log.
961  * - As long as sectors read from the log have contiguous idents, do:
962  *	read the delta header
963  *	add the delta to the logmap
964  *	skip over the contents to the start of the next delta header
965  * - After terminating the scan, remove uncommitted entries.
966  *
967  * This function cannot fail except if mapping the logbuffer area
968  * during lufs_logscan_prescan() fails. If there is a structural
969  * integrity problem and the on-disk log cannot be read, we'll
970  * treat this as the same situation as an uncommitted transaction
971  * at the end of the log (or, corner case of that, an empty log
972  * with no committed transactions in it at all).
973  *
974  */
975 static int
976 lufs_logscan(void)
977 {
978 	int32_t		addr;
979 	struct delta	d;
980 
981 	if (!lufs_logscan_prescan())
982 		return (LOG_IS_ERRORED);
983 
984 	addr = odi.od_head_lof;
985 
986 	/*
987 	 * Note that addr == od_tail_lof means a completely filled
988 	 * log. This almost never happens, so the common exit path
989 	 * from this loop is via one of the 'break's.
990 	 */
991 	while (addr != odi.od_tail_lof) {
992 		if (!lufs_logscan_read(&addr, &d))
993 			break;
994 		if (!lufs_logscan_addmap(&addr, &d))
995 			return (LOG_IS_ERRORED);
996 		if (!lufs_logscan_skip(&addr, &d))
997 			break;
998 	}
999 
1000 	lufs_logscan_postscan();
1001 	/*
1002 	 * Check whether the log contains data, and if so whether
1003 	 * it contains committed data.
1004 	 */
1005 	if (addr == odi.od_head_lof || curtid == 0) {
1006 		return (LOG_IS_EMPTY);
1007 	}
1008 	return (LOG_IS_OK);
1009 }
1010 
1011 /*
1012  * A metadata block was read from disk. Check whether the logmap
1013  * has a delta against this byte range, and if so read it in, since
1014  * the data in the log is more recent than what was read from other
1015  * places on the disk.
1016  */
1017 void
1018 lufs_merge_deltas(fileid_t *fp)
1019 {
1020 	int		nb;
1021 	int64_t		bof;
1022 	lb_me_t		**lh, *l;
1023 	int32_t		skip;
1024 
1025 	/*
1026 	 * No logmap: Empty log. Nothing to do here.
1027 	 */
1028 	if (!ufs_is_lufs || logbuffer == (caddr_t)NULL)
1029 		return;
1030 
1031 	bof = ldbtob(fp->fi_blocknum);
1032 	nb = fp->fi_count;
1033 
1034 	/*
1035 	 * Search the log hash.
1036 	 * Merge deltas if an overlap is found.
1037 	 */
1038 
1039 	lh = &loghash[LB_HASHFUNC(bof)];
1040 
1041 	if (*lh == (lb_me_t *)NULL)
1042 		return;
1043 
1044 	l = *lh;
1045 
1046 	do {
1047 		l = l->l_prev;
1048 		if (OVERLAP(l->l_mof, l->l_nb, bof, nb)) {
1049 			/*
1050 			 * Found a delta in the log hash which overlaps
1051 			 * with the current metadata block. Read the
1052 			 * actual delta payload from the on-disk log
1053 			 * directly into the file buffer.
1054 			 */
1055 			if (l->l_typ != DT_ABZERO) {
1056 				/*
1057 				 * We have to actually read this part of the
1058 				 * log as it could contain a sector trailer, or
1059 				 * wrap around the end of the log.
1060 				 * If it did, the second offset generation would
1061 				 * be incorrect if we'd started at l->l_lof.
1062 				 */
1063 				if (!(skip = lufs_read_log(l->l_lof, NULL,
1064 					MAX(bof - l->l_mof, 0))))
1065 					dprintf("scan/merge error, pre-skip\n");
1066 				if (!(skip = lufs_read_log(skip,
1067 					fp->fi_memp + MAX(l->l_mof - bof, 0),
1068 					MIN(l->l_mof + l->l_nb, bof + nb) -
1069 					MAX(l->l_mof, bof))))
1070 					dprintf("scan/merge error, merge\n");
1071 			} else {
1072 				/*
1073 				 * DT_ABZERO requires no disk access, just
1074 				 * clear the byte range which overlaps with
1075 				 * the delta.
1076 				 */
1077 				bzero(fp->fi_memp + MAX(l->l_mof - bof, 0),
1078 					MIN(l->l_mof + l->l_nb, bof + nb) -
1079 					MAX(l->l_mof, bof));
1080 			}
1081 		}
1082 	} while (l->l_prev != (*lh)->l_prev);
1083 
1084 	printf("*\b");
1085 }
1086 
1087 void
1088 lufs_closeall(void)
1089 {
1090 	if (ufs_is_lufs) {
1091 		bkmem_free((char *)eb, logfp->fi_devp->un_fs.di_fs.fs_bsize);
1092 		bkmem_free((char *)logfp, sizeof (fileid_t));
1093 		eb = (extent_block_t *)NULL;
1094 		bzero((caddr_t)&odi, sizeof (ml_odunit_t));
1095 		logfp = (fileid_t *)NULL;
1096 		lufs_free_logbuf();
1097 		ufs_is_lufs = 0;
1098 	}
1099 }
1100