1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/param.h>
27 #include <sys/vnode.h>
28 #include <sys/fs/ufs_fsdir.h>
29 #include <sys/fs/ufs_fs.h>
30 #include <sys/fs/ufs_inode.h>
31 #include <sys/fs/ufs_log.h>
32 #include <sys/sysmacros.h>
33 #include <sys/promif.h>
34 #include <sys/machparam.h>
35
36 #include <sys/stat.h>
37 #include <sys/bootdebug.h>
38 #include <sys/salib.h>
39 #include <sys/saio.h>
40 #include <sys/filep.h>
41
42
43 /*
44 * Big theory statement on how ufsboot makes use of the log
45 * in case the filesystem wasn't shut down cleanly.
46 *
47 * The structure of the ufs on-disk log looks like this:
48 *
49 * +-----------------+
50 * | SUPERBLOCK |
51 * | ... |
52 * | fs_logbno +--> +-----------------------+
53 * | ... | | EXTENT BLOCK |
54 * +-----------------+ | ... |
55 * | nextents |
56 * +----------------------+ extents[0].pbno |
57 * | | { extents[1].pbno } +------------+
58 * | | ... +--> ... |
59 * | +-----------------------+ |
60 * v |
61 * +-----------------------------+ \ |
62 * | ON-DISK LOG HEADER | | |
63 * | ... | | |
64 * | od_head_lof +--+ | |
65 * | ... | | | |
66 * +-----------------------------+ <|---|- od_bol_lof |
67 * | sector (may contain deltas) | | | (logical offset) |
68 * | +-------------------------+ | | |
69 * | | trailer (some ident#) | | > extents[0].nbno |
70 * +---+-------------------------+ | | blocks ("sectors") |
71 * . . | | |
72 * . . | | |
73 * +-----------------------------+<-+ | |
74 * | delta1 delta2 delta3 | | |
75 * | d +-------------------------+ | |
76 * | e | ident#: od_head_ident | | |
77 * +---+-------------------------+ / |
78 * |
79 * +-----------------------------+ <---------------------------+
80 * | lta4 delta5 delta6 de |
81 * | l +-------------------------+
82 * | t | ident#: od_head_ident+1 |
83 * +---+-------------------------+
84 * . .
85 * +-----------------------------+
86 * | sector (may contain deltas) |
87 * | +------------------+
88 * | | trailer (ident#) |
89 * +----------+------------------+ <-- od_eol_lof (logical offset)
90 *
91 * The ufs on-disk log has the following properties:
92 *
93 * 1. The log is made up from at least one extent. "fs_logbno" in
94 * the superblock points to where this is found.
95 * 2. Extents describe the logical layout.
96 * - Logical offset 0 is the on-disk log header. It's also
97 * at the beginning of the first physical block.
98 * - If there's more than one extent, the equation holds:
99 * extent[i+1].lbno == extent[i].lbno + extent[i].nbno
100 * i.e. logical offsets form a contiguous sequence. Yet on disk,
101 * two logically-adjacent offsets may be located in two
102 * physically disjoint extents, so logical offsets need to be
103 * translated into physical disk block addresses for access.
104 * - Various fields in the on-disk log header structure refer
105 * to such logical log offsets.
106 * 3. The actual logical logspace begins after the log header, at
107 * the logical offset indicated by "od_bol_lof". Every 512 Bytes
108 * (a "sector" in terms of ufs logging) is a sector trailer which
109 * contains a sequence number, the sector ident.
110 * 4. Deltas are packed tight in the remaining space, i.e. a delta
111 * may be part of more than one sector. Reads from the logspace
112 * must be split at sector boundaries, since the trailer is never
113 * part of a delta. Delta sizes vary.
114 * 5. The field "od_head_lof" points to the start of the dirty part
115 * of the log, i.e. to the first delta header. Likewise, "od_head_ident"
116 * is the sequence number where the valid part of the log starts; if
117 * the sector pointed to by "od_head_lof" has a sector ident different
118 * from "od_head_ident", the log is empty.
119 * 6. The valid part of the log extends for as many sectors as their ident
120 * numbers form a contiguous sequence. When reaching the logical end of
121 * the log, "od_bol_lof", logical offsets wrap around to "od_bol_lof",
122 * i.e. the log forms a circular buffer.
123 *
124 * For the strategy how to handle accessing the log, item 4. is the
125 * most important one - its consequence is that the log can only be
126 * read in one direction - forward, starting at the head.
127 *
128 * The task of identifying whether a given metadata block is
129 * actually in the log therefore requires reading the entire
130 * log. Doing so is memory-efficient but kills speed if re-done
131 * at every metadata read (64MB log size vs. 512 byte metadata
132 * block size: 128 times as much I/O, possibly only to find out
133 * that this block was not in the log ...).
134 *
135 * First thought to speed this up is to let ufsboot roll the log.
136 * But this is not possible because:
137 * - ufsboot currently does not implement any write functionality,
138 * the boot-time ufs implementation is read-only.
139 * - firmware write interfaces may or may not be available, in any
140 * case, they're rarely used and untested for such a purpose.
141 * - that would duplicate a lot of code, since at the moment only
142 * kernel ufs logging implements log rolling.
143 * - the boot environment cannot be considered high-performance;
144 * rolling the log there would be slow.
145 * - boot device and root device could well be different, creating
146 * inconsistencies e.g. with a mirrored root if the log is rolled.
147 *
148 * Therefore, caching the log structural information (boot-relevant
149 * deltas and their logical log offset) is required for fast access
150 * to the data in the log. This code builds a logmap for that purpose.
151 *
152 * As a simple optimization, if we find the log is empty, we will not
153 * use it - log reader support for ufsboot has no noticeable overhead
154 * for clean logs, or for root filesystems that aren't logging.
155 */
156
157 #define LB_HASHSHIFT 13
158 #define LB_HASHSIZE (1 << LB_HASHSHIFT)
159 #define LB_HASHFUNC(mof) (((mof) >> LB_HASHSHIFT) & (LB_HASHSIZE - 1))
160
161 #define LOGBUF_MAXSIZE (8*1024*1024)
162 #define LOGBUF_MINSIZE (256*1024)
163
164 #define LOG_IS_EMPTY 0
165 #define LOG_IS_OK 1
166 #define LOG_IS_ERRORED 2
167
168 /*
169 * We build a hashed logmap of those while scanning the log.
170 * sizeof(lb_map_t) is 40 on 64bit, 32 on 32bit; the max sized
171 * resalloc'ed buffer can accomodate around ~500k of those;
172 * this is approximately the maximum amount of deltas we'll
173 * see if a 64MB ufs log is completely filled. We'll make no
174 * attempt to free and reallocate the resalloc'ed buffer if
175 * we overflow, as conservative sizing should make that an
176 * impossibility. A future enhancement may allocate memory
177 * here as needed - once the boot time memory allocator
178 * supports that.
179 */
180 typedef struct lb_mapentry {
181 struct lb_mapentry *l_next; /* hash chaining */
182 struct lb_mapentry *l_prev; /* hash chaining */
183 int64_t l_mof; /* disk addr this delta is against */
184 int16_t l_nb; /* size of delta */
185 int16_t l_flags;
186 int32_t l_lof; /* log offset for delta header */
187 int32_t l_tid; /* transaction this delta is part of */
188 delta_t l_typ; /* see <sys/fs/ufs_trans.h> */
189 } lb_me_t;
190
191 #define LB_ISCANCELLED 1
192
193 #define inslist(lh, l) if ((*(lh))) { \
194 (*(lh))->l_prev->l_next = (l); \
195 (l)->l_next = (*(lh)); \
196 (l)->l_prev = (*(lh))->l_prev; \
197 (*(lh))->l_prev = (l); \
198 } else { \
199 (l)->l_next = (l); \
200 (l)->l_prev = (l); \
201 (*(lh)) = l; \
202 }
203
204 #define remlist(lh, l) \
205 if ((l)->l_next == (l)) { \
206 if (*(lh) != (l) || (l)->l_prev != (l)) \
207 dprintf("Logmap hash inconsistency.\n"); \
208 *(lh) = (lb_me_t *)NULL; \
209 } else { \
210 if (*(lh) == (l)) \
211 *(lh) = (l)->l_next; \
212 (l)->l_prev->l_next = (l)->l_next; \
213 (l)->l_next->l_prev = (l)->l_prev; \
214 }
215
216 #define lufs_alloc_me() \
217 (lb_me_t *)lufs_alloc_from_logbuf(sizeof (lb_me_t))
218
219 extern int boothowto;
220 static int ufs_is_lufs = 0;
221 static fileid_t *logfp = (fileid_t *)NULL;
222 static extent_block_t *eb = (extent_block_t *)NULL;
223 static ml_odunit_t odi;
224
225 static char logbuffer_min[LOGBUF_MINSIZE];
226 static caddr_t logbuffer = (caddr_t)NULL;
227 static caddr_t elogbuffer = (caddr_t)NULL;
228 static caddr_t logbuf_curptr;
229 static lb_me_t **loghash = (lb_me_t **)NULL;
230 static lb_me_t *lfreelist;
231
232 static uint32_t curtid;
233
234
235 int lufs_support = 1;
236
237 void lufs_boot_init(fileid_t *);
238 void lufs_closeall(void);
239 void lufs_merge_deltas(fileid_t *);
240
241 static int lufs_logscan(void);
242
243 extern int diskread(fileid_t *filep);
244 extern caddr_t resalloc(enum RESOURCES, size_t, caddr_t, int);
245
246 #if defined(__sparcv9)
247 #define LOGBUF_BASEADDR ((caddr_t)(SYSBASE - LOGBUF_MAXSIZE))
248 #endif
249
250 static int
lufs_alloc_logbuf(void)251 lufs_alloc_logbuf(void)
252 {
253 /*
254 * Allocate memory for caching the log. Since the logbuffer can
255 * potentially exceed the boot scratch memory limit, we use resalloc
256 * directly, passing the allocation to the low-level boot-time
257 * backend allocator. The chosen VA range is the top end of
258 * the kernel's segmap segment, so we're not interfering
259 * with the kernel because segmap is created at a time when
260 * the 2nd-stage boot has already been unloaded and this VA
261 * range was given back.
262 *
263 * On sparc platforms, the kernel cannot recover the memory
264 * obtained from resalloc because the page structs are allocated
265 * before the call to BOP_QUIESCE. To avoid leaking this
266 * memory, the logbuffer is allocated from a small bss array
267 * that should hold the logmap except in the most extreme cases.
268 * If the bss array is too small, the logbuffer is extended
269 * from resalloc 1 page at a time.
270 */
271
272 logbuffer = logbuffer_min;
273 elogbuffer = logbuffer+LOGBUF_MINSIZE;
274 logbuf_curptr = logbuffer;
275 lfreelist = (lb_me_t *)NULL;
276
277 if (logbuffer == (caddr_t)NULL)
278 return (0);
279
280 dprintf("Buffer for boot loader logging support: 0x%p, size 0x%x\n",
281 logbuffer, elogbuffer-logbuffer);
282
283 return (1);
284 }
285
286 static void
lufs_free_logbuf()287 lufs_free_logbuf()
288 {
289 /*
290 * Solaris/x86 has no prom_free() routine at this time.
291 * Reclaiming the VA range below KERNEL_TEXT on Solaris/x86
292 * is done by the kernel startup itself, in hat_unload_prom()
293 * after the bootloader has been quiesced.
294 *
295 * Solaris on sparc has a prom_free() routine that will update
296 * the memlist properties to reflect the freeing of the
297 * logbuffer. However, the sparc kernel cannot recover
298 * the memory freed after the call to BOP_QUIESCE as the
299 * page struct have already been allocated. We call
300 * prom_free anyway so that the kernel can reclaim this
301 * memory in the future.
302 */
303 if (logbuffer == LOGBUF_BASEADDR)
304 prom_free(logbuffer, elogbuffer-logbuffer);
305 logbuffer = (caddr_t)NULL;
306 }
307
308 static caddr_t
lufs_alloc_from_logbuf(size_t sz)309 lufs_alloc_from_logbuf(size_t sz)
310 {
311 caddr_t tmpaddr;
312 lb_me_t *l;
313
314 /*
315 * Satisfy lb_me_t allocations from the freelist
316 * first if possible.
317 */
318 if ((sz == sizeof (lb_me_t)) && lfreelist) {
319 l = lfreelist;
320 lfreelist = lfreelist->l_next;
321 return ((caddr_t)l);
322 }
323 if (elogbuffer < logbuf_curptr + sz) {
324 caddr_t np;
325 size_t nsz;
326
327 /*
328 * Out of space in current chunk - try to add another.
329 */
330 if (logbuffer == logbuffer_min) {
331 np = LOGBUF_BASEADDR;
332 } else {
333 np = elogbuffer;
334 }
335 nsz = roundup(sz, PAGESIZE);
336 if (np + nsz > LOGBUF_BASEADDR + LOGBUF_MAXSIZE) {
337 return ((caddr_t)NULL);
338 }
339
340 np = resalloc(RES_CHILDVIRT, nsz, np, 0UL);
341 if (np == (caddr_t)NULL) {
342 return ((caddr_t)NULL);
343 }
344 if (logbuffer == logbuffer_min)
345 logbuffer = LOGBUF_BASEADDR;
346 logbuf_curptr = np;
347 elogbuffer = logbuf_curptr + nsz;
348 }
349
350 tmpaddr = logbuf_curptr;
351 logbuf_curptr += sz;
352 bzero(tmpaddr, sz);
353 return (tmpaddr);
354 }
355
356 static int32_t
lufs_read_log(int32_t addr,caddr_t va,int nb)357 lufs_read_log(int32_t addr, caddr_t va, int nb)
358 {
359 int i, fastpath = 0;
360 daddr_t pblk, lblk;
361 sect_trailer_t *st;
362 uint32_t ident;
363
364 /*
365 * Fast path for skipping the read if no target buffer
366 * is specified. Don't do this for the initial scan.
367 */
368 if (ufs_is_lufs && (va == (caddr_t)NULL))
369 fastpath = 1;
370
371 while (nb) {
372 /* log wraparound check */
373 if (addr == odi.od_eol_lof)
374 addr = odi.od_bol_lof;
375 if (fastpath)
376 goto read_done;
377
378 /*
379 * Translate logically-contiguous log offsets into physical
380 * block numbers. For a log consisting of a single extent:
381 * pbno = btodb(addr) - extents[0].lbno;
382 * Otherwise, search for the extent which contains addr.
383 */
384 pblk = 0;
385 lblk = btodb(addr);
386 for (i = 0; i < eb->nextents; i++) {
387 if (lblk >= eb->extents[i].lbno &&
388 lblk < eb->extents[i].lbno +
389 eb->extents[i].nbno) {
390 pblk = lblk - eb->extents[i].lbno +
391 eb->extents[i].pbno;
392 break;
393 }
394 }
395
396 if (pblk == 0) {
397 /*
398 * block #0 can never be in a log extent since this
399 * block always contains the primary superblock copy.
400 */
401 dprintf("No log extent found for log offset 0x%llx.\n",
402 addr);
403 return (0);
404 }
405
406 /*
407 * Check whether the block we want is cached from the last
408 * read. If not, read it in now.
409 */
410 if (logfp->fi_blocknum != pblk) {
411 logfp->fi_blocknum = pblk;
412 logfp->fi_memp = logfp->fi_buf;
413 logfp->fi_count = DEV_BSIZE;
414 logfp->fi_offset = 0;
415 if (diskread(logfp)) {
416 dprintf("I/O error reading the ufs log" \
417 " at block 0x%x.\n",
418 logfp->fi_blocknum);
419 return (0);
420 }
421 /*
422 * Log structure verification. The block which we just
423 * read has an ident number that must match its offset
424 * in blocks from the head of the log. Since the log
425 * can wrap around, we have to check for that to get the
426 * ident right. Out-of-sequence idents can happen after
427 * power failures, panics during a partial transaction,
428 * media errors, ... - in any case, they mark the end of
429 * the valid part of the log.
430 */
431 st = (sect_trailer_t *)(logfp->fi_memp +
432 LDL_USABLE_BSIZE);
433 /* od_head_ident is where the sequence starts */
434 ident = odi.od_head_ident;
435 if (lblk >= lbtodb(odi.od_head_lof)) {
436 /* no wraparound */
437 ident += (lblk - lbtodb(odi.od_head_lof));
438 } else {
439 /* log wrapped around the end */
440 ident += (lbtodb(odi.od_eol_lof) -
441 lbtodb(odi.od_head_lof));
442 ident += (lblk - lbtodb(odi.od_bol_lof));
443 }
444
445 if (ident != st->st_ident)
446 return (0);
447 }
448 read_done:
449 /*
450 * Copy the delta contents to the destination buffer if
451 * one was specified. Otherwise, just skip the contents.
452 */
453 i = MIN(NB_LEFT_IN_SECTOR(addr), nb);
454 if (va != NULL) {
455 bcopy(logfp->fi_buf + (addr - ldbtob(lbtodb(addr))),
456 va, i);
457 va += i;
458 }
459 nb -= i;
460 addr += i;
461 /*
462 * Skip sector trailer if necessary.
463 */
464 if (NB_LEFT_IN_SECTOR(addr) == 0)
465 addr += sizeof (sect_trailer_t);
466 }
467 return (addr);
468 }
469
470 void
lufs_boot_init(fileid_t * filep)471 lufs_boot_init(fileid_t *filep)
472 {
473 struct fs *sb = (struct fs *)filep->fi_memp;
474 int err = 0;
475
476 /*
477 * boot_ufs_mountroot() should have called us with a
478 * filep pointing to the superblock. Verify that this
479 * is so first.
480 * Then check whether this filesystem has a dirty log.
481 * Also return if lufs support was disabled on request.
482 */
483 if (!lufs_support ||
484 sb != (struct fs *)&filep->fi_devp->un_fs.di_fs ||
485 sb->fs_clean != FSLOG || sb->fs_logbno == 0) {
486 return;
487 }
488
489 if (boothowto & RB_VERBOSE)
490 printf("The boot filesystem is logging.\n");
491
492 /*
493 * The filesystem is logging, there is a log area
494 * allocated for it. Check the log state and determine
495 * whether it'll be possible to use this log.
496 */
497
498 /*
499 * Allocate a private fileid_t for use when reading
500 * from the log.
501 */
502 eb = (extent_block_t *)bkmem_zalloc(sb->fs_bsize);
503 logfp = (fileid_t *)bkmem_zalloc(sizeof (fileid_t));
504 logfp->fi_memp = logfp->fi_buf;
505 logfp->fi_devp = filep->fi_devp;
506
507 /*
508 * Read the extent block and verify that what we
509 * find there are actually lufs extents.
510 * Make it simple: the extent block including all
511 * extents cannot be larger than a filesystem block.
512 * So read a whole filesystem block, to make sure
513 * we have read all extents in the same operation.
514 */
515 logfp->fi_blocknum = sb->fs_logbno;
516 logfp->fi_count = sb->fs_bsize;
517 logfp->fi_memp = (caddr_t)eb;
518 logfp->fi_offset = 0;
519 if (diskread(logfp) || eb->type != LUFS_EXTENTS) {
520 dprintf("Failed to read log extent block.\n");
521 err = LOG_IS_ERRORED;
522 goto out;
523 }
524
525 /*
526 * Read the on disk log header. If that fails,
527 * try the backup copy on the adjacent block.
528 */
529 logfp->fi_blocknum = eb->extents[0].pbno;
530 logfp->fi_count = sizeof (ml_odunit_t);
531 logfp->fi_memp = (caddr_t)&odi;
532 logfp->fi_offset = 0;
533 if (diskread(logfp)) {
534 logfp->fi_blocknum = eb->extents[0].pbno + 1;
535 logfp->fi_count = sizeof (ml_odunit_t);
536 logfp->fi_memp = (caddr_t)&odi;
537 logfp->fi_offset = 0;
538 if (diskread(logfp)) {
539 dprintf("Failed to read on-disk log header.\n");
540 err = LOG_IS_ERRORED;
541 goto out;
542 }
543 }
544
545 /*
546 * Verify that we understand this log, and
547 * that the log isn't bad or empty.
548 */
549 if (odi.od_version != LUFS_VERSION_LATEST) {
550 dprintf("On-disk log format v%d != supported format v%d.\n",
551 odi.od_version, LUFS_VERSION_LATEST);
552 err = LOG_IS_ERRORED;
553 } else if (odi.od_badlog) {
554 dprintf("On-disk log is marked bad.\n");
555 err = LOG_IS_ERRORED;
556 } else if (odi.od_chksum != odi.od_head_ident + odi.od_tail_ident) {
557 dprintf("On-disk log checksum %d != ident sum %d.\n",
558 odi.od_chksum, odi.od_head_ident + odi.od_tail_ident);
559 err = LOG_IS_ERRORED;
560 } else {
561 /*
562 * All consistency checks ok. Scan the log, build the
563 * log hash. If this succeeds we'll be using the log
564 * when reading from this filesystem.
565 */
566 err = lufs_logscan();
567 }
568 out:
569 ufs_is_lufs = 1;
570 switch (err) {
571 case LOG_IS_EMPTY:
572 if (boothowto & RB_VERBOSE)
573 printf("The ufs log is empty and will not be used.\n");
574 lufs_closeall();
575 break;
576 case LOG_IS_OK:
577 if (boothowto & RB_VERBOSE)
578 printf("Using the ufs log.\n");
579 break;
580 case LOG_IS_ERRORED:
581 if (boothowto & RB_VERBOSE)
582 printf("Couldn't build log hash. Can't use ufs log.\n");
583 lufs_closeall();
584 break;
585 default:
586 dprintf("Invalid error %d while scanning the ufs log.\n", err);
587 break;
588 }
589 }
590
591 static int
lufs_logscan_read(int32_t * addr,struct delta * d)592 lufs_logscan_read(int32_t *addr, struct delta *d)
593 {
594 *addr = lufs_read_log(*addr, (caddr_t)d, sizeof (struct delta));
595
596 if (*addr == 0 ||
597 (int)d->d_typ < DT_NONE || d->d_typ > DT_MAX ||
598 d->d_nb >= odi.od_logsize)
599 return (0);
600
601 return (1);
602 }
603
604 static int
lufs_logscan_skip(int32_t * addr,struct delta * d)605 lufs_logscan_skip(int32_t *addr, struct delta *d)
606 {
607 switch (d->d_typ) {
608 case DT_COMMIT:
609 /*
610 * A DT_COMMIT delta has no size as such, but will
611 * always "fill up" the sector that contains it.
612 * The next delta header is found at the beginning
613 * of the next 512-Bytes sector, adjust "addr" to
614 * reflect that.
615 */
616 *addr += ((*addr & (DEV_BSIZE - 1))) ?
617 NB_LEFT_IN_SECTOR(*addr) +
618 sizeof (sect_trailer_t) : 0;
619 return (1);
620 case DT_CANCEL:
621 case DT_ABZERO:
622 /*
623 * These types of deltas occupy no space in the log
624 */
625 return (1);
626 default:
627 /*
628 * Skip over the delta contents.
629 */
630 *addr = lufs_read_log(*addr, NULL, d->d_nb);
631 }
632
633 return (*addr != 0);
634 }
635
636 static void
lufs_logscan_freecancel(void)637 lufs_logscan_freecancel(void)
638 {
639 lb_me_t **lh, *l, *lnext;
640 int i;
641
642 /*
643 * Walk the entire log hash and put cancelled entries
644 * onto the freelist. Corner cases:
645 * a) empty hash chain (*lh == NULL)
646 * b) only one entry in chain, and that is cancelled.
647 * If for every cancelled delta another one would've
648 * been added, this situation couldn't occur, but a
649 * DT_CANCEL delta can lead to this as it is never
650 * added.
651 */
652 for (i = 0; i < LB_HASHSIZE; i++) {
653 lh = &loghash[i];
654 l = *lh;
655 do {
656 if (*lh == (lb_me_t *)NULL)
657 break;
658 lnext = l->l_next;
659 if (l->l_flags & LB_ISCANCELLED) {
660 remlist(lh, l);
661 bzero((caddr_t)l, sizeof (lb_me_t));
662 l->l_next = lfreelist;
663 lfreelist = l;
664 /*
665 * Just removed the hash head. In order not
666 * to terminate the while loop, respin chain
667 * walk for this hash chain.
668 */
669 if (lnext == *lh) {
670 i--;
671 break;
672 }
673 }
674 l = lnext;
675 } while (l != *lh);
676 }
677 }
678
679 static int
lufs_logscan_addmap(int32_t * addr,struct delta * d)680 lufs_logscan_addmap(int32_t *addr, struct delta *d)
681 {
682 lb_me_t **lh, *l;
683
684 switch (d->d_typ) {
685 case DT_COMMIT:
686 /*
687 * Handling DT_COMMIT deltas is special. We need to:
688 * 1. increase the transaction ID
689 * 2. remove cancelled entries.
690 */
691 lufs_logscan_freecancel();
692 curtid++;
693 break;
694 case DT_INODE:
695 /*
696 * Deltas against parts of on-disk inodes are
697 * assumed to be timestamps. Ignore those.
698 */
699 if (d->d_nb != sizeof (struct dinode))
700 break;
701 /* FALLTHROUGH */
702 case DT_CANCEL:
703 case DT_ABZERO:
704 case DT_AB:
705 case DT_DIR:
706 case DT_FBI:
707 /*
708 * These types of deltas contain and/or modify structural
709 * information that is needed for booting the system:
710 * - where to find a file (DT_DIR, DT_FBI)
711 * - the file itself (DT_INODE)
712 * - data blocks associated with a file (DT_AB, DT_ABZERO)
713 *
714 * Building the hash chains becomes complicated because there
715 * may exist an older (== previously added) entry that overlaps
716 * with the one we want to add.
717 * Four cases must be distinguished:
718 * 1. The new delta is an exact match for an existing one,
719 * or is a superset of an existing one, and both
720 * belong to the same transaction.
721 * The new delta completely supersedes the old one, so
722 * remove that and reuse the structure for the new.
723 * Then add the new delta to the head of the hashchain.
724 * 2. The new delta is an exact match for an existing one,
725 * or is a superset of an existing one, but the two
726 * belong to different transactions (i.e. the old one is
727 * committed).
728 * The existing one is marked to be cancelled when the
729 * next DT_COMMIT record is found, and the hash chain
730 * walk is continued as there may be more existing entries
731 * found which overlap the new delta (happens if that is
732 * a superset of those in the log).
733 * Once no more overlaps are found, goto 4.
734 * 3. An existing entry completely covers the new one.
735 * The new delta is then added directly before this
736 * existing one.
737 * 4. No (more) overlaps with existing entries are found.
738 * Unless this is a DT_CANCEL delta, whose only purpose
739 * is already handled by marking overlapping entries for
740 * cancellation, add the new delta at the hash chain head.
741 *
742 * This strategy makes sure that the hash chains are properly
743 * ordered. lufs_merge_deltas() walks the hash chain backward,
744 * which then ensures that delta merging is done in the same
745 * order as those deltas occur in the log - remember, the
746 * log can only be read in one direction.
747 *
748 */
749 lh = &loghash[LB_HASHFUNC(d->d_mof)];
750 l = *lh;
751 do {
752 if (l == (lb_me_t *)NULL)
753 break;
754 /*
755 * This covers the first two cases above.
756 * If this is a perfect match from the same transaction,
757 * and it isn't already cancelled, we simply replace it
758 * with its newer incarnation.
759 * Otherwise, mark it for cancellation. Handling of
760 * DT_COMMIT is going to remove it, then.
761 */
762 if (WITHIN(l->l_mof, l->l_nb, d->d_mof, d->d_nb)) {
763 if (!(l->l_flags & LB_ISCANCELLED)) {
764 if (l->l_tid == curtid &&
765 d->d_typ != DT_CANCEL) {
766 remlist(lh, l);
767 l->l_mof = d->d_mof;
768 l->l_lof = *addr;
769 l->l_nb = d->d_nb;
770 l->l_typ = d->d_typ;
771 l->l_flags = 0;
772 l->l_tid = curtid;
773 inslist(lh, l);
774 return (1);
775 } else {
776 /*
777 * 2nd case - cancel only.
778 */
779 l->l_flags |= LB_ISCANCELLED;
780 }
781 }
782 } else if (WITHIN(d->d_mof, d->d_nb,
783 l->l_mof, l->l_nb)) {
784 /*
785 * This is the third case above.
786 * With deltas DT_ABZERO/DT_AB and DT_FBI/DT_DIR
787 * this may happen - an existing previous delta
788 * is larger than the current one we're planning
789 * to add - DT_ABZERO deltas are supersets of
790 * DT_AB deltas, and likewise DT_FBI/DT_DIR.
791 * In order to do merging correctly, such deltas
792 * put up a barrier for new ones that overlap,
793 * and we have to add the new delta immediately
794 * before (!) the existing one.
795 */
796 lb_me_t *newl;
797 newl = lufs_alloc_me();
798 if (newl == (lb_me_t *)NULL) {
799 /*
800 * No memory. Throw away everything
801 * and try booting without logging
802 * support.
803 */
804 curtid = 0;
805 return (0);
806 }
807 newl->l_mof = d->d_mof;
808 newl->l_lof = *addr; /* "payload" address */
809 newl->l_nb = d->d_nb;
810 newl->l_typ = d->d_typ;
811 newl->l_tid = curtid;
812 newl->l_prev = l->l_prev;
813 newl->l_next = l;
814 l->l_prev->l_next = newl;
815 l->l_prev = newl;
816 if (*lh == l)
817 *lh = newl;
818 return (1);
819 }
820 l = l->l_next;
821 } while (l != *lh);
822
823 /*
824 * This is case 4., add a new delta at the head of the chain.
825 *
826 * If the new delta is a DT_CANCEL entry, we handled it by
827 * marking everything it covered for cancellation. We can
828 * get by without actually adding the delta itself to the
829 * hash, as it'd need to be removed by the commit code anyway.
830 */
831 if (d->d_typ == DT_CANCEL)
832 break;
833
834 l = lufs_alloc_me();
835 if (l == (lb_me_t *)NULL) {
836 /*
837 * No memory. Throw away everything
838 * and try booting without logging
839 * support.
840 */
841 curtid = 0;
842 return (0);
843 }
844 l->l_mof = d->d_mof;
845 l->l_lof = *addr; /* this is the "payload" address */
846 l->l_nb = d->d_nb;
847 l->l_typ = d->d_typ;
848 l->l_tid = curtid;
849 inslist(lh, l);
850 break;
851 default:
852 break;
853 }
854 return (1);
855 }
856
857 static int
lufs_logscan_prescan(void)858 lufs_logscan_prescan(void)
859 {
860 /*
861 * Simulate a full log by setting the tail to be one sector
862 * behind the head. This will make the logscan read all
863 * of the log until an out-of-sequence sector ident is
864 * found.
865 */
866 odi.od_tail_lof = dbtob(btodb(odi.od_head_lof)) - DEV_BSIZE;
867 if (odi.od_tail_lof < odi.od_bol_lof)
868 odi.od_tail_lof = odi.od_eol_lof - DEV_BSIZE;
869 if (odi.od_tail_lof >= odi.od_eol_lof)
870 odi.od_tail_lof = odi.od_bol_lof;
871
872 /*
873 * While sector trailers maintain TID values, od_head_tid
874 * is not being updated by the kernel ufs logging support
875 * at this time. We therefore count transactions ourselves
876 * starting at zero - as does the kernel ufs logscan code.
877 */
878 curtid = 0;
879
880 if (!lufs_alloc_logbuf()) {
881 dprintf("Failed to allocate log buffer.\n");
882 return (0);
883 }
884
885 loghash = (lb_me_t **)lufs_alloc_from_logbuf(
886 LB_HASHSIZE * sizeof (lb_me_t *));
887 if (loghash == (lb_me_t **)NULL) {
888 dprintf("Can't allocate loghash[] array.");
889 return (0);
890 }
891 return (1);
892 }
893
894 /*
895 * This function must remove all uncommitted entries (l->l_tid == curtid)
896 * from the log hash. Doing this, we implicitly delete pending cancellations
897 * as well.
898 * It uses the same hash walk algorithm as lufs_logscan_freecancel(). Only
899 * the check for entries that need to be removed is different.
900 */
901 static void
lufs_logscan_postscan(void)902 lufs_logscan_postscan(void)
903 {
904 lb_me_t **lh, *l, *lnext;
905 int i;
906
907 for (i = 0; i < LB_HASHSIZE; i++) {
908 lh = &loghash[i];
909 l = *lh;
910 do {
911 if (l == (lb_me_t *)NULL)
912 break;
913 lnext = l->l_next;
914 if (l->l_tid == curtid) {
915 remlist(lh, l);
916 bzero((caddr_t)l, sizeof (lb_me_t));
917 l->l_next = lfreelist;
918 lfreelist = l;
919 if (*lh == (lb_me_t *)NULL)
920 break;
921 /*
922 * Just removed the hash head. In order not
923 * to terminate the while loop, respin chain
924 * walk for this hash chain.
925 */
926 if (lnext == *lh) {
927 i--;
928 break;
929 }
930 } else {
931 l->l_flags &= ~(LB_ISCANCELLED);
932 }
933 l = lnext;
934 } while (l != *lh);
935 }
936 }
937
938 /*
939 * This function builds the log hash. It performs the same sequence
940 * of actions at logscan as the kernel ufs logging support:
941 * - Prepare the log for scanning by simulating a full log.
942 * - As long as sectors read from the log have contiguous idents, do:
943 * read the delta header
944 * add the delta to the logmap
945 * skip over the contents to the start of the next delta header
946 * - After terminating the scan, remove uncommitted entries.
947 *
948 * This function cannot fail except if mapping the logbuffer area
949 * during lufs_logscan_prescan() fails. If there is a structural
950 * integrity problem and the on-disk log cannot be read, we'll
951 * treat this as the same situation as an uncommitted transaction
952 * at the end of the log (or, corner case of that, an empty log
953 * with no committed transactions in it at all).
954 *
955 */
956 static int
lufs_logscan(void)957 lufs_logscan(void)
958 {
959 int32_t addr;
960 struct delta d;
961
962 if (!lufs_logscan_prescan())
963 return (LOG_IS_ERRORED);
964
965 addr = odi.od_head_lof;
966
967 /*
968 * Note that addr == od_tail_lof means a completely filled
969 * log. This almost never happens, so the common exit path
970 * from this loop is via one of the 'break's.
971 */
972 while (addr != odi.od_tail_lof) {
973 if (!lufs_logscan_read(&addr, &d))
974 break;
975 if (!lufs_logscan_addmap(&addr, &d))
976 return (LOG_IS_ERRORED);
977 if (!lufs_logscan_skip(&addr, &d))
978 break;
979 }
980
981 lufs_logscan_postscan();
982 /*
983 * Check whether the log contains data, and if so whether
984 * it contains committed data.
985 */
986 if (addr == odi.od_head_lof || curtid == 0) {
987 return (LOG_IS_EMPTY);
988 }
989 return (LOG_IS_OK);
990 }
991
992 /*
993 * A metadata block was read from disk. Check whether the logmap
994 * has a delta against this byte range, and if so read it in, since
995 * the data in the log is more recent than what was read from other
996 * places on the disk.
997 */
998 void
lufs_merge_deltas(fileid_t * fp)999 lufs_merge_deltas(fileid_t *fp)
1000 {
1001 int nb;
1002 int64_t bof;
1003 lb_me_t **lh, *l;
1004 int32_t skip;
1005
1006 /*
1007 * No logmap: Empty log. Nothing to do here.
1008 */
1009 if (!ufs_is_lufs || logbuffer == (caddr_t)NULL)
1010 return;
1011
1012 bof = ldbtob(fp->fi_blocknum);
1013 nb = fp->fi_count;
1014
1015 /*
1016 * Search the log hash.
1017 * Merge deltas if an overlap is found.
1018 */
1019
1020 lh = &loghash[LB_HASHFUNC(bof)];
1021
1022 if (*lh == (lb_me_t *)NULL)
1023 return;
1024
1025 l = *lh;
1026
1027 do {
1028 l = l->l_prev;
1029 if (OVERLAP(l->l_mof, l->l_nb, bof, nb)) {
1030 /*
1031 * Found a delta in the log hash which overlaps
1032 * with the current metadata block. Read the
1033 * actual delta payload from the on-disk log
1034 * directly into the file buffer.
1035 */
1036 if (l->l_typ != DT_ABZERO) {
1037 /*
1038 * We have to actually read this part of the
1039 * log as it could contain a sector trailer, or
1040 * wrap around the end of the log.
1041 * If it did, the second offset generation would
1042 * be incorrect if we'd started at l->l_lof.
1043 */
1044 if (!(skip = lufs_read_log(l->l_lof, NULL,
1045 MAX(bof - l->l_mof, 0))))
1046 dprintf("scan/merge error, pre-skip\n");
1047 if (!(skip = lufs_read_log(skip,
1048 fp->fi_memp + MAX(l->l_mof - bof, 0),
1049 MIN(l->l_mof + l->l_nb, bof + nb) -
1050 MAX(l->l_mof, bof))))
1051 dprintf("scan/merge error, merge\n");
1052 } else {
1053 /*
1054 * DT_ABZERO requires no disk access, just
1055 * clear the byte range which overlaps with
1056 * the delta.
1057 */
1058 bzero(fp->fi_memp + MAX(l->l_mof - bof, 0),
1059 MIN(l->l_mof + l->l_nb, bof + nb) -
1060 MAX(l->l_mof, bof));
1061 }
1062 }
1063 } while (l->l_prev != (*lh)->l_prev);
1064
1065 printf("*\b");
1066 }
1067
1068 void
lufs_closeall(void)1069 lufs_closeall(void)
1070 {
1071 if (ufs_is_lufs) {
1072 bkmem_free((char *)eb, logfp->fi_devp->un_fs.di_fs.fs_bsize);
1073 bkmem_free((char *)logfp, sizeof (fileid_t));
1074 eb = (extent_block_t *)NULL;
1075 bzero((caddr_t)&odi, sizeof (ml_odunit_t));
1076 logfp = (fileid_t *)NULL;
1077 lufs_free_logbuf();
1078 ufs_is_lufs = 0;
1079 }
1080 }
1081