1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 #include <sys/param.h>
29 #include <sys/vnode.h>
30 #include <sys/fs/ufs_fsdir.h>
31 #include <sys/fs/ufs_fs.h>
32 #include <sys/fs/ufs_inode.h>
33 #include <sys/fs/ufs_log.h>
34 #include <sys/sysmacros.h>
35 #include <sys/promif.h>
36 #include <sys/machparam.h>
37
38 #include <sys/stat.h>
39 #include <sys/bootdebug.h>
40 #include <sys/salib.h>
41 #include <sys/saio.h>
42 #include <sys/filep.h>
43
44
45 /*
46 * Big theory statement on how ufsboot makes use of the log
47 * in case the filesystem wasn't shut down cleanly.
48 *
49 * The structure of the ufs on-disk log looks like this:
50 *
51 * +-----------------+
52 * | SUPERBLOCK |
53 * | ... |
54 * | fs_logbno +--> +-----------------------+
55 * | ... | | EXTENT BLOCK |
56 * +-----------------+ | ... |
57 * | nextents |
58 * +----------------------+ extents[0].pbno |
59 * | | { extents[1].pbno } +------------+
60 * | | ... +--> ... |
61 * | +-----------------------+ |
62 * v |
63 * +-----------------------------+ \ |
64 * | ON-DISK LOG HEADER | | |
65 * | ... | | |
66 * | od_head_lof +--+ | |
67 * | ... | | | |
68 * +-----------------------------+ <|---|- od_bol_lof |
69 * | sector (may contain deltas) | | | (logical offset) |
70 * | +-------------------------+ | | |
71 * | | trailer (some ident#) | | > extents[0].nbno |
72 * +---+-------------------------+ | | blocks ("sectors") |
73 * . . | | |
74 * . . | | |
75 * +-----------------------------+<-+ | |
76 * | delta1 delta2 delta3 | | |
77 * | d +-------------------------+ | |
78 * | e | ident#: od_head_ident | | |
79 * +---+-------------------------+ / |
80 * |
81 * +-----------------------------+ <---------------------------+
82 * | lta4 delta5 delta6 de |
83 * | l +-------------------------+
84 * | t | ident#: od_head_ident+1 |
85 * +---+-------------------------+
86 * . .
87 * +-----------------------------+
88 * | sector (may contain deltas) |
89 * | +------------------+
90 * | | trailer (ident#) |
91 * +----------+------------------+ <-- od_eol_lof (logical offset)
92 *
93 * The ufs on-disk log has the following properties:
94 *
95 * 1. The log is made up from at least one extent. "fs_logbno" in
96 * the superblock points to where this is found.
97 * 2. Extents describe the logical layout.
98 * - Logical offset 0 is the on-disk log header. It's also
99 * at the beginning of the first physical block.
100 * - If there's more than one extent, the equation holds:
101 * extent[i+1].lbno == extent[i].lbno + extent[i].nbno
102 * i.e. logical offsets form a contiguous sequence. Yet on disk,
103 * two logically-adjacent offsets may be located in two
104 * physically disjoint extents, so logical offsets need to be
105 * translated into physical disk block addresses for access.
106 * - Various fields in the on-disk log header structure refer
107 * to such logical log offsets.
108 * 3. The actual logical logspace begins after the log header, at
109 * the logical offset indicated by "od_bol_lof". Every 512 Bytes
110 * (a "sector" in terms of ufs logging) is a sector trailer which
111 * contains a sequence number, the sector ident.
112 * 4. Deltas are packed tight in the remaining space, i.e. a delta
113 * may be part of more than one sector. Reads from the logspace
114 * must be split at sector boundaries, since the trailer is never
115 * part of a delta. Delta sizes vary.
116 * 5. The field "od_head_lof" points to the start of the dirty part
117 * of the log, i.e. to the first delta header. Likewise, "od_head_ident"
118 * is the sequence number where the valid part of the log starts; if
119 * the sector pointed to by "od_head_lof" has a sector ident different
120 * from "od_head_ident", the log is empty.
121 * 6. The valid part of the log extends for as many sectors as their ident
122 * numbers form a contiguous sequence. When reaching the logical end of
123 * the log, "od_bol_lof", logical offsets wrap around to "od_bol_lof",
124 * i.e. the log forms a circular buffer.
125 *
126 * For the strategy how to handle accessing the log, item 4. is the
127 * most important one - its consequence is that the log can only be
128 * read in one direction - forward, starting at the head.
129 *
130 * The task of identifying whether a given metadata block is
131 * actually in the log therefore requires reading the entire
132 * log. Doing so is memory-efficient but kills speed if re-done
133 * at every metadata read (64MB log size vs. 512 byte metadata
134 * block size: 128 times as much I/O, possibly only to find out
135 * that this block was not in the log ...).
136 *
137 * First thought to speed this up is to let ufsboot roll the log.
138 * But this is not possible because:
139 * - ufsboot currently does not implement any write functionality,
140 * the boot-time ufs implementation is read-only.
141 * - firmware write interfaces may or may not be available, in any
142 * case, they're rarely used and untested for such a purpose.
143 * - that would duplicate a lot of code, since at the moment only
144 * kernel ufs logging implements log rolling.
145 * - the boot environment cannot be considered high-performance;
146 * rolling the log there would be slow.
147 * - boot device and root device could well be different, creating
148 * inconsistencies e.g. with a mirrored root if the log is rolled.
149 *
150 * Therefore, caching the log structural information (boot-relevant
151 * deltas and their logical log offset) is required for fast access
152 * to the data in the log. This code builds a logmap for that purpose.
153 *
154 * As a simple optimization, if we find the log is empty, we will not
155 * use it - log reader support for ufsboot has no noticeable overhead
156 * for clean logs, or for root filesystems that aren't logging.
157 */
158
159 #define LB_HASHSHIFT 13
160 #define LB_HASHSIZE (1 << LB_HASHSHIFT)
161 #define LB_HASHFUNC(mof) (((mof) >> LB_HASHSHIFT) & (LB_HASHSIZE - 1))
162
163 #define LOGBUF_MAXSIZE (8*1024*1024)
164 #define LOGBUF_MINSIZE (256*1024)
165
166 #define LOG_IS_EMPTY 0
167 #define LOG_IS_OK 1
168 #define LOG_IS_ERRORED 2
169
170 /*
171 * We build a hashed logmap of those while scanning the log.
172 * sizeof(lb_map_t) is 40 on 64bit, 32 on 32bit; the max sized
173 * resalloc'ed buffer can accomodate around ~500k of those;
174 * this is approximately the maximum amount of deltas we'll
175 * see if a 64MB ufs log is completely filled. We'll make no
176 * attempt to free and reallocate the resalloc'ed buffer if
177 * we overflow, as conservative sizing should make that an
178 * impossibility. A future enhancement may allocate memory
179 * here as needed - once the boot time memory allocator
180 * supports that.
181 */
182 typedef struct lb_mapentry {
183 struct lb_mapentry *l_next; /* hash chaining */
184 struct lb_mapentry *l_prev; /* hash chaining */
185 int64_t l_mof; /* disk addr this delta is against */
186 int16_t l_nb; /* size of delta */
187 int16_t l_flags;
188 int32_t l_lof; /* log offset for delta header */
189 int32_t l_tid; /* transaction this delta is part of */
190 delta_t l_typ; /* see <sys/fs/ufs_trans.h> */
191 } lb_me_t;
192
193 #define LB_ISCANCELLED 1
194
195 #define inslist(lh, l) if ((*(lh))) { \
196 (*(lh))->l_prev->l_next = (l); \
197 (l)->l_next = (*(lh)); \
198 (l)->l_prev = (*(lh))->l_prev; \
199 (*(lh))->l_prev = (l); \
200 } else { \
201 (l)->l_next = (l); \
202 (l)->l_prev = (l); \
203 (*(lh)) = l; \
204 }
205
206 #define remlist(lh, l) \
207 if ((l)->l_next == (l)) { \
208 if (*(lh) != (l) || (l)->l_prev != (l)) \
209 dprintf("Logmap hash inconsistency.\n"); \
210 *(lh) = (lb_me_t *)NULL; \
211 } else { \
212 if (*(lh) == (l)) \
213 *(lh) = (l)->l_next; \
214 (l)->l_prev->l_next = (l)->l_next; \
215 (l)->l_next->l_prev = (l)->l_prev; \
216 }
217
218 #define lufs_alloc_me() \
219 (lb_me_t *)lufs_alloc_from_logbuf(sizeof (lb_me_t))
220
221 extern int boothowto;
222 static int ufs_is_lufs = 0;
223 static fileid_t *logfp = (fileid_t *)NULL;
224 static extent_block_t *eb = (extent_block_t *)NULL;
225 static ml_odunit_t odi;
226
227 static char logbuffer_min[LOGBUF_MINSIZE];
228 static caddr_t logbuffer = (caddr_t)NULL;
229 static caddr_t elogbuffer = (caddr_t)NULL;
230 static caddr_t logbuf_curptr;
231 static lb_me_t **loghash = (lb_me_t **)NULL;
232 static lb_me_t *lfreelist;
233
234 static uint32_t curtid;
235
236
237 int lufs_support = 1;
238
239 void lufs_boot_init(fileid_t *);
240 void lufs_closeall(void);
241 void lufs_merge_deltas(fileid_t *);
242
243 static int lufs_logscan(void);
244
245 extern int diskread(fileid_t *filep);
246 extern caddr_t resalloc(enum RESOURCES, size_t, caddr_t, int);
247
248 #if defined(__sparcv9)
249 #define LOGBUF_BASEADDR ((caddr_t)(SYSBASE - LOGBUF_MAXSIZE))
250 #endif
251
252 static int
lufs_alloc_logbuf(void)253 lufs_alloc_logbuf(void)
254 {
255 /*
256 * Allocate memory for caching the log. Since the logbuffer can
257 * potentially exceed the boot scratch memory limit, we use resalloc
258 * directly, passing the allocation to the low-level boot-time
259 * backend allocator. The chosen VA range is the top end of
260 * the kernel's segmap segment, so we're not interfering
261 * with the kernel because segmap is created at a time when
262 * the 2nd-stage boot has already been unloaded and this VA
263 * range was given back.
264 *
265 * On sparc platforms, the kernel cannot recover the memory
266 * obtained from resalloc because the page structs are allocated
267 * before the call to BOP_QUIESCE. To avoid leaking this
268 * memory, the logbuffer is allocated from a small bss array
269 * that should hold the logmap except in the most extreme cases.
270 * If the bss array is too small, the logbuffer is extended
271 * from resalloc 1 page at a time.
272 */
273
274 logbuffer = logbuffer_min;
275 elogbuffer = logbuffer+LOGBUF_MINSIZE;
276 logbuf_curptr = logbuffer;
277 lfreelist = (lb_me_t *)NULL;
278
279 if (logbuffer == (caddr_t)NULL)
280 return (0);
281
282 dprintf("Buffer for boot loader logging support: 0x%p, size 0x%x\n",
283 logbuffer, elogbuffer-logbuffer);
284
285 return (1);
286 }
287
288 static void
lufs_free_logbuf()289 lufs_free_logbuf()
290 {
291 /*
292 * Solaris/x86 has no prom_free() routine at this time.
293 * Reclaiming the VA range below KERNEL_TEXT on Solaris/x86
294 * is done by the kernel startup itself, in hat_unload_prom()
295 * after the bootloader has been quiesced.
296 *
297 * Solaris on sparc has a prom_free() routine that will update
298 * the memlist properties to reflect the freeing of the
299 * logbuffer. However, the sparc kernel cannot recover
300 * the memory freed after the call to BOP_QUIESCE as the
301 * page struct have already been allocated. We call
302 * prom_free anyway so that the kernel can reclaim this
303 * memory in the future.
304 */
305 if (logbuffer == LOGBUF_BASEADDR)
306 prom_free(logbuffer, elogbuffer-logbuffer);
307 logbuffer = (caddr_t)NULL;
308 }
309
310 static caddr_t
lufs_alloc_from_logbuf(size_t sz)311 lufs_alloc_from_logbuf(size_t sz)
312 {
313 caddr_t tmpaddr;
314 lb_me_t *l;
315
316 /*
317 * Satisfy lb_me_t allocations from the freelist
318 * first if possible.
319 */
320 if ((sz == sizeof (lb_me_t)) && lfreelist) {
321 l = lfreelist;
322 lfreelist = lfreelist->l_next;
323 return ((caddr_t)l);
324 }
325 if (elogbuffer < logbuf_curptr + sz) {
326 caddr_t np;
327 size_t nsz;
328
329 /*
330 * Out of space in current chunk - try to add another.
331 */
332 if (logbuffer == logbuffer_min) {
333 np = LOGBUF_BASEADDR;
334 } else {
335 np = elogbuffer;
336 }
337 nsz = roundup(sz, PAGESIZE);
338 if (np + nsz > LOGBUF_BASEADDR + LOGBUF_MAXSIZE) {
339 return ((caddr_t)NULL);
340 }
341
342 np = resalloc(RES_CHILDVIRT, nsz, np, 0UL);
343 if (np == (caddr_t)NULL) {
344 return ((caddr_t)NULL);
345 }
346 if (logbuffer == logbuffer_min)
347 logbuffer = LOGBUF_BASEADDR;
348 logbuf_curptr = np;
349 elogbuffer = logbuf_curptr + nsz;
350 }
351
352 tmpaddr = logbuf_curptr;
353 logbuf_curptr += sz;
354 bzero(tmpaddr, sz);
355 return (tmpaddr);
356 }
357
358 static int32_t
lufs_read_log(int32_t addr,caddr_t va,int nb)359 lufs_read_log(int32_t addr, caddr_t va, int nb)
360 {
361 int i, fastpath = 0;
362 daddr_t pblk, lblk;
363 sect_trailer_t *st;
364 uint32_t ident;
365
366 /*
367 * Fast path for skipping the read if no target buffer
368 * is specified. Don't do this for the initial scan.
369 */
370 if (ufs_is_lufs && (va == (caddr_t)NULL))
371 fastpath = 1;
372
373 while (nb) {
374 /* log wraparound check */
375 if (addr == odi.od_eol_lof)
376 addr = odi.od_bol_lof;
377 if (fastpath)
378 goto read_done;
379
380 /*
381 * Translate logically-contiguous log offsets into physical
382 * block numbers. For a log consisting of a single extent:
383 * pbno = btodb(addr) - extents[0].lbno;
384 * Otherwise, search for the extent which contains addr.
385 */
386 pblk = 0;
387 lblk = btodb(addr);
388 for (i = 0; i < eb->nextents; i++) {
389 if (lblk >= eb->extents[i].lbno &&
390 lblk < eb->extents[i].lbno +
391 eb->extents[i].nbno) {
392 pblk = lblk - eb->extents[i].lbno +
393 eb->extents[i].pbno;
394 break;
395 }
396 }
397
398 if (pblk == 0) {
399 /*
400 * block #0 can never be in a log extent since this
401 * block always contains the primary superblock copy.
402 */
403 dprintf("No log extent found for log offset 0x%llx.\n",
404 addr);
405 return (0);
406 }
407
408 /*
409 * Check whether the block we want is cached from the last
410 * read. If not, read it in now.
411 */
412 if (logfp->fi_blocknum != pblk) {
413 logfp->fi_blocknum = pblk;
414 logfp->fi_memp = logfp->fi_buf;
415 logfp->fi_count = DEV_BSIZE;
416 logfp->fi_offset = 0;
417 if (diskread(logfp)) {
418 dprintf("I/O error reading the ufs log" \
419 " at block 0x%x.\n",
420 logfp->fi_blocknum);
421 return (0);
422 }
423 /*
424 * Log structure verification. The block which we just
425 * read has an ident number that must match its offset
426 * in blocks from the head of the log. Since the log
427 * can wrap around, we have to check for that to get the
428 * ident right. Out-of-sequence idents can happen after
429 * power failures, panics during a partial transaction,
430 * media errors, ... - in any case, they mark the end of
431 * the valid part of the log.
432 */
433 st = (sect_trailer_t *)(logfp->fi_memp +
434 LDL_USABLE_BSIZE);
435 /* od_head_ident is where the sequence starts */
436 ident = odi.od_head_ident;
437 if (lblk >= lbtodb(odi.od_head_lof)) {
438 /* no wraparound */
439 ident += (lblk - lbtodb(odi.od_head_lof));
440 } else {
441 /* log wrapped around the end */
442 ident += (lbtodb(odi.od_eol_lof) -
443 lbtodb(odi.od_head_lof));
444 ident += (lblk - lbtodb(odi.od_bol_lof));
445 }
446
447 if (ident != st->st_ident)
448 return (0);
449 }
450 read_done:
451 /*
452 * Copy the delta contents to the destination buffer if
453 * one was specified. Otherwise, just skip the contents.
454 */
455 i = MIN(NB_LEFT_IN_SECTOR(addr), nb);
456 if (va != NULL) {
457 bcopy(logfp->fi_buf + (addr - ldbtob(lbtodb(addr))),
458 va, i);
459 va += i;
460 }
461 nb -= i;
462 addr += i;
463 /*
464 * Skip sector trailer if necessary.
465 */
466 if (NB_LEFT_IN_SECTOR(addr) == 0)
467 addr += sizeof (sect_trailer_t);
468 }
469 return (addr);
470 }
471
472 void
lufs_boot_init(fileid_t * filep)473 lufs_boot_init(fileid_t *filep)
474 {
475 struct fs *sb = (struct fs *)filep->fi_memp;
476 int err = 0;
477
478 /*
479 * boot_ufs_mountroot() should have called us with a
480 * filep pointing to the superblock. Verify that this
481 * is so first.
482 * Then check whether this filesystem has a dirty log.
483 * Also return if lufs support was disabled on request.
484 */
485 if (!lufs_support ||
486 sb != (struct fs *)&filep->fi_devp->un_fs.di_fs ||
487 sb->fs_clean != FSLOG || sb->fs_logbno == NULL) {
488 return;
489 }
490
491 if (boothowto & RB_VERBOSE)
492 printf("The boot filesystem is logging.\n");
493
494 /*
495 * The filesystem is logging, there is a log area
496 * allocated for it. Check the log state and determine
497 * whether it'll be possible to use this log.
498 */
499
500 /*
501 * Allocate a private fileid_t for use when reading
502 * from the log.
503 */
504 eb = (extent_block_t *)bkmem_zalloc(sb->fs_bsize);
505 logfp = (fileid_t *)bkmem_zalloc(sizeof (fileid_t));
506 logfp->fi_memp = logfp->fi_buf;
507 logfp->fi_devp = filep->fi_devp;
508
509 /*
510 * Read the extent block and verify that what we
511 * find there are actually lufs extents.
512 * Make it simple: the extent block including all
513 * extents cannot be larger than a filesystem block.
514 * So read a whole filesystem block, to make sure
515 * we have read all extents in the same operation.
516 */
517 logfp->fi_blocknum = sb->fs_logbno;
518 logfp->fi_count = sb->fs_bsize;
519 logfp->fi_memp = (caddr_t)eb;
520 logfp->fi_offset = 0;
521 if (diskread(logfp) || eb->type != LUFS_EXTENTS) {
522 dprintf("Failed to read log extent block.\n");
523 err = LOG_IS_ERRORED;
524 goto out;
525 }
526
527 /*
528 * Read the on disk log header. If that fails,
529 * try the backup copy on the adjacent block.
530 */
531 logfp->fi_blocknum = eb->extents[0].pbno;
532 logfp->fi_count = sizeof (ml_odunit_t);
533 logfp->fi_memp = (caddr_t)&odi;
534 logfp->fi_offset = 0;
535 if (diskread(logfp)) {
536 logfp->fi_blocknum = eb->extents[0].pbno + 1;
537 logfp->fi_count = sizeof (ml_odunit_t);
538 logfp->fi_memp = (caddr_t)&odi;
539 logfp->fi_offset = 0;
540 if (diskread(logfp)) {
541 dprintf("Failed to read on-disk log header.\n");
542 err = LOG_IS_ERRORED;
543 goto out;
544 }
545 }
546
547 /*
548 * Verify that we understand this log, and
549 * that the log isn't bad or empty.
550 */
551 if (odi.od_version != LUFS_VERSION_LATEST) {
552 dprintf("On-disk log format v%d != supported format v%d.\n",
553 odi.od_version, LUFS_VERSION_LATEST);
554 err = LOG_IS_ERRORED;
555 } else if (odi.od_badlog) {
556 dprintf("On-disk log is marked bad.\n");
557 err = LOG_IS_ERRORED;
558 } else if (odi.od_chksum != odi.od_head_ident + odi.od_tail_ident) {
559 dprintf("On-disk log checksum %d != ident sum %d.\n",
560 odi.od_chksum, odi.od_head_ident + odi.od_tail_ident);
561 err = LOG_IS_ERRORED;
562 } else {
563 /*
564 * All consistency checks ok. Scan the log, build the
565 * log hash. If this succeeds we'll be using the log
566 * when reading from this filesystem.
567 */
568 err = lufs_logscan();
569 }
570 out:
571 ufs_is_lufs = 1;
572 switch (err) {
573 case LOG_IS_EMPTY:
574 if (boothowto & RB_VERBOSE)
575 printf("The ufs log is empty and will not be used.\n");
576 lufs_closeall();
577 break;
578 case LOG_IS_OK:
579 if (boothowto & RB_VERBOSE)
580 printf("Using the ufs log.\n");
581 break;
582 case LOG_IS_ERRORED:
583 if (boothowto & RB_VERBOSE)
584 printf("Couldn't build log hash. Can't use ufs log.\n");
585 lufs_closeall();
586 break;
587 default:
588 dprintf("Invalid error %d while scanning the ufs log.\n", err);
589 break;
590 }
591 }
592
593 static int
lufs_logscan_read(int32_t * addr,struct delta * d)594 lufs_logscan_read(int32_t *addr, struct delta *d)
595 {
596 *addr = lufs_read_log(*addr, (caddr_t)d, sizeof (struct delta));
597
598 if (*addr == 0 ||
599 d->d_typ < DT_NONE || d->d_typ > DT_MAX ||
600 d->d_nb >= odi.od_logsize)
601 return (0);
602
603 return (1);
604 }
605
606 static int
lufs_logscan_skip(int32_t * addr,struct delta * d)607 lufs_logscan_skip(int32_t *addr, struct delta *d)
608 {
609 switch (d->d_typ) {
610 case DT_COMMIT:
611 /*
612 * A DT_COMMIT delta has no size as such, but will
613 * always "fill up" the sector that contains it.
614 * The next delta header is found at the beginning
615 * of the next 512-Bytes sector, adjust "addr" to
616 * reflect that.
617 */
618 *addr += ((*addr & (DEV_BSIZE - 1))) ?
619 NB_LEFT_IN_SECTOR(*addr) +
620 sizeof (sect_trailer_t) : 0;
621 return (1);
622 case DT_CANCEL:
623 case DT_ABZERO:
624 /*
625 * These types of deltas occupy no space in the log
626 */
627 return (1);
628 default:
629 /*
630 * Skip over the delta contents.
631 */
632 *addr = lufs_read_log(*addr, NULL, d->d_nb);
633 }
634
635 return (*addr != NULL);
636 }
637
638 static void
lufs_logscan_freecancel(void)639 lufs_logscan_freecancel(void)
640 {
641 lb_me_t **lh, *l, *lnext;
642 int i;
643
644 /*
645 * Walk the entire log hash and put cancelled entries
646 * onto the freelist. Corner cases:
647 * a) empty hash chain (*lh == NULL)
648 * b) only one entry in chain, and that is cancelled.
649 * If for every cancelled delta another one would've
650 * been added, this situation couldn't occur, but a
651 * DT_CANCEL delta can lead to this as it is never
652 * added.
653 */
654 for (i = 0; i < LB_HASHSIZE; i++) {
655 lh = &loghash[i];
656 l = *lh;
657 do {
658 if (*lh == (lb_me_t *)NULL)
659 break;
660 lnext = l->l_next;
661 if (l->l_flags & LB_ISCANCELLED) {
662 remlist(lh, l);
663 bzero((caddr_t)l, sizeof (lb_me_t));
664 l->l_next = lfreelist;
665 lfreelist = l;
666 /*
667 * Just removed the hash head. In order not
668 * to terminate the while loop, respin chain
669 * walk for this hash chain.
670 */
671 if (lnext == *lh) {
672 i--;
673 break;
674 }
675 }
676 l = lnext;
677 } while (l != *lh);
678 }
679 }
680
681 static int
lufs_logscan_addmap(int32_t * addr,struct delta * d)682 lufs_logscan_addmap(int32_t *addr, struct delta *d)
683 {
684 lb_me_t **lh, *l;
685
686 switch (d->d_typ) {
687 case DT_COMMIT:
688 /*
689 * Handling DT_COMMIT deltas is special. We need to:
690 * 1. increase the transaction ID
691 * 2. remove cancelled entries.
692 */
693 lufs_logscan_freecancel();
694 curtid++;
695 break;
696 case DT_INODE:
697 /*
698 * Deltas against parts of on-disk inodes are
699 * assumed to be timestamps. Ignore those.
700 */
701 if (d->d_nb != sizeof (struct dinode))
702 break;
703 /* FALLTHROUGH */
704 case DT_CANCEL:
705 case DT_ABZERO:
706 case DT_AB:
707 case DT_DIR:
708 case DT_FBI:
709 /*
710 * These types of deltas contain and/or modify structural
711 * information that is needed for booting the system:
712 * - where to find a file (DT_DIR, DT_FBI)
713 * - the file itself (DT_INODE)
714 * - data blocks associated with a file (DT_AB, DT_ABZERO)
715 *
716 * Building the hash chains becomes complicated because there
717 * may exist an older (== previously added) entry that overlaps
718 * with the one we want to add.
719 * Four cases must be distinguished:
720 * 1. The new delta is an exact match for an existing one,
721 * or is a superset of an existing one, and both
722 * belong to the same transaction.
723 * The new delta completely supersedes the old one, so
724 * remove that and reuse the structure for the new.
725 * Then add the new delta to the head of the hashchain.
726 * 2. The new delta is an exact match for an existing one,
727 * or is a superset of an existing one, but the two
728 * belong to different transactions (i.e. the old one is
729 * committed).
730 * The existing one is marked to be cancelled when the
731 * next DT_COMMIT record is found, and the hash chain
732 * walk is continued as there may be more existing entries
733 * found which overlap the new delta (happens if that is
734 * a superset of those in the log).
735 * Once no more overlaps are found, goto 4.
736 * 3. An existing entry completely covers the new one.
737 * The new delta is then added directly before this
738 * existing one.
739 * 4. No (more) overlaps with existing entries are found.
740 * Unless this is a DT_CANCEL delta, whose only purpose
741 * is already handled by marking overlapping entries for
742 * cancellation, add the new delta at the hash chain head.
743 *
744 * This strategy makes sure that the hash chains are properly
745 * ordered. lufs_merge_deltas() walks the hash chain backward,
746 * which then ensures that delta merging is done in the same
747 * order as those deltas occur in the log - remember, the
748 * log can only be read in one direction.
749 *
750 */
751 lh = &loghash[LB_HASHFUNC(d->d_mof)];
752 l = *lh;
753 do {
754 if (l == (lb_me_t *)NULL)
755 break;
756 /*
757 * This covers the first two cases above.
758 * If this is a perfect match from the same transaction,
759 * and it isn't already cancelled, we simply replace it
760 * with its newer incarnation.
761 * Otherwise, mark it for cancellation. Handling of
762 * DT_COMMIT is going to remove it, then.
763 */
764 if (WITHIN(l->l_mof, l->l_nb, d->d_mof, d->d_nb)) {
765 if (!(l->l_flags & LB_ISCANCELLED)) {
766 if (l->l_tid == curtid &&
767 d->d_typ != DT_CANCEL) {
768 remlist(lh, l);
769 l->l_mof = d->d_mof;
770 l->l_lof = *addr;
771 l->l_nb = d->d_nb;
772 l->l_typ = d->d_typ;
773 l->l_flags = 0;
774 l->l_tid = curtid;
775 inslist(lh, l);
776 return (1);
777 } else {
778 /*
779 * 2nd case - cancel only.
780 */
781 l->l_flags |= LB_ISCANCELLED;
782 }
783 }
784 } else if (WITHIN(d->d_mof, d->d_nb,
785 l->l_mof, l->l_nb)) {
786 /*
787 * This is the third case above.
788 * With deltas DT_ABZERO/DT_AB and DT_FBI/DT_DIR
789 * this may happen - an existing previous delta
790 * is larger than the current one we're planning
791 * to add - DT_ABZERO deltas are supersets of
792 * DT_AB deltas, and likewise DT_FBI/DT_DIR.
793 * In order to do merging correctly, such deltas
794 * put up a barrier for new ones that overlap,
795 * and we have to add the new delta immediately
796 * before (!) the existing one.
797 */
798 lb_me_t *newl;
799 newl = lufs_alloc_me();
800 if (newl == (lb_me_t *)NULL) {
801 /*
802 * No memory. Throw away everything
803 * and try booting without logging
804 * support.
805 */
806 curtid = 0;
807 return (0);
808 }
809 newl->l_mof = d->d_mof;
810 newl->l_lof = *addr; /* "payload" address */
811 newl->l_nb = d->d_nb;
812 newl->l_typ = d->d_typ;
813 newl->l_tid = curtid;
814 newl->l_prev = l->l_prev;
815 newl->l_next = l;
816 l->l_prev->l_next = newl;
817 l->l_prev = newl;
818 if (*lh == l)
819 *lh = newl;
820 return (1);
821 }
822 l = l->l_next;
823 } while (l != *lh);
824
825 /*
826 * This is case 4., add a new delta at the head of the chain.
827 *
828 * If the new delta is a DT_CANCEL entry, we handled it by
829 * marking everything it covered for cancellation. We can
830 * get by without actually adding the delta itself to the
831 * hash, as it'd need to be removed by the commit code anyway.
832 */
833 if (d->d_typ == DT_CANCEL)
834 break;
835
836 l = lufs_alloc_me();
837 if (l == (lb_me_t *)NULL) {
838 /*
839 * No memory. Throw away everything
840 * and try booting without logging
841 * support.
842 */
843 curtid = 0;
844 return (0);
845 }
846 l->l_mof = d->d_mof;
847 l->l_lof = *addr; /* this is the "payload" address */
848 l->l_nb = d->d_nb;
849 l->l_typ = d->d_typ;
850 l->l_tid = curtid;
851 inslist(lh, l);
852 break;
853 default:
854 break;
855 }
856 return (1);
857 }
858
859 static int
lufs_logscan_prescan(void)860 lufs_logscan_prescan(void)
861 {
862 /*
863 * Simulate a full log by setting the tail to be one sector
864 * behind the head. This will make the logscan read all
865 * of the log until an out-of-sequence sector ident is
866 * found.
867 */
868 odi.od_tail_lof = dbtob(btodb(odi.od_head_lof)) - DEV_BSIZE;
869 if (odi.od_tail_lof < odi.od_bol_lof)
870 odi.od_tail_lof = odi.od_eol_lof - DEV_BSIZE;
871 if (odi.od_tail_lof >= odi.od_eol_lof)
872 odi.od_tail_lof = odi.od_bol_lof;
873
874 /*
875 * While sector trailers maintain TID values, od_head_tid
876 * is not being updated by the kernel ufs logging support
877 * at this time. We therefore count transactions ourselves
878 * starting at zero - as does the kernel ufs logscan code.
879 */
880 curtid = 0;
881
882 if (!lufs_alloc_logbuf()) {
883 dprintf("Failed to allocate log buffer.\n");
884 return (0);
885 }
886
887 loghash = (lb_me_t **)lufs_alloc_from_logbuf(
888 LB_HASHSIZE * sizeof (lb_me_t *));
889 if (loghash == (lb_me_t **)NULL) {
890 dprintf("Can't allocate loghash[] array.");
891 return (0);
892 }
893 return (1);
894 }
895
896 /*
897 * This function must remove all uncommitted entries (l->l_tid == curtid)
898 * from the log hash. Doing this, we implicitly delete pending cancellations
899 * as well.
900 * It uses the same hash walk algorithm as lufs_logscan_freecancel(). Only
901 * the check for entries that need to be removed is different.
902 */
903 static void
lufs_logscan_postscan(void)904 lufs_logscan_postscan(void)
905 {
906 lb_me_t **lh, *l, *lnext;
907 int i;
908
909 for (i = 0; i < LB_HASHSIZE; i++) {
910 lh = &loghash[i];
911 l = *lh;
912 do {
913 if (l == (lb_me_t *)NULL)
914 break;
915 lnext = l->l_next;
916 if (l->l_tid == curtid) {
917 remlist(lh, l);
918 bzero((caddr_t)l, sizeof (lb_me_t));
919 l->l_next = lfreelist;
920 lfreelist = l;
921 if (*lh == (lb_me_t *)NULL)
922 break;
923 /*
924 * Just removed the hash head. In order not
925 * to terminate the while loop, respin chain
926 * walk for this hash chain.
927 */
928 if (lnext == *lh) {
929 i--;
930 break;
931 }
932 } else {
933 l->l_flags &= ~(LB_ISCANCELLED);
934 }
935 l = lnext;
936 } while (l != *lh);
937 }
938 }
939
940 /*
941 * This function builds the log hash. It performs the same sequence
942 * of actions at logscan as the kernel ufs logging support:
943 * - Prepare the log for scanning by simulating a full log.
944 * - As long as sectors read from the log have contiguous idents, do:
945 * read the delta header
946 * add the delta to the logmap
947 * skip over the contents to the start of the next delta header
948 * - After terminating the scan, remove uncommitted entries.
949 *
950 * This function cannot fail except if mapping the logbuffer area
951 * during lufs_logscan_prescan() fails. If there is a structural
952 * integrity problem and the on-disk log cannot be read, we'll
953 * treat this as the same situation as an uncommitted transaction
954 * at the end of the log (or, corner case of that, an empty log
955 * with no committed transactions in it at all).
956 *
957 */
958 static int
lufs_logscan(void)959 lufs_logscan(void)
960 {
961 int32_t addr;
962 struct delta d;
963
964 if (!lufs_logscan_prescan())
965 return (LOG_IS_ERRORED);
966
967 addr = odi.od_head_lof;
968
969 /*
970 * Note that addr == od_tail_lof means a completely filled
971 * log. This almost never happens, so the common exit path
972 * from this loop is via one of the 'break's.
973 */
974 while (addr != odi.od_tail_lof) {
975 if (!lufs_logscan_read(&addr, &d))
976 break;
977 if (!lufs_logscan_addmap(&addr, &d))
978 return (LOG_IS_ERRORED);
979 if (!lufs_logscan_skip(&addr, &d))
980 break;
981 }
982
983 lufs_logscan_postscan();
984 /*
985 * Check whether the log contains data, and if so whether
986 * it contains committed data.
987 */
988 if (addr == odi.od_head_lof || curtid == 0) {
989 return (LOG_IS_EMPTY);
990 }
991 return (LOG_IS_OK);
992 }
993
994 /*
995 * A metadata block was read from disk. Check whether the logmap
996 * has a delta against this byte range, and if so read it in, since
997 * the data in the log is more recent than what was read from other
998 * places on the disk.
999 */
1000 void
lufs_merge_deltas(fileid_t * fp)1001 lufs_merge_deltas(fileid_t *fp)
1002 {
1003 int nb;
1004 int64_t bof;
1005 lb_me_t **lh, *l;
1006 int32_t skip;
1007
1008 /*
1009 * No logmap: Empty log. Nothing to do here.
1010 */
1011 if (!ufs_is_lufs || logbuffer == (caddr_t)NULL)
1012 return;
1013
1014 bof = ldbtob(fp->fi_blocknum);
1015 nb = fp->fi_count;
1016
1017 /*
1018 * Search the log hash.
1019 * Merge deltas if an overlap is found.
1020 */
1021
1022 lh = &loghash[LB_HASHFUNC(bof)];
1023
1024 if (*lh == (lb_me_t *)NULL)
1025 return;
1026
1027 l = *lh;
1028
1029 do {
1030 l = l->l_prev;
1031 if (OVERLAP(l->l_mof, l->l_nb, bof, nb)) {
1032 /*
1033 * Found a delta in the log hash which overlaps
1034 * with the current metadata block. Read the
1035 * actual delta payload from the on-disk log
1036 * directly into the file buffer.
1037 */
1038 if (l->l_typ != DT_ABZERO) {
1039 /*
1040 * We have to actually read this part of the
1041 * log as it could contain a sector trailer, or
1042 * wrap around the end of the log.
1043 * If it did, the second offset generation would
1044 * be incorrect if we'd started at l->l_lof.
1045 */
1046 if (!(skip = lufs_read_log(l->l_lof, NULL,
1047 MAX(bof - l->l_mof, 0))))
1048 dprintf("scan/merge error, pre-skip\n");
1049 if (!(skip = lufs_read_log(skip,
1050 fp->fi_memp + MAX(l->l_mof - bof, 0),
1051 MIN(l->l_mof + l->l_nb, bof + nb) -
1052 MAX(l->l_mof, bof))))
1053 dprintf("scan/merge error, merge\n");
1054 } else {
1055 /*
1056 * DT_ABZERO requires no disk access, just
1057 * clear the byte range which overlaps with
1058 * the delta.
1059 */
1060 bzero(fp->fi_memp + MAX(l->l_mof - bof, 0),
1061 MIN(l->l_mof + l->l_nb, bof + nb) -
1062 MAX(l->l_mof, bof));
1063 }
1064 }
1065 } while (l->l_prev != (*lh)->l_prev);
1066
1067 printf("*\b");
1068 }
1069
1070 void
lufs_closeall(void)1071 lufs_closeall(void)
1072 {
1073 if (ufs_is_lufs) {
1074 bkmem_free((char *)eb, logfp->fi_devp->un_fs.di_fs.fs_bsize);
1075 bkmem_free((char *)logfp, sizeof (fileid_t));
1076 eb = (extent_block_t *)NULL;
1077 bzero((caddr_t)&odi, sizeof (ml_odunit_t));
1078 logfp = (fileid_t *)NULL;
1079 lufs_free_logbuf();
1080 ufs_is_lufs = 0;
1081 }
1082 }
1083