1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/param.h> 27 #include <sys/vnode.h> 28 #include <sys/fs/ufs_fsdir.h> 29 #include <sys/fs/ufs_fs.h> 30 #include <sys/fs/ufs_inode.h> 31 #include <sys/fs/ufs_log.h> 32 #include <sys/sysmacros.h> 33 #include <sys/promif.h> 34 #include <sys/machparam.h> 35 36 #include <sys/stat.h> 37 #include <sys/bootdebug.h> 38 #include <sys/salib.h> 39 #include <sys/saio.h> 40 #include <sys/filep.h> 41 42 43 /* 44 * Big theory statement on how ufsboot makes use of the log 45 * in case the filesystem wasn't shut down cleanly. 46 * 47 * The structure of the ufs on-disk log looks like this: 48 * 49 * +-----------------+ 50 * | SUPERBLOCK | 51 * | ... | 52 * | fs_logbno +--> +-----------------------+ 53 * | ... | | EXTENT BLOCK | 54 * +-----------------+ | ... | 55 * | nextents | 56 * +----------------------+ extents[0].pbno | 57 * | | { extents[1].pbno } +------------+ 58 * | | ... +--> ... | 59 * | +-----------------------+ | 60 * v | 61 * +-----------------------------+ \ | 62 * | ON-DISK LOG HEADER | | | 63 * | ... | | | 64 * | od_head_lof +--+ | | 65 * | ... | | | | 66 * +-----------------------------+ <|---|- od_bol_lof | 67 * | sector (may contain deltas) | | | (logical offset) | 68 * | +-------------------------+ | | | 69 * | | trailer (some ident#) | | > extents[0].nbno | 70 * +---+-------------------------+ | | blocks ("sectors") | 71 * . . | | | 72 * . . | | | 73 * +-----------------------------+<-+ | | 74 * | delta1 delta2 delta3 | | | 75 * | d +-------------------------+ | | 76 * | e | ident#: od_head_ident | | | 77 * +---+-------------------------+ / | 78 * | 79 * +-----------------------------+ <---------------------------+ 80 * | lta4 delta5 delta6 de | 81 * | l +-------------------------+ 82 * | t | ident#: od_head_ident+1 | 83 * +---+-------------------------+ 84 * . . 85 * +-----------------------------+ 86 * | sector (may contain deltas) | 87 * | +------------------+ 88 * | | trailer (ident#) | 89 * +----------+------------------+ <-- od_eol_lof (logical offset) 90 * 91 * The ufs on-disk log has the following properties: 92 * 93 * 1. The log is made up from at least one extent. "fs_logbno" in 94 * the superblock points to where this is found. 95 * 2. Extents describe the logical layout. 96 * - Logical offset 0 is the on-disk log header. It's also 97 * at the beginning of the first physical block. 98 * - If there's more than one extent, the equation holds: 99 * extent[i+1].lbno == extent[i].lbno + extent[i].nbno 100 * i.e. logical offsets form a contiguous sequence. Yet on disk, 101 * two logically-adjacent offsets may be located in two 102 * physically disjoint extents, so logical offsets need to be 103 * translated into physical disk block addresses for access. 104 * - Various fields in the on-disk log header structure refer 105 * to such logical log offsets. 106 * 3. The actual logical logspace begins after the log header, at 107 * the logical offset indicated by "od_bol_lof". Every 512 Bytes 108 * (a "sector" in terms of ufs logging) is a sector trailer which 109 * contains a sequence number, the sector ident. 110 * 4. Deltas are packed tight in the remaining space, i.e. a delta 111 * may be part of more than one sector. Reads from the logspace 112 * must be split at sector boundaries, since the trailer is never 113 * part of a delta. Delta sizes vary. 114 * 5. The field "od_head_lof" points to the start of the dirty part 115 * of the log, i.e. to the first delta header. Likewise, "od_head_ident" 116 * is the sequence number where the valid part of the log starts; if 117 * the sector pointed to by "od_head_lof" has a sector ident different 118 * from "od_head_ident", the log is empty. 119 * 6. The valid part of the log extends for as many sectors as their ident 120 * numbers form a contiguous sequence. When reaching the logical end of 121 * the log, "od_bol_lof", logical offsets wrap around to "od_bol_lof", 122 * i.e. the log forms a circular buffer. 123 * 124 * For the strategy how to handle accessing the log, item 4. is the 125 * most important one - its consequence is that the log can only be 126 * read in one direction - forward, starting at the head. 127 * 128 * The task of identifying whether a given metadata block is 129 * actually in the log therefore requires reading the entire 130 * log. Doing so is memory-efficient but kills speed if re-done 131 * at every metadata read (64MB log size vs. 512 byte metadata 132 * block size: 128 times as much I/O, possibly only to find out 133 * that this block was not in the log ...). 134 * 135 * First thought to speed this up is to let ufsboot roll the log. 136 * But this is not possible because: 137 * - ufsboot currently does not implement any write functionality, 138 * the boot-time ufs implementation is read-only. 139 * - firmware write interfaces may or may not be available, in any 140 * case, they're rarely used and untested for such a purpose. 141 * - that would duplicate a lot of code, since at the moment only 142 * kernel ufs logging implements log rolling. 143 * - the boot environment cannot be considered high-performance; 144 * rolling the log there would be slow. 145 * - boot device and root device could well be different, creating 146 * inconsistencies e.g. with a mirrored root if the log is rolled. 147 * 148 * Therefore, caching the log structural information (boot-relevant 149 * deltas and their logical log offset) is required for fast access 150 * to the data in the log. This code builds a logmap for that purpose. 151 * 152 * As a simple optimization, if we find the log is empty, we will not 153 * use it - log reader support for ufsboot has no noticeable overhead 154 * for clean logs, or for root filesystems that aren't logging. 155 */ 156 157 #define LB_HASHSHIFT 13 158 #define LB_HASHSIZE (1 << LB_HASHSHIFT) 159 #define LB_HASHFUNC(mof) (((mof) >> LB_HASHSHIFT) & (LB_HASHSIZE - 1)) 160 161 #define LOGBUF_MAXSIZE (8*1024*1024) 162 #define LOGBUF_MINSIZE (256*1024) 163 164 #define LOG_IS_EMPTY 0 165 #define LOG_IS_OK 1 166 #define LOG_IS_ERRORED 2 167 168 /* 169 * We build a hashed logmap of those while scanning the log. 170 * sizeof(lb_map_t) is 40 on 64bit, 32 on 32bit; the max sized 171 * resalloc'ed buffer can accomodate around ~500k of those; 172 * this is approximately the maximum amount of deltas we'll 173 * see if a 64MB ufs log is completely filled. We'll make no 174 * attempt to free and reallocate the resalloc'ed buffer if 175 * we overflow, as conservative sizing should make that an 176 * impossibility. A future enhancement may allocate memory 177 * here as needed - once the boot time memory allocator 178 * supports that. 179 */ 180 typedef struct lb_mapentry { 181 struct lb_mapentry *l_next; /* hash chaining */ 182 struct lb_mapentry *l_prev; /* hash chaining */ 183 int64_t l_mof; /* disk addr this delta is against */ 184 int16_t l_nb; /* size of delta */ 185 int16_t l_flags; 186 int32_t l_lof; /* log offset for delta header */ 187 int32_t l_tid; /* transaction this delta is part of */ 188 delta_t l_typ; /* see <sys/fs/ufs_trans.h> */ 189 } lb_me_t; 190 191 #define LB_ISCANCELLED 1 192 193 #define inslist(lh, l) if ((*(lh))) { \ 194 (*(lh))->l_prev->l_next = (l); \ 195 (l)->l_next = (*(lh)); \ 196 (l)->l_prev = (*(lh))->l_prev; \ 197 (*(lh))->l_prev = (l); \ 198 } else { \ 199 (l)->l_next = (l); \ 200 (l)->l_prev = (l); \ 201 (*(lh)) = l; \ 202 } 203 204 #define remlist(lh, l) \ 205 if ((l)->l_next == (l)) { \ 206 if (*(lh) != (l) || (l)->l_prev != (l)) \ 207 dprintf("Logmap hash inconsistency.\n"); \ 208 *(lh) = (lb_me_t *)NULL; \ 209 } else { \ 210 if (*(lh) == (l)) \ 211 *(lh) = (l)->l_next; \ 212 (l)->l_prev->l_next = (l)->l_next; \ 213 (l)->l_next->l_prev = (l)->l_prev; \ 214 } 215 216 #define lufs_alloc_me() \ 217 (lb_me_t *)lufs_alloc_from_logbuf(sizeof (lb_me_t)) 218 219 extern int boothowto; 220 static int ufs_is_lufs = 0; 221 static fileid_t *logfp = (fileid_t *)NULL; 222 static extent_block_t *eb = (extent_block_t *)NULL; 223 static ml_odunit_t odi; 224 225 static char logbuffer_min[LOGBUF_MINSIZE]; 226 static caddr_t logbuffer = (caddr_t)NULL; 227 static caddr_t elogbuffer = (caddr_t)NULL; 228 static caddr_t logbuf_curptr; 229 static lb_me_t **loghash = (lb_me_t **)NULL; 230 static lb_me_t *lfreelist; 231 232 static uint32_t curtid; 233 234 235 int lufs_support = 1; 236 237 void lufs_boot_init(fileid_t *); 238 void lufs_closeall(void); 239 void lufs_merge_deltas(fileid_t *); 240 241 static int lufs_logscan(void); 242 243 extern int diskread(fileid_t *filep); 244 extern caddr_t resalloc(enum RESOURCES, size_t, caddr_t, int); 245 246 #if defined(__sparcv9) 247 #define LOGBUF_BASEADDR ((caddr_t)(SYSBASE - LOGBUF_MAXSIZE)) 248 #endif 249 250 static int 251 lufs_alloc_logbuf(void) 252 { 253 /* 254 * Allocate memory for caching the log. Since the logbuffer can 255 * potentially exceed the boot scratch memory limit, we use resalloc 256 * directly, passing the allocation to the low-level boot-time 257 * backend allocator. The chosen VA range is the top end of 258 * the kernel's segmap segment, so we're not interfering 259 * with the kernel because segmap is created at a time when 260 * the 2nd-stage boot has already been unloaded and this VA 261 * range was given back. 262 * 263 * On sparc platforms, the kernel cannot recover the memory 264 * obtained from resalloc because the page structs are allocated 265 * before the call to BOP_QUIESCE. To avoid leaking this 266 * memory, the logbuffer is allocated from a small bss array 267 * that should hold the logmap except in the most extreme cases. 268 * If the bss array is too small, the logbuffer is extended 269 * from resalloc 1 page at a time. 270 */ 271 272 logbuffer = logbuffer_min; 273 elogbuffer = logbuffer+LOGBUF_MINSIZE; 274 logbuf_curptr = logbuffer; 275 lfreelist = (lb_me_t *)NULL; 276 277 if (logbuffer == (caddr_t)NULL) 278 return (0); 279 280 dprintf("Buffer for boot loader logging support: 0x%p, size 0x%x\n", 281 logbuffer, elogbuffer-logbuffer); 282 283 return (1); 284 } 285 286 static void 287 lufs_free_logbuf() 288 { 289 /* 290 * Solaris/x86 has no prom_free() routine at this time. 291 * Reclaiming the VA range below KERNEL_TEXT on Solaris/x86 292 * is done by the kernel startup itself, in hat_unload_prom() 293 * after the bootloader has been quiesced. 294 * 295 * Solaris on sparc has a prom_free() routine that will update 296 * the memlist properties to reflect the freeing of the 297 * logbuffer. However, the sparc kernel cannot recover 298 * the memory freed after the call to BOP_QUIESCE as the 299 * page struct have already been allocated. We call 300 * prom_free anyway so that the kernel can reclaim this 301 * memory in the future. 302 */ 303 if (logbuffer == LOGBUF_BASEADDR) 304 prom_free(logbuffer, elogbuffer-logbuffer); 305 logbuffer = (caddr_t)NULL; 306 } 307 308 static caddr_t 309 lufs_alloc_from_logbuf(size_t sz) 310 { 311 caddr_t tmpaddr; 312 lb_me_t *l; 313 314 /* 315 * Satisfy lb_me_t allocations from the freelist 316 * first if possible. 317 */ 318 if ((sz == sizeof (lb_me_t)) && lfreelist) { 319 l = lfreelist; 320 lfreelist = lfreelist->l_next; 321 return ((caddr_t)l); 322 } 323 if (elogbuffer < logbuf_curptr + sz) { 324 caddr_t np; 325 size_t nsz; 326 327 /* 328 * Out of space in current chunk - try to add another. 329 */ 330 if (logbuffer == logbuffer_min) { 331 np = LOGBUF_BASEADDR; 332 } else { 333 np = elogbuffer; 334 } 335 nsz = roundup(sz, PAGESIZE); 336 if (np + nsz > LOGBUF_BASEADDR + LOGBUF_MAXSIZE) { 337 return ((caddr_t)NULL); 338 } 339 340 np = resalloc(RES_CHILDVIRT, nsz, np, 0UL); 341 if (np == (caddr_t)NULL) { 342 return ((caddr_t)NULL); 343 } 344 if (logbuffer == logbuffer_min) 345 logbuffer = LOGBUF_BASEADDR; 346 logbuf_curptr = np; 347 elogbuffer = logbuf_curptr + nsz; 348 } 349 350 tmpaddr = logbuf_curptr; 351 logbuf_curptr += sz; 352 bzero(tmpaddr, sz); 353 return (tmpaddr); 354 } 355 356 static int32_t 357 lufs_read_log(int32_t addr, caddr_t va, int nb) 358 { 359 int i, fastpath = 0; 360 daddr_t pblk, lblk; 361 sect_trailer_t *st; 362 uint32_t ident; 363 364 /* 365 * Fast path for skipping the read if no target buffer 366 * is specified. Don't do this for the initial scan. 367 */ 368 if (ufs_is_lufs && (va == (caddr_t)NULL)) 369 fastpath = 1; 370 371 while (nb) { 372 /* log wraparound check */ 373 if (addr == odi.od_eol_lof) 374 addr = odi.od_bol_lof; 375 if (fastpath) 376 goto read_done; 377 378 /* 379 * Translate logically-contiguous log offsets into physical 380 * block numbers. For a log consisting of a single extent: 381 * pbno = btodb(addr) - extents[0].lbno; 382 * Otherwise, search for the extent which contains addr. 383 */ 384 pblk = 0; 385 lblk = btodb(addr); 386 for (i = 0; i < eb->nextents; i++) { 387 if (lblk >= eb->extents[i].lbno && 388 lblk < eb->extents[i].lbno + 389 eb->extents[i].nbno) { 390 pblk = lblk - eb->extents[i].lbno + 391 eb->extents[i].pbno; 392 break; 393 } 394 } 395 396 if (pblk == 0) { 397 /* 398 * block #0 can never be in a log extent since this 399 * block always contains the primary superblock copy. 400 */ 401 dprintf("No log extent found for log offset 0x%llx.\n", 402 addr); 403 return (0); 404 } 405 406 /* 407 * Check whether the block we want is cached from the last 408 * read. If not, read it in now. 409 */ 410 if (logfp->fi_blocknum != pblk) { 411 logfp->fi_blocknum = pblk; 412 logfp->fi_memp = logfp->fi_buf; 413 logfp->fi_count = DEV_BSIZE; 414 logfp->fi_offset = 0; 415 if (diskread(logfp)) { 416 dprintf("I/O error reading the ufs log" \ 417 " at block 0x%x.\n", 418 logfp->fi_blocknum); 419 return (0); 420 } 421 /* 422 * Log structure verification. The block which we just 423 * read has an ident number that must match its offset 424 * in blocks from the head of the log. Since the log 425 * can wrap around, we have to check for that to get the 426 * ident right. Out-of-sequence idents can happen after 427 * power failures, panics during a partial transaction, 428 * media errors, ... - in any case, they mark the end of 429 * the valid part of the log. 430 */ 431 st = (sect_trailer_t *)(logfp->fi_memp + 432 LDL_USABLE_BSIZE); 433 /* od_head_ident is where the sequence starts */ 434 ident = odi.od_head_ident; 435 if (lblk >= lbtodb(odi.od_head_lof)) { 436 /* no wraparound */ 437 ident += (lblk - lbtodb(odi.od_head_lof)); 438 } else { 439 /* log wrapped around the end */ 440 ident += (lbtodb(odi.od_eol_lof) - 441 lbtodb(odi.od_head_lof)); 442 ident += (lblk - lbtodb(odi.od_bol_lof)); 443 } 444 445 if (ident != st->st_ident) 446 return (0); 447 } 448 read_done: 449 /* 450 * Copy the delta contents to the destination buffer if 451 * one was specified. Otherwise, just skip the contents. 452 */ 453 i = MIN(NB_LEFT_IN_SECTOR(addr), nb); 454 if (va != NULL) { 455 bcopy(logfp->fi_buf + (addr - ldbtob(lbtodb(addr))), 456 va, i); 457 va += i; 458 } 459 nb -= i; 460 addr += i; 461 /* 462 * Skip sector trailer if necessary. 463 */ 464 if (NB_LEFT_IN_SECTOR(addr) == 0) 465 addr += sizeof (sect_trailer_t); 466 } 467 return (addr); 468 } 469 470 void 471 lufs_boot_init(fileid_t *filep) 472 { 473 struct fs *sb = (struct fs *)filep->fi_memp; 474 int err = 0; 475 476 /* 477 * boot_ufs_mountroot() should have called us with a 478 * filep pointing to the superblock. Verify that this 479 * is so first. 480 * Then check whether this filesystem has a dirty log. 481 * Also return if lufs support was disabled on request. 482 */ 483 if (!lufs_support || 484 sb != (struct fs *)&filep->fi_devp->un_fs.di_fs || 485 sb->fs_clean != FSLOG || sb->fs_logbno == 0) { 486 return; 487 } 488 489 if (boothowto & RB_VERBOSE) 490 printf("The boot filesystem is logging.\n"); 491 492 /* 493 * The filesystem is logging, there is a log area 494 * allocated for it. Check the log state and determine 495 * whether it'll be possible to use this log. 496 */ 497 498 /* 499 * Allocate a private fileid_t for use when reading 500 * from the log. 501 */ 502 eb = (extent_block_t *)bkmem_zalloc(sb->fs_bsize); 503 logfp = (fileid_t *)bkmem_zalloc(sizeof (fileid_t)); 504 logfp->fi_memp = logfp->fi_buf; 505 logfp->fi_devp = filep->fi_devp; 506 507 /* 508 * Read the extent block and verify that what we 509 * find there are actually lufs extents. 510 * Make it simple: the extent block including all 511 * extents cannot be larger than a filesystem block. 512 * So read a whole filesystem block, to make sure 513 * we have read all extents in the same operation. 514 */ 515 logfp->fi_blocknum = sb->fs_logbno; 516 logfp->fi_count = sb->fs_bsize; 517 logfp->fi_memp = (caddr_t)eb; 518 logfp->fi_offset = 0; 519 if (diskread(logfp) || eb->type != LUFS_EXTENTS) { 520 dprintf("Failed to read log extent block.\n"); 521 err = LOG_IS_ERRORED; 522 goto out; 523 } 524 525 /* 526 * Read the on disk log header. If that fails, 527 * try the backup copy on the adjacent block. 528 */ 529 logfp->fi_blocknum = eb->extents[0].pbno; 530 logfp->fi_count = sizeof (ml_odunit_t); 531 logfp->fi_memp = (caddr_t)&odi; 532 logfp->fi_offset = 0; 533 if (diskread(logfp)) { 534 logfp->fi_blocknum = eb->extents[0].pbno + 1; 535 logfp->fi_count = sizeof (ml_odunit_t); 536 logfp->fi_memp = (caddr_t)&odi; 537 logfp->fi_offset = 0; 538 if (diskread(logfp)) { 539 dprintf("Failed to read on-disk log header.\n"); 540 err = LOG_IS_ERRORED; 541 goto out; 542 } 543 } 544 545 /* 546 * Verify that we understand this log, and 547 * that the log isn't bad or empty. 548 */ 549 if (odi.od_version != LUFS_VERSION_LATEST) { 550 dprintf("On-disk log format v%d != supported format v%d.\n", 551 odi.od_version, LUFS_VERSION_LATEST); 552 err = LOG_IS_ERRORED; 553 } else if (odi.od_badlog) { 554 dprintf("On-disk log is marked bad.\n"); 555 err = LOG_IS_ERRORED; 556 } else if (odi.od_chksum != odi.od_head_ident + odi.od_tail_ident) { 557 dprintf("On-disk log checksum %d != ident sum %d.\n", 558 odi.od_chksum, odi.od_head_ident + odi.od_tail_ident); 559 err = LOG_IS_ERRORED; 560 } else { 561 /* 562 * All consistency checks ok. Scan the log, build the 563 * log hash. If this succeeds we'll be using the log 564 * when reading from this filesystem. 565 */ 566 err = lufs_logscan(); 567 } 568 out: 569 ufs_is_lufs = 1; 570 switch (err) { 571 case LOG_IS_EMPTY: 572 if (boothowto & RB_VERBOSE) 573 printf("The ufs log is empty and will not be used.\n"); 574 lufs_closeall(); 575 break; 576 case LOG_IS_OK: 577 if (boothowto & RB_VERBOSE) 578 printf("Using the ufs log.\n"); 579 break; 580 case LOG_IS_ERRORED: 581 if (boothowto & RB_VERBOSE) 582 printf("Couldn't build log hash. Can't use ufs log.\n"); 583 lufs_closeall(); 584 break; 585 default: 586 dprintf("Invalid error %d while scanning the ufs log.\n", err); 587 break; 588 } 589 } 590 591 static int 592 lufs_logscan_read(int32_t *addr, struct delta *d) 593 { 594 *addr = lufs_read_log(*addr, (caddr_t)d, sizeof (struct delta)); 595 596 if (*addr == 0 || 597 (int)d->d_typ < DT_NONE || d->d_typ > DT_MAX || 598 d->d_nb >= odi.od_logsize) 599 return (0); 600 601 return (1); 602 } 603 604 static int 605 lufs_logscan_skip(int32_t *addr, struct delta *d) 606 { 607 switch (d->d_typ) { 608 case DT_COMMIT: 609 /* 610 * A DT_COMMIT delta has no size as such, but will 611 * always "fill up" the sector that contains it. 612 * The next delta header is found at the beginning 613 * of the next 512-Bytes sector, adjust "addr" to 614 * reflect that. 615 */ 616 *addr += ((*addr & (DEV_BSIZE - 1))) ? 617 NB_LEFT_IN_SECTOR(*addr) + 618 sizeof (sect_trailer_t) : 0; 619 return (1); 620 case DT_CANCEL: 621 case DT_ABZERO: 622 /* 623 * These types of deltas occupy no space in the log 624 */ 625 return (1); 626 default: 627 /* 628 * Skip over the delta contents. 629 */ 630 *addr = lufs_read_log(*addr, NULL, d->d_nb); 631 } 632 633 return (*addr != 0); 634 } 635 636 static void 637 lufs_logscan_freecancel(void) 638 { 639 lb_me_t **lh, *l, *lnext; 640 int i; 641 642 /* 643 * Walk the entire log hash and put cancelled entries 644 * onto the freelist. Corner cases: 645 * a) empty hash chain (*lh == NULL) 646 * b) only one entry in chain, and that is cancelled. 647 * If for every cancelled delta another one would've 648 * been added, this situation couldn't occur, but a 649 * DT_CANCEL delta can lead to this as it is never 650 * added. 651 */ 652 for (i = 0; i < LB_HASHSIZE; i++) { 653 lh = &loghash[i]; 654 l = *lh; 655 do { 656 if (*lh == (lb_me_t *)NULL) 657 break; 658 lnext = l->l_next; 659 if (l->l_flags & LB_ISCANCELLED) { 660 remlist(lh, l); 661 bzero((caddr_t)l, sizeof (lb_me_t)); 662 l->l_next = lfreelist; 663 lfreelist = l; 664 /* 665 * Just removed the hash head. In order not 666 * to terminate the while loop, respin chain 667 * walk for this hash chain. 668 */ 669 if (lnext == *lh) { 670 i--; 671 break; 672 } 673 } 674 l = lnext; 675 } while (l != *lh); 676 } 677 } 678 679 static int 680 lufs_logscan_addmap(int32_t *addr, struct delta *d) 681 { 682 lb_me_t **lh, *l; 683 684 switch (d->d_typ) { 685 case DT_COMMIT: 686 /* 687 * Handling DT_COMMIT deltas is special. We need to: 688 * 1. increase the transaction ID 689 * 2. remove cancelled entries. 690 */ 691 lufs_logscan_freecancel(); 692 curtid++; 693 break; 694 case DT_INODE: 695 /* 696 * Deltas against parts of on-disk inodes are 697 * assumed to be timestamps. Ignore those. 698 */ 699 if (d->d_nb != sizeof (struct dinode)) 700 break; 701 /* FALLTHROUGH */ 702 case DT_CANCEL: 703 case DT_ABZERO: 704 case DT_AB: 705 case DT_DIR: 706 case DT_FBI: 707 /* 708 * These types of deltas contain and/or modify structural 709 * information that is needed for booting the system: 710 * - where to find a file (DT_DIR, DT_FBI) 711 * - the file itself (DT_INODE) 712 * - data blocks associated with a file (DT_AB, DT_ABZERO) 713 * 714 * Building the hash chains becomes complicated because there 715 * may exist an older (== previously added) entry that overlaps 716 * with the one we want to add. 717 * Four cases must be distinguished: 718 * 1. The new delta is an exact match for an existing one, 719 * or is a superset of an existing one, and both 720 * belong to the same transaction. 721 * The new delta completely supersedes the old one, so 722 * remove that and reuse the structure for the new. 723 * Then add the new delta to the head of the hashchain. 724 * 2. The new delta is an exact match for an existing one, 725 * or is a superset of an existing one, but the two 726 * belong to different transactions (i.e. the old one is 727 * committed). 728 * The existing one is marked to be cancelled when the 729 * next DT_COMMIT record is found, and the hash chain 730 * walk is continued as there may be more existing entries 731 * found which overlap the new delta (happens if that is 732 * a superset of those in the log). 733 * Once no more overlaps are found, goto 4. 734 * 3. An existing entry completely covers the new one. 735 * The new delta is then added directly before this 736 * existing one. 737 * 4. No (more) overlaps with existing entries are found. 738 * Unless this is a DT_CANCEL delta, whose only purpose 739 * is already handled by marking overlapping entries for 740 * cancellation, add the new delta at the hash chain head. 741 * 742 * This strategy makes sure that the hash chains are properly 743 * ordered. lufs_merge_deltas() walks the hash chain backward, 744 * which then ensures that delta merging is done in the same 745 * order as those deltas occur in the log - remember, the 746 * log can only be read in one direction. 747 * 748 */ 749 lh = &loghash[LB_HASHFUNC(d->d_mof)]; 750 l = *lh; 751 do { 752 if (l == (lb_me_t *)NULL) 753 break; 754 /* 755 * This covers the first two cases above. 756 * If this is a perfect match from the same transaction, 757 * and it isn't already cancelled, we simply replace it 758 * with its newer incarnation. 759 * Otherwise, mark it for cancellation. Handling of 760 * DT_COMMIT is going to remove it, then. 761 */ 762 if (WITHIN(l->l_mof, l->l_nb, d->d_mof, d->d_nb)) { 763 if (!(l->l_flags & LB_ISCANCELLED)) { 764 if (l->l_tid == curtid && 765 d->d_typ != DT_CANCEL) { 766 remlist(lh, l); 767 l->l_mof = d->d_mof; 768 l->l_lof = *addr; 769 l->l_nb = d->d_nb; 770 l->l_typ = d->d_typ; 771 l->l_flags = 0; 772 l->l_tid = curtid; 773 inslist(lh, l); 774 return (1); 775 } else { 776 /* 777 * 2nd case - cancel only. 778 */ 779 l->l_flags |= LB_ISCANCELLED; 780 } 781 } 782 } else if (WITHIN(d->d_mof, d->d_nb, 783 l->l_mof, l->l_nb)) { 784 /* 785 * This is the third case above. 786 * With deltas DT_ABZERO/DT_AB and DT_FBI/DT_DIR 787 * this may happen - an existing previous delta 788 * is larger than the current one we're planning 789 * to add - DT_ABZERO deltas are supersets of 790 * DT_AB deltas, and likewise DT_FBI/DT_DIR. 791 * In order to do merging correctly, such deltas 792 * put up a barrier for new ones that overlap, 793 * and we have to add the new delta immediately 794 * before (!) the existing one. 795 */ 796 lb_me_t *newl; 797 newl = lufs_alloc_me(); 798 if (newl == (lb_me_t *)NULL) { 799 /* 800 * No memory. Throw away everything 801 * and try booting without logging 802 * support. 803 */ 804 curtid = 0; 805 return (0); 806 } 807 newl->l_mof = d->d_mof; 808 newl->l_lof = *addr; /* "payload" address */ 809 newl->l_nb = d->d_nb; 810 newl->l_typ = d->d_typ; 811 newl->l_tid = curtid; 812 newl->l_prev = l->l_prev; 813 newl->l_next = l; 814 l->l_prev->l_next = newl; 815 l->l_prev = newl; 816 if (*lh == l) 817 *lh = newl; 818 return (1); 819 } 820 l = l->l_next; 821 } while (l != *lh); 822 823 /* 824 * This is case 4., add a new delta at the head of the chain. 825 * 826 * If the new delta is a DT_CANCEL entry, we handled it by 827 * marking everything it covered for cancellation. We can 828 * get by without actually adding the delta itself to the 829 * hash, as it'd need to be removed by the commit code anyway. 830 */ 831 if (d->d_typ == DT_CANCEL) 832 break; 833 834 l = lufs_alloc_me(); 835 if (l == (lb_me_t *)NULL) { 836 /* 837 * No memory. Throw away everything 838 * and try booting without logging 839 * support. 840 */ 841 curtid = 0; 842 return (0); 843 } 844 l->l_mof = d->d_mof; 845 l->l_lof = *addr; /* this is the "payload" address */ 846 l->l_nb = d->d_nb; 847 l->l_typ = d->d_typ; 848 l->l_tid = curtid; 849 inslist(lh, l); 850 break; 851 default: 852 break; 853 } 854 return (1); 855 } 856 857 static int 858 lufs_logscan_prescan(void) 859 { 860 /* 861 * Simulate a full log by setting the tail to be one sector 862 * behind the head. This will make the logscan read all 863 * of the log until an out-of-sequence sector ident is 864 * found. 865 */ 866 odi.od_tail_lof = dbtob(btodb(odi.od_head_lof)) - DEV_BSIZE; 867 if (odi.od_tail_lof < odi.od_bol_lof) 868 odi.od_tail_lof = odi.od_eol_lof - DEV_BSIZE; 869 if (odi.od_tail_lof >= odi.od_eol_lof) 870 odi.od_tail_lof = odi.od_bol_lof; 871 872 /* 873 * While sector trailers maintain TID values, od_head_tid 874 * is not being updated by the kernel ufs logging support 875 * at this time. We therefore count transactions ourselves 876 * starting at zero - as does the kernel ufs logscan code. 877 */ 878 curtid = 0; 879 880 if (!lufs_alloc_logbuf()) { 881 dprintf("Failed to allocate log buffer.\n"); 882 return (0); 883 } 884 885 loghash = (lb_me_t **)lufs_alloc_from_logbuf( 886 LB_HASHSIZE * sizeof (lb_me_t *)); 887 if (loghash == (lb_me_t **)NULL) { 888 dprintf("Can't allocate loghash[] array."); 889 return (0); 890 } 891 return (1); 892 } 893 894 /* 895 * This function must remove all uncommitted entries (l->l_tid == curtid) 896 * from the log hash. Doing this, we implicitly delete pending cancellations 897 * as well. 898 * It uses the same hash walk algorithm as lufs_logscan_freecancel(). Only 899 * the check for entries that need to be removed is different. 900 */ 901 static void 902 lufs_logscan_postscan(void) 903 { 904 lb_me_t **lh, *l, *lnext; 905 int i; 906 907 for (i = 0; i < LB_HASHSIZE; i++) { 908 lh = &loghash[i]; 909 l = *lh; 910 do { 911 if (l == (lb_me_t *)NULL) 912 break; 913 lnext = l->l_next; 914 if (l->l_tid == curtid) { 915 remlist(lh, l); 916 bzero((caddr_t)l, sizeof (lb_me_t)); 917 l->l_next = lfreelist; 918 lfreelist = l; 919 if (*lh == (lb_me_t *)NULL) 920 break; 921 /* 922 * Just removed the hash head. In order not 923 * to terminate the while loop, respin chain 924 * walk for this hash chain. 925 */ 926 if (lnext == *lh) { 927 i--; 928 break; 929 } 930 } else { 931 l->l_flags &= ~(LB_ISCANCELLED); 932 } 933 l = lnext; 934 } while (l != *lh); 935 } 936 } 937 938 /* 939 * This function builds the log hash. It performs the same sequence 940 * of actions at logscan as the kernel ufs logging support: 941 * - Prepare the log for scanning by simulating a full log. 942 * - As long as sectors read from the log have contiguous idents, do: 943 * read the delta header 944 * add the delta to the logmap 945 * skip over the contents to the start of the next delta header 946 * - After terminating the scan, remove uncommitted entries. 947 * 948 * This function cannot fail except if mapping the logbuffer area 949 * during lufs_logscan_prescan() fails. If there is a structural 950 * integrity problem and the on-disk log cannot be read, we'll 951 * treat this as the same situation as an uncommitted transaction 952 * at the end of the log (or, corner case of that, an empty log 953 * with no committed transactions in it at all). 954 * 955 */ 956 static int 957 lufs_logscan(void) 958 { 959 int32_t addr; 960 struct delta d; 961 962 if (!lufs_logscan_prescan()) 963 return (LOG_IS_ERRORED); 964 965 addr = odi.od_head_lof; 966 967 /* 968 * Note that addr == od_tail_lof means a completely filled 969 * log. This almost never happens, so the common exit path 970 * from this loop is via one of the 'break's. 971 */ 972 while (addr != odi.od_tail_lof) { 973 if (!lufs_logscan_read(&addr, &d)) 974 break; 975 if (!lufs_logscan_addmap(&addr, &d)) 976 return (LOG_IS_ERRORED); 977 if (!lufs_logscan_skip(&addr, &d)) 978 break; 979 } 980 981 lufs_logscan_postscan(); 982 /* 983 * Check whether the log contains data, and if so whether 984 * it contains committed data. 985 */ 986 if (addr == odi.od_head_lof || curtid == 0) { 987 return (LOG_IS_EMPTY); 988 } 989 return (LOG_IS_OK); 990 } 991 992 /* 993 * A metadata block was read from disk. Check whether the logmap 994 * has a delta against this byte range, and if so read it in, since 995 * the data in the log is more recent than what was read from other 996 * places on the disk. 997 */ 998 void 999 lufs_merge_deltas(fileid_t *fp) 1000 { 1001 int nb; 1002 int64_t bof; 1003 lb_me_t **lh, *l; 1004 int32_t skip; 1005 1006 /* 1007 * No logmap: Empty log. Nothing to do here. 1008 */ 1009 if (!ufs_is_lufs || logbuffer == (caddr_t)NULL) 1010 return; 1011 1012 bof = ldbtob(fp->fi_blocknum); 1013 nb = fp->fi_count; 1014 1015 /* 1016 * Search the log hash. 1017 * Merge deltas if an overlap is found. 1018 */ 1019 1020 lh = &loghash[LB_HASHFUNC(bof)]; 1021 1022 if (*lh == (lb_me_t *)NULL) 1023 return; 1024 1025 l = *lh; 1026 1027 do { 1028 l = l->l_prev; 1029 if (OVERLAP(l->l_mof, l->l_nb, bof, nb)) { 1030 /* 1031 * Found a delta in the log hash which overlaps 1032 * with the current metadata block. Read the 1033 * actual delta payload from the on-disk log 1034 * directly into the file buffer. 1035 */ 1036 if (l->l_typ != DT_ABZERO) { 1037 /* 1038 * We have to actually read this part of the 1039 * log as it could contain a sector trailer, or 1040 * wrap around the end of the log. 1041 * If it did, the second offset generation would 1042 * be incorrect if we'd started at l->l_lof. 1043 */ 1044 if (!(skip = lufs_read_log(l->l_lof, NULL, 1045 MAX(bof - l->l_mof, 0)))) 1046 dprintf("scan/merge error, pre-skip\n"); 1047 if (!(skip = lufs_read_log(skip, 1048 fp->fi_memp + MAX(l->l_mof - bof, 0), 1049 MIN(l->l_mof + l->l_nb, bof + nb) - 1050 MAX(l->l_mof, bof)))) 1051 dprintf("scan/merge error, merge\n"); 1052 } else { 1053 /* 1054 * DT_ABZERO requires no disk access, just 1055 * clear the byte range which overlaps with 1056 * the delta. 1057 */ 1058 bzero(fp->fi_memp + MAX(l->l_mof - bof, 0), 1059 MIN(l->l_mof + l->l_nb, bof + nb) - 1060 MAX(l->l_mof, bof)); 1061 } 1062 } 1063 } while (l->l_prev != (*lh)->l_prev); 1064 1065 printf("*\b"); 1066 } 1067 1068 void 1069 lufs_closeall(void) 1070 { 1071 if (ufs_is_lufs) { 1072 bkmem_free((char *)eb, logfp->fi_devp->un_fs.di_fs.fs_bsize); 1073 bkmem_free((char *)logfp, sizeof (fileid_t)); 1074 eb = (extent_block_t *)NULL; 1075 bzero((caddr_t)&odi, sizeof (ml_odunit_t)); 1076 logfp = (fileid_t *)NULL; 1077 lufs_free_logbuf(); 1078 ufs_is_lufs = 0; 1079 } 1080 } 1081