1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/param.h> 29 #include <sys/vnode.h> 30 #include <sys/fs/ufs_fsdir.h> 31 #include <sys/fs/ufs_fs.h> 32 #include <sys/fs/ufs_inode.h> 33 #include <sys/fs/ufs_log.h> 34 #include <sys/sysmacros.h> 35 #include <sys/promif.h> 36 #include <sys/machparam.h> 37 38 #include <sys/stat.h> 39 #include <sys/bootdebug.h> 40 #include <sys/salib.h> 41 #include <sys/saio.h> 42 #include <sys/filep.h> 43 44 45 /* 46 * Big theory statement on how ufsboot makes use of the log 47 * in case the filesystem wasn't shut down cleanly. 48 * 49 * The structure of the ufs on-disk log looks like this: 50 * 51 * +-----------------+ 52 * | SUPERBLOCK | 53 * | ... | 54 * | fs_logbno +--> +-----------------------+ 55 * | ... | | EXTENT BLOCK | 56 * +-----------------+ | ... | 57 * | nextents | 58 * +----------------------+ extents[0].pbno | 59 * | | { extents[1].pbno } +------------+ 60 * | | ... +--> ... | 61 * | +-----------------------+ | 62 * v | 63 * +-----------------------------+ \ | 64 * | ON-DISK LOG HEADER | | | 65 * | ... | | | 66 * | od_head_lof +--+ | | 67 * | ... | | | | 68 * +-----------------------------+ <|---|- od_bol_lof | 69 * | sector (may contain deltas) | | | (logical offset) | 70 * | +-------------------------+ | | | 71 * | | trailer (some ident#) | | > extents[0].nbno | 72 * +---+-------------------------+ | | blocks ("sectors") | 73 * . . | | | 74 * . . | | | 75 * +-----------------------------+<-+ | | 76 * | delta1 delta2 delta3 | | | 77 * | d +-------------------------+ | | 78 * | e | ident#: od_head_ident | | | 79 * +---+-------------------------+ / | 80 * | 81 * +-----------------------------+ <---------------------------+ 82 * | lta4 delta5 delta6 de | 83 * | l +-------------------------+ 84 * | t | ident#: od_head_ident+1 | 85 * +---+-------------------------+ 86 * . . 87 * +-----------------------------+ 88 * | sector (may contain deltas) | 89 * | +------------------+ 90 * | | trailer (ident#) | 91 * +----------+------------------+ <-- od_eol_lof (logical offset) 92 * 93 * The ufs on-disk log has the following properties: 94 * 95 * 1. The log is made up from at least one extent. "fs_logbno" in 96 * the superblock points to where this is found. 97 * 2. Extents describe the logical layout. 98 * - Logical offset 0 is the on-disk log header. It's also 99 * at the beginning of the first physical block. 100 * - If there's more than one extent, the equation holds: 101 * extent[i+1].lbno == extent[i].lbno + extent[i].nbno 102 * i.e. logical offsets form a contiguous sequence. Yet on disk, 103 * two logically-adjacent offsets may be located in two 104 * physically disjoint extents, so logical offsets need to be 105 * translated into physical disk block addresses for access. 106 * - Various fields in the on-disk log header structure refer 107 * to such logical log offsets. 108 * 3. The actual logical logspace begins after the log header, at 109 * the logical offset indicated by "od_bol_lof". Every 512 Bytes 110 * (a "sector" in terms of ufs logging) is a sector trailer which 111 * contains a sequence number, the sector ident. 112 * 4. Deltas are packed tight in the remaining space, i.e. a delta 113 * may be part of more than one sector. Reads from the logspace 114 * must be split at sector boundaries, since the trailer is never 115 * part of a delta. Delta sizes vary. 116 * 5. The field "od_head_lof" points to the start of the dirty part 117 * of the log, i.e. to the first delta header. Likewise, "od_head_ident" 118 * is the sequence number where the valid part of the log starts; if 119 * the sector pointed to by "od_head_lof" has a sector ident different 120 * from "od_head_ident", the log is empty. 121 * 6. The valid part of the log extends for as many sectors as their ident 122 * numbers form a contiguous sequence. When reaching the logical end of 123 * the log, "od_bol_lof", logical offsets wrap around to "od_bol_lof", 124 * i.e. the log forms a circular buffer. 125 * 126 * For the strategy how to handle accessing the log, item 4. is the 127 * most important one - its consequence is that the log can only be 128 * read in one direction - forward, starting at the head. 129 * 130 * The task of identifying whether a given metadata block is 131 * actually in the log therefore requires reading the entire 132 * log. Doing so is memory-efficient but kills speed if re-done 133 * at every metadata read (64MB log size vs. 512 byte metadata 134 * block size: 128 times as much I/O, possibly only to find out 135 * that this block was not in the log ...). 136 * 137 * First thought to speed this up is to let ufsboot roll the log. 138 * But this is not possible because: 139 * - ufsboot currently does not implement any write functionality, 140 * the boot-time ufs implementation is read-only. 141 * - firmware write interfaces may or may not be available, in any 142 * case, they're rarely used and untested for such a purpose. 143 * - that would duplicate a lot of code, since at the moment only 144 * kernel ufs logging implements log rolling. 145 * - the boot environment cannot be considered high-performance; 146 * rolling the log there would be slow. 147 * - boot device and root device could well be different, creating 148 * inconsistencies e.g. with a mirrored root if the log is rolled. 149 * 150 * Therefore, caching the log structural information (boot-relevant 151 * deltas and their logical log offset) is required for fast access 152 * to the data in the log. This code builds a logmap for that purpose. 153 * 154 * As a simple optimization, if we find the log is empty, we will not 155 * use it - log reader support for ufsboot has no noticeable overhead 156 * for clean logs, or for root filesystems that aren't logging. 157 */ 158 159 #define LB_HASHSHIFT 13 160 #define LB_HASHSIZE (1 << LB_HASHSHIFT) 161 #define LB_HASHFUNC(mof) (((mof) >> LB_HASHSHIFT) & (LB_HASHSIZE - 1)) 162 163 #define LOGBUF_MAXSIZE (8*1024*1024) 164 #define LOGBUF_MINSIZE (256*1024) 165 166 #define LOG_IS_EMPTY 0 167 #define LOG_IS_OK 1 168 #define LOG_IS_ERRORED 2 169 170 /* 171 * We build a hashed logmap of those while scanning the log. 172 * sizeof(lb_map_t) is 40 on 64bit, 32 on 32bit; the max sized 173 * resalloc'ed buffer can accomodate around ~500k of those; 174 * this is approximately the maximum amount of deltas we'll 175 * see if a 64MB ufs log is completely filled. We'll make no 176 * attempt to free and reallocate the resalloc'ed buffer if 177 * we overflow, as conservative sizing should make that an 178 * impossibility. A future enhancement may allocate memory 179 * here as needed - once the boot time memory allocator 180 * supports that. 181 */ 182 typedef struct lb_mapentry { 183 struct lb_mapentry *l_next; /* hash chaining */ 184 struct lb_mapentry *l_prev; /* hash chaining */ 185 int64_t l_mof; /* disk addr this delta is against */ 186 int16_t l_nb; /* size of delta */ 187 int16_t l_flags; 188 int32_t l_lof; /* log offset for delta header */ 189 int32_t l_tid; /* transaction this delta is part of */ 190 delta_t l_typ; /* see <sys/fs/ufs_trans.h> */ 191 } lb_me_t; 192 193 #define LB_ISCANCELLED 1 194 195 #define inslist(lh, l) if ((*(lh))) { \ 196 (*(lh))->l_prev->l_next = (l); \ 197 (l)->l_next = (*(lh)); \ 198 (l)->l_prev = (*(lh))->l_prev; \ 199 (*(lh))->l_prev = (l); \ 200 } else { \ 201 (l)->l_next = (l); \ 202 (l)->l_prev = (l); \ 203 (*(lh)) = l; \ 204 } 205 206 #define remlist(lh, l) \ 207 if ((l)->l_next == (l)) { \ 208 if (*(lh) != (l) || (l)->l_prev != (l)) \ 209 dprintf("Logmap hash inconsistency.\n"); \ 210 *(lh) = (lb_me_t *)NULL; \ 211 } else { \ 212 if (*(lh) == (l)) \ 213 *(lh) = (l)->l_next; \ 214 (l)->l_prev->l_next = (l)->l_next; \ 215 (l)->l_next->l_prev = (l)->l_prev; \ 216 } 217 218 #define lufs_alloc_me() \ 219 (lb_me_t *)lufs_alloc_from_logbuf(sizeof (lb_me_t)) 220 221 extern int boothowto; 222 static int ufs_is_lufs = 0; 223 static fileid_t *logfp = (fileid_t *)NULL; 224 static extent_block_t *eb = (extent_block_t *)NULL; 225 static ml_odunit_t odi; 226 227 static char logbuffer_min[LOGBUF_MINSIZE]; 228 static caddr_t logbuffer = (caddr_t)NULL; 229 static caddr_t elogbuffer = (caddr_t)NULL; 230 static caddr_t logbuf_curptr; 231 static lb_me_t **loghash = (lb_me_t **)NULL; 232 static lb_me_t *lfreelist; 233 234 static uint32_t curtid; 235 236 237 int lufs_support = 1; 238 239 void lufs_boot_init(fileid_t *); 240 void lufs_closeall(void); 241 void lufs_merge_deltas(fileid_t *); 242 243 static int lufs_logscan(void); 244 245 extern int diskread(fileid_t *filep); 246 extern caddr_t resalloc(enum RESOURCES, size_t, caddr_t, int); 247 248 #if defined(__sparcv9) 249 #define LOGBUF_BASEADDR ((caddr_t)(SYSBASE - LOGBUF_MAXSIZE)) 250 #endif 251 252 static int 253 lufs_alloc_logbuf(void) 254 { 255 /* 256 * Allocate memory for caching the log. Since the logbuffer can 257 * potentially exceed the boot scratch memory limit, we use resalloc 258 * directly, passing the allocation to the low-level boot-time 259 * backend allocator. The chosen VA range is the top end of 260 * the kernel's segmap segment, so we're not interfering 261 * with the kernel because segmap is created at a time when 262 * the 2nd-stage boot has already been unloaded and this VA 263 * range was given back. 264 * 265 * On sparc platforms, the kernel cannot recover the memory 266 * obtained from resalloc because the page structs are allocated 267 * before the call to BOP_QUIESCE. To avoid leaking this 268 * memory, the logbuffer is allocated from a small bss array 269 * that should hold the logmap except in the most extreme cases. 270 * If the bss array is too small, the logbuffer is extended 271 * from resalloc 1 page at a time. 272 */ 273 274 logbuffer = logbuffer_min; 275 elogbuffer = logbuffer+LOGBUF_MINSIZE; 276 logbuf_curptr = logbuffer; 277 lfreelist = (lb_me_t *)NULL; 278 279 if (logbuffer == (caddr_t)NULL) 280 return (0); 281 282 dprintf("Buffer for boot loader logging support: 0x%p, size 0x%x\n", 283 logbuffer, elogbuffer-logbuffer); 284 285 return (1); 286 } 287 288 static void 289 lufs_free_logbuf() 290 { 291 /* 292 * Solaris/x86 has no prom_free() routine at this time. 293 * Reclaiming the VA range below KERNEL_TEXT on Solaris/x86 294 * is done by the kernel startup itself, in hat_unload_prom() 295 * after the bootloader has been quiesced. 296 * 297 * Solaris on sparc has a prom_free() routine that will update 298 * the memlist properties to reflect the freeing of the 299 * logbuffer. However, the sparc kernel cannot recover 300 * the memory freed after the call to BOP_QUIESCE as the 301 * page struct have already been allocated. We call 302 * prom_free anyway so that the kernel can reclaim this 303 * memory in the future. 304 */ 305 if (logbuffer == LOGBUF_BASEADDR) 306 prom_free(logbuffer, elogbuffer-logbuffer); 307 logbuffer = (caddr_t)NULL; 308 } 309 310 static caddr_t 311 lufs_alloc_from_logbuf(size_t sz) 312 { 313 caddr_t tmpaddr; 314 lb_me_t *l; 315 316 /* 317 * Satisfy lb_me_t allocations from the freelist 318 * first if possible. 319 */ 320 if ((sz == sizeof (lb_me_t)) && lfreelist) { 321 l = lfreelist; 322 lfreelist = lfreelist->l_next; 323 return ((caddr_t)l); 324 } 325 if (elogbuffer < logbuf_curptr + sz) { 326 caddr_t np; 327 size_t nsz; 328 329 /* 330 * Out of space in current chunk - try to add another. 331 */ 332 if (logbuffer == logbuffer_min) { 333 np = LOGBUF_BASEADDR; 334 } else { 335 np = elogbuffer; 336 } 337 nsz = roundup(sz, PAGESIZE); 338 if (np + nsz > LOGBUF_BASEADDR + LOGBUF_MAXSIZE) { 339 return ((caddr_t)NULL); 340 } 341 342 np = resalloc(RES_CHILDVIRT, nsz, np, 0UL); 343 if (np == (caddr_t)NULL) { 344 return ((caddr_t)NULL); 345 } 346 if (logbuffer == logbuffer_min) 347 logbuffer = LOGBUF_BASEADDR; 348 logbuf_curptr = np; 349 elogbuffer = logbuf_curptr + nsz; 350 } 351 352 tmpaddr = logbuf_curptr; 353 logbuf_curptr += sz; 354 bzero(tmpaddr, sz); 355 return (tmpaddr); 356 } 357 358 static int32_t 359 lufs_read_log(int32_t addr, caddr_t va, int nb) 360 { 361 int i, fastpath = 0; 362 daddr_t pblk, lblk; 363 sect_trailer_t *st; 364 uint32_t ident; 365 366 /* 367 * Fast path for skipping the read if no target buffer 368 * is specified. Don't do this for the initial scan. 369 */ 370 if (ufs_is_lufs && (va == (caddr_t)NULL)) 371 fastpath = 1; 372 373 while (nb) { 374 /* log wraparound check */ 375 if (addr == odi.od_eol_lof) 376 addr = odi.od_bol_lof; 377 if (fastpath) 378 goto read_done; 379 380 /* 381 * Translate logically-contiguous log offsets into physical 382 * block numbers. For a log consisting of a single extent: 383 * pbno = btodb(addr) - extents[0].lbno; 384 * Otherwise, search for the extent which contains addr. 385 */ 386 pblk = 0; 387 lblk = btodb(addr); 388 for (i = 0; i < eb->nextents; i++) { 389 if (lblk >= eb->extents[i].lbno && 390 lblk < eb->extents[i].lbno + 391 eb->extents[i].nbno) { 392 pblk = lblk - eb->extents[i].lbno + 393 eb->extents[i].pbno; 394 break; 395 } 396 } 397 398 if (pblk == 0) { 399 /* 400 * block #0 can never be in a log extent since this 401 * block always contains the primary superblock copy. 402 */ 403 dprintf("No log extent found for log offset 0x%llx.\n", 404 addr); 405 return (0); 406 } 407 408 /* 409 * Check whether the block we want is cached from the last 410 * read. If not, read it in now. 411 */ 412 if (logfp->fi_blocknum != pblk) { 413 logfp->fi_blocknum = pblk; 414 logfp->fi_memp = logfp->fi_buf; 415 logfp->fi_count = DEV_BSIZE; 416 logfp->fi_offset = 0; 417 if (diskread(logfp)) { 418 dprintf("I/O error reading the ufs log" \ 419 " at block 0x%x.\n", 420 logfp->fi_blocknum); 421 return (0); 422 } 423 /* 424 * Log structure verification. The block which we just 425 * read has an ident number that must match its offset 426 * in blocks from the head of the log. Since the log 427 * can wrap around, we have to check for that to get the 428 * ident right. Out-of-sequence idents can happen after 429 * power failures, panics during a partial transaction, 430 * media errors, ... - in any case, they mark the end of 431 * the valid part of the log. 432 */ 433 st = (sect_trailer_t *)(logfp->fi_memp + 434 LDL_USABLE_BSIZE); 435 /* od_head_ident is where the sequence starts */ 436 ident = odi.od_head_ident; 437 if (lblk >= lbtodb(odi.od_head_lof)) { 438 /* no wraparound */ 439 ident += (lblk - lbtodb(odi.od_head_lof)); 440 } else { 441 /* log wrapped around the end */ 442 ident += (lbtodb(odi.od_eol_lof) - 443 lbtodb(odi.od_head_lof)); 444 ident += (lblk - lbtodb(odi.od_bol_lof)); 445 } 446 447 if (ident != st->st_ident) 448 return (0); 449 } 450 read_done: 451 /* 452 * Copy the delta contents to the destination buffer if 453 * one was specified. Otherwise, just skip the contents. 454 */ 455 i = MIN(NB_LEFT_IN_SECTOR(addr), nb); 456 if (va != NULL) { 457 bcopy(logfp->fi_buf + (addr - ldbtob(lbtodb(addr))), 458 va, i); 459 va += i; 460 } 461 nb -= i; 462 addr += i; 463 /* 464 * Skip sector trailer if necessary. 465 */ 466 if (NB_LEFT_IN_SECTOR(addr) == 0) 467 addr += sizeof (sect_trailer_t); 468 } 469 return (addr); 470 } 471 472 void 473 lufs_boot_init(fileid_t *filep) 474 { 475 struct fs *sb = (struct fs *)filep->fi_memp; 476 int err = 0; 477 478 /* 479 * boot_ufs_mountroot() should have called us with a 480 * filep pointing to the superblock. Verify that this 481 * is so first. 482 * Then check whether this filesystem has a dirty log. 483 * Also return if lufs support was disabled on request. 484 */ 485 if (!lufs_support || 486 sb != (struct fs *)&filep->fi_devp->un_fs.di_fs || 487 sb->fs_clean != FSLOG || sb->fs_logbno == NULL) { 488 return; 489 } 490 491 if (boothowto & RB_VERBOSE) 492 printf("The boot filesystem is logging.\n"); 493 494 /* 495 * The filesystem is logging, there is a log area 496 * allocated for it. Check the log state and determine 497 * whether it'll be possible to use this log. 498 */ 499 500 /* 501 * Allocate a private fileid_t for use when reading 502 * from the log. 503 */ 504 eb = (extent_block_t *)bkmem_zalloc(sb->fs_bsize); 505 logfp = (fileid_t *)bkmem_zalloc(sizeof (fileid_t)); 506 logfp->fi_memp = logfp->fi_buf; 507 logfp->fi_devp = filep->fi_devp; 508 509 /* 510 * Read the extent block and verify that what we 511 * find there are actually lufs extents. 512 * Make it simple: the extent block including all 513 * extents cannot be larger than a filesystem block. 514 * So read a whole filesystem block, to make sure 515 * we have read all extents in the same operation. 516 */ 517 logfp->fi_blocknum = sb->fs_logbno; 518 logfp->fi_count = sb->fs_bsize; 519 logfp->fi_memp = (caddr_t)eb; 520 logfp->fi_offset = 0; 521 if (diskread(logfp) || eb->type != LUFS_EXTENTS) { 522 dprintf("Failed to read log extent block.\n"); 523 err = LOG_IS_ERRORED; 524 goto out; 525 } 526 527 /* 528 * Read the on disk log header. If that fails, 529 * try the backup copy on the adjacent block. 530 */ 531 logfp->fi_blocknum = eb->extents[0].pbno; 532 logfp->fi_count = sizeof (ml_odunit_t); 533 logfp->fi_memp = (caddr_t)&odi; 534 logfp->fi_offset = 0; 535 if (diskread(logfp)) { 536 logfp->fi_blocknum = eb->extents[0].pbno + 1; 537 logfp->fi_count = sizeof (ml_odunit_t); 538 logfp->fi_memp = (caddr_t)&odi; 539 logfp->fi_offset = 0; 540 if (diskread(logfp)) { 541 dprintf("Failed to read on-disk log header.\n"); 542 err = LOG_IS_ERRORED; 543 goto out; 544 } 545 } 546 547 /* 548 * Verify that we understand this log, and 549 * that the log isn't bad or empty. 550 */ 551 if (odi.od_version != LUFS_VERSION_LATEST) { 552 dprintf("On-disk log format v%d != supported format v%d.\n", 553 odi.od_version, LUFS_VERSION_LATEST); 554 err = LOG_IS_ERRORED; 555 } else if (odi.od_badlog) { 556 dprintf("On-disk log is marked bad.\n"); 557 err = LOG_IS_ERRORED; 558 } else if (odi.od_chksum != odi.od_head_ident + odi.od_tail_ident) { 559 dprintf("On-disk log checksum %d != ident sum %d.\n", 560 odi.od_chksum, odi.od_head_ident + odi.od_tail_ident); 561 err = LOG_IS_ERRORED; 562 } else { 563 /* 564 * All consistency checks ok. Scan the log, build the 565 * log hash. If this succeeds we'll be using the log 566 * when reading from this filesystem. 567 */ 568 err = lufs_logscan(); 569 } 570 out: 571 ufs_is_lufs = 1; 572 switch (err) { 573 case LOG_IS_EMPTY: 574 if (boothowto & RB_VERBOSE) 575 printf("The ufs log is empty and will not be used.\n"); 576 lufs_closeall(); 577 break; 578 case LOG_IS_OK: 579 if (boothowto & RB_VERBOSE) 580 printf("Using the ufs log.\n"); 581 break; 582 case LOG_IS_ERRORED: 583 if (boothowto & RB_VERBOSE) 584 printf("Couldn't build log hash. Can't use ufs log.\n"); 585 lufs_closeall(); 586 break; 587 default: 588 dprintf("Invalid error %d while scanning the ufs log.\n", err); 589 break; 590 } 591 } 592 593 static int 594 lufs_logscan_read(int32_t *addr, struct delta *d) 595 { 596 *addr = lufs_read_log(*addr, (caddr_t)d, sizeof (struct delta)); 597 598 if (*addr == 0 || 599 d->d_typ < DT_NONE || d->d_typ > DT_MAX || 600 d->d_nb >= odi.od_logsize) 601 return (0); 602 603 return (1); 604 } 605 606 static int 607 lufs_logscan_skip(int32_t *addr, struct delta *d) 608 { 609 switch (d->d_typ) { 610 case DT_COMMIT: 611 /* 612 * A DT_COMMIT delta has no size as such, but will 613 * always "fill up" the sector that contains it. 614 * The next delta header is found at the beginning 615 * of the next 512-Bytes sector, adjust "addr" to 616 * reflect that. 617 */ 618 *addr += ((*addr & (DEV_BSIZE - 1))) ? 619 NB_LEFT_IN_SECTOR(*addr) + 620 sizeof (sect_trailer_t) : 0; 621 return (1); 622 case DT_CANCEL: 623 case DT_ABZERO: 624 /* 625 * These types of deltas occupy no space in the log 626 */ 627 return (1); 628 default: 629 /* 630 * Skip over the delta contents. 631 */ 632 *addr = lufs_read_log(*addr, NULL, d->d_nb); 633 } 634 635 return (*addr != NULL); 636 } 637 638 static void 639 lufs_logscan_freecancel(void) 640 { 641 lb_me_t **lh, *l, *lnext; 642 int i; 643 644 /* 645 * Walk the entire log hash and put cancelled entries 646 * onto the freelist. Corner cases: 647 * a) empty hash chain (*lh == NULL) 648 * b) only one entry in chain, and that is cancelled. 649 * If for every cancelled delta another one would've 650 * been added, this situation couldn't occur, but a 651 * DT_CANCEL delta can lead to this as it is never 652 * added. 653 */ 654 for (i = 0; i < LB_HASHSIZE; i++) { 655 lh = &loghash[i]; 656 l = *lh; 657 do { 658 if (*lh == (lb_me_t *)NULL) 659 break; 660 lnext = l->l_next; 661 if (l->l_flags & LB_ISCANCELLED) { 662 remlist(lh, l); 663 bzero((caddr_t)l, sizeof (lb_me_t)); 664 l->l_next = lfreelist; 665 lfreelist = l; 666 /* 667 * Just removed the hash head. In order not 668 * to terminate the while loop, respin chain 669 * walk for this hash chain. 670 */ 671 if (lnext == *lh) { 672 i--; 673 break; 674 } 675 } 676 l = lnext; 677 } while (l != *lh); 678 } 679 } 680 681 static int 682 lufs_logscan_addmap(int32_t *addr, struct delta *d) 683 { 684 lb_me_t **lh, *l; 685 686 switch (d->d_typ) { 687 case DT_COMMIT: 688 /* 689 * Handling DT_COMMIT deltas is special. We need to: 690 * 1. increase the transaction ID 691 * 2. remove cancelled entries. 692 */ 693 lufs_logscan_freecancel(); 694 curtid++; 695 break; 696 case DT_INODE: 697 /* 698 * Deltas against parts of on-disk inodes are 699 * assumed to be timestamps. Ignore those. 700 */ 701 if (d->d_nb != sizeof (struct dinode)) 702 break; 703 /* FALLTHROUGH */ 704 case DT_CANCEL: 705 case DT_ABZERO: 706 case DT_AB: 707 case DT_DIR: 708 case DT_FBI: 709 /* 710 * These types of deltas contain and/or modify structural 711 * information that is needed for booting the system: 712 * - where to find a file (DT_DIR, DT_FBI) 713 * - the file itself (DT_INODE) 714 * - data blocks associated with a file (DT_AB, DT_ABZERO) 715 * 716 * Building the hash chains becomes complicated because there 717 * may exist an older (== previously added) entry that overlaps 718 * with the one we want to add. 719 * Four cases must be distinguished: 720 * 1. The new delta is an exact match for an existing one, 721 * or is a superset of an existing one, and both 722 * belong to the same transaction. 723 * The new delta completely supersedes the old one, so 724 * remove that and reuse the structure for the new. 725 * Then add the new delta to the head of the hashchain. 726 * 2. The new delta is an exact match for an existing one, 727 * or is a superset of an existing one, but the two 728 * belong to different transactions (i.e. the old one is 729 * committed). 730 * The existing one is marked to be cancelled when the 731 * next DT_COMMIT record is found, and the hash chain 732 * walk is continued as there may be more existing entries 733 * found which overlap the new delta (happens if that is 734 * a superset of those in the log). 735 * Once no more overlaps are found, goto 4. 736 * 3. An existing entry completely covers the new one. 737 * The new delta is then added directly before this 738 * existing one. 739 * 4. No (more) overlaps with existing entries are found. 740 * Unless this is a DT_CANCEL delta, whose only purpose 741 * is already handled by marking overlapping entries for 742 * cancellation, add the new delta at the hash chain head. 743 * 744 * This strategy makes sure that the hash chains are properly 745 * ordered. lufs_merge_deltas() walks the hash chain backward, 746 * which then ensures that delta merging is done in the same 747 * order as those deltas occur in the log - remember, the 748 * log can only be read in one direction. 749 * 750 */ 751 lh = &loghash[LB_HASHFUNC(d->d_mof)]; 752 l = *lh; 753 do { 754 if (l == (lb_me_t *)NULL) 755 break; 756 /* 757 * This covers the first two cases above. 758 * If this is a perfect match from the same transaction, 759 * and it isn't already cancelled, we simply replace it 760 * with its newer incarnation. 761 * Otherwise, mark it for cancellation. Handling of 762 * DT_COMMIT is going to remove it, then. 763 */ 764 if (WITHIN(l->l_mof, l->l_nb, d->d_mof, d->d_nb)) { 765 if (!(l->l_flags & LB_ISCANCELLED)) { 766 if (l->l_tid == curtid && 767 d->d_typ != DT_CANCEL) { 768 remlist(lh, l); 769 l->l_mof = d->d_mof; 770 l->l_lof = *addr; 771 l->l_nb = d->d_nb; 772 l->l_typ = d->d_typ; 773 l->l_flags = 0; 774 l->l_tid = curtid; 775 inslist(lh, l); 776 return (1); 777 } else { 778 /* 779 * 2nd case - cancel only. 780 */ 781 l->l_flags |= LB_ISCANCELLED; 782 } 783 } 784 } else if (WITHIN(d->d_mof, d->d_nb, 785 l->l_mof, l->l_nb)) { 786 /* 787 * This is the third case above. 788 * With deltas DT_ABZERO/DT_AB and DT_FBI/DT_DIR 789 * this may happen - an existing previous delta 790 * is larger than the current one we're planning 791 * to add - DT_ABZERO deltas are supersets of 792 * DT_AB deltas, and likewise DT_FBI/DT_DIR. 793 * In order to do merging correctly, such deltas 794 * put up a barrier for new ones that overlap, 795 * and we have to add the new delta immediately 796 * before (!) the existing one. 797 */ 798 lb_me_t *newl; 799 newl = lufs_alloc_me(); 800 if (newl == (lb_me_t *)NULL) { 801 /* 802 * No memory. Throw away everything 803 * and try booting without logging 804 * support. 805 */ 806 curtid = 0; 807 return (0); 808 } 809 newl->l_mof = d->d_mof; 810 newl->l_lof = *addr; /* "payload" address */ 811 newl->l_nb = d->d_nb; 812 newl->l_typ = d->d_typ; 813 newl->l_tid = curtid; 814 newl->l_prev = l->l_prev; 815 newl->l_next = l; 816 l->l_prev->l_next = newl; 817 l->l_prev = newl; 818 if (*lh == l) 819 *lh = newl; 820 return (1); 821 } 822 l = l->l_next; 823 } while (l != *lh); 824 825 /* 826 * This is case 4., add a new delta at the head of the chain. 827 * 828 * If the new delta is a DT_CANCEL entry, we handled it by 829 * marking everything it covered for cancellation. We can 830 * get by without actually adding the delta itself to the 831 * hash, as it'd need to be removed by the commit code anyway. 832 */ 833 if (d->d_typ == DT_CANCEL) 834 break; 835 836 l = lufs_alloc_me(); 837 if (l == (lb_me_t *)NULL) { 838 /* 839 * No memory. Throw away everything 840 * and try booting without logging 841 * support. 842 */ 843 curtid = 0; 844 return (0); 845 } 846 l->l_mof = d->d_mof; 847 l->l_lof = *addr; /* this is the "payload" address */ 848 l->l_nb = d->d_nb; 849 l->l_typ = d->d_typ; 850 l->l_tid = curtid; 851 inslist(lh, l); 852 break; 853 default: 854 break; 855 } 856 return (1); 857 } 858 859 static int 860 lufs_logscan_prescan(void) 861 { 862 /* 863 * Simulate a full log by setting the tail to be one sector 864 * behind the head. This will make the logscan read all 865 * of the log until an out-of-sequence sector ident is 866 * found. 867 */ 868 odi.od_tail_lof = dbtob(btodb(odi.od_head_lof)) - DEV_BSIZE; 869 if (odi.od_tail_lof < odi.od_bol_lof) 870 odi.od_tail_lof = odi.od_eol_lof - DEV_BSIZE; 871 if (odi.od_tail_lof >= odi.od_eol_lof) 872 odi.od_tail_lof = odi.od_bol_lof; 873 874 /* 875 * While sector trailers maintain TID values, od_head_tid 876 * is not being updated by the kernel ufs logging support 877 * at this time. We therefore count transactions ourselves 878 * starting at zero - as does the kernel ufs logscan code. 879 */ 880 curtid = 0; 881 882 if (!lufs_alloc_logbuf()) { 883 dprintf("Failed to allocate log buffer.\n"); 884 return (0); 885 } 886 887 loghash = (lb_me_t **)lufs_alloc_from_logbuf( 888 LB_HASHSIZE * sizeof (lb_me_t *)); 889 if (loghash == (lb_me_t **)NULL) { 890 dprintf("Can't allocate loghash[] array."); 891 return (0); 892 } 893 return (1); 894 } 895 896 /* 897 * This function must remove all uncommitted entries (l->l_tid == curtid) 898 * from the log hash. Doing this, we implicitly delete pending cancellations 899 * as well. 900 * It uses the same hash walk algorithm as lufs_logscan_freecancel(). Only 901 * the check for entries that need to be removed is different. 902 */ 903 static void 904 lufs_logscan_postscan(void) 905 { 906 lb_me_t **lh, *l, *lnext; 907 int i; 908 909 for (i = 0; i < LB_HASHSIZE; i++) { 910 lh = &loghash[i]; 911 l = *lh; 912 do { 913 if (l == (lb_me_t *)NULL) 914 break; 915 lnext = l->l_next; 916 if (l->l_tid == curtid) { 917 remlist(lh, l); 918 bzero((caddr_t)l, sizeof (lb_me_t)); 919 l->l_next = lfreelist; 920 lfreelist = l; 921 if (*lh == (lb_me_t *)NULL) 922 break; 923 /* 924 * Just removed the hash head. In order not 925 * to terminate the while loop, respin chain 926 * walk for this hash chain. 927 */ 928 if (lnext == *lh) { 929 i--; 930 break; 931 } 932 } else { 933 l->l_flags &= ~(LB_ISCANCELLED); 934 } 935 l = lnext; 936 } while (l != *lh); 937 } 938 } 939 940 /* 941 * This function builds the log hash. It performs the same sequence 942 * of actions at logscan as the kernel ufs logging support: 943 * - Prepare the log for scanning by simulating a full log. 944 * - As long as sectors read from the log have contiguous idents, do: 945 * read the delta header 946 * add the delta to the logmap 947 * skip over the contents to the start of the next delta header 948 * - After terminating the scan, remove uncommitted entries. 949 * 950 * This function cannot fail except if mapping the logbuffer area 951 * during lufs_logscan_prescan() fails. If there is a structural 952 * integrity problem and the on-disk log cannot be read, we'll 953 * treat this as the same situation as an uncommitted transaction 954 * at the end of the log (or, corner case of that, an empty log 955 * with no committed transactions in it at all). 956 * 957 */ 958 static int 959 lufs_logscan(void) 960 { 961 int32_t addr; 962 struct delta d; 963 964 if (!lufs_logscan_prescan()) 965 return (LOG_IS_ERRORED); 966 967 addr = odi.od_head_lof; 968 969 /* 970 * Note that addr == od_tail_lof means a completely filled 971 * log. This almost never happens, so the common exit path 972 * from this loop is via one of the 'break's. 973 */ 974 while (addr != odi.od_tail_lof) { 975 if (!lufs_logscan_read(&addr, &d)) 976 break; 977 if (!lufs_logscan_addmap(&addr, &d)) 978 return (LOG_IS_ERRORED); 979 if (!lufs_logscan_skip(&addr, &d)) 980 break; 981 } 982 983 lufs_logscan_postscan(); 984 /* 985 * Check whether the log contains data, and if so whether 986 * it contains committed data. 987 */ 988 if (addr == odi.od_head_lof || curtid == 0) { 989 return (LOG_IS_EMPTY); 990 } 991 return (LOG_IS_OK); 992 } 993 994 /* 995 * A metadata block was read from disk. Check whether the logmap 996 * has a delta against this byte range, and if so read it in, since 997 * the data in the log is more recent than what was read from other 998 * places on the disk. 999 */ 1000 void 1001 lufs_merge_deltas(fileid_t *fp) 1002 { 1003 int nb; 1004 int64_t bof; 1005 lb_me_t **lh, *l; 1006 int32_t skip; 1007 1008 /* 1009 * No logmap: Empty log. Nothing to do here. 1010 */ 1011 if (!ufs_is_lufs || logbuffer == (caddr_t)NULL) 1012 return; 1013 1014 bof = ldbtob(fp->fi_blocknum); 1015 nb = fp->fi_count; 1016 1017 /* 1018 * Search the log hash. 1019 * Merge deltas if an overlap is found. 1020 */ 1021 1022 lh = &loghash[LB_HASHFUNC(bof)]; 1023 1024 if (*lh == (lb_me_t *)NULL) 1025 return; 1026 1027 l = *lh; 1028 1029 do { 1030 l = l->l_prev; 1031 if (OVERLAP(l->l_mof, l->l_nb, bof, nb)) { 1032 /* 1033 * Found a delta in the log hash which overlaps 1034 * with the current metadata block. Read the 1035 * actual delta payload from the on-disk log 1036 * directly into the file buffer. 1037 */ 1038 if (l->l_typ != DT_ABZERO) { 1039 /* 1040 * We have to actually read this part of the 1041 * log as it could contain a sector trailer, or 1042 * wrap around the end of the log. 1043 * If it did, the second offset generation would 1044 * be incorrect if we'd started at l->l_lof. 1045 */ 1046 if (!(skip = lufs_read_log(l->l_lof, NULL, 1047 MAX(bof - l->l_mof, 0)))) 1048 dprintf("scan/merge error, pre-skip\n"); 1049 if (!(skip = lufs_read_log(skip, 1050 fp->fi_memp + MAX(l->l_mof - bof, 0), 1051 MIN(l->l_mof + l->l_nb, bof + nb) - 1052 MAX(l->l_mof, bof)))) 1053 dprintf("scan/merge error, merge\n"); 1054 } else { 1055 /* 1056 * DT_ABZERO requires no disk access, just 1057 * clear the byte range which overlaps with 1058 * the delta. 1059 */ 1060 bzero(fp->fi_memp + MAX(l->l_mof - bof, 0), 1061 MIN(l->l_mof + l->l_nb, bof + nb) - 1062 MAX(l->l_mof, bof)); 1063 } 1064 } 1065 } while (l->l_prev != (*lh)->l_prev); 1066 1067 printf("*\b"); 1068 } 1069 1070 void 1071 lufs_closeall(void) 1072 { 1073 if (ufs_is_lufs) { 1074 bkmem_free((char *)eb, logfp->fi_devp->un_fs.di_fs.fs_bsize); 1075 bkmem_free((char *)logfp, sizeof (fileid_t)); 1076 eb = (extent_block_t *)NULL; 1077 bzero((caddr_t)&odi, sizeof (ml_odunit_t)); 1078 logfp = (fileid_t *)NULL; 1079 lufs_free_logbuf(); 1080 ufs_is_lufs = 0; 1081 } 1082 } 1083