1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/param.h> 30 #include <sys/vnode.h> 31 #include <sys/fs/ufs_fsdir.h> 32 #include <sys/fs/ufs_fs.h> 33 #include <sys/fs/ufs_inode.h> 34 #include <sys/fs/ufs_log.h> 35 #include <sys/sysmacros.h> 36 #include <sys/promif.h> 37 #include <sys/machparam.h> 38 39 #include <sys/stat.h> 40 #include <sys/bootdebug.h> 41 #include <sys/salib.h> 42 #include <sys/saio.h> 43 #include <sys/filep.h> 44 45 46 /* 47 * Big theory statement on how ufsboot makes use of the log 48 * in case the filesystem wasn't shut down cleanly. 49 * 50 * The structure of the ufs on-disk log looks like this: 51 * 52 * +-----------------+ 53 * | SUPERBLOCK | 54 * | ... | 55 * | fs_logbno +--> +-----------------------+ 56 * | ... | | EXTENT BLOCK | 57 * +-----------------+ | ... | 58 * | nextents | 59 * +----------------------+ extents[0].pbno | 60 * | | { extents[1].pbno } +------------+ 61 * | | ... +--> ... | 62 * | +-----------------------+ | 63 * v | 64 * +-----------------------------+ \ | 65 * | ON-DISK LOG HEADER | | | 66 * | ... | | | 67 * | od_head_lof +--+ | | 68 * | ... | | | | 69 * +-----------------------------+ <|---|- od_bol_lof | 70 * | sector (may contain deltas) | | | (logical offset) | 71 * | +-------------------------+ | | | 72 * | | trailer (some ident#) | | > extents[0].nbno | 73 * +---+-------------------------+ | | blocks ("sectors") | 74 * . . | | | 75 * . . | | | 76 * +-----------------------------+<-+ | | 77 * | delta1 delta2 delta3 | | | 78 * | d +-------------------------+ | | 79 * | e | ident#: od_head_ident | | | 80 * +---+-------------------------+ / | 81 * | 82 * +-----------------------------+ <---------------------------+ 83 * | lta4 delta5 delta6 de | 84 * | l +-------------------------+ 85 * | t | ident#: od_head_ident+1 | 86 * +---+-------------------------+ 87 * . . 88 * +-----------------------------+ 89 * | sector (may contain deltas) | 90 * | +------------------+ 91 * | | trailer (ident#) | 92 * +----------+------------------+ <-- od_eol_lof (logical offset) 93 * 94 * The ufs on-disk log has the following properties: 95 * 96 * 1. The log is made up from at least one extent. "fs_logbno" in 97 * the superblock points to where this is found. 98 * 2. Extents describe the logical layout. 99 * - Logical offset 0 is the on-disk log header. It's also 100 * at the beginning of the first physical block. 101 * - If there's more than one extent, the equation holds: 102 * extent[i+1].lbno == extent[i].lbno + extent[i].nbno 103 * i.e. logical offsets form a contiguous sequence. Yet on disk, 104 * two logically-adjacent offsets may be located in two 105 * physically disjoint extents, so logical offsets need to be 106 * translated into physical disk block addresses for access. 107 * - Various fields in the on-disk log header structure refer 108 * to such logical log offsets. 109 * 3. The actual logical logspace begins after the log header, at 110 * the logical offset indicated by "od_bol_lof". Every 512 Bytes 111 * (a "sector" in terms of ufs logging) is a sector trailer which 112 * contains a sequence number, the sector ident. 113 * 4. Deltas are packed tight in the remaining space, i.e. a delta 114 * may be part of more than one sector. Reads from the logspace 115 * must be split at sector boundaries, since the trailer is never 116 * part of a delta. Delta sizes vary. 117 * 5. The field "od_head_lof" points to the start of the dirty part 118 * of the log, i.e. to the first delta header. Likewise, "od_head_ident" 119 * is the sequence number where the valid part of the log starts; if 120 * the sector pointed to by "od_head_lof" has a sector ident different 121 * from "od_head_ident", the log is empty. 122 * 6. The valid part of the log extends for as many sectors as their ident 123 * numbers form a contiguous sequence. When reaching the logical end of 124 * the log, "od_bol_lof", logical offsets wrap around to "od_bol_lof", 125 * i.e. the log forms a circular buffer. 126 * 127 * For the strategy how to handle accessing the log, item 4. is the 128 * most important one - its consequence is that the log can only be 129 * read in one direction - forward, starting at the head. 130 * 131 * The task of identifying whether a given metadata block is 132 * actually in the log therefore requires reading the entire 133 * log. Doing so is memory-efficient but kills speed if re-done 134 * at every metadata read (64MB log size vs. 512 byte metadata 135 * block size: 128 times as much I/O, possibly only to find out 136 * that this block was not in the log ...). 137 * 138 * First thought to speed this up is to let ufsboot roll the log. 139 * But this is not possible because: 140 * - ufsboot currently does not implement any write functionality, 141 * the boot-time ufs implementation is read-only. 142 * - firmware write interfaces may or may not be available, in any 143 * case, they're rarely used and untested for such a purpose. 144 * - that would duplicate a lot of code, since at the moment only 145 * kernel ufs logging implements log rolling. 146 * - the boot environment cannot be considered high-performance; 147 * rolling the log there would be slow. 148 * - boot device and root device could well be different, creating 149 * inconsistencies e.g. with a mirrored root if the log is rolled. 150 * 151 * Therefore, caching the log structural information (boot-relevant 152 * deltas and their logical log offset) is required for fast access 153 * to the data in the log. This code builds a logmap for that purpose. 154 * 155 * As a simple optimization, if we find the log is empty, we will not 156 * use it - log reader support for ufsboot has no noticeable overhead 157 * for clean logs, or for root filesystems that aren't logging. 158 */ 159 160 #define LB_HASHSHIFT 13 161 #define LB_HASHSIZE (1 << LB_HASHSHIFT) 162 #define LB_HASHFUNC(mof) (((mof) >> LB_HASHSHIFT) & (LB_HASHSIZE - 1)) 163 164 #define LOGBUF_MAXSIZE (8*1024*1024) 165 #define LOGBUF_MINSIZE (256*1024) 166 167 #define LOG_IS_EMPTY 0 168 #define LOG_IS_OK 1 169 #define LOG_IS_ERRORED 2 170 171 /* 172 * We build a hashed logmap of those while scanning the log. 173 * sizeof(lb_map_t) is 40 on 64bit, 32 on 32bit; the max sized 174 * resalloc'ed buffer can accomodate around ~500k of those; 175 * this is approximately the maximum amount of deltas we'll 176 * see if a 64MB ufs log is completely filled. We'll make no 177 * attempt to free and reallocate the resalloc'ed buffer if 178 * we overflow, as conservative sizing should make that an 179 * impossibility. A future enhancement may allocate memory 180 * here as needed - once the boot time memory allocator 181 * supports that. 182 */ 183 typedef struct lb_mapentry { 184 struct lb_mapentry *l_next; /* hash chaining */ 185 struct lb_mapentry *l_prev; /* hash chaining */ 186 int64_t l_mof; /* disk addr this delta is against */ 187 int16_t l_nb; /* size of delta */ 188 int16_t l_flags; 189 int32_t l_lof; /* log offset for delta header */ 190 int32_t l_tid; /* transaction this delta is part of */ 191 delta_t l_typ; /* see <sys/fs/ufs_trans.h> */ 192 } lb_me_t; 193 194 #define LB_ISCANCELLED 1 195 196 #define inslist(lh, l) if ((*(lh))) { \ 197 (*(lh))->l_prev->l_next = (l); \ 198 (l)->l_next = (*(lh)); \ 199 (l)->l_prev = (*(lh))->l_prev; \ 200 (*(lh))->l_prev = (l); \ 201 } else { \ 202 (l)->l_next = (l); \ 203 (l)->l_prev = (l); \ 204 (*(lh)) = l; \ 205 } 206 207 #define remlist(lh, l) \ 208 if ((l)->l_next == (l)) { \ 209 if (*(lh) != (l) || (l)->l_prev != (l)) \ 210 dprintf("Logmap hash inconsistency.\n"); \ 211 *(lh) = (lb_me_t *)NULL; \ 212 } else { \ 213 if (*(lh) == (l)) \ 214 *(lh) = (l)->l_next; \ 215 (l)->l_prev->l_next = (l)->l_next; \ 216 (l)->l_next->l_prev = (l)->l_prev; \ 217 } 218 219 #define lufs_alloc_me() \ 220 (lb_me_t *)lufs_alloc_from_logbuf(sizeof (lb_me_t)) 221 222 extern int boothowto; 223 static int ufs_is_lufs = 0; 224 static fileid_t *logfp = (fileid_t *)NULL; 225 static extent_block_t *eb = (extent_block_t *)NULL; 226 static ml_odunit_t odi; 227 228 #ifndef i386 229 static char logbuffer_min[LOGBUF_MINSIZE]; 230 #endif 231 static caddr_t logbuffer = (caddr_t)NULL; 232 static caddr_t elogbuffer = (caddr_t)NULL; 233 static caddr_t logbuf_curptr; 234 static lb_me_t **loghash = (lb_me_t **)NULL; 235 static lb_me_t *lfreelist; 236 237 static uint32_t curtid; 238 239 240 int lufs_support = 1; 241 242 void lufs_boot_init(fileid_t *); 243 void lufs_closeall(void); 244 void lufs_merge_deltas(fileid_t *); 245 246 static int lufs_logscan(void); 247 248 extern int diskread(fileid_t *filep); 249 extern caddr_t resalloc(enum RESOURCES, size_t, caddr_t, int); 250 251 #if defined(i386) 252 #define LOGBUF_BASEADDR ((caddr_t)(KERNEL_TEXT - LOGBUF_MAXSIZE)) 253 #elif defined(__sparcv9) 254 #define LOGBUF_BASEADDR ((caddr_t)(SYSBASE - LOGBUF_MAXSIZE)) 255 #endif 256 257 static int 258 lufs_alloc_logbuf(void) 259 { 260 /* 261 * Allocate memory for caching the log. Since the logbuffer can 262 * potentially exceed the boot scratch memory limit, we use resalloc 263 * directly, passing the allocation to the low-level boot-time 264 * backend allocator. The chosen VA range is the top end of 265 * the kernel's segmap segment, so we're not interfering 266 * with the kernel because segmap is created at a time when 267 * the 2nd-stage boot has already been unloaded and this VA 268 * range was given back. 269 * 270 * On sparc platforms, the kernel cannot recover the memory 271 * obtained from resalloc because the page structs are allocated 272 * before the call to BOP_QUIESCE. To avoid leaking this 273 * memory, the logbuffer is allocated from a small bss array 274 * that should hold the logmap except in the most extreme cases. 275 * If the bss array is too small, the logbuffer is extended 276 * from resalloc 1 page at a time. 277 */ 278 279 #ifdef i386 280 logbuffer = resalloc(RES_CHILDVIRT, LOGBUF_MAXSIZE, 281 LOGBUF_BASEADDR, 0UL); 282 elogbuffer = logbuffer+LOGBUF_MAXSIZE; 283 #else 284 logbuffer = logbuffer_min; 285 elogbuffer = logbuffer+LOGBUF_MINSIZE; 286 #endif 287 logbuf_curptr = logbuffer; 288 lfreelist = (lb_me_t *)NULL; 289 290 if (logbuffer == (caddr_t)NULL) 291 return (0); 292 293 dprintf("Buffer for boot loader logging support: 0x%p, size 0x%x\n", 294 logbuffer, elogbuffer-logbuffer); 295 296 return (1); 297 } 298 299 static void 300 lufs_free_logbuf() 301 { 302 /* 303 * Solaris/x86 has no prom_free() routine at this time. 304 * Reclaiming the VA range below KERNEL_TEXT on Solaris/x86 305 * is done by the kernel startup itself, in hat_unload_prom() 306 * after the bootloader has been quiesced. 307 * 308 * Solaris on sparc has a prom_free() routine that will update 309 * the memlist properties to reflect the freeing of the 310 * logbuffer. However, the sparc kernel cannot recover 311 * the memory freed after the call to BOP_QUIESCE as the 312 * page struct have already been allocated. We call 313 * prom_free anyway so that the kernel can reclaim this 314 * memory in the future. 315 */ 316 #ifndef i386 317 if (logbuffer == LOGBUF_BASEADDR) 318 prom_free(logbuffer, elogbuffer-logbuffer); 319 #endif 320 logbuffer = (caddr_t)NULL; 321 } 322 323 static caddr_t 324 lufs_alloc_from_logbuf(size_t sz) 325 { 326 caddr_t tmpaddr; 327 lb_me_t *l; 328 329 /* 330 * Satisfy lb_me_t allocations from the freelist 331 * first if possible. 332 */ 333 if ((sz == sizeof (lb_me_t)) && lfreelist) { 334 l = lfreelist; 335 lfreelist = lfreelist->l_next; 336 return ((caddr_t)l); 337 } 338 if (elogbuffer < logbuf_curptr + sz) { 339 #ifdef i386 340 return ((caddr_t)NULL); 341 #else 342 caddr_t np; 343 size_t nsz; 344 345 /* 346 * Out of space in current chunk - try to add another. 347 */ 348 if (logbuffer == logbuffer_min) { 349 np = LOGBUF_BASEADDR; 350 } else { 351 np = elogbuffer; 352 } 353 nsz = roundup(sz, PAGESIZE); 354 if (np + nsz > LOGBUF_BASEADDR + LOGBUF_MAXSIZE) { 355 return ((caddr_t)NULL); 356 } 357 358 np = resalloc(RES_CHILDVIRT, nsz, np, 0UL); 359 if (np == (caddr_t)NULL) { 360 return ((caddr_t)NULL); 361 } 362 if (logbuffer == logbuffer_min) 363 logbuffer = LOGBUF_BASEADDR; 364 logbuf_curptr = np; 365 elogbuffer = logbuf_curptr + nsz; 366 #endif 367 } 368 369 tmpaddr = logbuf_curptr; 370 logbuf_curptr += sz; 371 bzero(tmpaddr, sz); 372 return (tmpaddr); 373 } 374 375 static int32_t 376 lufs_read_log(int32_t addr, caddr_t va, int nb) 377 { 378 int i, fastpath = 0; 379 daddr_t pblk, lblk; 380 sect_trailer_t *st; 381 uint32_t ident; 382 383 /* 384 * Fast path for skipping the read if no target buffer 385 * is specified. Don't do this for the initial scan. 386 */ 387 if (ufs_is_lufs && (va == (caddr_t)NULL)) 388 fastpath = 1; 389 390 while (nb) { 391 /* log wraparound check */ 392 if (addr == odi.od_eol_lof) 393 addr = odi.od_bol_lof; 394 if (fastpath) 395 goto read_done; 396 397 /* 398 * Translate logically-contiguous log offsets into physical 399 * block numbers. For a log consisting of a single extent: 400 * pbno = btodb(addr) - extents[0].lbno; 401 * Otherwise, search for the extent which contains addr. 402 */ 403 pblk = 0; 404 lblk = btodb(addr); 405 for (i = 0; i < eb->nextents; i++) { 406 if (lblk >= eb->extents[i].lbno && 407 lblk < eb->extents[i].lbno + 408 eb->extents[i].nbno) { 409 pblk = lblk - eb->extents[i].lbno + 410 eb->extents[i].pbno; 411 break; 412 } 413 } 414 415 if (pblk == 0) { 416 /* 417 * block #0 can never be in a log extent since this 418 * block always contains the primary superblock copy. 419 */ 420 dprintf("No log extent found for log offset 0x%llx.\n", 421 addr); 422 return (0); 423 } 424 425 /* 426 * Check whether the block we want is cached from the last 427 * read. If not, read it in now. 428 */ 429 if (logfp->fi_blocknum != pblk) { 430 logfp->fi_blocknum = pblk; 431 logfp->fi_memp = logfp->fi_buf; 432 logfp->fi_count = DEV_BSIZE; 433 logfp->fi_offset = 0; 434 if (diskread(logfp)) { 435 dprintf("I/O error reading the ufs log" \ 436 " at block 0x%x.\n", 437 logfp->fi_blocknum); 438 return (0); 439 } 440 /* 441 * Log structure verification. The block which we just 442 * read has an ident number that must match its offset 443 * in blocks from the head of the log. Since the log 444 * can wrap around, we have to check for that to get the 445 * ident right. Out-of-sequence idents can happen after 446 * power failures, panics during a partial transaction, 447 * media errors, ... - in any case, they mark the end of 448 * the valid part of the log. 449 */ 450 st = (sect_trailer_t *)(logfp->fi_memp + 451 LDL_USABLE_BSIZE); 452 /* od_head_ident is where the sequence starts */ 453 ident = odi.od_head_ident; 454 if (lblk >= lbtodb(odi.od_head_lof)) { 455 /* no wraparound */ 456 ident += (lblk - lbtodb(odi.od_head_lof)); 457 } else { 458 /* log wrapped around the end */ 459 ident += (lbtodb(odi.od_eol_lof) - 460 lbtodb(odi.od_head_lof)); 461 ident += (lblk - lbtodb(odi.od_bol_lof)); 462 } 463 464 if (ident != st->st_ident) 465 return (0); 466 } 467 read_done: 468 /* 469 * Copy the delta contents to the destination buffer if 470 * one was specified. Otherwise, just skip the contents. 471 */ 472 i = MIN(NB_LEFT_IN_SECTOR(addr), nb); 473 if (va != NULL) { 474 bcopy(logfp->fi_buf + (addr - ldbtob(lbtodb(addr))), 475 va, i); 476 va += i; 477 } 478 nb -= i; 479 addr += i; 480 /* 481 * Skip sector trailer if necessary. 482 */ 483 if (NB_LEFT_IN_SECTOR(addr) == 0) 484 addr += sizeof (sect_trailer_t); 485 } 486 return (addr); 487 } 488 489 void 490 lufs_boot_init(fileid_t *filep) 491 { 492 struct fs *sb = (struct fs *)filep->fi_memp; 493 int err = 0; 494 495 /* 496 * boot_ufs_mountroot() should have called us with a 497 * filep pointing to the superblock. Verify that this 498 * is so first. 499 * Then check whether this filesystem has a dirty log. 500 * Also return if lufs support was disabled on request. 501 */ 502 if (!lufs_support || 503 sb != (struct fs *)&filep->fi_devp->un_fs.di_fs || 504 sb->fs_clean != FSLOG || sb->fs_logbno == NULL) { 505 return; 506 } 507 508 if (boothowto & RB_VERBOSE) 509 printf("The boot filesystem is logging.\n"); 510 511 /* 512 * The filesystem is logging, there is a log area 513 * allocated for it. Check the log state and determine 514 * whether it'll be possible to use this log. 515 */ 516 517 /* 518 * Allocate a private fileid_t for use when reading 519 * from the log. 520 */ 521 eb = (extent_block_t *)bkmem_zalloc(sb->fs_bsize); 522 logfp = (fileid_t *)bkmem_zalloc(sizeof (fileid_t)); 523 logfp->fi_memp = logfp->fi_buf; 524 logfp->fi_devp = filep->fi_devp; 525 526 /* 527 * Read the extent block and verify that what we 528 * find there are actually lufs extents. 529 * Make it simple: the extent block including all 530 * extents cannot be larger than a filesystem block. 531 * So read a whole filesystem block, to make sure 532 * we have read all extents in the same operation. 533 */ 534 logfp->fi_blocknum = sb->fs_logbno; 535 logfp->fi_count = sb->fs_bsize; 536 logfp->fi_memp = (caddr_t)eb; 537 logfp->fi_offset = 0; 538 if (diskread(logfp) || eb->type != LUFS_EXTENTS) { 539 dprintf("Failed to read log extent block.\n"); 540 err = LOG_IS_ERRORED; 541 goto out; 542 } 543 544 /* 545 * Read the on disk log header. If that fails, 546 * try the backup copy on the adjacent block. 547 */ 548 logfp->fi_blocknum = eb->extents[0].pbno; 549 logfp->fi_count = sizeof (ml_odunit_t); 550 logfp->fi_memp = (caddr_t)&odi; 551 logfp->fi_offset = 0; 552 if (diskread(logfp)) { 553 logfp->fi_blocknum = eb->extents[0].pbno + 1; 554 logfp->fi_count = sizeof (ml_odunit_t); 555 logfp->fi_memp = (caddr_t)&odi; 556 logfp->fi_offset = 0; 557 if (diskread(logfp)) { 558 dprintf("Failed to read on-disk log header.\n"); 559 err = LOG_IS_ERRORED; 560 goto out; 561 } 562 } 563 564 /* 565 * Verify that we understand this log, and 566 * that the log isn't bad or empty. 567 */ 568 if (odi.od_version != LUFS_VERSION_LATEST) { 569 dprintf("On-disk log format v%d != supported format v%d.\n", 570 odi.od_version, LUFS_VERSION_LATEST); 571 err = LOG_IS_ERRORED; 572 } else if (odi.od_badlog) { 573 dprintf("On-disk log is marked bad.\n"); 574 err = LOG_IS_ERRORED; 575 } else if (odi.od_chksum != odi.od_head_ident + odi.od_tail_ident) { 576 dprintf("On-disk log checksum %d != ident sum %d.\n", 577 odi.od_chksum, odi.od_head_ident + odi.od_tail_ident); 578 err = LOG_IS_ERRORED; 579 } else { 580 /* 581 * All consistency checks ok. Scan the log, build the 582 * log hash. If this succeeds we'll be using the log 583 * when reading from this filesystem. 584 */ 585 err = lufs_logscan(); 586 } 587 out: 588 ufs_is_lufs = 1; 589 switch (err) { 590 case LOG_IS_EMPTY: 591 if (boothowto & RB_VERBOSE) 592 printf("The ufs log is empty and will not be used.\n"); 593 lufs_closeall(); 594 break; 595 case LOG_IS_OK: 596 if (boothowto & RB_VERBOSE) 597 printf("Using the ufs log.\n"); 598 break; 599 case LOG_IS_ERRORED: 600 if (boothowto & RB_VERBOSE) 601 printf("Couldn't build log hash. Can't use ufs log.\n"); 602 lufs_closeall(); 603 break; 604 default: 605 dprintf("Invalid error %d while scanning the ufs log.\n", err); 606 break; 607 } 608 } 609 610 static int 611 lufs_logscan_read(int32_t *addr, struct delta *d) 612 { 613 *addr = lufs_read_log(*addr, (caddr_t)d, sizeof (struct delta)); 614 615 if (*addr == 0 || 616 d->d_typ < DT_NONE || d->d_typ > DT_MAX || 617 d->d_nb >= odi.od_logsize) 618 return (0); 619 620 return (1); 621 } 622 623 static int 624 lufs_logscan_skip(int32_t *addr, struct delta *d) 625 { 626 switch (d->d_typ) { 627 case DT_COMMIT: 628 /* 629 * A DT_COMMIT delta has no size as such, but will 630 * always "fill up" the sector that contains it. 631 * The next delta header is found at the beginning 632 * of the next 512-Bytes sector, adjust "addr" to 633 * reflect that. 634 */ 635 *addr += ((*addr & (DEV_BSIZE - 1))) ? 636 NB_LEFT_IN_SECTOR(*addr) + 637 sizeof (sect_trailer_t) : 0; 638 return (1); 639 case DT_CANCEL: 640 case DT_ABZERO: 641 /* 642 * These types of deltas occupy no space in the log 643 */ 644 return (1); 645 default: 646 /* 647 * Skip over the delta contents. 648 */ 649 *addr = lufs_read_log(*addr, NULL, d->d_nb); 650 } 651 652 return (*addr != NULL); 653 } 654 655 static void 656 lufs_logscan_freecancel(void) 657 { 658 lb_me_t **lh, *l, *lnext; 659 int i; 660 661 /* 662 * Walk the entire log hash and put cancelled entries 663 * onto the freelist. Corner cases: 664 * a) empty hash chain (*lh == NULL) 665 * b) only one entry in chain, and that is cancelled. 666 * If for every cancelled delta another one would've 667 * been added, this situation couldn't occur, but a 668 * DT_CANCEL delta can lead to this as it is never 669 * added. 670 */ 671 for (i = 0; i < LB_HASHSIZE; i++) { 672 lh = &loghash[i]; 673 l = *lh; 674 do { 675 if (*lh == (lb_me_t *)NULL) 676 break; 677 lnext = l->l_next; 678 if (l->l_flags & LB_ISCANCELLED) { 679 remlist(lh, l); 680 bzero((caddr_t)l, sizeof (lb_me_t)); 681 l->l_next = lfreelist; 682 lfreelist = l; 683 /* 684 * Just removed the hash head. In order not 685 * to terminate the while loop, respin chain 686 * walk for this hash chain. 687 */ 688 if (lnext == *lh) { 689 i--; 690 break; 691 } 692 } 693 l = lnext; 694 } while (l != *lh); 695 } 696 } 697 698 static int 699 lufs_logscan_addmap(int32_t *addr, struct delta *d) 700 { 701 lb_me_t **lh, *l; 702 703 switch (d->d_typ) { 704 case DT_COMMIT: 705 /* 706 * Handling DT_COMMIT deltas is special. We need to: 707 * 1. increase the transaction ID 708 * 2. remove cancelled entries. 709 */ 710 lufs_logscan_freecancel(); 711 curtid++; 712 break; 713 case DT_INODE: 714 /* 715 * Deltas against parts of on-disk inodes are 716 * assumed to be timestamps. Ignore those. 717 */ 718 if (d->d_nb != sizeof (struct dinode)) 719 break; 720 /* FALLTHROUGH */ 721 case DT_CANCEL: 722 case DT_ABZERO: 723 case DT_AB: 724 case DT_DIR: 725 case DT_FBI: 726 /* 727 * These types of deltas contain and/or modify structural 728 * information that is needed for booting the system: 729 * - where to find a file (DT_DIR, DT_FBI) 730 * - the file itself (DT_INODE) 731 * - data blocks associated with a file (DT_AB, DT_ABZERO) 732 * 733 * Building the hash chains becomes complicated because there 734 * may exist an older (== previously added) entry that overlaps 735 * with the one we want to add. 736 * Four cases must be distinguished: 737 * 1. The new delta is an exact match for an existing one, 738 * or is a superset of an existing one, and both 739 * belong to the same transaction. 740 * The new delta completely supersedes the old one, so 741 * remove that and reuse the structure for the new. 742 * Then add the new delta to the head of the hashchain. 743 * 2. The new delta is an exact match for an existing one, 744 * or is a superset of an existing one, but the two 745 * belong to different transactions (i.e. the old one is 746 * committed). 747 * The existing one is marked to be cancelled when the 748 * next DT_COMMIT record is found, and the hash chain 749 * walk is continued as there may be more existing entries 750 * found which overlap the new delta (happens if that is 751 * a superset of those in the log). 752 * Once no more overlaps are found, goto 4. 753 * 3. An existing entry completely covers the new one. 754 * The new delta is then added directly before this 755 * existing one. 756 * 4. No (more) overlaps with existing entries are found. 757 * Unless this is a DT_CANCEL delta, whose only purpose 758 * is already handled by marking overlapping entries for 759 * cancellation, add the new delta at the hash chain head. 760 * 761 * This strategy makes sure that the hash chains are properly 762 * ordered. lufs_merge_deltas() walks the hash chain backward, 763 * which then ensures that delta merging is done in the same 764 * order as those deltas occur in the log - remember, the 765 * log can only be read in one direction. 766 * 767 */ 768 lh = &loghash[LB_HASHFUNC(d->d_mof)]; 769 l = *lh; 770 do { 771 if (l == (lb_me_t *)NULL) 772 break; 773 /* 774 * This covers the first two cases above. 775 * If this is a perfect match from the same transaction, 776 * and it isn't already cancelled, we simply replace it 777 * with its newer incarnation. 778 * Otherwise, mark it for cancellation. Handling of 779 * DT_COMMIT is going to remove it, then. 780 */ 781 if (WITHIN(l->l_mof, l->l_nb, d->d_mof, d->d_nb)) { 782 if (!(l->l_flags & LB_ISCANCELLED)) { 783 if (l->l_tid == curtid && 784 d->d_typ != DT_CANCEL) { 785 remlist(lh, l); 786 l->l_mof = d->d_mof; 787 l->l_lof = *addr; 788 l->l_nb = d->d_nb; 789 l->l_typ = d->d_typ; 790 l->l_flags = 0; 791 l->l_tid = curtid; 792 inslist(lh, l); 793 return (1); 794 } else { 795 /* 796 * 2nd case - cancel only. 797 */ 798 l->l_flags |= LB_ISCANCELLED; 799 } 800 } 801 } else if (WITHIN(d->d_mof, d->d_nb, 802 l->l_mof, l->l_nb)) { 803 /* 804 * This is the third case above. 805 * With deltas DT_ABZERO/DT_AB and DT_FBI/DT_DIR 806 * this may happen - an existing previous delta 807 * is larger than the current one we're planning 808 * to add - DT_ABZERO deltas are supersets of 809 * DT_AB deltas, and likewise DT_FBI/DT_DIR. 810 * In order to do merging correctly, such deltas 811 * put up a barrier for new ones that overlap, 812 * and we have to add the new delta immediately 813 * before (!) the existing one. 814 */ 815 lb_me_t *newl; 816 newl = lufs_alloc_me(); 817 if (newl == (lb_me_t *)NULL) { 818 /* 819 * No memory. Throw away everything 820 * and try booting without logging 821 * support. 822 */ 823 curtid = 0; 824 return (0); 825 } 826 newl->l_mof = d->d_mof; 827 newl->l_lof = *addr; /* "payload" address */ 828 newl->l_nb = d->d_nb; 829 newl->l_typ = d->d_typ; 830 newl->l_tid = curtid; 831 newl->l_prev = l->l_prev; 832 newl->l_next = l; 833 l->l_prev->l_next = newl; 834 l->l_prev = newl; 835 if (*lh == l) 836 *lh = newl; 837 return (1); 838 } 839 l = l->l_next; 840 } while (l != *lh); 841 842 /* 843 * This is case 4., add a new delta at the head of the chain. 844 * 845 * If the new delta is a DT_CANCEL entry, we handled it by 846 * marking everything it covered for cancellation. We can 847 * get by without actually adding the delta itself to the 848 * hash, as it'd need to be removed by the commit code anyway. 849 */ 850 if (d->d_typ == DT_CANCEL) 851 break; 852 853 l = lufs_alloc_me(); 854 if (l == (lb_me_t *)NULL) { 855 /* 856 * No memory. Throw away everything 857 * and try booting without logging 858 * support. 859 */ 860 curtid = 0; 861 return (0); 862 } 863 l->l_mof = d->d_mof; 864 l->l_lof = *addr; /* this is the "payload" address */ 865 l->l_nb = d->d_nb; 866 l->l_typ = d->d_typ; 867 l->l_tid = curtid; 868 inslist(lh, l); 869 break; 870 default: 871 break; 872 } 873 return (1); 874 } 875 876 static int 877 lufs_logscan_prescan(void) 878 { 879 /* 880 * Simulate a full log by setting the tail to be one sector 881 * behind the head. This will make the logscan read all 882 * of the log until an out-of-sequence sector ident is 883 * found. 884 */ 885 odi.od_tail_lof = dbtob(btodb(odi.od_head_lof)) - DEV_BSIZE; 886 if (odi.od_tail_lof < odi.od_bol_lof) 887 odi.od_tail_lof = odi.od_eol_lof - DEV_BSIZE; 888 if (odi.od_tail_lof >= odi.od_eol_lof) 889 odi.od_tail_lof = odi.od_bol_lof; 890 891 /* 892 * While sector trailers maintain TID values, od_head_tid 893 * is not being updated by the kernel ufs logging support 894 * at this time. We therefore count transactions ourselves 895 * starting at zero - as does the kernel ufs logscan code. 896 */ 897 curtid = 0; 898 899 if (!lufs_alloc_logbuf()) { 900 dprintf("Failed to allocate log buffer.\n"); 901 return (0); 902 } 903 904 loghash = (lb_me_t **)lufs_alloc_from_logbuf( 905 LB_HASHSIZE * sizeof (lb_me_t *)); 906 if (loghash == (lb_me_t **)NULL) { 907 dprintf("Can't allocate loghash[] array."); 908 return (0); 909 } 910 return (1); 911 } 912 913 /* 914 * This function must remove all uncommitted entries (l->l_tid == curtid) 915 * from the log hash. Doing this, we implicitly delete pending cancellations 916 * as well. 917 * It uses the same hash walk algorithm as lufs_logscan_freecancel(). Only 918 * the check for entries that need to be removed is different. 919 */ 920 static void 921 lufs_logscan_postscan(void) 922 { 923 lb_me_t **lh, *l, *lnext; 924 int i; 925 926 for (i = 0; i < LB_HASHSIZE; i++) { 927 lh = &loghash[i]; 928 l = *lh; 929 do { 930 if (l == (lb_me_t *)NULL) 931 break; 932 lnext = l->l_next; 933 if (l->l_tid == curtid) { 934 remlist(lh, l); 935 bzero((caddr_t)l, sizeof (lb_me_t)); 936 l->l_next = lfreelist; 937 lfreelist = l; 938 if (*lh == (lb_me_t *)NULL) 939 break; 940 /* 941 * Just removed the hash head. In order not 942 * to terminate the while loop, respin chain 943 * walk for this hash chain. 944 */ 945 if (lnext == *lh) { 946 i--; 947 break; 948 } 949 } else { 950 l->l_flags &= ~(LB_ISCANCELLED); 951 } 952 l = lnext; 953 } while (l != *lh); 954 } 955 } 956 957 /* 958 * This function builds the log hash. It performs the same sequence 959 * of actions at logscan as the kernel ufs logging support: 960 * - Prepare the log for scanning by simulating a full log. 961 * - As long as sectors read from the log have contiguous idents, do: 962 * read the delta header 963 * add the delta to the logmap 964 * skip over the contents to the start of the next delta header 965 * - After terminating the scan, remove uncommitted entries. 966 * 967 * This function cannot fail except if mapping the logbuffer area 968 * during lufs_logscan_prescan() fails. If there is a structural 969 * integrity problem and the on-disk log cannot be read, we'll 970 * treat this as the same situation as an uncommitted transaction 971 * at the end of the log (or, corner case of that, an empty log 972 * with no committed transactions in it at all). 973 * 974 */ 975 static int 976 lufs_logscan(void) 977 { 978 int32_t addr; 979 struct delta d; 980 981 if (!lufs_logscan_prescan()) 982 return (LOG_IS_ERRORED); 983 984 addr = odi.od_head_lof; 985 986 /* 987 * Note that addr == od_tail_lof means a completely filled 988 * log. This almost never happens, so the common exit path 989 * from this loop is via one of the 'break's. 990 */ 991 while (addr != odi.od_tail_lof) { 992 if (!lufs_logscan_read(&addr, &d)) 993 break; 994 if (!lufs_logscan_addmap(&addr, &d)) 995 return (LOG_IS_ERRORED); 996 if (!lufs_logscan_skip(&addr, &d)) 997 break; 998 } 999 1000 lufs_logscan_postscan(); 1001 /* 1002 * Check whether the log contains data, and if so whether 1003 * it contains committed data. 1004 */ 1005 if (addr == odi.od_head_lof || curtid == 0) { 1006 return (LOG_IS_EMPTY); 1007 } 1008 return (LOG_IS_OK); 1009 } 1010 1011 /* 1012 * A metadata block was read from disk. Check whether the logmap 1013 * has a delta against this byte range, and if so read it in, since 1014 * the data in the log is more recent than what was read from other 1015 * places on the disk. 1016 */ 1017 void 1018 lufs_merge_deltas(fileid_t *fp) 1019 { 1020 int nb; 1021 int64_t bof; 1022 lb_me_t **lh, *l; 1023 int32_t skip; 1024 1025 /* 1026 * No logmap: Empty log. Nothing to do here. 1027 */ 1028 if (!ufs_is_lufs || logbuffer == (caddr_t)NULL) 1029 return; 1030 1031 bof = ldbtob(fp->fi_blocknum); 1032 nb = fp->fi_count; 1033 1034 /* 1035 * Search the log hash. 1036 * Merge deltas if an overlap is found. 1037 */ 1038 1039 lh = &loghash[LB_HASHFUNC(bof)]; 1040 1041 if (*lh == (lb_me_t *)NULL) 1042 return; 1043 1044 l = *lh; 1045 1046 do { 1047 l = l->l_prev; 1048 if (OVERLAP(l->l_mof, l->l_nb, bof, nb)) { 1049 /* 1050 * Found a delta in the log hash which overlaps 1051 * with the current metadata block. Read the 1052 * actual delta payload from the on-disk log 1053 * directly into the file buffer. 1054 */ 1055 if (l->l_typ != DT_ABZERO) { 1056 /* 1057 * We have to actually read this part of the 1058 * log as it could contain a sector trailer, or 1059 * wrap around the end of the log. 1060 * If it did, the second offset generation would 1061 * be incorrect if we'd started at l->l_lof. 1062 */ 1063 if (!(skip = lufs_read_log(l->l_lof, NULL, 1064 MAX(bof - l->l_mof, 0)))) 1065 dprintf("scan/merge error, pre-skip\n"); 1066 if (!(skip = lufs_read_log(skip, 1067 fp->fi_memp + MAX(l->l_mof - bof, 0), 1068 MIN(l->l_mof + l->l_nb, bof + nb) - 1069 MAX(l->l_mof, bof)))) 1070 dprintf("scan/merge error, merge\n"); 1071 } else { 1072 /* 1073 * DT_ABZERO requires no disk access, just 1074 * clear the byte range which overlaps with 1075 * the delta. 1076 */ 1077 bzero(fp->fi_memp + MAX(l->l_mof - bof, 0), 1078 MIN(l->l_mof + l->l_nb, bof + nb) - 1079 MAX(l->l_mof, bof)); 1080 } 1081 } 1082 } while (l->l_prev != (*lh)->l_prev); 1083 1084 printf("*\b"); 1085 } 1086 1087 void 1088 lufs_closeall(void) 1089 { 1090 if (ufs_is_lufs) { 1091 bkmem_free((char *)eb, logfp->fi_devp->un_fs.di_fs.fs_bsize); 1092 bkmem_free((char *)logfp, sizeof (fileid_t)); 1093 eb = (extent_block_t *)NULL; 1094 bzero((caddr_t)&odi, sizeof (ml_odunit_t)); 1095 logfp = (fileid_t *)NULL; 1096 lufs_free_logbuf(); 1097 ufs_is_lufs = 0; 1098 } 1099 } 1100