1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/disk.h> 34 #include <sys/disklabel.h> 35 #include <sys/mount.h> 36 #include <sys/stat.h> 37 38 #include <ufs/ufs/ufsmount.h> 39 #include <ufs/ufs/dinode.h> 40 #include <ufs/ufs/dir.h> 41 #include <ufs/ffs/fs.h> 42 43 #include <assert.h> 44 #include <err.h> 45 #include <setjmp.h> 46 #include <stdarg.h> 47 #include <stdio.h> 48 #include <stdlib.h> 49 #include <stdint.h> 50 #include <libufs.h> 51 #include <string.h> 52 #include <strings.h> 53 #include <sysexits.h> 54 #include <time.h> 55 56 #include "fsck.h" 57 58 #define DOTDOT_OFFSET DIRECTSIZ(1) 59 #define SUJ_HASHSIZE 2048 60 #define SUJ_HASHMASK (SUJ_HASHSIZE - 1) 61 #define SUJ_HASH(x) ((x * 2654435761) & SUJ_HASHMASK) 62 63 struct suj_seg { 64 TAILQ_ENTRY(suj_seg) ss_next; 65 struct jsegrec ss_rec; 66 uint8_t *ss_blk; 67 }; 68 69 struct suj_rec { 70 TAILQ_ENTRY(suj_rec) sr_next; 71 union jrec *sr_rec; 72 }; 73 TAILQ_HEAD(srechd, suj_rec); 74 75 struct suj_ino { 76 LIST_ENTRY(suj_ino) si_next; 77 struct srechd si_recs; 78 struct srechd si_newrecs; 79 struct srechd si_movs; 80 struct jtrncrec *si_trunc; 81 ino_t si_ino; 82 char si_skipparent; 83 char si_hasrecs; 84 char si_blkadj; 85 char si_linkadj; 86 int si_mode; 87 nlink_t si_nlinkadj; 88 nlink_t si_nlink; 89 nlink_t si_dotlinks; 90 }; 91 LIST_HEAD(inohd, suj_ino); 92 93 struct suj_blk { 94 LIST_ENTRY(suj_blk) sb_next; 95 struct srechd sb_recs; 96 ufs2_daddr_t sb_blk; 97 }; 98 LIST_HEAD(blkhd, suj_blk); 99 100 struct data_blk { 101 LIST_ENTRY(data_blk) db_next; 102 uint8_t *db_buf; 103 ufs2_daddr_t db_blk; 104 int db_size; 105 int db_dirty; 106 }; 107 108 struct ino_blk { 109 LIST_ENTRY(ino_blk) ib_next; 110 uint8_t *ib_buf; 111 int ib_dirty; 112 ino_t ib_startinginum; 113 ufs2_daddr_t ib_blk; 114 }; 115 LIST_HEAD(iblkhd, ino_blk); 116 117 struct suj_cg { 118 LIST_ENTRY(suj_cg) sc_next; 119 struct blkhd sc_blkhash[SUJ_HASHSIZE]; 120 struct inohd sc_inohash[SUJ_HASHSIZE]; 121 struct iblkhd sc_iblkhash[SUJ_HASHSIZE]; 122 struct ino_blk *sc_lastiblk; 123 struct suj_ino *sc_lastino; 124 struct suj_blk *sc_lastblk; 125 uint8_t *sc_cgbuf; 126 struct cg *sc_cgp; 127 int sc_dirty; 128 int sc_cgx; 129 }; 130 131 static LIST_HEAD(cghd, suj_cg) cghash[SUJ_HASHSIZE]; 132 static LIST_HEAD(dblkhd, data_blk) dbhash[SUJ_HASHSIZE]; 133 static struct suj_cg *lastcg; 134 static struct data_blk *lastblk; 135 136 static TAILQ_HEAD(seghd, suj_seg) allsegs; 137 static uint64_t oldseq; 138 static struct fs *fs = NULL; 139 static ino_t sujino; 140 141 /* 142 * Summary statistics. 143 */ 144 static uint64_t freefrags; 145 static uint64_t freeblocks; 146 static uint64_t freeinos; 147 static uint64_t freedir; 148 static uint64_t jbytes; 149 static uint64_t jrecs; 150 151 static jmp_buf jmpbuf; 152 153 typedef void (*ino_visitor)(ino_t, ufs_lbn_t, ufs2_daddr_t, int); 154 static void err_suj(const char *, ...) __dead2; 155 static void ino_trunc(ino_t, off_t); 156 static void ino_decr(ino_t); 157 static void ino_adjust(struct suj_ino *); 158 static void ino_build(struct suj_ino *); 159 static int blk_isfree(ufs2_daddr_t); 160 static void initsuj(void); 161 static void ino_dirty(ino_t); 162 163 static void * 164 errmalloc(size_t n) 165 { 166 void *a; 167 168 a = Malloc(n); 169 if (a == NULL) 170 err(EX_OSERR, "malloc(%zu)", n); 171 return (a); 172 } 173 174 /* 175 * When hit a fatal error in journalling check, print out 176 * the error and then offer to fallback to normal fsck. 177 */ 178 static void 179 err_suj(const char * restrict fmt, ...) 180 { 181 va_list ap; 182 183 if (preen) 184 (void)fprintf(stdout, "%s: ", cdevname); 185 186 va_start(ap, fmt); 187 (void)vfprintf(stdout, fmt, ap); 188 va_end(ap); 189 190 longjmp(jmpbuf, -1); 191 } 192 193 /* 194 * Mark file system as clean, write the super-block back, close the disk. 195 */ 196 static void 197 closedisk(const char *devnam) 198 { 199 struct csum *cgsum; 200 uint32_t i; 201 202 /* 203 * Recompute the fs summary info from correct cs summaries. 204 */ 205 bzero(&fs->fs_cstotal, sizeof(struct csum_total)); 206 for (i = 0; i < fs->fs_ncg; i++) { 207 cgsum = &fs->fs_cs(fs, i); 208 fs->fs_cstotal.cs_nffree += cgsum->cs_nffree; 209 fs->fs_cstotal.cs_nbfree += cgsum->cs_nbfree; 210 fs->fs_cstotal.cs_nifree += cgsum->cs_nifree; 211 fs->fs_cstotal.cs_ndir += cgsum->cs_ndir; 212 } 213 fs->fs_pendinginodes = 0; 214 fs->fs_pendingblocks = 0; 215 fs->fs_clean = 1; 216 fs->fs_time = time(NULL); 217 fs->fs_mtime = time(NULL); 218 if (sbput(disk.d_fd, fs, 0) == -1) 219 err(EX_OSERR, "sbput(%s)", devnam); 220 if (ufs_disk_close(&disk) == -1) 221 err(EX_OSERR, "ufs_disk_close(%s)", devnam); 222 fs = NULL; 223 } 224 225 /* 226 * Lookup a cg by number in the hash so we can keep track of which cgs 227 * need stats rebuilt. 228 */ 229 static struct suj_cg * 230 cg_lookup(int cgx) 231 { 232 struct cghd *hd; 233 struct suj_cg *sc; 234 235 if (cgx < 0 || cgx >= fs->fs_ncg) 236 err_suj("Bad cg number %d\n", cgx); 237 if (lastcg && lastcg->sc_cgx == cgx) 238 return (lastcg); 239 hd = &cghash[SUJ_HASH(cgx)]; 240 LIST_FOREACH(sc, hd, sc_next) 241 if (sc->sc_cgx == cgx) { 242 lastcg = sc; 243 return (sc); 244 } 245 sc = errmalloc(sizeof(*sc)); 246 bzero(sc, sizeof(*sc)); 247 sc->sc_cgbuf = errmalloc(fs->fs_bsize); 248 sc->sc_cgp = (struct cg *)sc->sc_cgbuf; 249 sc->sc_cgx = cgx; 250 LIST_INSERT_HEAD(hd, sc, sc_next); 251 /* 252 * Use bread() here rather than cgget() because the cylinder group 253 * may be corrupted but we want it anyway so we can fix it. 254 */ 255 if (bread(&disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf, 256 fs->fs_bsize) == -1) 257 err_suj("Unable to read cylinder group %d\n", sc->sc_cgx); 258 259 return (sc); 260 } 261 262 /* 263 * Lookup an inode number in the hash and allocate a suj_ino if it does 264 * not exist. 265 */ 266 static struct suj_ino * 267 ino_lookup(ino_t ino, int creat) 268 { 269 struct suj_ino *sino; 270 struct inohd *hd; 271 struct suj_cg *sc; 272 273 sc = cg_lookup(ino_to_cg(fs, ino)); 274 if (sc->sc_lastino && sc->sc_lastino->si_ino == ino) 275 return (sc->sc_lastino); 276 hd = &sc->sc_inohash[SUJ_HASH(ino)]; 277 LIST_FOREACH(sino, hd, si_next) 278 if (sino->si_ino == ino) 279 return (sino); 280 if (creat == 0) 281 return (NULL); 282 sino = errmalloc(sizeof(*sino)); 283 bzero(sino, sizeof(*sino)); 284 sino->si_ino = ino; 285 TAILQ_INIT(&sino->si_recs); 286 TAILQ_INIT(&sino->si_newrecs); 287 TAILQ_INIT(&sino->si_movs); 288 LIST_INSERT_HEAD(hd, sino, si_next); 289 290 return (sino); 291 } 292 293 /* 294 * Lookup a block number in the hash and allocate a suj_blk if it does 295 * not exist. 296 */ 297 static struct suj_blk * 298 blk_lookup(ufs2_daddr_t blk, int creat) 299 { 300 struct suj_blk *sblk; 301 struct suj_cg *sc; 302 struct blkhd *hd; 303 304 sc = cg_lookup(dtog(fs, blk)); 305 if (sc->sc_lastblk && sc->sc_lastblk->sb_blk == blk) 306 return (sc->sc_lastblk); 307 hd = &sc->sc_blkhash[SUJ_HASH(fragstoblks(fs, blk))]; 308 LIST_FOREACH(sblk, hd, sb_next) 309 if (sblk->sb_blk == blk) 310 return (sblk); 311 if (creat == 0) 312 return (NULL); 313 sblk = errmalloc(sizeof(*sblk)); 314 bzero(sblk, sizeof(*sblk)); 315 sblk->sb_blk = blk; 316 TAILQ_INIT(&sblk->sb_recs); 317 LIST_INSERT_HEAD(hd, sblk, sb_next); 318 319 return (sblk); 320 } 321 322 static struct data_blk * 323 dblk_lookup(ufs2_daddr_t blk) 324 { 325 struct data_blk *dblk; 326 struct dblkhd *hd; 327 328 hd = &dbhash[SUJ_HASH(fragstoblks(fs, blk))]; 329 if (lastblk && lastblk->db_blk == blk) 330 return (lastblk); 331 LIST_FOREACH(dblk, hd, db_next) 332 if (dblk->db_blk == blk) 333 return (dblk); 334 /* 335 * The inode block wasn't located, allocate a new one. 336 */ 337 dblk = errmalloc(sizeof(*dblk)); 338 bzero(dblk, sizeof(*dblk)); 339 LIST_INSERT_HEAD(hd, dblk, db_next); 340 dblk->db_blk = blk; 341 return (dblk); 342 } 343 344 static uint8_t * 345 dblk_read(ufs2_daddr_t blk, int size) 346 { 347 struct data_blk *dblk; 348 349 dblk = dblk_lookup(blk); 350 /* 351 * I doubt size mismatches can happen in practice but it is trivial 352 * to handle. 353 */ 354 if (size != dblk->db_size) { 355 if (dblk->db_buf) 356 free(dblk->db_buf); 357 dblk->db_buf = errmalloc(size); 358 dblk->db_size = size; 359 if (bread(&disk, fsbtodb(fs, blk), dblk->db_buf, size) == -1) 360 err_suj("Failed to read data block %jd\n", blk); 361 } 362 return (dblk->db_buf); 363 } 364 365 static void 366 dblk_dirty(ufs2_daddr_t blk) 367 { 368 struct data_blk *dblk; 369 370 dblk = dblk_lookup(blk); 371 dblk->db_dirty = 1; 372 } 373 374 static void 375 dblk_write(void) 376 { 377 struct data_blk *dblk; 378 int i; 379 380 for (i = 0; i < SUJ_HASHSIZE; i++) { 381 LIST_FOREACH(dblk, &dbhash[i], db_next) { 382 if (dblk->db_dirty == 0 || dblk->db_size == 0) 383 continue; 384 if (bwrite(&disk, fsbtodb(fs, dblk->db_blk), 385 dblk->db_buf, dblk->db_size) == -1) 386 err_suj("Unable to write block %jd\n", 387 dblk->db_blk); 388 } 389 } 390 } 391 392 static union dinode * 393 ino_read(ino_t ino) 394 { 395 struct ino_blk *iblk; 396 struct iblkhd *hd; 397 struct suj_cg *sc; 398 ufs2_daddr_t blk; 399 union dinode *dp; 400 int off; 401 402 blk = ino_to_fsba(fs, ino); 403 sc = cg_lookup(ino_to_cg(fs, ino)); 404 iblk = sc->sc_lastiblk; 405 if (iblk && iblk->ib_blk == blk) 406 goto found; 407 hd = &sc->sc_iblkhash[SUJ_HASH(fragstoblks(fs, blk))]; 408 LIST_FOREACH(iblk, hd, ib_next) 409 if (iblk->ib_blk == blk) 410 goto found; 411 /* 412 * The inode block wasn't located, allocate a new one. 413 */ 414 iblk = errmalloc(sizeof(*iblk)); 415 bzero(iblk, sizeof(*iblk)); 416 iblk->ib_buf = errmalloc(fs->fs_bsize); 417 iblk->ib_blk = blk; 418 iblk->ib_startinginum = rounddown(ino, INOPB(fs)); 419 LIST_INSERT_HEAD(hd, iblk, ib_next); 420 if (bread(&disk, fsbtodb(fs, blk), iblk->ib_buf, fs->fs_bsize) == -1) 421 err_suj("Failed to read inode block %jd\n", blk); 422 found: 423 sc->sc_lastiblk = iblk; 424 off = ino_to_fsbo(fs, ino); 425 if (fs->fs_magic == FS_UFS1_MAGIC) 426 return (union dinode *)&((struct ufs1_dinode *)iblk->ib_buf)[off]; 427 dp = (union dinode *)&((struct ufs2_dinode *)iblk->ib_buf)[off]; 428 if (debug && 429 ffs_verify_dinode_ckhash(fs, (struct ufs2_dinode *)dp) != 0) { 430 pwarn("ino_read: INODE CHECK-HASH FAILED"); 431 prtinode(ino, dp); 432 if (preen || reply("FIX") != 0) { 433 if (preen) 434 printf(" (FIXED)\n"); 435 ino_dirty(ino); 436 } 437 } 438 return (dp); 439 } 440 441 static void 442 ino_dirty(ino_t ino) 443 { 444 struct ino_blk *iblk; 445 struct iblkhd *hd; 446 struct suj_cg *sc; 447 ufs2_daddr_t blk; 448 int off; 449 450 blk = ino_to_fsba(fs, ino); 451 sc = cg_lookup(ino_to_cg(fs, ino)); 452 iblk = sc->sc_lastiblk; 453 if (iblk && iblk->ib_blk == blk) { 454 if (fs->fs_magic == FS_UFS2_MAGIC) { 455 off = ino_to_fsbo(fs, ino); 456 ffs_update_dinode_ckhash(fs, 457 &((struct ufs2_dinode *)iblk->ib_buf)[off]); 458 } 459 iblk->ib_dirty = 1; 460 return; 461 } 462 hd = &sc->sc_iblkhash[SUJ_HASH(fragstoblks(fs, blk))]; 463 LIST_FOREACH(iblk, hd, ib_next) { 464 if (iblk->ib_blk == blk) { 465 if (fs->fs_magic == FS_UFS2_MAGIC) { 466 off = ino_to_fsbo(fs, ino); 467 ffs_update_dinode_ckhash(fs, 468 &((struct ufs2_dinode *)iblk->ib_buf)[off]); 469 } 470 iblk->ib_dirty = 1; 471 return; 472 } 473 } 474 ino_read(ino); 475 ino_dirty(ino); 476 } 477 478 static void 479 iblk_write(struct ino_blk *iblk) 480 { 481 struct ufs2_dinode *dp; 482 int i; 483 484 if (iblk->ib_dirty == 0) 485 return; 486 if (debug && fs->fs_magic == FS_UFS2_MAGIC) { 487 dp = (struct ufs2_dinode *)iblk->ib_buf; 488 for (i = 0; i < INOPB(fs); dp++, i++) { 489 if (ffs_verify_dinode_ckhash(fs, dp) == 0) 490 continue; 491 pwarn("iblk_write: INODE CHECK-HASH FAILED"); 492 prtinode(iblk->ib_startinginum + i, (union dinode *)dp); 493 if (preen || reply("FIX") != 0) { 494 if (preen) 495 printf(" (FIXED)\n"); 496 ino_dirty(iblk->ib_startinginum + i); 497 } 498 } 499 } 500 if (bwrite(&disk, fsbtodb(fs, iblk->ib_blk), iblk->ib_buf, 501 fs->fs_bsize) == -1) 502 err_suj("Failed to write inode block %jd\n", iblk->ib_blk); 503 } 504 505 static int 506 blk_overlaps(struct jblkrec *brec, ufs2_daddr_t start, int frags) 507 { 508 ufs2_daddr_t bstart; 509 ufs2_daddr_t bend; 510 ufs2_daddr_t end; 511 512 end = start + frags; 513 bstart = brec->jb_blkno + brec->jb_oldfrags; 514 bend = bstart + brec->jb_frags; 515 if (start < bend && end > bstart) 516 return (1); 517 return (0); 518 } 519 520 static int 521 blk_equals(struct jblkrec *brec, ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t start, 522 int frags) 523 { 524 525 if (brec->jb_ino != ino || brec->jb_lbn != lbn) 526 return (0); 527 if (brec->jb_blkno + brec->jb_oldfrags != start) 528 return (0); 529 if (brec->jb_frags < frags) 530 return (0); 531 return (1); 532 } 533 534 static void 535 blk_setmask(struct jblkrec *brec, int *mask) 536 { 537 int i; 538 539 for (i = brec->jb_oldfrags; i < brec->jb_oldfrags + brec->jb_frags; i++) 540 *mask |= 1 << i; 541 } 542 543 /* 544 * Determine whether a given block has been reallocated to a new location. 545 * Returns a mask of overlapping bits if any frags have been reused or 546 * zero if the block has not been re-used and the contents can be trusted. 547 * 548 * This is used to ensure that an orphaned pointer due to truncate is safe 549 * to be freed. The mask value can be used to free partial blocks. 550 */ 551 static int 552 blk_freemask(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags) 553 { 554 struct suj_blk *sblk; 555 struct suj_rec *srec; 556 struct jblkrec *brec; 557 int mask; 558 int off; 559 560 /* 561 * To be certain we're not freeing a reallocated block we lookup 562 * this block in the blk hash and see if there is an allocation 563 * journal record that overlaps with any fragments in the block 564 * we're concerned with. If any fragments have ben reallocated 565 * the block has already been freed and re-used for another purpose. 566 */ 567 mask = 0; 568 sblk = blk_lookup(blknum(fs, blk), 0); 569 if (sblk == NULL) 570 return (0); 571 off = blk - sblk->sb_blk; 572 TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { 573 brec = (struct jblkrec *)srec->sr_rec; 574 /* 575 * If the block overlaps but does not match 576 * exactly this record refers to the current 577 * location. 578 */ 579 if (blk_overlaps(brec, blk, frags) == 0) 580 continue; 581 if (blk_equals(brec, ino, lbn, blk, frags) == 1) 582 mask = 0; 583 else 584 blk_setmask(brec, &mask); 585 } 586 if (debug) 587 printf("blk_freemask: blk %jd sblk %jd off %d mask 0x%X\n", 588 blk, sblk->sb_blk, off, mask); 589 return (mask >> off); 590 } 591 592 /* 593 * Determine whether it is safe to follow an indirect. It is not safe 594 * if any part of the indirect has been reallocated or the last journal 595 * entry was an allocation. Just allocated indirects may not have valid 596 * pointers yet and all of their children will have their own records. 597 * It is also not safe to follow an indirect if the cg bitmap has been 598 * cleared as a new allocation may write to the block prior to the journal 599 * being written. 600 * 601 * Returns 1 if it's safe to follow the indirect and 0 otherwise. 602 */ 603 static int 604 blk_isindir(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn) 605 { 606 struct suj_blk *sblk; 607 struct jblkrec *brec; 608 609 sblk = blk_lookup(blk, 0); 610 if (sblk == NULL) 611 return (1); 612 if (TAILQ_EMPTY(&sblk->sb_recs)) 613 return (1); 614 brec = (struct jblkrec *)TAILQ_LAST(&sblk->sb_recs, srechd)->sr_rec; 615 if (blk_equals(brec, ino, lbn, blk, fs->fs_frag)) 616 if (brec->jb_op == JOP_FREEBLK) 617 return (!blk_isfree(blk)); 618 return (0); 619 } 620 621 /* 622 * Clear an inode from the cg bitmap. If the inode was already clear return 623 * 0 so the caller knows it does not have to check the inode contents. 624 */ 625 static int 626 ino_free(ino_t ino, int mode) 627 { 628 struct suj_cg *sc; 629 uint8_t *inosused; 630 struct cg *cgp; 631 int cg; 632 633 cg = ino_to_cg(fs, ino); 634 ino = ino % fs->fs_ipg; 635 sc = cg_lookup(cg); 636 cgp = sc->sc_cgp; 637 inosused = cg_inosused(cgp); 638 /* 639 * The bitmap may never have made it to the disk so we have to 640 * conditionally clear. We can avoid writing the cg in this case. 641 */ 642 if (isclr(inosused, ino)) 643 return (0); 644 freeinos++; 645 clrbit(inosused, ino); 646 if (ino < cgp->cg_irotor) 647 cgp->cg_irotor = ino; 648 cgp->cg_cs.cs_nifree++; 649 if ((mode & IFMT) == IFDIR) { 650 freedir++; 651 cgp->cg_cs.cs_ndir--; 652 } 653 sc->sc_dirty = 1; 654 655 return (1); 656 } 657 658 /* 659 * Free 'frags' frags starting at filesystem block 'bno' skipping any frags 660 * set in the mask. 661 */ 662 static void 663 blk_free(ufs2_daddr_t bno, int mask, int frags) 664 { 665 ufs1_daddr_t fragno, cgbno; 666 struct suj_cg *sc; 667 struct cg *cgp; 668 int i, cg; 669 uint8_t *blksfree; 670 671 if (debug) 672 printf("Freeing %d frags at blk %jd mask 0x%x\n", 673 frags, bno, mask); 674 cg = dtog(fs, bno); 675 sc = cg_lookup(cg); 676 cgp = sc->sc_cgp; 677 cgbno = dtogd(fs, bno); 678 blksfree = cg_blksfree(cgp); 679 680 /* 681 * If it's not allocated we only wrote the journal entry 682 * and never the bitmaps. Here we unconditionally clear and 683 * resolve the cg summary later. 684 */ 685 if (frags == fs->fs_frag && mask == 0) { 686 fragno = fragstoblks(fs, cgbno); 687 ffs_setblock(fs, blksfree, fragno); 688 freeblocks++; 689 } else { 690 /* 691 * deallocate the fragment 692 */ 693 for (i = 0; i < frags; i++) 694 if ((mask & (1 << i)) == 0 && isclr(blksfree, cgbno +i)) { 695 freefrags++; 696 setbit(blksfree, cgbno + i); 697 } 698 } 699 sc->sc_dirty = 1; 700 } 701 702 /* 703 * Returns 1 if the whole block starting at 'bno' is marked free and 0 704 * otherwise. 705 */ 706 static int 707 blk_isfree(ufs2_daddr_t bno) 708 { 709 struct suj_cg *sc; 710 711 sc = cg_lookup(dtog(fs, bno)); 712 return ffs_isblock(fs, cg_blksfree(sc->sc_cgp), dtogd(fs, bno)); 713 } 714 715 /* 716 * Fetch an indirect block to find the block at a given lbn. The lbn 717 * may be negative to fetch a specific indirect block pointer or positive 718 * to fetch a specific block. 719 */ 720 static ufs2_daddr_t 721 indir_blkatoff(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t cur, ufs_lbn_t lbn) 722 { 723 ufs2_daddr_t *bap2; 724 ufs2_daddr_t *bap1; 725 ufs_lbn_t lbnadd; 726 ufs_lbn_t base; 727 int level; 728 int i; 729 730 if (blk == 0) 731 return (0); 732 level = lbn_level(cur); 733 if (level == -1) 734 err_suj("Invalid indir lbn %jd\n", lbn); 735 if (level == 0 && lbn < 0) 736 err_suj("Invalid lbn %jd\n", lbn); 737 bap2 = (void *)dblk_read(blk, fs->fs_bsize); 738 bap1 = (void *)bap2; 739 lbnadd = 1; 740 base = -(cur + level); 741 for (i = level; i > 0; i--) 742 lbnadd *= NINDIR(fs); 743 if (lbn > 0) 744 i = (lbn - base) / lbnadd; 745 else 746 i = (-lbn - base) / lbnadd; 747 if (i < 0 || i >= NINDIR(fs)) 748 err_suj("Invalid indirect index %d produced by lbn %jd\n", 749 i, lbn); 750 if (level == 0) 751 cur = base + (i * lbnadd); 752 else 753 cur = -(base + (i * lbnadd)) - (level - 1); 754 if (fs->fs_magic == FS_UFS1_MAGIC) 755 blk = bap1[i]; 756 else 757 blk = bap2[i]; 758 if (cur == lbn) 759 return (blk); 760 if (level == 0) 761 err_suj("Invalid lbn %jd at level 0\n", lbn); 762 return indir_blkatoff(blk, ino, cur, lbn); 763 } 764 765 /* 766 * Finds the disk block address at the specified lbn within the inode 767 * specified by ip. This follows the whole tree and honors di_size and 768 * di_extsize so it is a true test of reachability. The lbn may be 769 * negative if an extattr or indirect block is requested. 770 */ 771 static ufs2_daddr_t 772 ino_blkatoff(union dinode *ip, ino_t ino, ufs_lbn_t lbn, int *frags) 773 { 774 ufs_lbn_t tmpval; 775 ufs_lbn_t cur; 776 ufs_lbn_t next; 777 int i; 778 779 /* 780 * Handle extattr blocks first. 781 */ 782 if (lbn < 0 && lbn >= -UFS_NXADDR) { 783 lbn = -1 - lbn; 784 if (lbn > lblkno(fs, ip->dp2.di_extsize - 1)) 785 return (0); 786 *frags = numfrags(fs, sblksize(fs, ip->dp2.di_extsize, lbn)); 787 return (ip->dp2.di_extb[lbn]); 788 } 789 /* 790 * Now direct and indirect. 791 */ 792 if (DIP(ip, di_mode) == IFLNK && 793 DIP(ip, di_size) < fs->fs_maxsymlinklen) 794 return (0); 795 if (lbn >= 0 && lbn < UFS_NDADDR) { 796 *frags = numfrags(fs, sblksize(fs, DIP(ip, di_size), lbn)); 797 return (DIP(ip, di_db[lbn])); 798 } 799 *frags = fs->fs_frag; 800 801 for (i = 0, tmpval = NINDIR(fs), cur = UFS_NDADDR; i < UFS_NIADDR; i++, 802 tmpval *= NINDIR(fs), cur = next) { 803 next = cur + tmpval; 804 if (lbn == -cur - i) 805 return (DIP(ip, di_ib[i])); 806 /* 807 * Determine whether the lbn in question is within this tree. 808 */ 809 if (lbn < 0 && -lbn >= next) 810 continue; 811 if (lbn > 0 && lbn >= next) 812 continue; 813 return indir_blkatoff(DIP(ip, di_ib[i]), ino, -cur - i, lbn); 814 } 815 err_suj("lbn %jd not in ino\n", lbn); 816 /* NOTREACHED */ 817 } 818 819 /* 820 * Determine whether a block exists at a particular lbn in an inode. 821 * Returns 1 if found, 0 if not. lbn may be negative for indirects 822 * or ext blocks. 823 */ 824 static int 825 blk_isat(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int *frags) 826 { 827 union dinode *ip; 828 ufs2_daddr_t nblk; 829 830 ip = ino_read(ino); 831 832 if (DIP(ip, di_nlink) == 0 || DIP(ip, di_mode) == 0) 833 return (0); 834 nblk = ino_blkatoff(ip, ino, lbn, frags); 835 836 return (nblk == blk); 837 } 838 839 /* 840 * Clear the directory entry at diroff that should point to child. Minimal 841 * checking is done and it is assumed that this path was verified with isat. 842 */ 843 static void 844 ino_clrat(ino_t parent, off_t diroff, ino_t child) 845 { 846 union dinode *dip; 847 struct direct *dp; 848 ufs2_daddr_t blk; 849 uint8_t *block; 850 ufs_lbn_t lbn; 851 int blksize; 852 int frags; 853 int doff; 854 855 if (debug) 856 printf("Clearing inode %ju from parent %ju at offset %jd\n", 857 (uintmax_t)child, (uintmax_t)parent, diroff); 858 859 lbn = lblkno(fs, diroff); 860 doff = blkoff(fs, diroff); 861 dip = ino_read(parent); 862 blk = ino_blkatoff(dip, parent, lbn, &frags); 863 blksize = sblksize(fs, DIP(dip, di_size), lbn); 864 block = dblk_read(blk, blksize); 865 dp = (struct direct *)&block[doff]; 866 if (dp->d_ino != child) 867 errx(1, "Inode %ju does not exist in %ju at %jd", 868 (uintmax_t)child, (uintmax_t)parent, diroff); 869 dp->d_ino = 0; 870 dblk_dirty(blk); 871 /* 872 * The actual .. reference count will already have been removed 873 * from the parent by the .. remref record. 874 */ 875 } 876 877 /* 878 * Determines whether a pointer to an inode exists within a directory 879 * at a specified offset. Returns the mode of the found entry. 880 */ 881 static int 882 ino_isat(ino_t parent, off_t diroff, ino_t child, int *mode, int *isdot) 883 { 884 union dinode *dip; 885 struct direct *dp; 886 ufs2_daddr_t blk; 887 uint8_t *block; 888 ufs_lbn_t lbn; 889 int blksize; 890 int frags; 891 int dpoff; 892 int doff; 893 894 *isdot = 0; 895 dip = ino_read(parent); 896 *mode = DIP(dip, di_mode); 897 if ((*mode & IFMT) != IFDIR) { 898 if (debug) { 899 /* 900 * This can happen if the parent inode 901 * was reallocated. 902 */ 903 if (*mode != 0) 904 printf("Directory %ju has bad mode %o\n", 905 (uintmax_t)parent, *mode); 906 else 907 printf("Directory %ju has zero mode\n", 908 (uintmax_t)parent); 909 } 910 return (0); 911 } 912 lbn = lblkno(fs, diroff); 913 doff = blkoff(fs, diroff); 914 blksize = sblksize(fs, DIP(dip, di_size), lbn); 915 if (diroff + DIRECTSIZ(1) > DIP(dip, di_size) || doff >= blksize) { 916 if (debug) 917 printf("ino %ju absent from %ju due to offset %jd" 918 " exceeding size %jd\n", 919 (uintmax_t)child, (uintmax_t)parent, diroff, 920 DIP(dip, di_size)); 921 return (0); 922 } 923 blk = ino_blkatoff(dip, parent, lbn, &frags); 924 if (blk <= 0) { 925 if (debug) 926 printf("Sparse directory %ju", (uintmax_t)parent); 927 return (0); 928 } 929 block = dblk_read(blk, blksize); 930 /* 931 * Walk through the records from the start of the block to be 932 * certain we hit a valid record and not some junk in the middle 933 * of a file name. Stop when we reach or pass the expected offset. 934 */ 935 dpoff = rounddown(doff, DIRBLKSIZ); 936 do { 937 dp = (struct direct *)&block[dpoff]; 938 if (dpoff == doff) 939 break; 940 if (dp->d_reclen == 0) 941 break; 942 dpoff += dp->d_reclen; 943 } while (dpoff <= doff); 944 if (dpoff > fs->fs_bsize) 945 err_suj("Corrupt directory block in dir ino %ju\n", 946 (uintmax_t)parent); 947 /* Not found. */ 948 if (dpoff != doff) { 949 if (debug) 950 printf("ino %ju not found in %ju, lbn %jd, dpoff %d\n", 951 (uintmax_t)child, (uintmax_t)parent, lbn, dpoff); 952 return (0); 953 } 954 /* 955 * We found the item in question. Record the mode and whether it's 956 * a . or .. link for the caller. 957 */ 958 if (dp->d_ino == child) { 959 if (child == parent) 960 *isdot = 1; 961 else if (dp->d_namlen == 2 && 962 dp->d_name[0] == '.' && dp->d_name[1] == '.') 963 *isdot = 1; 964 *mode = DTTOIF(dp->d_type); 965 return (1); 966 } 967 if (debug) 968 printf("ino %ju doesn't match dirent ino %ju in parent %ju\n", 969 (uintmax_t)child, (uintmax_t)dp->d_ino, (uintmax_t)parent); 970 return (0); 971 } 972 973 #define VISIT_INDIR 0x0001 974 #define VISIT_EXT 0x0002 975 #define VISIT_ROOT 0x0004 /* Operation came via root & valid pointers. */ 976 977 /* 978 * Read an indirect level which may or may not be linked into an inode. 979 */ 980 static void 981 indir_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, uint64_t *frags, 982 ino_visitor visitor, int flags) 983 { 984 ufs2_daddr_t *bap2; 985 ufs1_daddr_t *bap1; 986 ufs_lbn_t lbnadd; 987 ufs2_daddr_t nblk; 988 ufs_lbn_t nlbn; 989 int level; 990 int i; 991 992 /* 993 * Don't visit indirect blocks with contents we can't trust. This 994 * should only happen when indir_visit() is called to complete a 995 * truncate that never finished and not when a pointer is found via 996 * an inode. 997 */ 998 if (blk == 0) 999 return; 1000 level = lbn_level(lbn); 1001 if (level == -1) 1002 err_suj("Invalid level for lbn %jd\n", lbn); 1003 if ((flags & VISIT_ROOT) == 0 && blk_isindir(blk, ino, lbn) == 0) { 1004 if (debug) 1005 printf("blk %jd ino %ju lbn %jd(%d) is not indir.\n", 1006 blk, (uintmax_t)ino, lbn, level); 1007 goto out; 1008 } 1009 lbnadd = 1; 1010 for (i = level; i > 0; i--) 1011 lbnadd *= NINDIR(fs); 1012 bap1 = (void *)dblk_read(blk, fs->fs_bsize); 1013 bap2 = (void *)bap1; 1014 for (i = 0; i < NINDIR(fs); i++) { 1015 if (fs->fs_magic == FS_UFS1_MAGIC) 1016 nblk = *bap1++; 1017 else 1018 nblk = *bap2++; 1019 if (nblk == 0) 1020 continue; 1021 if (level == 0) { 1022 nlbn = -lbn + i * lbnadd; 1023 (*frags) += fs->fs_frag; 1024 visitor(ino, nlbn, nblk, fs->fs_frag); 1025 } else { 1026 nlbn = (lbn + 1) - (i * lbnadd); 1027 indir_visit(ino, nlbn, nblk, frags, visitor, flags); 1028 } 1029 } 1030 out: 1031 if (flags & VISIT_INDIR) { 1032 (*frags) += fs->fs_frag; 1033 visitor(ino, lbn, blk, fs->fs_frag); 1034 } 1035 } 1036 1037 /* 1038 * Visit each block in an inode as specified by 'flags' and call a 1039 * callback function. The callback may inspect or free blocks. The 1040 * count of frags found according to the size in the file is returned. 1041 * This is not valid for sparse files but may be used to determine 1042 * the correct di_blocks for a file. 1043 */ 1044 static uint64_t 1045 ino_visit(union dinode *ip, ino_t ino, ino_visitor visitor, int flags) 1046 { 1047 ufs_lbn_t nextlbn; 1048 ufs_lbn_t tmpval; 1049 ufs_lbn_t lbn; 1050 uint64_t size; 1051 uint64_t fragcnt; 1052 int mode; 1053 int frags; 1054 int i; 1055 1056 size = DIP(ip, di_size); 1057 mode = DIP(ip, di_mode) & IFMT; 1058 fragcnt = 0; 1059 if ((flags & VISIT_EXT) && 1060 fs->fs_magic == FS_UFS2_MAGIC && ip->dp2.di_extsize) { 1061 for (i = 0; i < UFS_NXADDR; i++) { 1062 if (ip->dp2.di_extb[i] == 0) 1063 continue; 1064 frags = sblksize(fs, ip->dp2.di_extsize, i); 1065 frags = numfrags(fs, frags); 1066 fragcnt += frags; 1067 visitor(ino, -1 - i, ip->dp2.di_extb[i], frags); 1068 } 1069 } 1070 /* Skip datablocks for short links and devices. */ 1071 if (mode == IFBLK || mode == IFCHR || 1072 (mode == IFLNK && size < fs->fs_maxsymlinklen)) 1073 return (fragcnt); 1074 for (i = 0; i < UFS_NDADDR; i++) { 1075 if (DIP(ip, di_db[i]) == 0) 1076 continue; 1077 frags = sblksize(fs, size, i); 1078 frags = numfrags(fs, frags); 1079 fragcnt += frags; 1080 visitor(ino, i, DIP(ip, di_db[i]), frags); 1081 } 1082 /* 1083 * We know the following indirects are real as we're following 1084 * real pointers to them. 1085 */ 1086 flags |= VISIT_ROOT; 1087 for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; i < UFS_NIADDR; i++, 1088 lbn = nextlbn) { 1089 nextlbn = lbn + tmpval; 1090 tmpval *= NINDIR(fs); 1091 if (DIP(ip, di_ib[i]) == 0) 1092 continue; 1093 indir_visit(ino, -lbn - i, DIP(ip, di_ib[i]), &fragcnt, visitor, 1094 flags); 1095 } 1096 return (fragcnt); 1097 } 1098 1099 /* 1100 * Null visitor function used when we just want to count blocks and 1101 * record the lbn. 1102 */ 1103 ufs_lbn_t visitlbn; 1104 static void 1105 null_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) 1106 { 1107 if (lbn > 0) 1108 visitlbn = lbn; 1109 } 1110 1111 /* 1112 * Recalculate di_blocks when we discover that a block allocation or 1113 * free was not successfully completed. The kernel does not roll this back 1114 * because it would be too expensive to compute which indirects were 1115 * reachable at the time the inode was written. 1116 */ 1117 static void 1118 ino_adjblks(struct suj_ino *sino) 1119 { 1120 union dinode *ip; 1121 uint64_t blocks; 1122 uint64_t frags; 1123 off_t isize; 1124 off_t size; 1125 ino_t ino; 1126 1127 ino = sino->si_ino; 1128 ip = ino_read(ino); 1129 /* No need to adjust zero'd inodes. */ 1130 if (DIP(ip, di_mode) == 0) 1131 return; 1132 /* 1133 * Visit all blocks and count them as well as recording the last 1134 * valid lbn in the file. If the file size doesn't agree with the 1135 * last lbn we need to truncate to fix it. Otherwise just adjust 1136 * the blocks count. 1137 */ 1138 visitlbn = 0; 1139 frags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT); 1140 blocks = fsbtodb(fs, frags); 1141 /* 1142 * We assume the size and direct block list is kept coherent by 1143 * softdep. For files that have extended into indirects we truncate 1144 * to the size in the inode or the maximum size permitted by 1145 * populated indirects. 1146 */ 1147 if (visitlbn >= UFS_NDADDR) { 1148 isize = DIP(ip, di_size); 1149 size = lblktosize(fs, visitlbn + 1); 1150 if (isize > size) 1151 isize = size; 1152 /* Always truncate to free any unpopulated indirects. */ 1153 ino_trunc(sino->si_ino, isize); 1154 return; 1155 } 1156 if (blocks == DIP(ip, di_blocks)) 1157 return; 1158 if (debug) 1159 printf("ino %ju adjusting block count from %jd to %jd\n", 1160 (uintmax_t)ino, DIP(ip, di_blocks), blocks); 1161 DIP_SET(ip, di_blocks, blocks); 1162 ino_dirty(ino); 1163 } 1164 1165 static void 1166 blk_free_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) 1167 { 1168 1169 blk_free(blk, blk_freemask(blk, ino, lbn, frags), frags); 1170 } 1171 1172 /* 1173 * Free a block or tree of blocks that was previously rooted in ino at 1174 * the given lbn. If the lbn is an indirect all children are freed 1175 * recursively. 1176 */ 1177 static void 1178 blk_free_lbn(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags, int follow) 1179 { 1180 uint64_t resid; 1181 int mask; 1182 1183 mask = blk_freemask(blk, ino, lbn, frags); 1184 resid = 0; 1185 if (lbn <= -UFS_NDADDR && follow && mask == 0) 1186 indir_visit(ino, lbn, blk, &resid, blk_free_visit, VISIT_INDIR); 1187 else 1188 blk_free(blk, mask, frags); 1189 } 1190 1191 static void 1192 ino_setskip(struct suj_ino *sino, ino_t parent) 1193 { 1194 int isdot; 1195 int mode; 1196 1197 if (ino_isat(sino->si_ino, DOTDOT_OFFSET, parent, &mode, &isdot)) 1198 sino->si_skipparent = 1; 1199 } 1200 1201 static void 1202 ino_remref(ino_t parent, ino_t child, uint64_t diroff, int isdotdot) 1203 { 1204 struct suj_ino *sino; 1205 struct suj_rec *srec; 1206 struct jrefrec *rrec; 1207 1208 /* 1209 * Lookup this inode to see if we have a record for it. 1210 */ 1211 sino = ino_lookup(child, 0); 1212 /* 1213 * Tell any child directories we've already removed their 1214 * parent link cnt. Don't try to adjust our link down again. 1215 */ 1216 if (sino != NULL && isdotdot == 0) 1217 ino_setskip(sino, parent); 1218 /* 1219 * No valid record for this inode. Just drop the on-disk 1220 * link by one. 1221 */ 1222 if (sino == NULL || sino->si_hasrecs == 0) { 1223 ino_decr(child); 1224 return; 1225 } 1226 /* 1227 * Use ino_adjust() if ino_check() has already processed this 1228 * child. If we lose the last non-dot reference to a 1229 * directory it will be discarded. 1230 */ 1231 if (sino->si_linkadj) { 1232 sino->si_nlink--; 1233 if (isdotdot) 1234 sino->si_dotlinks--; 1235 ino_adjust(sino); 1236 return; 1237 } 1238 /* 1239 * If we haven't yet processed this inode we need to make 1240 * sure we will successfully discover the lost path. If not 1241 * use nlinkadj to remember. 1242 */ 1243 TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { 1244 rrec = (struct jrefrec *)srec->sr_rec; 1245 if (rrec->jr_parent == parent && 1246 rrec->jr_diroff == diroff) 1247 return; 1248 } 1249 sino->si_nlinkadj++; 1250 } 1251 1252 /* 1253 * Free the children of a directory when the directory is discarded. 1254 */ 1255 static void 1256 ino_free_children(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) 1257 { 1258 struct suj_ino *sino; 1259 struct direct *dp; 1260 off_t diroff; 1261 uint8_t *block; 1262 int skipparent; 1263 int isdotdot; 1264 int dpoff; 1265 int size; 1266 1267 sino = ino_lookup(ino, 0); 1268 if (sino) 1269 skipparent = sino->si_skipparent; 1270 else 1271 skipparent = 0; 1272 size = lfragtosize(fs, frags); 1273 block = dblk_read(blk, size); 1274 dp = (struct direct *)&block[0]; 1275 for (dpoff = 0; dpoff < size && dp->d_reclen; dpoff += dp->d_reclen) { 1276 dp = (struct direct *)&block[dpoff]; 1277 if (dp->d_ino == 0 || dp->d_ino == UFS_WINO) 1278 continue; 1279 if (dp->d_namlen == 1 && dp->d_name[0] == '.') 1280 continue; 1281 isdotdot = dp->d_namlen == 2 && dp->d_name[0] == '.' && 1282 dp->d_name[1] == '.'; 1283 if (isdotdot && skipparent == 1) 1284 continue; 1285 if (debug) 1286 printf("Directory %ju removing ino %ju name %s\n", 1287 (uintmax_t)ino, (uintmax_t)dp->d_ino, dp->d_name); 1288 diroff = lblktosize(fs, lbn) + dpoff; 1289 ino_remref(ino, dp->d_ino, diroff, isdotdot); 1290 } 1291 } 1292 1293 /* 1294 * Reclaim an inode, freeing all blocks and decrementing all children's 1295 * link counts. Free the inode back to the cg. 1296 */ 1297 static void 1298 ino_reclaim(union dinode *ip, ino_t ino, int mode) 1299 { 1300 uint32_t gen; 1301 1302 if (ino == UFS_ROOTINO) 1303 err_suj("Attempting to free UFS_ROOTINO\n"); 1304 if (debug) 1305 printf("Truncating and freeing ino %ju, nlink %d, mode %o\n", 1306 (uintmax_t)ino, DIP(ip, di_nlink), DIP(ip, di_mode)); 1307 1308 /* We are freeing an inode or directory. */ 1309 if ((DIP(ip, di_mode) & IFMT) == IFDIR) 1310 ino_visit(ip, ino, ino_free_children, 0); 1311 DIP_SET(ip, di_nlink, 0); 1312 ino_visit(ip, ino, blk_free_visit, VISIT_EXT | VISIT_INDIR); 1313 /* Here we have to clear the inode and release any blocks it holds. */ 1314 gen = DIP(ip, di_gen); 1315 if (fs->fs_magic == FS_UFS1_MAGIC) 1316 bzero(ip, sizeof(struct ufs1_dinode)); 1317 else 1318 bzero(ip, sizeof(struct ufs2_dinode)); 1319 DIP_SET(ip, di_gen, gen); 1320 ino_dirty(ino); 1321 ino_free(ino, mode); 1322 return; 1323 } 1324 1325 /* 1326 * Adjust an inode's link count down by one when a directory goes away. 1327 */ 1328 static void 1329 ino_decr(ino_t ino) 1330 { 1331 union dinode *ip; 1332 int reqlink; 1333 int nlink; 1334 int mode; 1335 1336 ip = ino_read(ino); 1337 nlink = DIP(ip, di_nlink); 1338 mode = DIP(ip, di_mode); 1339 if (nlink < 1) 1340 err_suj("Inode %d link count %d invalid\n", ino, nlink); 1341 if (mode == 0) 1342 err_suj("Inode %d has a link of %d with 0 mode\n", ino, nlink); 1343 nlink--; 1344 if ((mode & IFMT) == IFDIR) 1345 reqlink = 2; 1346 else 1347 reqlink = 1; 1348 if (nlink < reqlink) { 1349 if (debug) 1350 printf("ino %ju not enough links to live %d < %d\n", 1351 (uintmax_t)ino, nlink, reqlink); 1352 ino_reclaim(ip, ino, mode); 1353 return; 1354 } 1355 DIP_SET(ip, di_nlink, nlink); 1356 ino_dirty(ino); 1357 } 1358 1359 /* 1360 * Adjust the inode link count to 'nlink'. If the count reaches zero 1361 * free it. 1362 */ 1363 static void 1364 ino_adjust(struct suj_ino *sino) 1365 { 1366 struct jrefrec *rrec; 1367 struct suj_rec *srec; 1368 struct suj_ino *stmp; 1369 union dinode *ip; 1370 nlink_t nlink; 1371 nlink_t reqlink; 1372 int recmode; 1373 int isdot; 1374 int mode; 1375 ino_t ino; 1376 1377 nlink = sino->si_nlink; 1378 ino = sino->si_ino; 1379 mode = sino->si_mode & IFMT; 1380 /* 1381 * If it's a directory with no dot links, it was truncated before 1382 * the name was cleared. We need to clear the dirent that 1383 * points at it. 1384 */ 1385 if (mode == IFDIR && nlink == 1 && sino->si_dotlinks == 0) { 1386 sino->si_nlink = nlink = 0; 1387 TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { 1388 rrec = (struct jrefrec *)srec->sr_rec; 1389 if (ino_isat(rrec->jr_parent, rrec->jr_diroff, ino, 1390 &recmode, &isdot) == 0) 1391 continue; 1392 ino_clrat(rrec->jr_parent, rrec->jr_diroff, ino); 1393 break; 1394 } 1395 if (srec == NULL) 1396 errx(1, "Directory %ju name not found", (uintmax_t)ino); 1397 } 1398 /* 1399 * If it's a directory with no real names pointing to it go ahead 1400 * and truncate it. This will free any children. 1401 */ 1402 if (mode == IFDIR && nlink - sino->si_dotlinks == 0) { 1403 sino->si_nlink = nlink = 0; 1404 /* 1405 * Mark any .. links so they know not to free this inode 1406 * when they are removed. 1407 */ 1408 TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { 1409 rrec = (struct jrefrec *)srec->sr_rec; 1410 if (rrec->jr_diroff == DOTDOT_OFFSET) { 1411 stmp = ino_lookup(rrec->jr_parent, 0); 1412 if (stmp) 1413 ino_setskip(stmp, ino); 1414 } 1415 } 1416 } 1417 ip = ino_read(ino); 1418 mode = DIP(ip, di_mode) & IFMT; 1419 if (nlink > UFS_LINK_MAX) 1420 err_suj("ino %ju nlink manipulation error, new %ju, old %d\n", 1421 (uintmax_t)ino, (uintmax_t)nlink, DIP(ip, di_nlink)); 1422 if (debug) 1423 printf("Adjusting ino %ju, nlink %ju, old link %d lastmode %o\n", 1424 (uintmax_t)ino, (uintmax_t)nlink, DIP(ip, di_nlink), 1425 sino->si_mode); 1426 if (mode == 0) { 1427 if (debug) 1428 printf("ino %ju, zero inode freeing bitmap\n", 1429 (uintmax_t)ino); 1430 ino_free(ino, sino->si_mode); 1431 return; 1432 } 1433 /* XXX Should be an assert? */ 1434 if (mode != sino->si_mode && debug) 1435 printf("ino %ju, mode %o != %o\n", 1436 (uintmax_t)ino, mode, sino->si_mode); 1437 if ((mode & IFMT) == IFDIR) 1438 reqlink = 2; 1439 else 1440 reqlink = 1; 1441 /* If the inode doesn't have enough links to live, free it. */ 1442 if (nlink < reqlink) { 1443 if (debug) 1444 printf("ino %ju not enough links to live %ju < %ju\n", 1445 (uintmax_t)ino, (uintmax_t)nlink, 1446 (uintmax_t)reqlink); 1447 ino_reclaim(ip, ino, mode); 1448 return; 1449 } 1450 /* If required write the updated link count. */ 1451 if (DIP(ip, di_nlink) == nlink) { 1452 if (debug) 1453 printf("ino %ju, link matches, skipping.\n", 1454 (uintmax_t)ino); 1455 return; 1456 } 1457 DIP_SET(ip, di_nlink, nlink); 1458 ino_dirty(ino); 1459 } 1460 1461 /* 1462 * Truncate some or all blocks in an indirect, freeing any that are required 1463 * and zeroing the indirect. 1464 */ 1465 static void 1466 indir_trunc(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, ufs_lbn_t lastlbn) 1467 { 1468 ufs2_daddr_t *bap2; 1469 ufs1_daddr_t *bap1; 1470 ufs_lbn_t lbnadd; 1471 ufs2_daddr_t nblk; 1472 ufs_lbn_t next; 1473 ufs_lbn_t nlbn; 1474 int dirty; 1475 int level; 1476 int i; 1477 1478 if (blk == 0) 1479 return; 1480 dirty = 0; 1481 level = lbn_level(lbn); 1482 if (level == -1) 1483 err_suj("Invalid level for lbn %jd\n", lbn); 1484 lbnadd = 1; 1485 for (i = level; i > 0; i--) 1486 lbnadd *= NINDIR(fs); 1487 bap1 = (void *)dblk_read(blk, fs->fs_bsize); 1488 bap2 = (void *)bap1; 1489 for (i = 0; i < NINDIR(fs); i++) { 1490 if (fs->fs_magic == FS_UFS1_MAGIC) 1491 nblk = *bap1++; 1492 else 1493 nblk = *bap2++; 1494 if (nblk == 0) 1495 continue; 1496 if (level != 0) { 1497 nlbn = (lbn + 1) - (i * lbnadd); 1498 /* 1499 * Calculate the lbn of the next indirect to 1500 * determine if any of this indirect must be 1501 * reclaimed. 1502 */ 1503 next = -(lbn + level) + ((i+1) * lbnadd); 1504 if (next <= lastlbn) 1505 continue; 1506 indir_trunc(ino, nlbn, nblk, lastlbn); 1507 /* If all of this indirect was reclaimed, free it. */ 1508 nlbn = next - lbnadd; 1509 if (nlbn < lastlbn) 1510 continue; 1511 } else { 1512 nlbn = -lbn + i * lbnadd; 1513 if (nlbn < lastlbn) 1514 continue; 1515 } 1516 dirty = 1; 1517 blk_free(nblk, 0, fs->fs_frag); 1518 if (fs->fs_magic == FS_UFS1_MAGIC) 1519 *(bap1 - 1) = 0; 1520 else 1521 *(bap2 - 1) = 0; 1522 } 1523 if (dirty) 1524 dblk_dirty(blk); 1525 } 1526 1527 /* 1528 * Truncate an inode to the minimum of the given size or the last populated 1529 * block after any over size have been discarded. The kernel would allocate 1530 * the last block in the file but fsck does not and neither do we. This 1531 * code never extends files, only shrinks them. 1532 */ 1533 static void 1534 ino_trunc(ino_t ino, off_t size) 1535 { 1536 union dinode *ip; 1537 ufs2_daddr_t bn; 1538 uint64_t totalfrags; 1539 ufs_lbn_t nextlbn; 1540 ufs_lbn_t lastlbn; 1541 ufs_lbn_t tmpval; 1542 ufs_lbn_t lbn; 1543 ufs_lbn_t i; 1544 int frags; 1545 off_t cursize; 1546 off_t off; 1547 int mode; 1548 1549 ip = ino_read(ino); 1550 mode = DIP(ip, di_mode) & IFMT; 1551 cursize = DIP(ip, di_size); 1552 if (debug) 1553 printf("Truncating ino %ju, mode %o to size %jd from size %jd\n", 1554 (uintmax_t)ino, mode, size, cursize); 1555 1556 /* Skip datablocks for short links and devices. */ 1557 if (mode == 0 || mode == IFBLK || mode == IFCHR || 1558 (mode == IFLNK && cursize < fs->fs_maxsymlinklen)) 1559 return; 1560 /* Don't extend. */ 1561 if (size > cursize) 1562 size = cursize; 1563 lastlbn = lblkno(fs, blkroundup(fs, size)); 1564 for (i = lastlbn; i < UFS_NDADDR; i++) { 1565 if (DIP(ip, di_db[i]) == 0) 1566 continue; 1567 frags = sblksize(fs, cursize, i); 1568 frags = numfrags(fs, frags); 1569 blk_free(DIP(ip, di_db[i]), 0, frags); 1570 DIP_SET(ip, di_db[i], 0); 1571 } 1572 /* 1573 * Follow indirect blocks, freeing anything required. 1574 */ 1575 for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; i < UFS_NIADDR; i++, 1576 lbn = nextlbn) { 1577 nextlbn = lbn + tmpval; 1578 tmpval *= NINDIR(fs); 1579 /* If we're not freeing any in this indirect range skip it. */ 1580 if (lastlbn >= nextlbn) 1581 continue; 1582 if (DIP(ip, di_ib[i]) == 0) 1583 continue; 1584 indir_trunc(ino, -lbn - i, DIP(ip, di_ib[i]), lastlbn); 1585 /* If we freed everything in this indirect free the indir. */ 1586 if (lastlbn > lbn) 1587 continue; 1588 blk_free(DIP(ip, di_ib[i]), 0, fs->fs_frag); 1589 DIP_SET(ip, di_ib[i], 0); 1590 } 1591 ino_dirty(ino); 1592 /* 1593 * Now that we've freed any whole blocks that exceed the desired 1594 * truncation size, figure out how many blocks remain and what the 1595 * last populated lbn is. We will set the size to this last lbn 1596 * rather than worrying about allocating the final lbn as the kernel 1597 * would've done. This is consistent with normal fsck behavior. 1598 */ 1599 visitlbn = 0; 1600 totalfrags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT); 1601 if (size > lblktosize(fs, visitlbn + 1)) 1602 size = lblktosize(fs, visitlbn + 1); 1603 /* 1604 * If we're truncating direct blocks we have to adjust frags 1605 * accordingly. 1606 */ 1607 if (visitlbn < UFS_NDADDR && totalfrags) { 1608 long oldspace, newspace; 1609 1610 bn = DIP(ip, di_db[visitlbn]); 1611 if (bn == 0) 1612 err_suj("Bad blk at ino %ju lbn %jd\n", 1613 (uintmax_t)ino, visitlbn); 1614 oldspace = sblksize(fs, cursize, visitlbn); 1615 newspace = sblksize(fs, size, visitlbn); 1616 if (oldspace != newspace) { 1617 bn += numfrags(fs, newspace); 1618 frags = numfrags(fs, oldspace - newspace); 1619 blk_free(bn, 0, frags); 1620 totalfrags -= frags; 1621 } 1622 } 1623 DIP_SET(ip, di_blocks, fsbtodb(fs, totalfrags)); 1624 DIP_SET(ip, di_size, size); 1625 ino_dirty(ino); 1626 /* 1627 * If we've truncated into the middle of a block or frag we have 1628 * to zero it here. Otherwise the file could extend into 1629 * uninitialized space later. 1630 */ 1631 off = blkoff(fs, size); 1632 if (off && DIP(ip, di_mode) != IFDIR) { 1633 uint8_t *buf; 1634 long clrsize; 1635 1636 bn = ino_blkatoff(ip, ino, visitlbn, &frags); 1637 if (bn == 0) 1638 err_suj("Block missing from ino %ju at lbn %jd\n", 1639 (uintmax_t)ino, visitlbn); 1640 clrsize = frags * fs->fs_fsize; 1641 buf = dblk_read(bn, clrsize); 1642 clrsize -= off; 1643 buf += off; 1644 bzero(buf, clrsize); 1645 dblk_dirty(bn); 1646 } 1647 return; 1648 } 1649 1650 /* 1651 * Process records available for one inode and determine whether the 1652 * link count is correct or needs adjusting. 1653 */ 1654 static void 1655 ino_check(struct suj_ino *sino) 1656 { 1657 struct suj_rec *srec; 1658 struct jrefrec *rrec; 1659 nlink_t dotlinks; 1660 nlink_t newlinks; 1661 nlink_t removes; 1662 nlink_t nlink; 1663 ino_t ino; 1664 int isdot; 1665 int isat; 1666 int mode; 1667 1668 if (sino->si_hasrecs == 0) 1669 return; 1670 ino = sino->si_ino; 1671 rrec = (struct jrefrec *)TAILQ_FIRST(&sino->si_recs)->sr_rec; 1672 nlink = rrec->jr_nlink; 1673 newlinks = 0; 1674 dotlinks = 0; 1675 removes = sino->si_nlinkadj; 1676 TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { 1677 rrec = (struct jrefrec *)srec->sr_rec; 1678 isat = ino_isat(rrec->jr_parent, rrec->jr_diroff, 1679 rrec->jr_ino, &mode, &isdot); 1680 if (isat && (mode & IFMT) != (rrec->jr_mode & IFMT)) 1681 err_suj("Inode mode/directory type mismatch %o != %o\n", 1682 mode, rrec->jr_mode); 1683 if (debug) 1684 printf("jrefrec: op %d ino %ju, nlink %ju, parent %ju, " 1685 "diroff %jd, mode %o, isat %d, isdot %d\n", 1686 rrec->jr_op, (uintmax_t)rrec->jr_ino, 1687 (uintmax_t)rrec->jr_nlink, 1688 (uintmax_t)rrec->jr_parent, 1689 (uintmax_t)rrec->jr_diroff, 1690 rrec->jr_mode, isat, isdot); 1691 mode = rrec->jr_mode & IFMT; 1692 if (rrec->jr_op == JOP_REMREF) 1693 removes++; 1694 newlinks += isat; 1695 if (isdot) 1696 dotlinks += isat; 1697 } 1698 /* 1699 * The number of links that remain are the starting link count 1700 * subtracted by the total number of removes with the total 1701 * links discovered back in. An incomplete remove thus 1702 * makes no change to the link count but an add increases 1703 * by one. 1704 */ 1705 if (debug) 1706 printf( 1707 "ino %ju nlink %ju newlinks %ju removes %ju dotlinks %ju\n", 1708 (uintmax_t)ino, (uintmax_t)nlink, (uintmax_t)newlinks, 1709 (uintmax_t)removes, (uintmax_t)dotlinks); 1710 nlink += newlinks; 1711 nlink -= removes; 1712 sino->si_linkadj = 1; 1713 sino->si_nlink = nlink; 1714 sino->si_dotlinks = dotlinks; 1715 sino->si_mode = mode; 1716 ino_adjust(sino); 1717 } 1718 1719 /* 1720 * Process records available for one block and determine whether it is 1721 * still allocated and whether the owning inode needs to be updated or 1722 * a free completed. 1723 */ 1724 static void 1725 blk_check(struct suj_blk *sblk) 1726 { 1727 struct suj_rec *srec; 1728 struct jblkrec *brec; 1729 struct suj_ino *sino; 1730 ufs2_daddr_t blk; 1731 int mask; 1732 int frags; 1733 int isat; 1734 1735 /* 1736 * Each suj_blk actually contains records for any fragments in that 1737 * block. As a result we must evaluate each record individually. 1738 */ 1739 sino = NULL; 1740 TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { 1741 brec = (struct jblkrec *)srec->sr_rec; 1742 frags = brec->jb_frags; 1743 blk = brec->jb_blkno + brec->jb_oldfrags; 1744 isat = blk_isat(brec->jb_ino, brec->jb_lbn, blk, &frags); 1745 if (sino == NULL || sino->si_ino != brec->jb_ino) { 1746 sino = ino_lookup(brec->jb_ino, 1); 1747 sino->si_blkadj = 1; 1748 } 1749 if (debug) 1750 printf("op %d blk %jd ino %ju lbn %jd frags %d isat %d (%d)\n", 1751 brec->jb_op, blk, (uintmax_t)brec->jb_ino, 1752 brec->jb_lbn, brec->jb_frags, isat, frags); 1753 /* 1754 * If we found the block at this address we still have to 1755 * determine if we need to free the tail end that was 1756 * added by adding contiguous fragments from the same block. 1757 */ 1758 if (isat == 1) { 1759 if (frags == brec->jb_frags) 1760 continue; 1761 mask = blk_freemask(blk, brec->jb_ino, brec->jb_lbn, 1762 brec->jb_frags); 1763 mask >>= frags; 1764 blk += frags; 1765 frags = brec->jb_frags - frags; 1766 blk_free(blk, mask, frags); 1767 continue; 1768 } 1769 /* 1770 * The block wasn't found, attempt to free it. It won't be 1771 * freed if it was actually reallocated. If this was an 1772 * allocation we don't want to follow indirects as they 1773 * may not be written yet. Any children of the indirect will 1774 * have their own records. If it's a free we need to 1775 * recursively free children. 1776 */ 1777 blk_free_lbn(blk, brec->jb_ino, brec->jb_lbn, brec->jb_frags, 1778 brec->jb_op == JOP_FREEBLK); 1779 } 1780 } 1781 1782 /* 1783 * Walk the list of inode records for this cg and resolve moved and duplicate 1784 * inode references now that we have a complete picture. 1785 */ 1786 static void 1787 cg_build(struct suj_cg *sc) 1788 { 1789 struct suj_ino *sino; 1790 int i; 1791 1792 for (i = 0; i < SUJ_HASHSIZE; i++) 1793 LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) 1794 ino_build(sino); 1795 } 1796 1797 /* 1798 * Handle inodes requiring truncation. This must be done prior to 1799 * looking up any inodes in directories. 1800 */ 1801 static void 1802 cg_trunc(struct suj_cg *sc) 1803 { 1804 struct suj_ino *sino; 1805 int i; 1806 1807 for (i = 0; i < SUJ_HASHSIZE; i++) { 1808 LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) { 1809 if (sino->si_trunc) { 1810 ino_trunc(sino->si_ino, 1811 sino->si_trunc->jt_size); 1812 sino->si_blkadj = 0; 1813 sino->si_trunc = NULL; 1814 } 1815 if (sino->si_blkadj) 1816 ino_adjblks(sino); 1817 } 1818 } 1819 } 1820 1821 static void 1822 cg_adj_blk(struct suj_cg *sc) 1823 { 1824 struct suj_ino *sino; 1825 int i; 1826 1827 for (i = 0; i < SUJ_HASHSIZE; i++) { 1828 LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) { 1829 if (sino->si_blkadj) 1830 ino_adjblks(sino); 1831 } 1832 } 1833 } 1834 1835 /* 1836 * Free any partially allocated blocks and then resolve inode block 1837 * counts. 1838 */ 1839 static void 1840 cg_check_blk(struct suj_cg *sc) 1841 { 1842 struct suj_blk *sblk; 1843 int i; 1844 1845 1846 for (i = 0; i < SUJ_HASHSIZE; i++) 1847 LIST_FOREACH(sblk, &sc->sc_blkhash[i], sb_next) 1848 blk_check(sblk); 1849 } 1850 1851 /* 1852 * Walk the list of inode records for this cg, recovering any 1853 * changes which were not complete at the time of crash. 1854 */ 1855 static void 1856 cg_check_ino(struct suj_cg *sc) 1857 { 1858 struct suj_ino *sino; 1859 int i; 1860 1861 for (i = 0; i < SUJ_HASHSIZE; i++) 1862 LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) 1863 ino_check(sino); 1864 } 1865 1866 /* 1867 * Write a potentially dirty cg. Recalculate the summary information and 1868 * update the superblock summary. 1869 */ 1870 static void 1871 cg_write(struct suj_cg *sc) 1872 { 1873 ufs1_daddr_t fragno, cgbno, maxbno; 1874 u_int8_t *blksfree; 1875 struct cg *cgp; 1876 int blk; 1877 int i; 1878 1879 if (sc->sc_dirty == 0) 1880 return; 1881 /* 1882 * Fix the frag and cluster summary. 1883 */ 1884 cgp = sc->sc_cgp; 1885 cgp->cg_cs.cs_nbfree = 0; 1886 cgp->cg_cs.cs_nffree = 0; 1887 bzero(&cgp->cg_frsum, sizeof(cgp->cg_frsum)); 1888 maxbno = fragstoblks(fs, fs->fs_fpg); 1889 if (fs->fs_contigsumsize > 0) { 1890 for (i = 1; i <= fs->fs_contigsumsize; i++) 1891 cg_clustersum(cgp)[i] = 0; 1892 bzero(cg_clustersfree(cgp), howmany(maxbno, CHAR_BIT)); 1893 } 1894 blksfree = cg_blksfree(cgp); 1895 for (cgbno = 0; cgbno < maxbno; cgbno++) { 1896 if (ffs_isfreeblock(fs, blksfree, cgbno)) 1897 continue; 1898 if (ffs_isblock(fs, blksfree, cgbno)) { 1899 ffs_clusteracct(fs, cgp, cgbno, 1); 1900 cgp->cg_cs.cs_nbfree++; 1901 continue; 1902 } 1903 fragno = blkstofrags(fs, cgbno); 1904 blk = blkmap(fs, blksfree, fragno); 1905 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 1906 for (i = 0; i < fs->fs_frag; i++) 1907 if (isset(blksfree, fragno + i)) 1908 cgp->cg_cs.cs_nffree++; 1909 } 1910 /* 1911 * Update the superblock cg summary from our now correct values 1912 * before writing the block. 1913 */ 1914 fs->fs_cs(fs, sc->sc_cgx) = cgp->cg_cs; 1915 if (cgput(&disk, cgp) == -1) 1916 err_suj("Unable to write cylinder group %d\n", sc->sc_cgx); 1917 } 1918 1919 /* 1920 * Write out any modified inodes. 1921 */ 1922 static void 1923 cg_write_inos(struct suj_cg *sc) 1924 { 1925 struct ino_blk *iblk; 1926 int i; 1927 1928 for (i = 0; i < SUJ_HASHSIZE; i++) 1929 LIST_FOREACH(iblk, &sc->sc_iblkhash[i], ib_next) 1930 if (iblk->ib_dirty) 1931 iblk_write(iblk); 1932 } 1933 1934 static void 1935 cg_apply(void (*apply)(struct suj_cg *)) 1936 { 1937 struct suj_cg *scg; 1938 int i; 1939 1940 for (i = 0; i < SUJ_HASHSIZE; i++) 1941 LIST_FOREACH(scg, &cghash[i], sc_next) 1942 apply(scg); 1943 } 1944 1945 /* 1946 * Process the unlinked but referenced file list. Freeing all inodes. 1947 */ 1948 static void 1949 ino_unlinked(void) 1950 { 1951 union dinode *ip; 1952 uint16_t mode; 1953 ino_t inon; 1954 ino_t ino; 1955 1956 ino = fs->fs_sujfree; 1957 fs->fs_sujfree = 0; 1958 while (ino != 0) { 1959 ip = ino_read(ino); 1960 mode = DIP(ip, di_mode) & IFMT; 1961 inon = DIP(ip, di_freelink); 1962 DIP_SET(ip, di_freelink, 0); 1963 ino_dirty(ino); 1964 /* 1965 * XXX Should this be an errx? 1966 */ 1967 if (DIP(ip, di_nlink) == 0) { 1968 if (debug) 1969 printf("Freeing unlinked ino %ju mode %o\n", 1970 (uintmax_t)ino, mode); 1971 ino_reclaim(ip, ino, mode); 1972 } else if (debug) 1973 printf("Skipping ino %ju mode %o with link %d\n", 1974 (uintmax_t)ino, mode, DIP(ip, di_nlink)); 1975 ino = inon; 1976 } 1977 } 1978 1979 /* 1980 * Append a new record to the list of records requiring processing. 1981 */ 1982 static void 1983 ino_append(union jrec *rec) 1984 { 1985 struct jrefrec *refrec; 1986 struct jmvrec *mvrec; 1987 struct suj_ino *sino; 1988 struct suj_rec *srec; 1989 1990 mvrec = &rec->rec_jmvrec; 1991 refrec = &rec->rec_jrefrec; 1992 if (debug && mvrec->jm_op == JOP_MVREF) 1993 printf("ino move: ino %ju, parent %ju, " 1994 "diroff %jd, oldoff %jd\n", 1995 (uintmax_t)mvrec->jm_ino, (uintmax_t)mvrec->jm_parent, 1996 (uintmax_t)mvrec->jm_newoff, (uintmax_t)mvrec->jm_oldoff); 1997 else if (debug && 1998 (refrec->jr_op == JOP_ADDREF || refrec->jr_op == JOP_REMREF)) 1999 printf("ino ref: op %d, ino %ju, nlink %ju, " 2000 "parent %ju, diroff %jd\n", 2001 refrec->jr_op, (uintmax_t)refrec->jr_ino, 2002 (uintmax_t)refrec->jr_nlink, 2003 (uintmax_t)refrec->jr_parent, (uintmax_t)refrec->jr_diroff); 2004 sino = ino_lookup(((struct jrefrec *)rec)->jr_ino, 1); 2005 sino->si_hasrecs = 1; 2006 srec = errmalloc(sizeof(*srec)); 2007 srec->sr_rec = rec; 2008 TAILQ_INSERT_TAIL(&sino->si_newrecs, srec, sr_next); 2009 } 2010 2011 /* 2012 * Add a reference adjustment to the sino list and eliminate dups. The 2013 * primary loop in ino_build_ref() checks for dups but new ones may be 2014 * created as a result of offset adjustments. 2015 */ 2016 static void 2017 ino_add_ref(struct suj_ino *sino, struct suj_rec *srec) 2018 { 2019 struct jrefrec *refrec; 2020 struct suj_rec *srn; 2021 struct jrefrec *rrn; 2022 2023 refrec = (struct jrefrec *)srec->sr_rec; 2024 /* 2025 * We walk backwards so that the oldest link count is preserved. If 2026 * an add record conflicts with a remove keep the remove. Redundant 2027 * removes are eliminated in ino_build_ref. Otherwise we keep the 2028 * oldest record at a given location. 2029 */ 2030 for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn; 2031 srn = TAILQ_PREV(srn, srechd, sr_next)) { 2032 rrn = (struct jrefrec *)srn->sr_rec; 2033 if (rrn->jr_parent != refrec->jr_parent || 2034 rrn->jr_diroff != refrec->jr_diroff) 2035 continue; 2036 if (rrn->jr_op == JOP_REMREF || refrec->jr_op == JOP_ADDREF) { 2037 rrn->jr_mode = refrec->jr_mode; 2038 return; 2039 } 2040 /* 2041 * Adding a remove. 2042 * 2043 * Replace the record in place with the old nlink in case 2044 * we replace the head of the list. Abandon srec as a dup. 2045 */ 2046 refrec->jr_nlink = rrn->jr_nlink; 2047 srn->sr_rec = srec->sr_rec; 2048 return; 2049 } 2050 TAILQ_INSERT_TAIL(&sino->si_recs, srec, sr_next); 2051 } 2052 2053 /* 2054 * Create a duplicate of a reference at a previous location. 2055 */ 2056 static void 2057 ino_dup_ref(struct suj_ino *sino, struct jrefrec *refrec, off_t diroff) 2058 { 2059 struct jrefrec *rrn; 2060 struct suj_rec *srn; 2061 2062 rrn = errmalloc(sizeof(*refrec)); 2063 *rrn = *refrec; 2064 rrn->jr_op = JOP_ADDREF; 2065 rrn->jr_diroff = diroff; 2066 srn = errmalloc(sizeof(*srn)); 2067 srn->sr_rec = (union jrec *)rrn; 2068 ino_add_ref(sino, srn); 2069 } 2070 2071 /* 2072 * Add a reference to the list at all known locations. We follow the offset 2073 * changes for a single instance and create duplicate add refs at each so 2074 * that we can tolerate any version of the directory block. Eliminate 2075 * removes which collide with adds that are seen in the journal. They should 2076 * not adjust the link count down. 2077 */ 2078 static void 2079 ino_build_ref(struct suj_ino *sino, struct suj_rec *srec) 2080 { 2081 struct jrefrec *refrec; 2082 struct jmvrec *mvrec; 2083 struct suj_rec *srp; 2084 struct suj_rec *srn; 2085 struct jrefrec *rrn; 2086 off_t diroff; 2087 2088 refrec = (struct jrefrec *)srec->sr_rec; 2089 /* 2090 * Search for a mvrec that matches this offset. Whether it's an add 2091 * or a remove we can delete the mvref after creating a dup record in 2092 * the old location. 2093 */ 2094 if (!TAILQ_EMPTY(&sino->si_movs)) { 2095 diroff = refrec->jr_diroff; 2096 for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn; srn = srp) { 2097 srp = TAILQ_PREV(srn, srechd, sr_next); 2098 mvrec = (struct jmvrec *)srn->sr_rec; 2099 if (mvrec->jm_parent != refrec->jr_parent || 2100 mvrec->jm_newoff != diroff) 2101 continue; 2102 diroff = mvrec->jm_oldoff; 2103 TAILQ_REMOVE(&sino->si_movs, srn, sr_next); 2104 free(srn); 2105 ino_dup_ref(sino, refrec, diroff); 2106 } 2107 } 2108 /* 2109 * If a remove wasn't eliminated by an earlier add just append it to 2110 * the list. 2111 */ 2112 if (refrec->jr_op == JOP_REMREF) { 2113 ino_add_ref(sino, srec); 2114 return; 2115 } 2116 /* 2117 * Walk the list of records waiting to be added to the list. We 2118 * must check for moves that apply to our current offset and remove 2119 * them from the list. Remove any duplicates to eliminate removes 2120 * with corresponding adds. 2121 */ 2122 TAILQ_FOREACH_SAFE(srn, &sino->si_newrecs, sr_next, srp) { 2123 switch (srn->sr_rec->rec_jrefrec.jr_op) { 2124 case JOP_ADDREF: 2125 /* 2126 * This should actually be an error we should 2127 * have a remove for every add journaled. 2128 */ 2129 rrn = (struct jrefrec *)srn->sr_rec; 2130 if (rrn->jr_parent != refrec->jr_parent || 2131 rrn->jr_diroff != refrec->jr_diroff) 2132 break; 2133 TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next); 2134 break; 2135 case JOP_REMREF: 2136 /* 2137 * Once we remove the current iteration of the 2138 * record at this address we're done. 2139 */ 2140 rrn = (struct jrefrec *)srn->sr_rec; 2141 if (rrn->jr_parent != refrec->jr_parent || 2142 rrn->jr_diroff != refrec->jr_diroff) 2143 break; 2144 TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next); 2145 ino_add_ref(sino, srec); 2146 return; 2147 case JOP_MVREF: 2148 /* 2149 * Update our diroff based on any moves that match 2150 * and remove the move. 2151 */ 2152 mvrec = (struct jmvrec *)srn->sr_rec; 2153 if (mvrec->jm_parent != refrec->jr_parent || 2154 mvrec->jm_oldoff != refrec->jr_diroff) 2155 break; 2156 ino_dup_ref(sino, refrec, mvrec->jm_oldoff); 2157 refrec->jr_diroff = mvrec->jm_newoff; 2158 TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next); 2159 break; 2160 default: 2161 err_suj("ino_build_ref: Unknown op %d\n", 2162 srn->sr_rec->rec_jrefrec.jr_op); 2163 } 2164 } 2165 ino_add_ref(sino, srec); 2166 } 2167 2168 /* 2169 * Walk the list of new records and add them in-order resolving any 2170 * dups and adjusted offsets. 2171 */ 2172 static void 2173 ino_build(struct suj_ino *sino) 2174 { 2175 struct suj_rec *srec; 2176 2177 while ((srec = TAILQ_FIRST(&sino->si_newrecs)) != NULL) { 2178 TAILQ_REMOVE(&sino->si_newrecs, srec, sr_next); 2179 switch (srec->sr_rec->rec_jrefrec.jr_op) { 2180 case JOP_ADDREF: 2181 case JOP_REMREF: 2182 ino_build_ref(sino, srec); 2183 break; 2184 case JOP_MVREF: 2185 /* 2186 * Add this mvrec to the queue of pending mvs. 2187 */ 2188 TAILQ_INSERT_TAIL(&sino->si_movs, srec, sr_next); 2189 break; 2190 default: 2191 err_suj("ino_build: Unknown op %d\n", 2192 srec->sr_rec->rec_jrefrec.jr_op); 2193 } 2194 } 2195 if (TAILQ_EMPTY(&sino->si_recs)) 2196 sino->si_hasrecs = 0; 2197 } 2198 2199 /* 2200 * Modify journal records so they refer to the base block number 2201 * and a start and end frag range. This is to facilitate the discovery 2202 * of overlapping fragment allocations. 2203 */ 2204 static void 2205 blk_build(struct jblkrec *blkrec) 2206 { 2207 struct suj_rec *srec; 2208 struct suj_blk *sblk; 2209 struct jblkrec *blkrn; 2210 ufs2_daddr_t blk; 2211 int frag; 2212 2213 if (debug) 2214 printf("blk_build: op %d blkno %jd frags %d oldfrags %d " 2215 "ino %ju lbn %jd\n", 2216 blkrec->jb_op, (uintmax_t)blkrec->jb_blkno, 2217 blkrec->jb_frags, blkrec->jb_oldfrags, 2218 (uintmax_t)blkrec->jb_ino, (uintmax_t)blkrec->jb_lbn); 2219 2220 blk = blknum(fs, blkrec->jb_blkno); 2221 frag = fragnum(fs, blkrec->jb_blkno); 2222 sblk = blk_lookup(blk, 1); 2223 /* 2224 * Rewrite the record using oldfrags to indicate the offset into 2225 * the block. Leave jb_frags as the actual allocated count. 2226 */ 2227 blkrec->jb_blkno -= frag; 2228 blkrec->jb_oldfrags = frag; 2229 if (blkrec->jb_oldfrags + blkrec->jb_frags > fs->fs_frag) 2230 err_suj("Invalid fragment count %d oldfrags %d\n", 2231 blkrec->jb_frags, frag); 2232 /* 2233 * Detect dups. If we detect a dup we always discard the oldest 2234 * record as it is superseded by the new record. This speeds up 2235 * later stages but also eliminates free records which are used 2236 * to indicate that the contents of indirects can be trusted. 2237 */ 2238 TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { 2239 blkrn = (struct jblkrec *)srec->sr_rec; 2240 if (blkrn->jb_ino != blkrec->jb_ino || 2241 blkrn->jb_lbn != blkrec->jb_lbn || 2242 blkrn->jb_blkno != blkrec->jb_blkno || 2243 blkrn->jb_frags != blkrec->jb_frags || 2244 blkrn->jb_oldfrags != blkrec->jb_oldfrags) 2245 continue; 2246 if (debug) 2247 printf("Removed dup.\n"); 2248 /* Discard the free which is a dup with an alloc. */ 2249 if (blkrec->jb_op == JOP_FREEBLK) 2250 return; 2251 TAILQ_REMOVE(&sblk->sb_recs, srec, sr_next); 2252 free(srec); 2253 break; 2254 } 2255 srec = errmalloc(sizeof(*srec)); 2256 srec->sr_rec = (union jrec *)blkrec; 2257 TAILQ_INSERT_TAIL(&sblk->sb_recs, srec, sr_next); 2258 } 2259 2260 static void 2261 ino_build_trunc(struct jtrncrec *rec) 2262 { 2263 struct suj_ino *sino; 2264 2265 if (debug) 2266 printf("ino_build_trunc: op %d ino %ju, size %jd\n", 2267 rec->jt_op, (uintmax_t)rec->jt_ino, 2268 (uintmax_t)rec->jt_size); 2269 sino = ino_lookup(rec->jt_ino, 1); 2270 if (rec->jt_op == JOP_SYNC) { 2271 sino->si_trunc = NULL; 2272 return; 2273 } 2274 if (sino->si_trunc == NULL || sino->si_trunc->jt_size > rec->jt_size) 2275 sino->si_trunc = rec; 2276 } 2277 2278 /* 2279 * Build up tables of the operations we need to recover. 2280 */ 2281 static void 2282 suj_build(void) 2283 { 2284 struct suj_seg *seg; 2285 union jrec *rec; 2286 int off; 2287 int i; 2288 2289 TAILQ_FOREACH(seg, &allsegs, ss_next) { 2290 if (debug) 2291 printf("seg %jd has %d records, oldseq %jd.\n", 2292 seg->ss_rec.jsr_seq, seg->ss_rec.jsr_cnt, 2293 seg->ss_rec.jsr_oldest); 2294 off = 0; 2295 rec = (union jrec *)seg->ss_blk; 2296 for (i = 0; i < seg->ss_rec.jsr_cnt; off += JREC_SIZE, rec++) { 2297 /* skip the segrec. */ 2298 if ((off % real_dev_bsize) == 0) 2299 continue; 2300 switch (rec->rec_jrefrec.jr_op) { 2301 case JOP_ADDREF: 2302 case JOP_REMREF: 2303 case JOP_MVREF: 2304 ino_append(rec); 2305 break; 2306 case JOP_NEWBLK: 2307 case JOP_FREEBLK: 2308 blk_build((struct jblkrec *)rec); 2309 break; 2310 case JOP_TRUNC: 2311 case JOP_SYNC: 2312 ino_build_trunc((struct jtrncrec *)rec); 2313 break; 2314 default: 2315 err_suj("Unknown journal operation %d (%d)\n", 2316 rec->rec_jrefrec.jr_op, off); 2317 } 2318 i++; 2319 } 2320 } 2321 } 2322 2323 /* 2324 * Prune the journal segments to those we care about based on the 2325 * oldest sequence in the newest segment. Order the segment list 2326 * based on sequence number. 2327 */ 2328 static void 2329 suj_prune(void) 2330 { 2331 struct suj_seg *seg; 2332 struct suj_seg *segn; 2333 uint64_t newseq; 2334 int discard; 2335 2336 if (debug) 2337 printf("Pruning up to %jd\n", oldseq); 2338 /* First free the expired segments. */ 2339 TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { 2340 if (seg->ss_rec.jsr_seq >= oldseq) 2341 continue; 2342 TAILQ_REMOVE(&allsegs, seg, ss_next); 2343 free(seg->ss_blk); 2344 free(seg); 2345 } 2346 /* Next ensure that segments are ordered properly. */ 2347 seg = TAILQ_FIRST(&allsegs); 2348 if (seg == NULL) { 2349 if (debug) 2350 printf("Empty journal\n"); 2351 return; 2352 } 2353 newseq = seg->ss_rec.jsr_seq; 2354 for (;;) { 2355 seg = TAILQ_LAST(&allsegs, seghd); 2356 if (seg->ss_rec.jsr_seq >= newseq) 2357 break; 2358 TAILQ_REMOVE(&allsegs, seg, ss_next); 2359 TAILQ_INSERT_HEAD(&allsegs, seg, ss_next); 2360 newseq = seg->ss_rec.jsr_seq; 2361 2362 } 2363 if (newseq != oldseq) { 2364 TAILQ_FOREACH(seg, &allsegs, ss_next) { 2365 printf("%jd, ", seg->ss_rec.jsr_seq); 2366 } 2367 printf("\n"); 2368 err_suj("Journal file sequence mismatch %jd != %jd\n", 2369 newseq, oldseq); 2370 } 2371 /* 2372 * The kernel may asynchronously write segments which can create 2373 * gaps in the sequence space. Throw away any segments after the 2374 * gap as the kernel guarantees only those that are contiguously 2375 * reachable are marked as completed. 2376 */ 2377 discard = 0; 2378 TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { 2379 if (!discard && newseq++ == seg->ss_rec.jsr_seq) { 2380 jrecs += seg->ss_rec.jsr_cnt; 2381 jbytes += seg->ss_rec.jsr_blocks * real_dev_bsize; 2382 continue; 2383 } 2384 discard = 1; 2385 if (debug) 2386 printf("Journal order mismatch %jd != %jd pruning\n", 2387 newseq-1, seg->ss_rec.jsr_seq); 2388 TAILQ_REMOVE(&allsegs, seg, ss_next); 2389 free(seg->ss_blk); 2390 free(seg); 2391 } 2392 if (debug) 2393 printf("Processing journal segments from %jd to %jd\n", 2394 oldseq, newseq-1); 2395 } 2396 2397 /* 2398 * Verify the journal inode before attempting to read records. 2399 */ 2400 static int 2401 suj_verifyino(union dinode *ip) 2402 { 2403 2404 if (DIP(ip, di_nlink) != 1) { 2405 printf("Invalid link count %d for journal inode %ju\n", 2406 DIP(ip, di_nlink), (uintmax_t)sujino); 2407 return (-1); 2408 } 2409 2410 if ((DIP(ip, di_flags) & (SF_IMMUTABLE | SF_NOUNLINK)) != 2411 (SF_IMMUTABLE | SF_NOUNLINK)) { 2412 printf("Invalid flags 0x%X for journal inode %ju\n", 2413 DIP(ip, di_flags), (uintmax_t)sujino); 2414 return (-1); 2415 } 2416 2417 if (DIP(ip, di_mode) != (IFREG | IREAD)) { 2418 printf("Invalid mode %o for journal inode %ju\n", 2419 DIP(ip, di_mode), (uintmax_t)sujino); 2420 return (-1); 2421 } 2422 2423 if (DIP(ip, di_size) < SUJ_MIN) { 2424 printf("Invalid size %jd for journal inode %ju\n", 2425 DIP(ip, di_size), (uintmax_t)sujino); 2426 return (-1); 2427 } 2428 2429 if (DIP(ip, di_modrev) != fs->fs_mtime) { 2430 printf("Journal timestamp does not match fs mount time\n"); 2431 return (-1); 2432 } 2433 2434 return (0); 2435 } 2436 2437 struct jblocks { 2438 struct jextent *jb_extent; /* Extent array. */ 2439 int jb_avail; /* Available extents. */ 2440 int jb_used; /* Last used extent. */ 2441 int jb_head; /* Allocator head. */ 2442 int jb_off; /* Allocator extent offset. */ 2443 }; 2444 struct jextent { 2445 ufs2_daddr_t je_daddr; /* Disk block address. */ 2446 int je_blocks; /* Disk block count. */ 2447 }; 2448 2449 static struct jblocks *suj_jblocks; 2450 2451 static struct jblocks * 2452 jblocks_create(void) 2453 { 2454 struct jblocks *jblocks; 2455 int size; 2456 2457 jblocks = errmalloc(sizeof(*jblocks)); 2458 jblocks->jb_avail = 10; 2459 jblocks->jb_used = 0; 2460 jblocks->jb_head = 0; 2461 jblocks->jb_off = 0; 2462 size = sizeof(struct jextent) * jblocks->jb_avail; 2463 jblocks->jb_extent = errmalloc(size); 2464 bzero(jblocks->jb_extent, size); 2465 2466 return (jblocks); 2467 } 2468 2469 /* 2470 * Return the next available disk block and the amount of contiguous 2471 * free space it contains. 2472 */ 2473 static ufs2_daddr_t 2474 jblocks_next(struct jblocks *jblocks, int bytes, int *actual) 2475 { 2476 struct jextent *jext; 2477 ufs2_daddr_t daddr; 2478 int freecnt; 2479 int blocks; 2480 2481 blocks = bytes / disk.d_bsize; 2482 jext = &jblocks->jb_extent[jblocks->jb_head]; 2483 freecnt = jext->je_blocks - jblocks->jb_off; 2484 if (freecnt == 0) { 2485 jblocks->jb_off = 0; 2486 if (++jblocks->jb_head > jblocks->jb_used) 2487 return (0); 2488 jext = &jblocks->jb_extent[jblocks->jb_head]; 2489 freecnt = jext->je_blocks; 2490 } 2491 if (freecnt > blocks) 2492 freecnt = blocks; 2493 *actual = freecnt * disk.d_bsize; 2494 daddr = jext->je_daddr + jblocks->jb_off; 2495 2496 return (daddr); 2497 } 2498 2499 /* 2500 * Advance the allocation head by a specified number of bytes, consuming 2501 * one journal segment. 2502 */ 2503 static void 2504 jblocks_advance(struct jblocks *jblocks, int bytes) 2505 { 2506 2507 jblocks->jb_off += bytes / disk.d_bsize; 2508 } 2509 2510 static void 2511 jblocks_destroy(struct jblocks *jblocks) 2512 { 2513 2514 free(jblocks->jb_extent); 2515 free(jblocks); 2516 } 2517 2518 static void 2519 jblocks_add(struct jblocks *jblocks, ufs2_daddr_t daddr, int blocks) 2520 { 2521 struct jextent *jext; 2522 int size; 2523 2524 jext = &jblocks->jb_extent[jblocks->jb_used]; 2525 /* Adding the first block. */ 2526 if (jext->je_daddr == 0) { 2527 jext->je_daddr = daddr; 2528 jext->je_blocks = blocks; 2529 return; 2530 } 2531 /* Extending the last extent. */ 2532 if (jext->je_daddr + jext->je_blocks == daddr) { 2533 jext->je_blocks += blocks; 2534 return; 2535 } 2536 /* Adding a new extent. */ 2537 if (++jblocks->jb_used == jblocks->jb_avail) { 2538 jblocks->jb_avail *= 2; 2539 size = sizeof(struct jextent) * jblocks->jb_avail; 2540 jext = errmalloc(size); 2541 bzero(jext, size); 2542 bcopy(jblocks->jb_extent, jext, 2543 sizeof(struct jextent) * jblocks->jb_used); 2544 free(jblocks->jb_extent); 2545 jblocks->jb_extent = jext; 2546 } 2547 jext = &jblocks->jb_extent[jblocks->jb_used]; 2548 jext->je_daddr = daddr; 2549 jext->je_blocks = blocks; 2550 2551 return; 2552 } 2553 2554 /* 2555 * Add a file block from the journal to the extent map. We can't read 2556 * each file block individually because the kernel treats it as a circular 2557 * buffer and segments may span mutliple contiguous blocks. 2558 */ 2559 static void 2560 suj_add_block(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) 2561 { 2562 2563 jblocks_add(suj_jblocks, fsbtodb(fs, blk), fsbtodb(fs, frags)); 2564 } 2565 2566 static void 2567 suj_read(void) 2568 { 2569 uint8_t block[1 * 1024 * 1024]; 2570 struct suj_seg *seg; 2571 struct jsegrec *recn; 2572 struct jsegrec *rec; 2573 ufs2_daddr_t blk; 2574 int readsize; 2575 int blocks; 2576 int recsize; 2577 int size; 2578 int i; 2579 2580 /* 2581 * Read records until we exhaust the journal space. If we find 2582 * an invalid record we start searching for a valid segment header 2583 * at the next block. This is because we don't have a head/tail 2584 * pointer and must recover the information indirectly. At the gap 2585 * between the head and tail we won't necessarily have a valid 2586 * segment. 2587 */ 2588 restart: 2589 for (;;) { 2590 size = sizeof(block); 2591 blk = jblocks_next(suj_jblocks, size, &readsize); 2592 if (blk == 0) 2593 return; 2594 size = readsize; 2595 /* 2596 * Read 1MB at a time and scan for records within this block. 2597 */ 2598 if (bread(&disk, blk, &block, size) == -1) { 2599 err_suj("Error reading journal block %jd\n", 2600 (intmax_t)blk); 2601 } 2602 for (rec = (void *)block; size; size -= recsize, 2603 rec = (struct jsegrec *)((uintptr_t)rec + recsize)) { 2604 recsize = real_dev_bsize; 2605 if (rec->jsr_time != fs->fs_mtime) { 2606 if (debug) 2607 printf("Rec time %jd != fs mtime %jd\n", 2608 rec->jsr_time, fs->fs_mtime); 2609 jblocks_advance(suj_jblocks, recsize); 2610 continue; 2611 } 2612 if (rec->jsr_cnt == 0) { 2613 if (debug) 2614 printf("Found illegal count %d\n", 2615 rec->jsr_cnt); 2616 jblocks_advance(suj_jblocks, recsize); 2617 continue; 2618 } 2619 blocks = rec->jsr_blocks; 2620 recsize = blocks * real_dev_bsize; 2621 if (recsize > size) { 2622 /* 2623 * We may just have run out of buffer, restart 2624 * the loop to re-read from this spot. 2625 */ 2626 if (size < fs->fs_bsize && 2627 size != readsize && 2628 recsize <= fs->fs_bsize) 2629 goto restart; 2630 if (debug) 2631 printf("Found invalid segsize %d > %d\n", 2632 recsize, size); 2633 recsize = real_dev_bsize; 2634 jblocks_advance(suj_jblocks, recsize); 2635 continue; 2636 } 2637 /* 2638 * Verify that all blocks in the segment are present. 2639 */ 2640 for (i = 1; i < blocks; i++) { 2641 recn = (void *)((uintptr_t)rec) + i * 2642 real_dev_bsize; 2643 if (recn->jsr_seq == rec->jsr_seq && 2644 recn->jsr_time == rec->jsr_time) 2645 continue; 2646 if (debug) 2647 printf("Incomplete record %jd (%d)\n", 2648 rec->jsr_seq, i); 2649 recsize = i * real_dev_bsize; 2650 jblocks_advance(suj_jblocks, recsize); 2651 goto restart; 2652 } 2653 seg = errmalloc(sizeof(*seg)); 2654 seg->ss_blk = errmalloc(recsize); 2655 seg->ss_rec = *rec; 2656 bcopy((void *)rec, seg->ss_blk, recsize); 2657 if (rec->jsr_oldest > oldseq) 2658 oldseq = rec->jsr_oldest; 2659 TAILQ_INSERT_TAIL(&allsegs, seg, ss_next); 2660 jblocks_advance(suj_jblocks, recsize); 2661 } 2662 } 2663 } 2664 2665 /* 2666 * Search a directory block for the SUJ_FILE. 2667 */ 2668 static void 2669 suj_find(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) 2670 { 2671 char block[MAXBSIZE]; 2672 struct direct *dp; 2673 int bytes; 2674 int off; 2675 2676 if (sujino) 2677 return; 2678 bytes = lfragtosize(fs, frags); 2679 if (bread(&disk, fsbtodb(fs, blk), block, bytes) <= 0) 2680 err_suj("Failed to read UFS_ROOTINO directory block %jd\n", 2681 blk); 2682 for (off = 0; off < bytes; off += dp->d_reclen) { 2683 dp = (struct direct *)&block[off]; 2684 if (dp->d_reclen == 0) 2685 break; 2686 if (dp->d_ino == 0) 2687 continue; 2688 if (dp->d_namlen != strlen(SUJ_FILE)) 2689 continue; 2690 if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0) 2691 continue; 2692 sujino = dp->d_ino; 2693 return; 2694 } 2695 } 2696 2697 /* 2698 * Orchestrate the verification of a filesystem via the softupdates journal. 2699 */ 2700 int 2701 suj_check(const char *filesys) 2702 { 2703 union dinode *jip; 2704 union dinode *ip; 2705 uint64_t blocks; 2706 int retval; 2707 struct suj_seg *seg; 2708 struct suj_seg *segn; 2709 2710 initsuj(); 2711 fs = &sblock; 2712 if (real_dev_bsize == 0 && ioctl(disk.d_fd, DIOCGSECTORSIZE, 2713 &real_dev_bsize) == -1) 2714 real_dev_bsize = secsize; 2715 if (debug) 2716 printf("dev_bsize %u\n", real_dev_bsize); 2717 2718 /* 2719 * Set an exit point when SUJ check failed 2720 */ 2721 retval = setjmp(jmpbuf); 2722 if (retval != 0) { 2723 pwarn("UNEXPECTED SU+J INCONSISTENCY\n"); 2724 TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { 2725 TAILQ_REMOVE(&allsegs, seg, ss_next); 2726 free(seg->ss_blk); 2727 free(seg); 2728 } 2729 if (reply("FALLBACK TO FULL FSCK") == 0) { 2730 ckfini(0); 2731 exit(EEXIT); 2732 } else 2733 return (-1); 2734 } 2735 2736 /* 2737 * Find the journal inode. 2738 */ 2739 ip = ino_read(UFS_ROOTINO); 2740 sujino = 0; 2741 ino_visit(ip, UFS_ROOTINO, suj_find, 0); 2742 if (sujino == 0) { 2743 printf("Journal inode removed. Use tunefs to re-create.\n"); 2744 sblock.fs_flags &= ~FS_SUJ; 2745 sblock.fs_sujfree = 0; 2746 return (-1); 2747 } 2748 /* 2749 * Fetch the journal inode and verify it. 2750 */ 2751 jip = ino_read(sujino); 2752 printf("** SU+J Recovering %s\n", filesys); 2753 if (suj_verifyino(jip) != 0) 2754 return (-1); 2755 if (!preen && !reply("USE JOURNAL")) 2756 return (-1); 2757 /* 2758 * Build a list of journal blocks in jblocks before parsing the 2759 * available journal blocks in with suj_read(). 2760 */ 2761 printf("** Reading %jd byte journal from inode %ju.\n", 2762 DIP(jip, di_size), (uintmax_t)sujino); 2763 suj_jblocks = jblocks_create(); 2764 blocks = ino_visit(jip, sujino, suj_add_block, 0); 2765 if (blocks != numfrags(fs, DIP(jip, di_size))) { 2766 printf("Sparse journal inode %ju.\n", (uintmax_t)sujino); 2767 return (-1); 2768 } 2769 suj_read(); 2770 jblocks_destroy(suj_jblocks); 2771 suj_jblocks = NULL; 2772 if (preen || reply("RECOVER")) { 2773 printf("** Building recovery table.\n"); 2774 suj_prune(); 2775 suj_build(); 2776 cg_apply(cg_build); 2777 printf("** Resolving unreferenced inode list.\n"); 2778 ino_unlinked(); 2779 printf("** Processing journal entries.\n"); 2780 cg_apply(cg_trunc); 2781 cg_apply(cg_check_blk); 2782 cg_apply(cg_adj_blk); 2783 cg_apply(cg_check_ino); 2784 } 2785 if (preen == 0 && (jrecs > 0 || jbytes > 0) && reply("WRITE CHANGES") == 0) 2786 return (0); 2787 /* 2788 * To remain idempotent with partial truncations the free bitmaps 2789 * must be written followed by indirect blocks and lastly inode 2790 * blocks. This preserves access to the modified pointers until 2791 * they are freed. 2792 */ 2793 cg_apply(cg_write); 2794 dblk_write(); 2795 cg_apply(cg_write_inos); 2796 /* Write back superblock. */ 2797 closedisk(filesys); 2798 if (jrecs > 0 || jbytes > 0) { 2799 printf("** %jd journal records in %jd bytes for %.2f%% utilization\n", 2800 jrecs, jbytes, ((float)jrecs / (float)(jbytes / JREC_SIZE)) * 100); 2801 printf("** Freed %jd inodes (%jd dirs) %jd blocks, and %jd frags.\n", 2802 freeinos, freedir, freeblocks, freefrags); 2803 } 2804 2805 return (0); 2806 } 2807 2808 static void 2809 initsuj(void) 2810 { 2811 int i; 2812 2813 for (i = 0; i < SUJ_HASHSIZE; i++) { 2814 LIST_INIT(&cghash[i]); 2815 LIST_INIT(&dbhash[i]); 2816 } 2817 lastcg = NULL; 2818 lastblk = NULL; 2819 TAILQ_INIT(&allsegs); 2820 oldseq = 0; 2821 fs = NULL; 2822 sujino = 0; 2823 freefrags = 0; 2824 freeblocks = 0; 2825 freeinos = 0; 2826 freedir = 0; 2827 jbytes = 0; 2828 jrecs = 0; 2829 suj_jblocks = NULL; 2830 } 2831