1 /*- 2 * Copyright 1998, 2000 Marshall Kirk McKusick. 3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org> 4 * All rights reserved. 5 * 6 * The soft updates code is derived from the appendix of a University 7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 8 * "Soft Updates: A Solution to the Metadata Update Problem in File 9 * Systems", CSE-TR-254-95, August 1995). 10 * 11 * Further information about soft updates can be obtained from: 12 * 13 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 14 * 1614 Oxford Street mckusick@mckusick.com 15 * Berkeley, CA 94709-1608 +1-510-843-9542 16 * USA 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 * 39 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 40 */ 41 42 #include <sys/cdefs.h> 43 __FBSDID("$FreeBSD$"); 44 45 #include "opt_ffs.h" 46 #include "opt_quota.h" 47 #include "opt_ddb.h" 48 49 /* 50 * For now we want the safety net that the DEBUG flag provides. 51 */ 52 #ifndef DEBUG 53 #define DEBUG 54 #endif 55 56 #include <sys/param.h> 57 #include <sys/kernel.h> 58 #include <sys/systm.h> 59 #include <sys/bio.h> 60 #include <sys/buf.h> 61 #include <sys/kdb.h> 62 #include <sys/kthread.h> 63 #include <sys/limits.h> 64 #include <sys/lock.h> 65 #include <sys/malloc.h> 66 #include <sys/mount.h> 67 #include <sys/mutex.h> 68 #include <sys/namei.h> 69 #include <sys/priv.h> 70 #include <sys/proc.h> 71 #include <sys/stat.h> 72 #include <sys/sysctl.h> 73 #include <sys/syslog.h> 74 #include <sys/vnode.h> 75 #include <sys/conf.h> 76 77 #include <ufs/ufs/dir.h> 78 #include <ufs/ufs/extattr.h> 79 #include <ufs/ufs/quota.h> 80 #include <ufs/ufs/inode.h> 81 #include <ufs/ufs/ufsmount.h> 82 #include <ufs/ffs/fs.h> 83 #include <ufs/ffs/softdep.h> 84 #include <ufs/ffs/ffs_extern.h> 85 #include <ufs/ufs/ufs_extern.h> 86 87 #include <vm/vm.h> 88 #include <vm/vm_extern.h> 89 #include <vm/vm_object.h> 90 91 #include <ddb/ddb.h> 92 93 #ifndef SOFTUPDATES 94 95 int 96 softdep_flushfiles(oldmnt, flags, td) 97 struct mount *oldmnt; 98 int flags; 99 struct thread *td; 100 { 101 102 panic("softdep_flushfiles called"); 103 } 104 105 int 106 softdep_mount(devvp, mp, fs, cred) 107 struct vnode *devvp; 108 struct mount *mp; 109 struct fs *fs; 110 struct ucred *cred; 111 { 112 113 return (0); 114 } 115 116 void 117 softdep_initialize() 118 { 119 120 return; 121 } 122 123 void 124 softdep_uninitialize() 125 { 126 127 return; 128 } 129 130 void 131 softdep_unmount(mp) 132 struct mount *mp; 133 { 134 135 } 136 137 void 138 softdep_setup_sbupdate(ump, fs, bp) 139 struct ufsmount *ump; 140 struct fs *fs; 141 struct buf *bp; 142 { 143 } 144 145 void 146 softdep_setup_inomapdep(bp, ip, newinum, mode) 147 struct buf *bp; 148 struct inode *ip; 149 ino_t newinum; 150 int mode; 151 { 152 153 panic("softdep_setup_inomapdep called"); 154 } 155 156 void 157 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 158 struct buf *bp; 159 struct mount *mp; 160 ufs2_daddr_t newblkno; 161 int frags; 162 int oldfrags; 163 { 164 165 panic("softdep_setup_blkmapdep called"); 166 } 167 168 void 169 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 170 struct inode *ip; 171 ufs_lbn_t lbn; 172 ufs2_daddr_t newblkno; 173 ufs2_daddr_t oldblkno; 174 long newsize; 175 long oldsize; 176 struct buf *bp; 177 { 178 179 panic("softdep_setup_allocdirect called"); 180 } 181 182 void 183 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 184 struct inode *ip; 185 ufs_lbn_t lbn; 186 ufs2_daddr_t newblkno; 187 ufs2_daddr_t oldblkno; 188 long newsize; 189 long oldsize; 190 struct buf *bp; 191 { 192 193 panic("softdep_setup_allocext called"); 194 } 195 196 void 197 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 198 struct inode *ip; 199 ufs_lbn_t lbn; 200 struct buf *bp; 201 int ptrno; 202 ufs2_daddr_t newblkno; 203 ufs2_daddr_t oldblkno; 204 struct buf *nbp; 205 { 206 207 panic("softdep_setup_allocindir_page called"); 208 } 209 210 void 211 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 212 struct buf *nbp; 213 struct inode *ip; 214 struct buf *bp; 215 int ptrno; 216 ufs2_daddr_t newblkno; 217 { 218 219 panic("softdep_setup_allocindir_meta called"); 220 } 221 222 void 223 softdep_journal_freeblocks(ip, cred, length, flags) 224 struct inode *ip; 225 struct ucred *cred; 226 off_t length; 227 int flags; 228 { 229 230 panic("softdep_journal_freeblocks called"); 231 } 232 233 void 234 softdep_journal_fsync(ip) 235 struct inode *ip; 236 { 237 238 panic("softdep_journal_fsync called"); 239 } 240 241 void 242 softdep_setup_freeblocks(ip, length, flags) 243 struct inode *ip; 244 off_t length; 245 int flags; 246 { 247 248 panic("softdep_setup_freeblocks called"); 249 } 250 251 void 252 softdep_freefile(pvp, ino, mode) 253 struct vnode *pvp; 254 ino_t ino; 255 int mode; 256 { 257 258 panic("softdep_freefile called"); 259 } 260 261 int 262 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 263 struct buf *bp; 264 struct inode *dp; 265 off_t diroffset; 266 ino_t newinum; 267 struct buf *newdirbp; 268 int isnewblk; 269 { 270 271 panic("softdep_setup_directory_add called"); 272 } 273 274 void 275 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 276 struct buf *bp; 277 struct inode *dp; 278 caddr_t base; 279 caddr_t oldloc; 280 caddr_t newloc; 281 int entrysize; 282 { 283 284 panic("softdep_change_directoryentry_offset called"); 285 } 286 287 void 288 softdep_setup_remove(bp, dp, ip, isrmdir) 289 struct buf *bp; 290 struct inode *dp; 291 struct inode *ip; 292 int isrmdir; 293 { 294 295 panic("softdep_setup_remove called"); 296 } 297 298 void 299 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 300 struct buf *bp; 301 struct inode *dp; 302 struct inode *ip; 303 ino_t newinum; 304 int isrmdir; 305 { 306 307 panic("softdep_setup_directory_change called"); 308 } 309 310 void 311 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 312 struct mount *mp; 313 struct buf *bp; 314 ufs2_daddr_t blkno; 315 int frags; 316 struct workhead *wkhd; 317 { 318 319 panic("%s called", __FUNCTION__); 320 } 321 322 void 323 softdep_setup_inofree(mp, bp, ino, wkhd) 324 struct mount *mp; 325 struct buf *bp; 326 ino_t ino; 327 struct workhead *wkhd; 328 { 329 330 panic("%s called", __FUNCTION__); 331 } 332 333 void 334 softdep_setup_unlink(dp, ip) 335 struct inode *dp; 336 struct inode *ip; 337 { 338 339 panic("%s called", __FUNCTION__); 340 } 341 342 void 343 softdep_setup_link(dp, ip) 344 struct inode *dp; 345 struct inode *ip; 346 { 347 348 panic("%s called", __FUNCTION__); 349 } 350 351 void 352 softdep_revert_link(dp, ip) 353 struct inode *dp; 354 struct inode *ip; 355 { 356 357 panic("%s called", __FUNCTION__); 358 } 359 360 void 361 softdep_setup_rmdir(dp, ip) 362 struct inode *dp; 363 struct inode *ip; 364 { 365 366 panic("%s called", __FUNCTION__); 367 } 368 369 void 370 softdep_revert_rmdir(dp, ip) 371 struct inode *dp; 372 struct inode *ip; 373 { 374 375 panic("%s called", __FUNCTION__); 376 } 377 378 void 379 softdep_setup_create(dp, ip) 380 struct inode *dp; 381 struct inode *ip; 382 { 383 384 panic("%s called", __FUNCTION__); 385 } 386 387 void 388 softdep_revert_create(dp, ip) 389 struct inode *dp; 390 struct inode *ip; 391 { 392 393 panic("%s called", __FUNCTION__); 394 } 395 396 void 397 softdep_setup_mkdir(dp, ip) 398 struct inode *dp; 399 struct inode *ip; 400 { 401 402 panic("%s called", __FUNCTION__); 403 } 404 405 void 406 softdep_revert_mkdir(dp, ip) 407 struct inode *dp; 408 struct inode *ip; 409 { 410 411 panic("%s called", __FUNCTION__); 412 } 413 414 void 415 softdep_setup_dotdot_link(dp, ip) 416 struct inode *dp; 417 struct inode *ip; 418 { 419 420 panic("%s called", __FUNCTION__); 421 } 422 423 int 424 softdep_prealloc(vp, waitok) 425 struct vnode *vp; 426 int waitok; 427 { 428 429 panic("%s called", __FUNCTION__); 430 431 return (0); 432 } 433 434 int 435 softdep_journal_lookup(mp, vpp) 436 struct mount *mp; 437 struct vnode **vpp; 438 { 439 440 return (ENOENT); 441 } 442 443 void 444 softdep_change_linkcnt(ip) 445 struct inode *ip; 446 { 447 448 panic("softdep_change_linkcnt called"); 449 } 450 451 void 452 softdep_load_inodeblock(ip) 453 struct inode *ip; 454 { 455 456 panic("softdep_load_inodeblock called"); 457 } 458 459 void 460 softdep_update_inodeblock(ip, bp, waitfor) 461 struct inode *ip; 462 struct buf *bp; 463 int waitfor; 464 { 465 466 panic("softdep_update_inodeblock called"); 467 } 468 469 int 470 softdep_fsync(vp) 471 struct vnode *vp; /* the "in_core" copy of the inode */ 472 { 473 474 return (0); 475 } 476 477 void 478 softdep_fsync_mountdev(vp) 479 struct vnode *vp; 480 { 481 482 return; 483 } 484 485 int 486 softdep_flushworklist(oldmnt, countp, td) 487 struct mount *oldmnt; 488 int *countp; 489 struct thread *td; 490 { 491 492 *countp = 0; 493 return (0); 494 } 495 496 int 497 softdep_sync_metadata(struct vnode *vp) 498 { 499 500 return (0); 501 } 502 503 int 504 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) 505 { 506 507 return (0); 508 } 509 510 int 511 softdep_slowdown(vp) 512 struct vnode *vp; 513 { 514 515 panic("softdep_slowdown called"); 516 } 517 518 void 519 softdep_releasefile(ip) 520 struct inode *ip; /* inode with the zero effective link count */ 521 { 522 523 panic("softdep_releasefile called"); 524 } 525 526 int 527 softdep_request_cleanup(fs, vp, cred, resource) 528 struct fs *fs; 529 struct vnode *vp; 530 struct ucred *cred; 531 int resource; 532 { 533 534 return (0); 535 } 536 537 int 538 softdep_check_suspend(struct mount *mp, 539 struct vnode *devvp, 540 int softdep_deps, 541 int softdep_accdeps, 542 int secondary_writes, 543 int secondary_accwrites) 544 { 545 struct bufobj *bo; 546 int error; 547 548 (void) softdep_deps, 549 (void) softdep_accdeps; 550 551 bo = &devvp->v_bufobj; 552 ASSERT_BO_LOCKED(bo); 553 554 MNT_ILOCK(mp); 555 while (mp->mnt_secondary_writes != 0) { 556 BO_UNLOCK(bo); 557 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), 558 (PUSER - 1) | PDROP, "secwr", 0); 559 BO_LOCK(bo); 560 MNT_ILOCK(mp); 561 } 562 563 /* 564 * Reasons for needing more work before suspend: 565 * - Dirty buffers on devvp. 566 * - Secondary writes occurred after start of vnode sync loop 567 */ 568 error = 0; 569 if (bo->bo_numoutput > 0 || 570 bo->bo_dirty.bv_cnt > 0 || 571 secondary_writes != 0 || 572 mp->mnt_secondary_writes != 0 || 573 secondary_accwrites != mp->mnt_secondary_accwrites) 574 error = EAGAIN; 575 BO_UNLOCK(bo); 576 return (error); 577 } 578 579 void 580 softdep_get_depcounts(struct mount *mp, 581 int *softdepactivep, 582 int *softdepactiveaccp) 583 { 584 (void) mp; 585 *softdepactivep = 0; 586 *softdepactiveaccp = 0; 587 } 588 589 void 590 softdep_buf_append(bp, wkhd) 591 struct buf *bp; 592 struct workhead *wkhd; 593 { 594 595 panic("softdep_buf_appendwork called"); 596 } 597 598 void 599 softdep_inode_append(ip, cred, wkhd) 600 struct inode *ip; 601 struct ucred *cred; 602 struct workhead *wkhd; 603 { 604 605 panic("softdep_inode_appendwork called"); 606 } 607 608 void 609 softdep_freework(wkhd) 610 struct workhead *wkhd; 611 { 612 613 panic("softdep_freework called"); 614 } 615 616 #else 617 618 FEATURE(softupdates, "FFS soft-updates support"); 619 620 /* 621 * These definitions need to be adapted to the system to which 622 * this file is being ported. 623 */ 624 625 #define M_SOFTDEP_FLAGS (M_WAITOK) 626 627 #define D_PAGEDEP 0 628 #define D_INODEDEP 1 629 #define D_BMSAFEMAP 2 630 #define D_NEWBLK 3 631 #define D_ALLOCDIRECT 4 632 #define D_INDIRDEP 5 633 #define D_ALLOCINDIR 6 634 #define D_FREEFRAG 7 635 #define D_FREEBLKS 8 636 #define D_FREEFILE 9 637 #define D_DIRADD 10 638 #define D_MKDIR 11 639 #define D_DIRREM 12 640 #define D_NEWDIRBLK 13 641 #define D_FREEWORK 14 642 #define D_FREEDEP 15 643 #define D_JADDREF 16 644 #define D_JREMREF 17 645 #define D_JMVREF 18 646 #define D_JNEWBLK 19 647 #define D_JFREEBLK 20 648 #define D_JFREEFRAG 21 649 #define D_JSEG 22 650 #define D_JSEGDEP 23 651 #define D_SBDEP 24 652 #define D_JTRUNC 25 653 #define D_JFSYNC 26 654 #define D_SENTINAL 27 655 #define D_LAST D_SENTINAL 656 657 unsigned long dep_current[D_LAST + 1]; 658 unsigned long dep_total[D_LAST + 1]; 659 unsigned long dep_write[D_LAST + 1]; 660 661 662 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, 663 "soft updates stats"); 664 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, 665 "total dependencies allocated"); 666 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, 667 "current dependencies allocated"); 668 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0, 669 "current dependencies written"); 670 671 #define SOFTDEP_TYPE(type, str, long) \ 672 static MALLOC_DEFINE(M_ ## type, #str, long); \ 673 SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ 674 &dep_total[D_ ## type], 0, ""); \ 675 SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ 676 &dep_current[D_ ## type], 0, ""); \ 677 SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \ 678 &dep_write[D_ ## type], 0, ""); 679 680 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 681 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); 682 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, 683 "Block or frag allocated from cyl group map"); 684 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); 685 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); 686 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); 687 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); 688 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); 689 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); 690 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); 691 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); 692 SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); 693 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); 694 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); 695 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); 696 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); 697 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); 698 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); 699 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); 700 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); 701 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); 702 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); 703 SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); 704 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); 705 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); 706 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); 707 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete"); 708 709 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); 710 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); 711 712 /* 713 * translate from workitem type to memory type 714 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 715 */ 716 static struct malloc_type *memtype[] = { 717 M_PAGEDEP, 718 M_INODEDEP, 719 M_BMSAFEMAP, 720 M_NEWBLK, 721 M_ALLOCDIRECT, 722 M_INDIRDEP, 723 M_ALLOCINDIR, 724 M_FREEFRAG, 725 M_FREEBLKS, 726 M_FREEFILE, 727 M_DIRADD, 728 M_MKDIR, 729 M_DIRREM, 730 M_NEWDIRBLK, 731 M_FREEWORK, 732 M_FREEDEP, 733 M_JADDREF, 734 M_JREMREF, 735 M_JMVREF, 736 M_JNEWBLK, 737 M_JFREEBLK, 738 M_JFREEFRAG, 739 M_JSEG, 740 M_JSEGDEP, 741 M_SBDEP, 742 M_JTRUNC, 743 M_JFSYNC 744 }; 745 746 static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd; 747 748 #define DtoM(type) (memtype[type]) 749 750 /* 751 * Names of malloc types. 752 */ 753 #define TYPENAME(type) \ 754 ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") 755 /* 756 * End system adaptation definitions. 757 */ 758 759 #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) 760 #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) 761 762 /* 763 * Forward declarations. 764 */ 765 struct inodedep_hashhead; 766 struct newblk_hashhead; 767 struct pagedep_hashhead; 768 struct bmsafemap_hashhead; 769 770 /* 771 * Internal function prototypes. 772 */ 773 static void softdep_error(char *, int); 774 static void drain_output(struct vnode *); 775 static struct buf *getdirtybuf(struct buf *, struct mtx *, int); 776 static void clear_remove(struct thread *); 777 static void clear_inodedeps(struct thread *); 778 static void unlinked_inodedep(struct mount *, struct inodedep *); 779 static void clear_unlinked_inodedep(struct inodedep *); 780 static struct inodedep *first_unlinked_inodedep(struct ufsmount *); 781 static int flush_pagedep_deps(struct vnode *, struct mount *, 782 struct diraddhd *); 783 static int free_pagedep(struct pagedep *); 784 static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); 785 static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); 786 static int flush_deplist(struct allocdirectlst *, int, int *); 787 static int sync_cgs(struct mount *, int); 788 static int handle_written_filepage(struct pagedep *, struct buf *); 789 static int handle_written_sbdep(struct sbdep *, struct buf *); 790 static void initiate_write_sbdep(struct sbdep *); 791 static void diradd_inode_written(struct diradd *, struct inodedep *); 792 static int handle_written_indirdep(struct indirdep *, struct buf *, 793 struct buf**); 794 static int handle_written_inodeblock(struct inodedep *, struct buf *); 795 static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *, 796 uint8_t *); 797 static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); 798 static void handle_written_jaddref(struct jaddref *); 799 static void handle_written_jremref(struct jremref *); 800 static void handle_written_jseg(struct jseg *, struct buf *); 801 static void handle_written_jnewblk(struct jnewblk *); 802 static void handle_written_jblkdep(struct jblkdep *); 803 static void handle_written_jfreefrag(struct jfreefrag *); 804 static void complete_jseg(struct jseg *); 805 static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); 806 static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); 807 static void jremref_write(struct jremref *, struct jseg *, uint8_t *); 808 static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); 809 static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); 810 static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data); 811 static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); 812 static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); 813 static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); 814 static inline void inoref_write(struct inoref *, struct jseg *, 815 struct jrefrec *); 816 static void handle_allocdirect_partdone(struct allocdirect *, 817 struct workhead *); 818 static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *, 819 struct workhead *); 820 static void indirdep_complete(struct indirdep *); 821 static int indirblk_lookup(struct mount *, ufs2_daddr_t); 822 static void indirblk_insert(struct freework *); 823 static void indirblk_remove(struct freework *); 824 static void handle_allocindir_partdone(struct allocindir *); 825 static void initiate_write_filepage(struct pagedep *, struct buf *); 826 static void initiate_write_indirdep(struct indirdep*, struct buf *); 827 static void handle_written_mkdir(struct mkdir *, int); 828 static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *, 829 uint8_t *); 830 static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); 831 static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); 832 static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); 833 static void handle_workitem_freefile(struct freefile *); 834 static int handle_workitem_remove(struct dirrem *, int); 835 static struct dirrem *newdirrem(struct buf *, struct inode *, 836 struct inode *, int, struct dirrem **); 837 static struct indirdep *indirdep_lookup(struct mount *, struct inode *, 838 struct buf *); 839 static void cancel_indirdep(struct indirdep *, struct buf *, 840 struct freeblks *); 841 static void free_indirdep(struct indirdep *); 842 static void free_diradd(struct diradd *, struct workhead *); 843 static void merge_diradd(struct inodedep *, struct diradd *); 844 static void complete_diradd(struct diradd *); 845 static struct diradd *diradd_lookup(struct pagedep *, int); 846 static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, 847 struct jremref *); 848 static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, 849 struct jremref *); 850 static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, 851 struct jremref *, struct jremref *); 852 static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, 853 struct jremref *); 854 static void cancel_allocindir(struct allocindir *, struct buf *bp, 855 struct freeblks *, int); 856 static int setup_trunc_indir(struct freeblks *, struct inode *, 857 ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t); 858 static void complete_trunc_indir(struct freework *); 859 static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *, 860 int); 861 static void complete_mkdir(struct mkdir *); 862 static void free_newdirblk(struct newdirblk *); 863 static void free_jremref(struct jremref *); 864 static void free_jaddref(struct jaddref *); 865 static void free_jsegdep(struct jsegdep *); 866 static void free_jsegs(struct jblocks *); 867 static void rele_jseg(struct jseg *); 868 static void free_jseg(struct jseg *, struct jblocks *); 869 static void free_jnewblk(struct jnewblk *); 870 static void free_jblkdep(struct jblkdep *); 871 static void free_jfreefrag(struct jfreefrag *); 872 static void free_freedep(struct freedep *); 873 static void journal_jremref(struct dirrem *, struct jremref *, 874 struct inodedep *); 875 static void cancel_jnewblk(struct jnewblk *, struct workhead *); 876 static int cancel_jaddref(struct jaddref *, struct inodedep *, 877 struct workhead *); 878 static void cancel_jfreefrag(struct jfreefrag *); 879 static inline void setup_freedirect(struct freeblks *, struct inode *, 880 int, int); 881 static inline void setup_freeext(struct freeblks *, struct inode *, int, int); 882 static inline void setup_freeindir(struct freeblks *, struct inode *, int, 883 ufs_lbn_t, int); 884 static inline struct freeblks *newfreeblks(struct mount *, struct inode *); 885 static void freeblks_free(struct ufsmount *, struct freeblks *, int); 886 static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); 887 ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t); 888 static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int); 889 static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t, 890 int, int); 891 static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int); 892 static int cancel_pagedep(struct pagedep *, struct freeblks *, int); 893 static int deallocate_dependencies(struct buf *, struct freeblks *, int); 894 static void newblk_freefrag(struct newblk*); 895 static void free_newblk(struct newblk *); 896 static void cancel_allocdirect(struct allocdirectlst *, 897 struct allocdirect *, struct freeblks *); 898 static int check_inode_unwritten(struct inodedep *); 899 static int free_inodedep(struct inodedep *); 900 static void freework_freeblock(struct freework *); 901 static void freework_enqueue(struct freework *); 902 static int handle_workitem_freeblocks(struct freeblks *, int); 903 static int handle_complete_freeblocks(struct freeblks *, int); 904 static void handle_workitem_indirblk(struct freework *); 905 static void handle_written_freework(struct freework *); 906 static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); 907 static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, 908 struct workhead *); 909 static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *, 910 struct inodedep *, struct allocindir *, ufs_lbn_t); 911 static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, 912 ufs2_daddr_t, ufs_lbn_t); 913 static void handle_workitem_freefrag(struct freefrag *); 914 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, 915 ufs_lbn_t); 916 static void allocdirect_merge(struct allocdirectlst *, 917 struct allocdirect *, struct allocdirect *); 918 static struct freefrag *allocindir_merge(struct allocindir *, 919 struct allocindir *); 920 static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int, 921 struct bmsafemap **); 922 static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, 923 int cg); 924 static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t, 925 int, struct newblk **); 926 static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); 927 static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, 928 struct inodedep **); 929 static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); 930 static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t, 931 int, struct pagedep **); 932 static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, 933 struct mount *mp, int, struct pagedep **); 934 static void pause_timer(void *); 935 static int request_cleanup(struct mount *, int); 936 static int process_worklist_item(struct mount *, int, int); 937 static void process_removes(struct vnode *); 938 static void process_truncates(struct vnode *); 939 static void jwork_move(struct workhead *, struct workhead *); 940 static void jwork_insert(struct workhead *, struct jsegdep *); 941 static void add_to_worklist(struct worklist *, int); 942 static void wake_worklist(struct worklist *); 943 static void wait_worklist(struct worklist *, char *); 944 static void remove_from_worklist(struct worklist *); 945 static void softdep_flush(void); 946 static void softdep_flushjournal(struct mount *); 947 static int softdep_speedup(void); 948 static void worklist_speedup(void); 949 static int journal_mount(struct mount *, struct fs *, struct ucred *); 950 static void journal_unmount(struct mount *); 951 static int journal_space(struct ufsmount *, int); 952 static void journal_suspend(struct ufsmount *); 953 static int journal_unsuspend(struct ufsmount *ump); 954 static void softdep_prelink(struct vnode *, struct vnode *); 955 static void add_to_journal(struct worklist *); 956 static void remove_from_journal(struct worklist *); 957 static void softdep_process_journal(struct mount *, struct worklist *, int); 958 static struct jremref *newjremref(struct dirrem *, struct inode *, 959 struct inode *ip, off_t, nlink_t); 960 static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, 961 uint16_t); 962 static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, 963 uint16_t); 964 static inline struct jsegdep *inoref_jseg(struct inoref *); 965 static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); 966 static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, 967 ufs2_daddr_t, int); 968 static struct jtrunc *newjtrunc(struct freeblks *, off_t, int); 969 static void move_newblock_dep(struct jaddref *, struct inodedep *); 970 static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t); 971 static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, 972 ufs2_daddr_t, long, ufs_lbn_t); 973 static struct freework *newfreework(struct ufsmount *, struct freeblks *, 974 struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int); 975 static int jwait(struct worklist *, int); 976 static struct inodedep *inodedep_lookup_ip(struct inode *); 977 static int bmsafemap_rollbacks(struct bmsafemap *); 978 static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); 979 static void handle_jwork(struct workhead *); 980 static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, 981 struct mkdir **); 982 static struct jblocks *jblocks_create(void); 983 static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); 984 static void jblocks_free(struct jblocks *, struct mount *, int); 985 static void jblocks_destroy(struct jblocks *); 986 static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); 987 988 /* 989 * Exported softdep operations. 990 */ 991 static void softdep_disk_io_initiation(struct buf *); 992 static void softdep_disk_write_complete(struct buf *); 993 static void softdep_deallocate_dependencies(struct buf *); 994 static int softdep_count_dependencies(struct buf *bp, int); 995 996 static struct mtx lk; 997 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); 998 999 #define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) 1000 #define ACQUIRE_LOCK(lk) mtx_lock(lk) 1001 #define FREE_LOCK(lk) mtx_unlock(lk) 1002 1003 #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) 1004 #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) 1005 1006 /* 1007 * Worklist queue management. 1008 * These routines require that the lock be held. 1009 */ 1010 #ifndef /* NOT */ DEBUG 1011 #define WORKLIST_INSERT(head, item) do { \ 1012 (item)->wk_state |= ONWORKLIST; \ 1013 LIST_INSERT_HEAD(head, item, wk_list); \ 1014 } while (0) 1015 #define WORKLIST_REMOVE(item) do { \ 1016 (item)->wk_state &= ~ONWORKLIST; \ 1017 LIST_REMOVE(item, wk_list); \ 1018 } while (0) 1019 #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT 1020 #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE 1021 1022 #else /* DEBUG */ 1023 static void worklist_insert(struct workhead *, struct worklist *, int); 1024 static void worklist_remove(struct worklist *, int); 1025 1026 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) 1027 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) 1028 #define WORKLIST_REMOVE(item) worklist_remove(item, 1) 1029 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) 1030 1031 static void 1032 worklist_insert(head, item, locked) 1033 struct workhead *head; 1034 struct worklist *item; 1035 int locked; 1036 { 1037 1038 if (locked) 1039 mtx_assert(&lk, MA_OWNED); 1040 if (item->wk_state & ONWORKLIST) 1041 panic("worklist_insert: %p %s(0x%X) already on list", 1042 item, TYPENAME(item->wk_type), item->wk_state); 1043 item->wk_state |= ONWORKLIST; 1044 LIST_INSERT_HEAD(head, item, wk_list); 1045 } 1046 1047 static void 1048 worklist_remove(item, locked) 1049 struct worklist *item; 1050 int locked; 1051 { 1052 1053 if (locked) 1054 mtx_assert(&lk, MA_OWNED); 1055 if ((item->wk_state & ONWORKLIST) == 0) 1056 panic("worklist_remove: %p %s(0x%X) not on list", 1057 item, TYPENAME(item->wk_type), item->wk_state); 1058 item->wk_state &= ~ONWORKLIST; 1059 LIST_REMOVE(item, wk_list); 1060 } 1061 #endif /* DEBUG */ 1062 1063 /* 1064 * Merge two jsegdeps keeping only the oldest one as newer references 1065 * can't be discarded until after older references. 1066 */ 1067 static inline struct jsegdep * 1068 jsegdep_merge(struct jsegdep *one, struct jsegdep *two) 1069 { 1070 struct jsegdep *swp; 1071 1072 if (two == NULL) 1073 return (one); 1074 1075 if (one->jd_seg->js_seq > two->jd_seg->js_seq) { 1076 swp = one; 1077 one = two; 1078 two = swp; 1079 } 1080 WORKLIST_REMOVE(&two->jd_list); 1081 free_jsegdep(two); 1082 1083 return (one); 1084 } 1085 1086 /* 1087 * If two freedeps are compatible free one to reduce list size. 1088 */ 1089 static inline struct freedep * 1090 freedep_merge(struct freedep *one, struct freedep *two) 1091 { 1092 if (two == NULL) 1093 return (one); 1094 1095 if (one->fd_freework == two->fd_freework) { 1096 WORKLIST_REMOVE(&two->fd_list); 1097 free_freedep(two); 1098 } 1099 return (one); 1100 } 1101 1102 /* 1103 * Move journal work from one list to another. Duplicate freedeps and 1104 * jsegdeps are coalesced to keep the lists as small as possible. 1105 */ 1106 static void 1107 jwork_move(dst, src) 1108 struct workhead *dst; 1109 struct workhead *src; 1110 { 1111 struct freedep *freedep; 1112 struct jsegdep *jsegdep; 1113 struct worklist *wkn; 1114 struct worklist *wk; 1115 1116 KASSERT(dst != src, 1117 ("jwork_move: dst == src")); 1118 freedep = NULL; 1119 jsegdep = NULL; 1120 LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { 1121 if (wk->wk_type == D_JSEGDEP) 1122 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1123 if (wk->wk_type == D_FREEDEP) 1124 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1125 } 1126 1127 mtx_assert(&lk, MA_OWNED); 1128 while ((wk = LIST_FIRST(src)) != NULL) { 1129 WORKLIST_REMOVE(wk); 1130 WORKLIST_INSERT(dst, wk); 1131 if (wk->wk_type == D_JSEGDEP) { 1132 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1133 continue; 1134 } 1135 if (wk->wk_type == D_FREEDEP) 1136 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1137 } 1138 } 1139 1140 static void 1141 jwork_insert(dst, jsegdep) 1142 struct workhead *dst; 1143 struct jsegdep *jsegdep; 1144 { 1145 struct jsegdep *jsegdepn; 1146 struct worklist *wk; 1147 1148 LIST_FOREACH(wk, dst, wk_list) 1149 if (wk->wk_type == D_JSEGDEP) 1150 break; 1151 if (wk == NULL) { 1152 WORKLIST_INSERT(dst, &jsegdep->jd_list); 1153 return; 1154 } 1155 jsegdepn = WK_JSEGDEP(wk); 1156 if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) { 1157 WORKLIST_REMOVE(wk); 1158 free_jsegdep(jsegdepn); 1159 WORKLIST_INSERT(dst, &jsegdep->jd_list); 1160 } else 1161 free_jsegdep(jsegdep); 1162 } 1163 1164 /* 1165 * Routines for tracking and managing workitems. 1166 */ 1167 static void workitem_free(struct worklist *, int); 1168 static void workitem_alloc(struct worklist *, int, struct mount *); 1169 1170 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) 1171 1172 static void 1173 workitem_free(item, type) 1174 struct worklist *item; 1175 int type; 1176 { 1177 struct ufsmount *ump; 1178 mtx_assert(&lk, MA_OWNED); 1179 1180 #ifdef DEBUG 1181 if (item->wk_state & ONWORKLIST) 1182 panic("workitem_free: %s(0x%X) still on list", 1183 TYPENAME(item->wk_type), item->wk_state); 1184 if (item->wk_type != type) 1185 panic("workitem_free: type mismatch %s != %s", 1186 TYPENAME(item->wk_type), TYPENAME(type)); 1187 #endif 1188 if (item->wk_state & IOWAITING) 1189 wakeup(item); 1190 ump = VFSTOUFS(item->wk_mp); 1191 if (--ump->softdep_deps == 0 && ump->softdep_req) 1192 wakeup(&ump->softdep_deps); 1193 dep_current[type]--; 1194 free(item, DtoM(type)); 1195 } 1196 1197 static void 1198 workitem_alloc(item, type, mp) 1199 struct worklist *item; 1200 int type; 1201 struct mount *mp; 1202 { 1203 struct ufsmount *ump; 1204 1205 item->wk_type = type; 1206 item->wk_mp = mp; 1207 item->wk_state = 0; 1208 1209 ump = VFSTOUFS(mp); 1210 ACQUIRE_LOCK(&lk); 1211 dep_current[type]++; 1212 dep_total[type]++; 1213 ump->softdep_deps++; 1214 ump->softdep_accdeps++; 1215 FREE_LOCK(&lk); 1216 } 1217 1218 /* 1219 * Workitem queue management 1220 */ 1221 static int max_softdeps; /* maximum number of structs before slowdown */ 1222 static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ 1223 static int tickdelay = 2; /* number of ticks to pause during slowdown */ 1224 static int proc_waiting; /* tracks whether we have a timeout posted */ 1225 static int *stat_countp; /* statistic to count in proc_waiting timeout */ 1226 static struct callout softdep_callout; 1227 static int req_pending; 1228 static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 1229 static int req_clear_remove; /* syncer process flush some freeblks */ 1230 1231 /* 1232 * runtime statistics 1233 */ 1234 static int stat_worklist_push; /* number of worklist cleanups */ 1235 static int stat_blk_limit_push; /* number of times block limit neared */ 1236 static int stat_ino_limit_push; /* number of times inode limit neared */ 1237 static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 1238 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 1239 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 1240 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 1241 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 1242 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 1243 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 1244 static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ 1245 static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ 1246 static int stat_journal_min; /* Times hit journal min threshold */ 1247 static int stat_journal_low; /* Times hit journal low threshold */ 1248 static int stat_journal_wait; /* Times blocked in jwait(). */ 1249 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ 1250 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ 1251 static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ 1252 static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ 1253 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */ 1254 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */ 1255 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */ 1256 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */ 1257 static int stat_cleanup_failures; /* Number of cleanup requests that failed */ 1258 1259 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, 1260 &max_softdeps, 0, ""); 1261 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, 1262 &tickdelay, 0, ""); 1263 SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW, 1264 &maxindirdeps, 0, ""); 1265 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, 1266 &stat_worklist_push, 0,""); 1267 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, 1268 &stat_blk_limit_push, 0,""); 1269 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, 1270 &stat_ino_limit_push, 0,""); 1271 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, 1272 &stat_blk_limit_hit, 0, ""); 1273 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, 1274 &stat_ino_limit_hit, 0, ""); 1275 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, 1276 &stat_sync_limit_hit, 0, ""); 1277 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, 1278 &stat_indir_blk_ptrs, 0, ""); 1279 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, 1280 &stat_inode_bitmap, 0, ""); 1281 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, 1282 &stat_direct_blk_ptrs, 0, ""); 1283 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, 1284 &stat_dir_entry, 0, ""); 1285 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW, 1286 &stat_jaddref, 0, ""); 1287 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW, 1288 &stat_jnewblk, 0, ""); 1289 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW, 1290 &stat_journal_low, 0, ""); 1291 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW, 1292 &stat_journal_min, 0, ""); 1293 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW, 1294 &stat_journal_wait, 0, ""); 1295 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW, 1296 &stat_jwait_filepage, 0, ""); 1297 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW, 1298 &stat_jwait_freeblks, 0, ""); 1299 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW, 1300 &stat_jwait_inode, 0, ""); 1301 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW, 1302 &stat_jwait_newblk, 0, ""); 1303 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW, 1304 &stat_cleanup_blkrequests, 0, ""); 1305 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW, 1306 &stat_cleanup_inorequests, 0, ""); 1307 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW, 1308 &stat_cleanup_high_delay, 0, ""); 1309 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW, 1310 &stat_cleanup_retries, 0, ""); 1311 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW, 1312 &stat_cleanup_failures, 0, ""); 1313 1314 SYSCTL_DECL(_vfs_ffs); 1315 1316 LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl; 1317 static u_long bmsafemap_hash; /* size of hash table - 1 */ 1318 1319 static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ 1320 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, 1321 &compute_summary_at_mount, 0, "Recompute summary at mount"); 1322 1323 static struct proc *softdepproc; 1324 static struct kproc_desc softdep_kp = { 1325 "softdepflush", 1326 softdep_flush, 1327 &softdepproc 1328 }; 1329 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, 1330 &softdep_kp); 1331 1332 static void 1333 softdep_flush(void) 1334 { 1335 struct mount *nmp; 1336 struct mount *mp; 1337 struct ufsmount *ump; 1338 struct thread *td; 1339 int remaining; 1340 int progress; 1341 int vfslocked; 1342 1343 td = curthread; 1344 td->td_pflags |= TDP_NORUNNINGBUF; 1345 1346 for (;;) { 1347 kproc_suspend_check(softdepproc); 1348 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL); 1349 ACQUIRE_LOCK(&lk); 1350 /* 1351 * If requested, try removing inode or removal dependencies. 1352 */ 1353 if (req_clear_inodedeps) { 1354 clear_inodedeps(td); 1355 req_clear_inodedeps -= 1; 1356 wakeup_one(&proc_waiting); 1357 } 1358 if (req_clear_remove) { 1359 clear_remove(td); 1360 req_clear_remove -= 1; 1361 wakeup_one(&proc_waiting); 1362 } 1363 FREE_LOCK(&lk); 1364 VFS_UNLOCK_GIANT(vfslocked); 1365 remaining = progress = 0; 1366 mtx_lock(&mountlist_mtx); 1367 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1368 nmp = TAILQ_NEXT(mp, mnt_list); 1369 if (MOUNTEDSOFTDEP(mp) == 0) 1370 continue; 1371 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 1372 continue; 1373 vfslocked = VFS_LOCK_GIANT(mp); 1374 progress += softdep_process_worklist(mp, 0); 1375 ump = VFSTOUFS(mp); 1376 remaining += ump->softdep_on_worklist; 1377 VFS_UNLOCK_GIANT(vfslocked); 1378 mtx_lock(&mountlist_mtx); 1379 nmp = TAILQ_NEXT(mp, mnt_list); 1380 vfs_unbusy(mp); 1381 } 1382 mtx_unlock(&mountlist_mtx); 1383 if (remaining && progress) 1384 continue; 1385 ACQUIRE_LOCK(&lk); 1386 if (!req_pending) 1387 msleep(&req_pending, &lk, PVM, "sdflush", hz); 1388 req_pending = 0; 1389 FREE_LOCK(&lk); 1390 } 1391 } 1392 1393 static void 1394 worklist_speedup(void) 1395 { 1396 mtx_assert(&lk, MA_OWNED); 1397 if (req_pending == 0) { 1398 req_pending = 1; 1399 wakeup(&req_pending); 1400 } 1401 } 1402 1403 static int 1404 softdep_speedup(void) 1405 { 1406 1407 worklist_speedup(); 1408 bd_speedup(); 1409 return speedup_syncer(); 1410 } 1411 1412 /* 1413 * Add an item to the end of the work queue. 1414 * This routine requires that the lock be held. 1415 * This is the only routine that adds items to the list. 1416 * The following routine is the only one that removes items 1417 * and does so in order from first to last. 1418 */ 1419 1420 #define WK_HEAD 0x0001 /* Add to HEAD. */ 1421 #define WK_NODELAY 0x0002 /* Process immediately. */ 1422 1423 static void 1424 add_to_worklist(wk, flags) 1425 struct worklist *wk; 1426 int flags; 1427 { 1428 struct ufsmount *ump; 1429 1430 mtx_assert(&lk, MA_OWNED); 1431 ump = VFSTOUFS(wk->wk_mp); 1432 if (wk->wk_state & ONWORKLIST) 1433 panic("add_to_worklist: %s(0x%X) already on list", 1434 TYPENAME(wk->wk_type), wk->wk_state); 1435 wk->wk_state |= ONWORKLIST; 1436 if (ump->softdep_on_worklist == 0) { 1437 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1438 ump->softdep_worklist_tail = wk; 1439 } else if (flags & WK_HEAD) { 1440 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1441 } else { 1442 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); 1443 ump->softdep_worklist_tail = wk; 1444 } 1445 ump->softdep_on_worklist += 1; 1446 if (flags & WK_NODELAY) 1447 worklist_speedup(); 1448 } 1449 1450 /* 1451 * Remove the item to be processed. If we are removing the last 1452 * item on the list, we need to recalculate the tail pointer. 1453 */ 1454 static void 1455 remove_from_worklist(wk) 1456 struct worklist *wk; 1457 { 1458 struct ufsmount *ump; 1459 1460 ump = VFSTOUFS(wk->wk_mp); 1461 WORKLIST_REMOVE(wk); 1462 if (ump->softdep_worklist_tail == wk) 1463 ump->softdep_worklist_tail = 1464 (struct worklist *)wk->wk_list.le_prev; 1465 ump->softdep_on_worklist -= 1; 1466 } 1467 1468 static void 1469 wake_worklist(wk) 1470 struct worklist *wk; 1471 { 1472 if (wk->wk_state & IOWAITING) { 1473 wk->wk_state &= ~IOWAITING; 1474 wakeup(wk); 1475 } 1476 } 1477 1478 static void 1479 wait_worklist(wk, wmesg) 1480 struct worklist *wk; 1481 char *wmesg; 1482 { 1483 1484 wk->wk_state |= IOWAITING; 1485 msleep(wk, &lk, PVM, wmesg, 0); 1486 } 1487 1488 /* 1489 * Process that runs once per second to handle items in the background queue. 1490 * 1491 * Note that we ensure that everything is done in the order in which they 1492 * appear in the queue. The code below depends on this property to ensure 1493 * that blocks of a file are freed before the inode itself is freed. This 1494 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 1495 * until all the old ones have been purged from the dependency lists. 1496 */ 1497 int 1498 softdep_process_worklist(mp, full) 1499 struct mount *mp; 1500 int full; 1501 { 1502 struct thread *td = curthread; 1503 int cnt, matchcnt; 1504 struct ufsmount *ump; 1505 long starttime; 1506 1507 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); 1508 /* 1509 * Record the process identifier of our caller so that we can give 1510 * this process preferential treatment in request_cleanup below. 1511 */ 1512 matchcnt = 0; 1513 ump = VFSTOUFS(mp); 1514 ACQUIRE_LOCK(&lk); 1515 starttime = time_second; 1516 softdep_process_journal(mp, NULL, full?MNT_WAIT:0); 1517 while (ump->softdep_on_worklist > 0) { 1518 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0) 1519 break; 1520 else 1521 matchcnt += cnt; 1522 /* 1523 * If requested, try removing inode or removal dependencies. 1524 */ 1525 if (req_clear_inodedeps) { 1526 clear_inodedeps(td); 1527 req_clear_inodedeps -= 1; 1528 wakeup_one(&proc_waiting); 1529 } 1530 if (req_clear_remove) { 1531 clear_remove(td); 1532 req_clear_remove -= 1; 1533 wakeup_one(&proc_waiting); 1534 } 1535 /* 1536 * We do not generally want to stop for buffer space, but if 1537 * we are really being a buffer hog, we will stop and wait. 1538 */ 1539 if (should_yield()) { 1540 FREE_LOCK(&lk); 1541 kern_yield(PRI_UNCHANGED); 1542 bwillwrite(); 1543 ACQUIRE_LOCK(&lk); 1544 } 1545 /* 1546 * Never allow processing to run for more than one 1547 * second. Otherwise the other mountpoints may get 1548 * excessively backlogged. 1549 */ 1550 if (!full && starttime != time_second) 1551 break; 1552 } 1553 if (full == 0) 1554 journal_unsuspend(ump); 1555 FREE_LOCK(&lk); 1556 return (matchcnt); 1557 } 1558 1559 /* 1560 * Process all removes associated with a vnode if we are running out of 1561 * journal space. Any other process which attempts to flush these will 1562 * be unable as we have the vnodes locked. 1563 */ 1564 static void 1565 process_removes(vp) 1566 struct vnode *vp; 1567 { 1568 struct inodedep *inodedep; 1569 struct dirrem *dirrem; 1570 struct mount *mp; 1571 ino_t inum; 1572 1573 mtx_assert(&lk, MA_OWNED); 1574 1575 mp = vp->v_mount; 1576 inum = VTOI(vp)->i_number; 1577 for (;;) { 1578 top: 1579 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1580 return; 1581 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) { 1582 /* 1583 * If another thread is trying to lock this vnode 1584 * it will fail but we must wait for it to do so 1585 * before we can proceed. 1586 */ 1587 if (dirrem->dm_state & INPROGRESS) { 1588 wait_worklist(&dirrem->dm_list, "pwrwait"); 1589 goto top; 1590 } 1591 if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 1592 (COMPLETE | ONWORKLIST)) 1593 break; 1594 } 1595 if (dirrem == NULL) 1596 return; 1597 remove_from_worklist(&dirrem->dm_list); 1598 FREE_LOCK(&lk); 1599 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1600 panic("process_removes: suspended filesystem"); 1601 handle_workitem_remove(dirrem, 0); 1602 vn_finished_secondary_write(mp); 1603 ACQUIRE_LOCK(&lk); 1604 } 1605 } 1606 1607 /* 1608 * Process all truncations associated with a vnode if we are running out 1609 * of journal space. This is called when the vnode lock is already held 1610 * and no other process can clear the truncation. This function returns 1611 * a value greater than zero if it did any work. 1612 */ 1613 static void 1614 process_truncates(vp) 1615 struct vnode *vp; 1616 { 1617 struct inodedep *inodedep; 1618 struct freeblks *freeblks; 1619 struct mount *mp; 1620 ino_t inum; 1621 int cgwait; 1622 1623 mtx_assert(&lk, MA_OWNED); 1624 1625 mp = vp->v_mount; 1626 inum = VTOI(vp)->i_number; 1627 for (;;) { 1628 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1629 return; 1630 cgwait = 0; 1631 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) { 1632 /* Journal entries not yet written. */ 1633 if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) { 1634 jwait(&LIST_FIRST( 1635 &freeblks->fb_jblkdephd)->jb_list, 1636 MNT_WAIT); 1637 break; 1638 } 1639 /* Another thread is executing this item. */ 1640 if (freeblks->fb_state & INPROGRESS) { 1641 wait_worklist(&freeblks->fb_list, "ptrwait"); 1642 break; 1643 } 1644 /* Freeblks is waiting on a inode write. */ 1645 if ((freeblks->fb_state & COMPLETE) == 0) { 1646 FREE_LOCK(&lk); 1647 ffs_update(vp, 1); 1648 ACQUIRE_LOCK(&lk); 1649 break; 1650 } 1651 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) == 1652 (ALLCOMPLETE | ONWORKLIST)) { 1653 remove_from_worklist(&freeblks->fb_list); 1654 freeblks->fb_state |= INPROGRESS; 1655 FREE_LOCK(&lk); 1656 if (vn_start_secondary_write(NULL, &mp, 1657 V_NOWAIT)) 1658 panic("process_truncates: " 1659 "suspended filesystem"); 1660 handle_workitem_freeblocks(freeblks, 0); 1661 vn_finished_secondary_write(mp); 1662 ACQUIRE_LOCK(&lk); 1663 break; 1664 } 1665 if (freeblks->fb_cgwait) 1666 cgwait++; 1667 } 1668 if (cgwait) { 1669 FREE_LOCK(&lk); 1670 sync_cgs(mp, MNT_WAIT); 1671 ffs_sync_snap(mp, MNT_WAIT); 1672 ACQUIRE_LOCK(&lk); 1673 continue; 1674 } 1675 if (freeblks == NULL) 1676 break; 1677 } 1678 return; 1679 } 1680 1681 /* 1682 * Process one item on the worklist. 1683 */ 1684 static int 1685 process_worklist_item(mp, target, flags) 1686 struct mount *mp; 1687 int target; 1688 int flags; 1689 { 1690 struct worklist sintenel; 1691 struct worklist *wk; 1692 struct ufsmount *ump; 1693 int matchcnt; 1694 int error; 1695 1696 mtx_assert(&lk, MA_OWNED); 1697 KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); 1698 /* 1699 * If we are being called because of a process doing a 1700 * copy-on-write, then it is not safe to write as we may 1701 * recurse into the copy-on-write routine. 1702 */ 1703 if (curthread->td_pflags & TDP_COWINPROGRESS) 1704 return (-1); 1705 PHOLD(curproc); /* Don't let the stack go away. */ 1706 ump = VFSTOUFS(mp); 1707 matchcnt = 0; 1708 sintenel.wk_mp = NULL; 1709 sintenel.wk_type = D_SENTINAL; 1710 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sintenel, wk_list); 1711 for (wk = LIST_NEXT(&sintenel, wk_list); wk != NULL; 1712 wk = LIST_NEXT(&sintenel, wk_list)) { 1713 if (wk->wk_type == D_SENTINAL) { 1714 LIST_REMOVE(&sintenel, wk_list); 1715 LIST_INSERT_AFTER(wk, &sintenel, wk_list); 1716 continue; 1717 } 1718 if (wk->wk_state & INPROGRESS) 1719 panic("process_worklist_item: %p already in progress.", 1720 wk); 1721 wk->wk_state |= INPROGRESS; 1722 remove_from_worklist(wk); 1723 FREE_LOCK(&lk); 1724 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1725 panic("process_worklist_item: suspended filesystem"); 1726 switch (wk->wk_type) { 1727 case D_DIRREM: 1728 /* removal of a directory entry */ 1729 error = handle_workitem_remove(WK_DIRREM(wk), flags); 1730 break; 1731 1732 case D_FREEBLKS: 1733 /* releasing blocks and/or fragments from a file */ 1734 error = handle_workitem_freeblocks(WK_FREEBLKS(wk), 1735 flags); 1736 break; 1737 1738 case D_FREEFRAG: 1739 /* releasing a fragment when replaced as a file grows */ 1740 handle_workitem_freefrag(WK_FREEFRAG(wk)); 1741 error = 0; 1742 break; 1743 1744 case D_FREEFILE: 1745 /* releasing an inode when its link count drops to 0 */ 1746 handle_workitem_freefile(WK_FREEFILE(wk)); 1747 error = 0; 1748 break; 1749 1750 default: 1751 panic("%s_process_worklist: Unknown type %s", 1752 "softdep", TYPENAME(wk->wk_type)); 1753 /* NOTREACHED */ 1754 } 1755 vn_finished_secondary_write(mp); 1756 ACQUIRE_LOCK(&lk); 1757 if (error == 0) { 1758 if (++matchcnt == target) 1759 break; 1760 continue; 1761 } 1762 /* 1763 * We have to retry the worklist item later. Wake up any 1764 * waiters who may be able to complete it immediately and 1765 * add the item back to the head so we don't try to execute 1766 * it again. 1767 */ 1768 wk->wk_state &= ~INPROGRESS; 1769 wake_worklist(wk); 1770 add_to_worklist(wk, WK_HEAD); 1771 } 1772 LIST_REMOVE(&sintenel, wk_list); 1773 /* Sentinal could've become the tail from remove_from_worklist. */ 1774 if (ump->softdep_worklist_tail == &sintenel) 1775 ump->softdep_worklist_tail = 1776 (struct worklist *)sintenel.wk_list.le_prev; 1777 PRELE(curproc); 1778 return (matchcnt); 1779 } 1780 1781 /* 1782 * Move dependencies from one buffer to another. 1783 */ 1784 int 1785 softdep_move_dependencies(oldbp, newbp) 1786 struct buf *oldbp; 1787 struct buf *newbp; 1788 { 1789 struct worklist *wk, *wktail; 1790 int dirty; 1791 1792 dirty = 0; 1793 wktail = NULL; 1794 ACQUIRE_LOCK(&lk); 1795 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 1796 LIST_REMOVE(wk, wk_list); 1797 if (wk->wk_type == D_BMSAFEMAP && 1798 bmsafemap_rollbacks(WK_BMSAFEMAP(wk))) 1799 dirty = 1; 1800 if (wktail == 0) 1801 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 1802 else 1803 LIST_INSERT_AFTER(wktail, wk, wk_list); 1804 wktail = wk; 1805 } 1806 FREE_LOCK(&lk); 1807 1808 return (dirty); 1809 } 1810 1811 /* 1812 * Purge the work list of all items associated with a particular mount point. 1813 */ 1814 int 1815 softdep_flushworklist(oldmnt, countp, td) 1816 struct mount *oldmnt; 1817 int *countp; 1818 struct thread *td; 1819 { 1820 struct vnode *devvp; 1821 int count, error = 0; 1822 struct ufsmount *ump; 1823 1824 /* 1825 * Alternately flush the block device associated with the mount 1826 * point and process any dependencies that the flushing 1827 * creates. We continue until no more worklist dependencies 1828 * are found. 1829 */ 1830 *countp = 0; 1831 ump = VFSTOUFS(oldmnt); 1832 devvp = ump->um_devvp; 1833 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { 1834 *countp += count; 1835 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1836 error = VOP_FSYNC(devvp, MNT_WAIT, td); 1837 VOP_UNLOCK(devvp, 0); 1838 if (error) 1839 break; 1840 } 1841 return (error); 1842 } 1843 1844 int 1845 softdep_waitidle(struct mount *mp) 1846 { 1847 struct ufsmount *ump; 1848 int error; 1849 int i; 1850 1851 ump = VFSTOUFS(mp); 1852 ACQUIRE_LOCK(&lk); 1853 for (i = 0; i < 10 && ump->softdep_deps; i++) { 1854 ump->softdep_req = 1; 1855 if (ump->softdep_on_worklist) 1856 panic("softdep_waitidle: work added after flush."); 1857 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); 1858 } 1859 ump->softdep_req = 0; 1860 FREE_LOCK(&lk); 1861 error = 0; 1862 if (i == 10) { 1863 error = EBUSY; 1864 printf("softdep_waitidle: Failed to flush worklist for %p\n", 1865 mp); 1866 } 1867 1868 return (error); 1869 } 1870 1871 /* 1872 * Flush all vnodes and worklist items associated with a specified mount point. 1873 */ 1874 int 1875 softdep_flushfiles(oldmnt, flags, td) 1876 struct mount *oldmnt; 1877 int flags; 1878 struct thread *td; 1879 { 1880 int error, depcount, loopcnt, retry_flush_count, retry; 1881 1882 loopcnt = 10; 1883 retry_flush_count = 3; 1884 retry_flush: 1885 error = 0; 1886 1887 /* 1888 * Alternately flush the vnodes associated with the mount 1889 * point and process any dependencies that the flushing 1890 * creates. In theory, this loop can happen at most twice, 1891 * but we give it a few extra just to be sure. 1892 */ 1893 for (; loopcnt > 0; loopcnt--) { 1894 /* 1895 * Do another flush in case any vnodes were brought in 1896 * as part of the cleanup operations. 1897 */ 1898 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) 1899 break; 1900 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || 1901 depcount == 0) 1902 break; 1903 } 1904 /* 1905 * If we are unmounting then it is an error to fail. If we 1906 * are simply trying to downgrade to read-only, then filesystem 1907 * activity can keep us busy forever, so we just fail with EBUSY. 1908 */ 1909 if (loopcnt == 0) { 1910 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 1911 panic("softdep_flushfiles: looping"); 1912 error = EBUSY; 1913 } 1914 if (!error) 1915 error = softdep_waitidle(oldmnt); 1916 if (!error) { 1917 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { 1918 retry = 0; 1919 MNT_ILOCK(oldmnt); 1920 KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, 1921 ("softdep_flushfiles: !MNTK_NOINSMNTQ")); 1922 if (oldmnt->mnt_nvnodelistsize > 0) { 1923 if (--retry_flush_count > 0) { 1924 retry = 1; 1925 loopcnt = 3; 1926 } else 1927 error = EBUSY; 1928 } 1929 MNT_IUNLOCK(oldmnt); 1930 if (retry) 1931 goto retry_flush; 1932 } 1933 } 1934 return (error); 1935 } 1936 1937 /* 1938 * Structure hashing. 1939 * 1940 * There are three types of structures that can be looked up: 1941 * 1) pagedep structures identified by mount point, inode number, 1942 * and logical block. 1943 * 2) inodedep structures identified by mount point and inode number. 1944 * 3) newblk structures identified by mount point and 1945 * physical block number. 1946 * 1947 * The "pagedep" and "inodedep" dependency structures are hashed 1948 * separately from the file blocks and inodes to which they correspond. 1949 * This separation helps when the in-memory copy of an inode or 1950 * file block must be replaced. It also obviates the need to access 1951 * an inode or file page when simply updating (or de-allocating) 1952 * dependency structures. Lookup of newblk structures is needed to 1953 * find newly allocated blocks when trying to associate them with 1954 * their allocdirect or allocindir structure. 1955 * 1956 * The lookup routines optionally create and hash a new instance when 1957 * an existing entry is not found. 1958 */ 1959 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 1960 #define NODELAY 0x0002 /* cannot do background work */ 1961 1962 /* 1963 * Structures and routines associated with pagedep caching. 1964 */ 1965 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 1966 u_long pagedep_hash; /* size of hash table - 1 */ 1967 #define PAGEDEP_HASH(mp, inum, lbn) \ 1968 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 1969 pagedep_hash]) 1970 1971 static int 1972 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) 1973 struct pagedep_hashhead *pagedephd; 1974 ino_t ino; 1975 ufs_lbn_t lbn; 1976 struct mount *mp; 1977 int flags; 1978 struct pagedep **pagedeppp; 1979 { 1980 struct pagedep *pagedep; 1981 1982 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 1983 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn && 1984 mp == pagedep->pd_list.wk_mp) { 1985 *pagedeppp = pagedep; 1986 return (1); 1987 } 1988 } 1989 *pagedeppp = NULL; 1990 return (0); 1991 } 1992 /* 1993 * Look up a pagedep. Return 1 if found, 0 otherwise. 1994 * If not found, allocate if DEPALLOC flag is passed. 1995 * Found or allocated entry is returned in pagedeppp. 1996 * This routine must be called with splbio interrupts blocked. 1997 */ 1998 static int 1999 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp) 2000 struct mount *mp; 2001 struct buf *bp; 2002 ino_t ino; 2003 ufs_lbn_t lbn; 2004 int flags; 2005 struct pagedep **pagedeppp; 2006 { 2007 struct pagedep *pagedep; 2008 struct pagedep_hashhead *pagedephd; 2009 struct worklist *wk; 2010 int ret; 2011 int i; 2012 2013 mtx_assert(&lk, MA_OWNED); 2014 if (bp) { 2015 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 2016 if (wk->wk_type == D_PAGEDEP) { 2017 *pagedeppp = WK_PAGEDEP(wk); 2018 return (1); 2019 } 2020 } 2021 } 2022 pagedephd = PAGEDEP_HASH(mp, ino, lbn); 2023 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 2024 if (ret) { 2025 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp) 2026 WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list); 2027 return (1); 2028 } 2029 if ((flags & DEPALLOC) == 0) 2030 return (0); 2031 FREE_LOCK(&lk); 2032 pagedep = malloc(sizeof(struct pagedep), 2033 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); 2034 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); 2035 ACQUIRE_LOCK(&lk); 2036 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 2037 if (*pagedeppp) { 2038 /* 2039 * This should never happen since we only create pagedeps 2040 * with the vnode lock held. Could be an assert. 2041 */ 2042 WORKITEM_FREE(pagedep, D_PAGEDEP); 2043 return (ret); 2044 } 2045 pagedep->pd_ino = ino; 2046 pagedep->pd_lbn = lbn; 2047 LIST_INIT(&pagedep->pd_dirremhd); 2048 LIST_INIT(&pagedep->pd_pendinghd); 2049 for (i = 0; i < DAHASHSZ; i++) 2050 LIST_INIT(&pagedep->pd_diraddhd[i]); 2051 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 2052 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2053 *pagedeppp = pagedep; 2054 return (0); 2055 } 2056 2057 /* 2058 * Structures and routines associated with inodedep caching. 2059 */ 2060 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 2061 static u_long inodedep_hash; /* size of hash table - 1 */ 2062 #define INODEDEP_HASH(fs, inum) \ 2063 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 2064 2065 static int 2066 inodedep_find(inodedephd, fs, inum, inodedeppp) 2067 struct inodedep_hashhead *inodedephd; 2068 struct fs *fs; 2069 ino_t inum; 2070 struct inodedep **inodedeppp; 2071 { 2072 struct inodedep *inodedep; 2073 2074 LIST_FOREACH(inodedep, inodedephd, id_hash) 2075 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 2076 break; 2077 if (inodedep) { 2078 *inodedeppp = inodedep; 2079 return (1); 2080 } 2081 *inodedeppp = NULL; 2082 2083 return (0); 2084 } 2085 /* 2086 * Look up an inodedep. Return 1 if found, 0 if not found. 2087 * If not found, allocate if DEPALLOC flag is passed. 2088 * Found or allocated entry is returned in inodedeppp. 2089 * This routine must be called with splbio interrupts blocked. 2090 */ 2091 static int 2092 inodedep_lookup(mp, inum, flags, inodedeppp) 2093 struct mount *mp; 2094 ino_t inum; 2095 int flags; 2096 struct inodedep **inodedeppp; 2097 { 2098 struct inodedep *inodedep; 2099 struct inodedep_hashhead *inodedephd; 2100 struct fs *fs; 2101 2102 mtx_assert(&lk, MA_OWNED); 2103 fs = VFSTOUFS(mp)->um_fs; 2104 inodedephd = INODEDEP_HASH(fs, inum); 2105 2106 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) 2107 return (1); 2108 if ((flags & DEPALLOC) == 0) 2109 return (0); 2110 /* 2111 * If we are over our limit, try to improve the situation. 2112 */ 2113 if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0) 2114 request_cleanup(mp, FLUSH_INODES); 2115 FREE_LOCK(&lk); 2116 inodedep = malloc(sizeof(struct inodedep), 2117 M_INODEDEP, M_SOFTDEP_FLAGS); 2118 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); 2119 ACQUIRE_LOCK(&lk); 2120 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { 2121 WORKITEM_FREE(inodedep, D_INODEDEP); 2122 return (1); 2123 } 2124 inodedep->id_fs = fs; 2125 inodedep->id_ino = inum; 2126 inodedep->id_state = ALLCOMPLETE; 2127 inodedep->id_nlinkdelta = 0; 2128 inodedep->id_savedino1 = NULL; 2129 inodedep->id_savedsize = -1; 2130 inodedep->id_savedextsize = -1; 2131 inodedep->id_savednlink = -1; 2132 inodedep->id_bmsafemap = NULL; 2133 inodedep->id_mkdiradd = NULL; 2134 LIST_INIT(&inodedep->id_dirremhd); 2135 LIST_INIT(&inodedep->id_pendinghd); 2136 LIST_INIT(&inodedep->id_inowait); 2137 LIST_INIT(&inodedep->id_bufwait); 2138 TAILQ_INIT(&inodedep->id_inoreflst); 2139 TAILQ_INIT(&inodedep->id_inoupdt); 2140 TAILQ_INIT(&inodedep->id_newinoupdt); 2141 TAILQ_INIT(&inodedep->id_extupdt); 2142 TAILQ_INIT(&inodedep->id_newextupdt); 2143 TAILQ_INIT(&inodedep->id_freeblklst); 2144 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 2145 *inodedeppp = inodedep; 2146 return (0); 2147 } 2148 2149 /* 2150 * Structures and routines associated with newblk caching. 2151 */ 2152 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 2153 u_long newblk_hash; /* size of hash table - 1 */ 2154 #define NEWBLK_HASH(fs, inum) \ 2155 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 2156 2157 static int 2158 newblk_find(newblkhd, mp, newblkno, flags, newblkpp) 2159 struct newblk_hashhead *newblkhd; 2160 struct mount *mp; 2161 ufs2_daddr_t newblkno; 2162 int flags; 2163 struct newblk **newblkpp; 2164 { 2165 struct newblk *newblk; 2166 2167 LIST_FOREACH(newblk, newblkhd, nb_hash) { 2168 if (newblkno != newblk->nb_newblkno) 2169 continue; 2170 if (mp != newblk->nb_list.wk_mp) 2171 continue; 2172 /* 2173 * If we're creating a new dependency don't match those that 2174 * have already been converted to allocdirects. This is for 2175 * a frag extend. 2176 */ 2177 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) 2178 continue; 2179 break; 2180 } 2181 if (newblk) { 2182 *newblkpp = newblk; 2183 return (1); 2184 } 2185 *newblkpp = NULL; 2186 return (0); 2187 } 2188 2189 /* 2190 * Look up a newblk. Return 1 if found, 0 if not found. 2191 * If not found, allocate if DEPALLOC flag is passed. 2192 * Found or allocated entry is returned in newblkpp. 2193 */ 2194 static int 2195 newblk_lookup(mp, newblkno, flags, newblkpp) 2196 struct mount *mp; 2197 ufs2_daddr_t newblkno; 2198 int flags; 2199 struct newblk **newblkpp; 2200 { 2201 struct newblk *newblk; 2202 struct newblk_hashhead *newblkhd; 2203 2204 newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno); 2205 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) 2206 return (1); 2207 if ((flags & DEPALLOC) == 0) 2208 return (0); 2209 FREE_LOCK(&lk); 2210 newblk = malloc(sizeof(union allblk), M_NEWBLK, 2211 M_SOFTDEP_FLAGS | M_ZERO); 2212 workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); 2213 ACQUIRE_LOCK(&lk); 2214 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) { 2215 WORKITEM_FREE(newblk, D_NEWBLK); 2216 return (1); 2217 } 2218 newblk->nb_freefrag = NULL; 2219 LIST_INIT(&newblk->nb_indirdeps); 2220 LIST_INIT(&newblk->nb_newdirblk); 2221 LIST_INIT(&newblk->nb_jwork); 2222 newblk->nb_state = ATTACHED; 2223 newblk->nb_newblkno = newblkno; 2224 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 2225 *newblkpp = newblk; 2226 return (0); 2227 } 2228 2229 /* 2230 * Structures and routines associated with freed indirect block caching. 2231 */ 2232 struct freeworklst *indir_hashtbl; 2233 u_long indir_hash; /* size of hash table - 1 */ 2234 #define INDIR_HASH(mp, blkno) \ 2235 (&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash]) 2236 2237 /* 2238 * Lookup an indirect block in the indir hash table. The freework is 2239 * removed and potentially freed. The caller must do a blocking journal 2240 * write before writing to the blkno. 2241 */ 2242 static int 2243 indirblk_lookup(mp, blkno) 2244 struct mount *mp; 2245 ufs2_daddr_t blkno; 2246 { 2247 struct freework *freework; 2248 struct freeworklst *wkhd; 2249 2250 wkhd = INDIR_HASH(mp, blkno); 2251 TAILQ_FOREACH(freework, wkhd, fw_next) { 2252 if (freework->fw_blkno != blkno) 2253 continue; 2254 if (freework->fw_list.wk_mp != mp) 2255 continue; 2256 indirblk_remove(freework); 2257 return (1); 2258 } 2259 return (0); 2260 } 2261 2262 /* 2263 * Insert an indirect block represented by freework into the indirblk 2264 * hash table so that it may prevent the block from being re-used prior 2265 * to the journal being written. 2266 */ 2267 static void 2268 indirblk_insert(freework) 2269 struct freework *freework; 2270 { 2271 struct freeblks *freeblks; 2272 struct jsegdep *jsegdep; 2273 struct worklist *wk; 2274 2275 freeblks = freework->fw_freeblks; 2276 LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list) 2277 if (wk->wk_type == D_JSEGDEP) 2278 break; 2279 if (wk == NULL) 2280 return; 2281 2282 jsegdep = WK_JSEGDEP(wk); 2283 LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs, freework, fw_segs); 2284 TAILQ_INSERT_HEAD(INDIR_HASH(freework->fw_list.wk_mp, 2285 freework->fw_blkno), freework, fw_next); 2286 freework->fw_state &= ~DEPCOMPLETE; 2287 } 2288 2289 static void 2290 indirblk_remove(freework) 2291 struct freework *freework; 2292 { 2293 2294 LIST_REMOVE(freework, fw_segs); 2295 TAILQ_REMOVE(INDIR_HASH(freework->fw_list.wk_mp, 2296 freework->fw_blkno), freework, fw_next); 2297 freework->fw_state |= DEPCOMPLETE; 2298 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) 2299 WORKITEM_FREE(freework, D_FREEWORK); 2300 } 2301 2302 /* 2303 * Executed during filesystem system initialization before 2304 * mounting any filesystems. 2305 */ 2306 void 2307 softdep_initialize() 2308 { 2309 int i; 2310 2311 LIST_INIT(&mkdirlisthd); 2312 max_softdeps = desiredvnodes * 4; 2313 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); 2314 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 2315 newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash); 2316 bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash); 2317 i = 1 << (ffs(desiredvnodes / 10) - 1); 2318 indir_hashtbl = malloc(i * sizeof(indir_hashtbl[0]), M_FREEWORK, 2319 M_WAITOK); 2320 indir_hash = i - 1; 2321 for (i = 0; i <= indir_hash; i++) 2322 TAILQ_INIT(&indir_hashtbl[i]); 2323 2324 /* initialise bioops hack */ 2325 bioops.io_start = softdep_disk_io_initiation; 2326 bioops.io_complete = softdep_disk_write_complete; 2327 bioops.io_deallocate = softdep_deallocate_dependencies; 2328 bioops.io_countdeps = softdep_count_dependencies; 2329 2330 /* Initialize the callout with an mtx. */ 2331 callout_init_mtx(&softdep_callout, &lk, 0); 2332 } 2333 2334 /* 2335 * Executed after all filesystems have been unmounted during 2336 * filesystem module unload. 2337 */ 2338 void 2339 softdep_uninitialize() 2340 { 2341 2342 callout_drain(&softdep_callout); 2343 hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); 2344 hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); 2345 hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); 2346 hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash); 2347 free(indir_hashtbl, M_FREEWORK); 2348 } 2349 2350 /* 2351 * Called at mount time to notify the dependency code that a 2352 * filesystem wishes to use it. 2353 */ 2354 int 2355 softdep_mount(devvp, mp, fs, cred) 2356 struct vnode *devvp; 2357 struct mount *mp; 2358 struct fs *fs; 2359 struct ucred *cred; 2360 { 2361 struct csum_total cstotal; 2362 struct ufsmount *ump; 2363 struct cg *cgp; 2364 struct buf *bp; 2365 int error, cyl; 2366 2367 MNT_ILOCK(mp); 2368 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; 2369 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { 2370 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 2371 MNTK_SOFTDEP | MNTK_NOASYNC; 2372 } 2373 MNT_IUNLOCK(mp); 2374 ump = VFSTOUFS(mp); 2375 LIST_INIT(&ump->softdep_workitem_pending); 2376 LIST_INIT(&ump->softdep_journal_pending); 2377 TAILQ_INIT(&ump->softdep_unlinked); 2378 LIST_INIT(&ump->softdep_dirtycg); 2379 ump->softdep_worklist_tail = NULL; 2380 ump->softdep_on_worklist = 0; 2381 ump->softdep_deps = 0; 2382 if ((fs->fs_flags & FS_SUJ) && 2383 (error = journal_mount(mp, fs, cred)) != 0) { 2384 printf("Failed to start journal: %d\n", error); 2385 return (error); 2386 } 2387 /* 2388 * When doing soft updates, the counters in the 2389 * superblock may have gotten out of sync. Recomputation 2390 * can take a long time and can be deferred for background 2391 * fsck. However, the old behavior of scanning the cylinder 2392 * groups and recalculating them at mount time is available 2393 * by setting vfs.ffs.compute_summary_at_mount to one. 2394 */ 2395 if (compute_summary_at_mount == 0 || fs->fs_clean != 0) 2396 return (0); 2397 bzero(&cstotal, sizeof cstotal); 2398 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 2399 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 2400 fs->fs_cgsize, cred, &bp)) != 0) { 2401 brelse(bp); 2402 return (error); 2403 } 2404 cgp = (struct cg *)bp->b_data; 2405 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 2406 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 2407 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 2408 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 2409 fs->fs_cs(fs, cyl) = cgp->cg_cs; 2410 brelse(bp); 2411 } 2412 #ifdef DEBUG 2413 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 2414 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 2415 #endif 2416 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 2417 return (0); 2418 } 2419 2420 void 2421 softdep_unmount(mp) 2422 struct mount *mp; 2423 { 2424 2425 MNT_ILOCK(mp); 2426 mp->mnt_flag &= ~MNT_SOFTDEP; 2427 if (MOUNTEDSUJ(mp) == 0) { 2428 MNT_IUNLOCK(mp); 2429 return; 2430 } 2431 mp->mnt_flag &= ~MNT_SUJ; 2432 MNT_IUNLOCK(mp); 2433 journal_unmount(mp); 2434 } 2435 2436 struct jblocks { 2437 struct jseglst jb_segs; /* TAILQ of current segments. */ 2438 struct jseg *jb_writeseg; /* Next write to complete. */ 2439 struct jseg *jb_oldestseg; /* Oldest segment with valid entries. */ 2440 struct jextent *jb_extent; /* Extent array. */ 2441 uint64_t jb_nextseq; /* Next sequence number. */ 2442 uint64_t jb_oldestwrseq; /* Oldest written sequence number. */ 2443 uint8_t jb_needseg; /* Need a forced segment. */ 2444 uint8_t jb_suspended; /* Did journal suspend writes? */ 2445 int jb_avail; /* Available extents. */ 2446 int jb_used; /* Last used extent. */ 2447 int jb_head; /* Allocator head. */ 2448 int jb_off; /* Allocator extent offset. */ 2449 int jb_blocks; /* Total disk blocks covered. */ 2450 int jb_free; /* Total disk blocks free. */ 2451 int jb_min; /* Minimum free space. */ 2452 int jb_low; /* Low on space. */ 2453 int jb_age; /* Insertion time of oldest rec. */ 2454 }; 2455 2456 struct jextent { 2457 ufs2_daddr_t je_daddr; /* Disk block address. */ 2458 int je_blocks; /* Disk block count. */ 2459 }; 2460 2461 static struct jblocks * 2462 jblocks_create(void) 2463 { 2464 struct jblocks *jblocks; 2465 2466 jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); 2467 TAILQ_INIT(&jblocks->jb_segs); 2468 jblocks->jb_avail = 10; 2469 jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2470 M_JBLOCKS, M_WAITOK | M_ZERO); 2471 2472 return (jblocks); 2473 } 2474 2475 static ufs2_daddr_t 2476 jblocks_alloc(jblocks, bytes, actual) 2477 struct jblocks *jblocks; 2478 int bytes; 2479 int *actual; 2480 { 2481 ufs2_daddr_t daddr; 2482 struct jextent *jext; 2483 int freecnt; 2484 int blocks; 2485 2486 blocks = bytes / DEV_BSIZE; 2487 jext = &jblocks->jb_extent[jblocks->jb_head]; 2488 freecnt = jext->je_blocks - jblocks->jb_off; 2489 if (freecnt == 0) { 2490 jblocks->jb_off = 0; 2491 if (++jblocks->jb_head > jblocks->jb_used) 2492 jblocks->jb_head = 0; 2493 jext = &jblocks->jb_extent[jblocks->jb_head]; 2494 freecnt = jext->je_blocks; 2495 } 2496 if (freecnt > blocks) 2497 freecnt = blocks; 2498 *actual = freecnt * DEV_BSIZE; 2499 daddr = jext->je_daddr + jblocks->jb_off; 2500 jblocks->jb_off += freecnt; 2501 jblocks->jb_free -= freecnt; 2502 2503 return (daddr); 2504 } 2505 2506 static void 2507 jblocks_free(jblocks, mp, bytes) 2508 struct jblocks *jblocks; 2509 struct mount *mp; 2510 int bytes; 2511 { 2512 2513 jblocks->jb_free += bytes / DEV_BSIZE; 2514 if (jblocks->jb_suspended) 2515 worklist_speedup(); 2516 wakeup(jblocks); 2517 } 2518 2519 static void 2520 jblocks_destroy(jblocks) 2521 struct jblocks *jblocks; 2522 { 2523 2524 if (jblocks->jb_extent) 2525 free(jblocks->jb_extent, M_JBLOCKS); 2526 free(jblocks, M_JBLOCKS); 2527 } 2528 2529 static void 2530 jblocks_add(jblocks, daddr, blocks) 2531 struct jblocks *jblocks; 2532 ufs2_daddr_t daddr; 2533 int blocks; 2534 { 2535 struct jextent *jext; 2536 2537 jblocks->jb_blocks += blocks; 2538 jblocks->jb_free += blocks; 2539 jext = &jblocks->jb_extent[jblocks->jb_used]; 2540 /* Adding the first block. */ 2541 if (jext->je_daddr == 0) { 2542 jext->je_daddr = daddr; 2543 jext->je_blocks = blocks; 2544 return; 2545 } 2546 /* Extending the last extent. */ 2547 if (jext->je_daddr + jext->je_blocks == daddr) { 2548 jext->je_blocks += blocks; 2549 return; 2550 } 2551 /* Adding a new extent. */ 2552 if (++jblocks->jb_used == jblocks->jb_avail) { 2553 jblocks->jb_avail *= 2; 2554 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2555 M_JBLOCKS, M_WAITOK | M_ZERO); 2556 memcpy(jext, jblocks->jb_extent, 2557 sizeof(struct jextent) * jblocks->jb_used); 2558 free(jblocks->jb_extent, M_JBLOCKS); 2559 jblocks->jb_extent = jext; 2560 } 2561 jext = &jblocks->jb_extent[jblocks->jb_used]; 2562 jext->je_daddr = daddr; 2563 jext->je_blocks = blocks; 2564 return; 2565 } 2566 2567 int 2568 softdep_journal_lookup(mp, vpp) 2569 struct mount *mp; 2570 struct vnode **vpp; 2571 { 2572 struct componentname cnp; 2573 struct vnode *dvp; 2574 ino_t sujournal; 2575 int error; 2576 2577 error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp); 2578 if (error) 2579 return (error); 2580 bzero(&cnp, sizeof(cnp)); 2581 cnp.cn_nameiop = LOOKUP; 2582 cnp.cn_flags = ISLASTCN; 2583 cnp.cn_thread = curthread; 2584 cnp.cn_cred = curthread->td_ucred; 2585 cnp.cn_pnbuf = SUJ_FILE; 2586 cnp.cn_nameptr = SUJ_FILE; 2587 cnp.cn_namelen = strlen(SUJ_FILE); 2588 error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); 2589 vput(dvp); 2590 if (error != 0) 2591 return (error); 2592 error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); 2593 return (error); 2594 } 2595 2596 /* 2597 * Open and verify the journal file. 2598 */ 2599 static int 2600 journal_mount(mp, fs, cred) 2601 struct mount *mp; 2602 struct fs *fs; 2603 struct ucred *cred; 2604 { 2605 struct jblocks *jblocks; 2606 struct vnode *vp; 2607 struct inode *ip; 2608 ufs2_daddr_t blkno; 2609 int bcount; 2610 int error; 2611 int i; 2612 2613 error = softdep_journal_lookup(mp, &vp); 2614 if (error != 0) { 2615 printf("Failed to find journal. Use tunefs to create one\n"); 2616 return (error); 2617 } 2618 ip = VTOI(vp); 2619 if (ip->i_size < SUJ_MIN) { 2620 error = ENOSPC; 2621 goto out; 2622 } 2623 bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ 2624 jblocks = jblocks_create(); 2625 for (i = 0; i < bcount; i++) { 2626 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); 2627 if (error) 2628 break; 2629 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); 2630 } 2631 if (error) { 2632 jblocks_destroy(jblocks); 2633 goto out; 2634 } 2635 jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ 2636 jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ 2637 VFSTOUFS(mp)->softdep_jblocks = jblocks; 2638 out: 2639 if (error == 0) { 2640 MNT_ILOCK(mp); 2641 mp->mnt_flag |= MNT_SUJ; 2642 mp->mnt_flag &= ~MNT_SOFTDEP; 2643 MNT_IUNLOCK(mp); 2644 /* 2645 * Only validate the journal contents if the 2646 * filesystem is clean, otherwise we write the logs 2647 * but they'll never be used. If the filesystem was 2648 * still dirty when we mounted it the journal is 2649 * invalid and a new journal can only be valid if it 2650 * starts from a clean mount. 2651 */ 2652 if (fs->fs_clean) { 2653 DIP_SET(ip, i_modrev, fs->fs_mtime); 2654 ip->i_flags |= IN_MODIFIED; 2655 ffs_update(vp, 1); 2656 } 2657 } 2658 vput(vp); 2659 return (error); 2660 } 2661 2662 static void 2663 journal_unmount(mp) 2664 struct mount *mp; 2665 { 2666 struct ufsmount *ump; 2667 2668 ump = VFSTOUFS(mp); 2669 if (ump->softdep_jblocks) 2670 jblocks_destroy(ump->softdep_jblocks); 2671 ump->softdep_jblocks = NULL; 2672 } 2673 2674 /* 2675 * Called when a journal record is ready to be written. Space is allocated 2676 * and the journal entry is created when the journal is flushed to stable 2677 * store. 2678 */ 2679 static void 2680 add_to_journal(wk) 2681 struct worklist *wk; 2682 { 2683 struct ufsmount *ump; 2684 2685 mtx_assert(&lk, MA_OWNED); 2686 ump = VFSTOUFS(wk->wk_mp); 2687 if (wk->wk_state & ONWORKLIST) 2688 panic("add_to_journal: %s(0x%X) already on list", 2689 TYPENAME(wk->wk_type), wk->wk_state); 2690 wk->wk_state |= ONWORKLIST | DEPCOMPLETE; 2691 if (LIST_EMPTY(&ump->softdep_journal_pending)) { 2692 ump->softdep_jblocks->jb_age = ticks; 2693 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); 2694 } else 2695 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); 2696 ump->softdep_journal_tail = wk; 2697 ump->softdep_on_journal += 1; 2698 } 2699 2700 /* 2701 * Remove an arbitrary item for the journal worklist maintain the tail 2702 * pointer. This happens when a new operation obviates the need to 2703 * journal an old operation. 2704 */ 2705 static void 2706 remove_from_journal(wk) 2707 struct worklist *wk; 2708 { 2709 struct ufsmount *ump; 2710 2711 mtx_assert(&lk, MA_OWNED); 2712 ump = VFSTOUFS(wk->wk_mp); 2713 #ifdef SUJ_DEBUG 2714 { 2715 struct worklist *wkn; 2716 2717 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) 2718 if (wkn == wk) 2719 break; 2720 if (wkn == NULL) 2721 panic("remove_from_journal: %p is not in journal", wk); 2722 } 2723 #endif 2724 /* 2725 * We emulate a TAILQ to save space in most structures which do not 2726 * require TAILQ semantics. Here we must update the tail position 2727 * when removing the tail which is not the final entry. This works 2728 * only if the worklist linkage are at the beginning of the structure. 2729 */ 2730 if (ump->softdep_journal_tail == wk) 2731 ump->softdep_journal_tail = 2732 (struct worklist *)wk->wk_list.le_prev; 2733 2734 WORKLIST_REMOVE(wk); 2735 ump->softdep_on_journal -= 1; 2736 } 2737 2738 /* 2739 * Check for journal space as well as dependency limits so the prelink 2740 * code can throttle both journaled and non-journaled filesystems. 2741 * Threshold is 0 for low and 1 for min. 2742 */ 2743 static int 2744 journal_space(ump, thresh) 2745 struct ufsmount *ump; 2746 int thresh; 2747 { 2748 struct jblocks *jblocks; 2749 int avail; 2750 2751 jblocks = ump->softdep_jblocks; 2752 if (jblocks == NULL) 2753 return (1); 2754 /* 2755 * We use a tighter restriction here to prevent request_cleanup() 2756 * running in threads from running into locks we currently hold. 2757 */ 2758 if (dep_current[D_INODEDEP] > (max_softdeps / 10) * 9) 2759 return (0); 2760 if (thresh) 2761 thresh = jblocks->jb_min; 2762 else 2763 thresh = jblocks->jb_low; 2764 avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; 2765 avail = jblocks->jb_free - avail; 2766 2767 return (avail > thresh); 2768 } 2769 2770 static void 2771 journal_suspend(ump) 2772 struct ufsmount *ump; 2773 { 2774 struct jblocks *jblocks; 2775 struct mount *mp; 2776 2777 mp = UFSTOVFS(ump); 2778 jblocks = ump->softdep_jblocks; 2779 MNT_ILOCK(mp); 2780 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2781 stat_journal_min++; 2782 mp->mnt_kern_flag |= MNTK_SUSPEND; 2783 mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc); 2784 } 2785 jblocks->jb_suspended = 1; 2786 MNT_IUNLOCK(mp); 2787 } 2788 2789 static int 2790 journal_unsuspend(struct ufsmount *ump) 2791 { 2792 struct jblocks *jblocks; 2793 struct mount *mp; 2794 2795 mp = UFSTOVFS(ump); 2796 jblocks = ump->softdep_jblocks; 2797 2798 if (jblocks != NULL && jblocks->jb_suspended && 2799 journal_space(ump, jblocks->jb_min)) { 2800 jblocks->jb_suspended = 0; 2801 FREE_LOCK(&lk); 2802 mp->mnt_susp_owner = curthread; 2803 vfs_write_resume(mp); 2804 ACQUIRE_LOCK(&lk); 2805 return (1); 2806 } 2807 return (0); 2808 } 2809 2810 /* 2811 * Called before any allocation function to be certain that there is 2812 * sufficient space in the journal prior to creating any new records. 2813 * Since in the case of block allocation we may have multiple locked 2814 * buffers at the time of the actual allocation we can not block 2815 * when the journal records are created. Doing so would create a deadlock 2816 * if any of these buffers needed to be flushed to reclaim space. Instead 2817 * we require a sufficiently large amount of available space such that 2818 * each thread in the system could have passed this allocation check and 2819 * still have sufficient free space. With 20% of a minimum journal size 2820 * of 1MB we have 6553 records available. 2821 */ 2822 int 2823 softdep_prealloc(vp, waitok) 2824 struct vnode *vp; 2825 int waitok; 2826 { 2827 struct ufsmount *ump; 2828 2829 /* 2830 * Nothing to do if we are not running journaled soft updates. 2831 * If we currently hold the snapshot lock, we must avoid handling 2832 * other resources that could cause deadlock. 2833 */ 2834 if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp))) 2835 return (0); 2836 ump = VFSTOUFS(vp->v_mount); 2837 ACQUIRE_LOCK(&lk); 2838 if (journal_space(ump, 0)) { 2839 FREE_LOCK(&lk); 2840 return (0); 2841 } 2842 stat_journal_low++; 2843 FREE_LOCK(&lk); 2844 if (waitok == MNT_NOWAIT) 2845 return (ENOSPC); 2846 /* 2847 * Attempt to sync this vnode once to flush any journal 2848 * work attached to it. 2849 */ 2850 if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) 2851 ffs_syncvnode(vp, waitok); 2852 ACQUIRE_LOCK(&lk); 2853 process_removes(vp); 2854 process_truncates(vp); 2855 if (journal_space(ump, 0) == 0) { 2856 softdep_speedup(); 2857 if (journal_space(ump, 1) == 0) 2858 journal_suspend(ump); 2859 } 2860 FREE_LOCK(&lk); 2861 2862 return (0); 2863 } 2864 2865 /* 2866 * Before adjusting a link count on a vnode verify that we have sufficient 2867 * journal space. If not, process operations that depend on the currently 2868 * locked pair of vnodes to try to flush space as the syncer, buf daemon, 2869 * and softdep flush threads can not acquire these locks to reclaim space. 2870 */ 2871 static void 2872 softdep_prelink(dvp, vp) 2873 struct vnode *dvp; 2874 struct vnode *vp; 2875 { 2876 struct ufsmount *ump; 2877 2878 ump = VFSTOUFS(dvp->v_mount); 2879 mtx_assert(&lk, MA_OWNED); 2880 /* 2881 * Nothing to do if we have sufficient journal space. 2882 * If we currently hold the snapshot lock, we must avoid 2883 * handling other resources that could cause deadlock. 2884 */ 2885 if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp)))) 2886 return; 2887 stat_journal_low++; 2888 FREE_LOCK(&lk); 2889 if (vp) 2890 ffs_syncvnode(vp, MNT_NOWAIT); 2891 ffs_syncvnode(dvp, MNT_WAIT); 2892 ACQUIRE_LOCK(&lk); 2893 /* Process vp before dvp as it may create .. removes. */ 2894 if (vp) { 2895 process_removes(vp); 2896 process_truncates(vp); 2897 } 2898 process_removes(dvp); 2899 process_truncates(dvp); 2900 softdep_speedup(); 2901 process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); 2902 if (journal_space(ump, 0) == 0) { 2903 softdep_speedup(); 2904 if (journal_space(ump, 1) == 0) 2905 journal_suspend(ump); 2906 } 2907 } 2908 2909 static void 2910 jseg_write(ump, jseg, data) 2911 struct ufsmount *ump; 2912 struct jseg *jseg; 2913 uint8_t *data; 2914 { 2915 struct jsegrec *rec; 2916 2917 rec = (struct jsegrec *)data; 2918 rec->jsr_seq = jseg->js_seq; 2919 rec->jsr_oldest = jseg->js_oldseq; 2920 rec->jsr_cnt = jseg->js_cnt; 2921 rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; 2922 rec->jsr_crc = 0; 2923 rec->jsr_time = ump->um_fs->fs_mtime; 2924 } 2925 2926 static inline void 2927 inoref_write(inoref, jseg, rec) 2928 struct inoref *inoref; 2929 struct jseg *jseg; 2930 struct jrefrec *rec; 2931 { 2932 2933 inoref->if_jsegdep->jd_seg = jseg; 2934 rec->jr_ino = inoref->if_ino; 2935 rec->jr_parent = inoref->if_parent; 2936 rec->jr_nlink = inoref->if_nlink; 2937 rec->jr_mode = inoref->if_mode; 2938 rec->jr_diroff = inoref->if_diroff; 2939 } 2940 2941 static void 2942 jaddref_write(jaddref, jseg, data) 2943 struct jaddref *jaddref; 2944 struct jseg *jseg; 2945 uint8_t *data; 2946 { 2947 struct jrefrec *rec; 2948 2949 rec = (struct jrefrec *)data; 2950 rec->jr_op = JOP_ADDREF; 2951 inoref_write(&jaddref->ja_ref, jseg, rec); 2952 } 2953 2954 static void 2955 jremref_write(jremref, jseg, data) 2956 struct jremref *jremref; 2957 struct jseg *jseg; 2958 uint8_t *data; 2959 { 2960 struct jrefrec *rec; 2961 2962 rec = (struct jrefrec *)data; 2963 rec->jr_op = JOP_REMREF; 2964 inoref_write(&jremref->jr_ref, jseg, rec); 2965 } 2966 2967 static void 2968 jmvref_write(jmvref, jseg, data) 2969 struct jmvref *jmvref; 2970 struct jseg *jseg; 2971 uint8_t *data; 2972 { 2973 struct jmvrec *rec; 2974 2975 rec = (struct jmvrec *)data; 2976 rec->jm_op = JOP_MVREF; 2977 rec->jm_ino = jmvref->jm_ino; 2978 rec->jm_parent = jmvref->jm_parent; 2979 rec->jm_oldoff = jmvref->jm_oldoff; 2980 rec->jm_newoff = jmvref->jm_newoff; 2981 } 2982 2983 static void 2984 jnewblk_write(jnewblk, jseg, data) 2985 struct jnewblk *jnewblk; 2986 struct jseg *jseg; 2987 uint8_t *data; 2988 { 2989 struct jblkrec *rec; 2990 2991 jnewblk->jn_jsegdep->jd_seg = jseg; 2992 rec = (struct jblkrec *)data; 2993 rec->jb_op = JOP_NEWBLK; 2994 rec->jb_ino = jnewblk->jn_ino; 2995 rec->jb_blkno = jnewblk->jn_blkno; 2996 rec->jb_lbn = jnewblk->jn_lbn; 2997 rec->jb_frags = jnewblk->jn_frags; 2998 rec->jb_oldfrags = jnewblk->jn_oldfrags; 2999 } 3000 3001 static void 3002 jfreeblk_write(jfreeblk, jseg, data) 3003 struct jfreeblk *jfreeblk; 3004 struct jseg *jseg; 3005 uint8_t *data; 3006 { 3007 struct jblkrec *rec; 3008 3009 jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg; 3010 rec = (struct jblkrec *)data; 3011 rec->jb_op = JOP_FREEBLK; 3012 rec->jb_ino = jfreeblk->jf_ino; 3013 rec->jb_blkno = jfreeblk->jf_blkno; 3014 rec->jb_lbn = jfreeblk->jf_lbn; 3015 rec->jb_frags = jfreeblk->jf_frags; 3016 rec->jb_oldfrags = 0; 3017 } 3018 3019 static void 3020 jfreefrag_write(jfreefrag, jseg, data) 3021 struct jfreefrag *jfreefrag; 3022 struct jseg *jseg; 3023 uint8_t *data; 3024 { 3025 struct jblkrec *rec; 3026 3027 jfreefrag->fr_jsegdep->jd_seg = jseg; 3028 rec = (struct jblkrec *)data; 3029 rec->jb_op = JOP_FREEBLK; 3030 rec->jb_ino = jfreefrag->fr_ino; 3031 rec->jb_blkno = jfreefrag->fr_blkno; 3032 rec->jb_lbn = jfreefrag->fr_lbn; 3033 rec->jb_frags = jfreefrag->fr_frags; 3034 rec->jb_oldfrags = 0; 3035 } 3036 3037 static void 3038 jtrunc_write(jtrunc, jseg, data) 3039 struct jtrunc *jtrunc; 3040 struct jseg *jseg; 3041 uint8_t *data; 3042 { 3043 struct jtrncrec *rec; 3044 3045 jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg; 3046 rec = (struct jtrncrec *)data; 3047 rec->jt_op = JOP_TRUNC; 3048 rec->jt_ino = jtrunc->jt_ino; 3049 rec->jt_size = jtrunc->jt_size; 3050 rec->jt_extsize = jtrunc->jt_extsize; 3051 } 3052 3053 static void 3054 jfsync_write(jfsync, jseg, data) 3055 struct jfsync *jfsync; 3056 struct jseg *jseg; 3057 uint8_t *data; 3058 { 3059 struct jtrncrec *rec; 3060 3061 rec = (struct jtrncrec *)data; 3062 rec->jt_op = JOP_SYNC; 3063 rec->jt_ino = jfsync->jfs_ino; 3064 rec->jt_size = jfsync->jfs_size; 3065 rec->jt_extsize = jfsync->jfs_extsize; 3066 } 3067 3068 static void 3069 softdep_flushjournal(mp) 3070 struct mount *mp; 3071 { 3072 struct jblocks *jblocks; 3073 struct ufsmount *ump; 3074 3075 if (MOUNTEDSUJ(mp) == 0) 3076 return; 3077 ump = VFSTOUFS(mp); 3078 jblocks = ump->softdep_jblocks; 3079 ACQUIRE_LOCK(&lk); 3080 while (ump->softdep_on_journal) { 3081 jblocks->jb_needseg = 1; 3082 softdep_process_journal(mp, NULL, MNT_WAIT); 3083 } 3084 FREE_LOCK(&lk); 3085 } 3086 3087 /* 3088 * Flush some journal records to disk. 3089 */ 3090 static void 3091 softdep_process_journal(mp, needwk, flags) 3092 struct mount *mp; 3093 struct worklist *needwk; 3094 int flags; 3095 { 3096 struct jblocks *jblocks; 3097 struct ufsmount *ump; 3098 struct worklist *wk; 3099 struct jseg *jseg; 3100 struct buf *bp; 3101 uint8_t *data; 3102 struct fs *fs; 3103 int segwritten; 3104 int jrecmin; /* Minimum records per block. */ 3105 int jrecmax; /* Maximum records per block. */ 3106 int size; 3107 int cnt; 3108 int off; 3109 int devbsize; 3110 3111 if (MOUNTEDSUJ(mp) == 0) 3112 return; 3113 ump = VFSTOUFS(mp); 3114 fs = ump->um_fs; 3115 jblocks = ump->softdep_jblocks; 3116 devbsize = ump->um_devvp->v_bufobj.bo_bsize; 3117 /* 3118 * We write anywhere between a disk block and fs block. The upper 3119 * bound is picked to prevent buffer cache fragmentation and limit 3120 * processing time per I/O. 3121 */ 3122 jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ 3123 jrecmax = (fs->fs_bsize / devbsize) * jrecmin; 3124 segwritten = 0; 3125 for (;;) { 3126 cnt = ump->softdep_on_journal; 3127 /* 3128 * Criteria for writing a segment: 3129 * 1) We have a full block. 3130 * 2) We're called from jwait() and haven't found the 3131 * journal item yet. 3132 * 3) Always write if needseg is set. 3133 * 4) If we are called from process_worklist and have 3134 * not yet written anything we write a partial block 3135 * to enforce a 1 second maximum latency on journal 3136 * entries. 3137 */ 3138 if (cnt < (jrecmax - 1) && needwk == NULL && 3139 jblocks->jb_needseg == 0 && (segwritten || cnt == 0)) 3140 break; 3141 cnt++; 3142 /* 3143 * Verify some free journal space. softdep_prealloc() should 3144 * guarantee that we don't run out so this is indicative of 3145 * a problem with the flow control. Try to recover 3146 * gracefully in any event. 3147 */ 3148 while (jblocks->jb_free == 0) { 3149 if (flags != MNT_WAIT) 3150 break; 3151 printf("softdep: Out of journal space!\n"); 3152 softdep_speedup(); 3153 msleep(jblocks, &lk, PRIBIO, "jblocks", hz); 3154 } 3155 FREE_LOCK(&lk); 3156 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); 3157 workitem_alloc(&jseg->js_list, D_JSEG, mp); 3158 LIST_INIT(&jseg->js_entries); 3159 LIST_INIT(&jseg->js_indirs); 3160 jseg->js_state = ATTACHED; 3161 jseg->js_jblocks = jblocks; 3162 bp = geteblk(fs->fs_bsize, 0); 3163 ACQUIRE_LOCK(&lk); 3164 /* 3165 * If there was a race while we were allocating the block 3166 * and jseg the entry we care about was likely written. 3167 * We bail out in both the WAIT and NOWAIT case and assume 3168 * the caller will loop if the entry it cares about is 3169 * not written. 3170 */ 3171 cnt = ump->softdep_on_journal; 3172 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) { 3173 bp->b_flags |= B_INVAL | B_NOCACHE; 3174 WORKITEM_FREE(jseg, D_JSEG); 3175 FREE_LOCK(&lk); 3176 brelse(bp); 3177 ACQUIRE_LOCK(&lk); 3178 break; 3179 } 3180 /* 3181 * Calculate the disk block size required for the available 3182 * records rounded to the min size. 3183 */ 3184 if (cnt == 0) 3185 size = devbsize; 3186 else if (cnt < jrecmax) 3187 size = howmany(cnt, jrecmin) * devbsize; 3188 else 3189 size = fs->fs_bsize; 3190 /* 3191 * Allocate a disk block for this journal data and account 3192 * for truncation of the requested size if enough contiguous 3193 * space was not available. 3194 */ 3195 bp->b_blkno = jblocks_alloc(jblocks, size, &size); 3196 bp->b_lblkno = bp->b_blkno; 3197 bp->b_offset = bp->b_blkno * DEV_BSIZE; 3198 bp->b_bcount = size; 3199 bp->b_bufobj = &ump->um_devvp->v_bufobj; 3200 bp->b_flags &= ~B_INVAL; 3201 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; 3202 /* 3203 * Initialize our jseg with cnt records. Assign the next 3204 * sequence number to it and link it in-order. 3205 */ 3206 cnt = MIN(cnt, (size / devbsize) * jrecmin); 3207 jseg->js_buf = bp; 3208 jseg->js_cnt = cnt; 3209 jseg->js_refs = cnt + 1; /* Self ref. */ 3210 jseg->js_size = size; 3211 jseg->js_seq = jblocks->jb_nextseq++; 3212 if (jblocks->jb_oldestseg == NULL) 3213 jblocks->jb_oldestseg = jseg; 3214 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq; 3215 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); 3216 if (jblocks->jb_writeseg == NULL) 3217 jblocks->jb_writeseg = jseg; 3218 /* 3219 * Start filling in records from the pending list. 3220 */ 3221 data = bp->b_data; 3222 off = 0; 3223 while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) 3224 != NULL) { 3225 if (cnt == 0) 3226 break; 3227 /* Place a segment header on every device block. */ 3228 if ((off % devbsize) == 0) { 3229 jseg_write(ump, jseg, data); 3230 off += JREC_SIZE; 3231 data = bp->b_data + off; 3232 } 3233 if (wk == needwk) 3234 needwk = NULL; 3235 remove_from_journal(wk); 3236 wk->wk_state |= INPROGRESS; 3237 WORKLIST_INSERT(&jseg->js_entries, wk); 3238 switch (wk->wk_type) { 3239 case D_JADDREF: 3240 jaddref_write(WK_JADDREF(wk), jseg, data); 3241 break; 3242 case D_JREMREF: 3243 jremref_write(WK_JREMREF(wk), jseg, data); 3244 break; 3245 case D_JMVREF: 3246 jmvref_write(WK_JMVREF(wk), jseg, data); 3247 break; 3248 case D_JNEWBLK: 3249 jnewblk_write(WK_JNEWBLK(wk), jseg, data); 3250 break; 3251 case D_JFREEBLK: 3252 jfreeblk_write(WK_JFREEBLK(wk), jseg, data); 3253 break; 3254 case D_JFREEFRAG: 3255 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); 3256 break; 3257 case D_JTRUNC: 3258 jtrunc_write(WK_JTRUNC(wk), jseg, data); 3259 break; 3260 case D_JFSYNC: 3261 jfsync_write(WK_JFSYNC(wk), jseg, data); 3262 break; 3263 default: 3264 panic("process_journal: Unknown type %s", 3265 TYPENAME(wk->wk_type)); 3266 /* NOTREACHED */ 3267 } 3268 off += JREC_SIZE; 3269 data = bp->b_data + off; 3270 cnt--; 3271 } 3272 /* 3273 * Write this one buffer and continue. 3274 */ 3275 segwritten = 1; 3276 jblocks->jb_needseg = 0; 3277 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); 3278 FREE_LOCK(&lk); 3279 BO_LOCK(bp->b_bufobj); 3280 bgetvp(ump->um_devvp, bp); 3281 BO_UNLOCK(bp->b_bufobj); 3282 /* 3283 * We only do the blocking wait once we find the journal 3284 * entry we're looking for. 3285 */ 3286 if (needwk == NULL && flags == MNT_WAIT) 3287 bwrite(bp); 3288 else 3289 bawrite(bp); 3290 ACQUIRE_LOCK(&lk); 3291 } 3292 /* 3293 * If we've suspended the filesystem because we ran out of journal 3294 * space either try to sync it here to make some progress or 3295 * unsuspend it if we already have. 3296 */ 3297 if (flags == 0 && jblocks->jb_suspended) { 3298 if (journal_unsuspend(ump)) 3299 return; 3300 FREE_LOCK(&lk); 3301 VFS_SYNC(mp, MNT_NOWAIT); 3302 ffs_sbupdate(ump, MNT_WAIT, 0); 3303 ACQUIRE_LOCK(&lk); 3304 } 3305 } 3306 3307 /* 3308 * Complete a jseg, allowing all dependencies awaiting journal writes 3309 * to proceed. Each journal dependency also attaches a jsegdep to dependent 3310 * structures so that the journal segment can be freed to reclaim space. 3311 */ 3312 static void 3313 complete_jseg(jseg) 3314 struct jseg *jseg; 3315 { 3316 struct worklist *wk; 3317 struct jmvref *jmvref; 3318 int waiting; 3319 #ifdef INVARIANTS 3320 int i = 0; 3321 #endif 3322 3323 while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { 3324 WORKLIST_REMOVE(wk); 3325 waiting = wk->wk_state & IOWAITING; 3326 wk->wk_state &= ~(INPROGRESS | IOWAITING); 3327 wk->wk_state |= COMPLETE; 3328 KASSERT(i++ < jseg->js_cnt, 3329 ("handle_written_jseg: overflow %d >= %d", 3330 i - 1, jseg->js_cnt)); 3331 switch (wk->wk_type) { 3332 case D_JADDREF: 3333 handle_written_jaddref(WK_JADDREF(wk)); 3334 break; 3335 case D_JREMREF: 3336 handle_written_jremref(WK_JREMREF(wk)); 3337 break; 3338 case D_JMVREF: 3339 rele_jseg(jseg); /* No jsegdep. */ 3340 jmvref = WK_JMVREF(wk); 3341 LIST_REMOVE(jmvref, jm_deps); 3342 if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0) 3343 free_pagedep(jmvref->jm_pagedep); 3344 WORKITEM_FREE(jmvref, D_JMVREF); 3345 break; 3346 case D_JNEWBLK: 3347 handle_written_jnewblk(WK_JNEWBLK(wk)); 3348 break; 3349 case D_JFREEBLK: 3350 handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep); 3351 break; 3352 case D_JTRUNC: 3353 handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep); 3354 break; 3355 case D_JFSYNC: 3356 rele_jseg(jseg); /* No jsegdep. */ 3357 WORKITEM_FREE(wk, D_JFSYNC); 3358 break; 3359 case D_JFREEFRAG: 3360 handle_written_jfreefrag(WK_JFREEFRAG(wk)); 3361 break; 3362 default: 3363 panic("handle_written_jseg: Unknown type %s", 3364 TYPENAME(wk->wk_type)); 3365 /* NOTREACHED */ 3366 } 3367 if (waiting) 3368 wakeup(wk); 3369 } 3370 /* Release the self reference so the structure may be freed. */ 3371 rele_jseg(jseg); 3372 } 3373 3374 /* 3375 * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg 3376 * completions in order only. 3377 */ 3378 static void 3379 handle_written_jseg(jseg, bp) 3380 struct jseg *jseg; 3381 struct buf *bp; 3382 { 3383 struct jblocks *jblocks; 3384 struct jseg *jsegn; 3385 3386 if (jseg->js_refs == 0) 3387 panic("handle_written_jseg: No self-reference on %p", jseg); 3388 jseg->js_state |= DEPCOMPLETE; 3389 /* 3390 * We'll never need this buffer again, set flags so it will be 3391 * discarded. 3392 */ 3393 bp->b_flags |= B_INVAL | B_NOCACHE; 3394 jblocks = jseg->js_jblocks; 3395 /* 3396 * Don't allow out of order completions. If this isn't the first 3397 * block wait for it to write before we're done. 3398 */ 3399 if (jseg != jblocks->jb_writeseg) 3400 return; 3401 /* Iterate through available jsegs processing their entries. */ 3402 do { 3403 jblocks->jb_oldestwrseq = jseg->js_oldseq; 3404 jsegn = TAILQ_NEXT(jseg, js_next); 3405 complete_jseg(jseg); 3406 jseg = jsegn; 3407 } while (jseg && jseg->js_state & DEPCOMPLETE); 3408 jblocks->jb_writeseg = jseg; 3409 /* 3410 * Attempt to free jsegs now that oldestwrseq may have advanced. 3411 */ 3412 free_jsegs(jblocks); 3413 } 3414 3415 static inline struct jsegdep * 3416 inoref_jseg(inoref) 3417 struct inoref *inoref; 3418 { 3419 struct jsegdep *jsegdep; 3420 3421 jsegdep = inoref->if_jsegdep; 3422 inoref->if_jsegdep = NULL; 3423 3424 return (jsegdep); 3425 } 3426 3427 /* 3428 * Called once a jremref has made it to stable store. The jremref is marked 3429 * complete and we attempt to free it. Any pagedeps writes sleeping waiting 3430 * for the jremref to complete will be awoken by free_jremref. 3431 */ 3432 static void 3433 handle_written_jremref(jremref) 3434 struct jremref *jremref; 3435 { 3436 struct inodedep *inodedep; 3437 struct jsegdep *jsegdep; 3438 struct dirrem *dirrem; 3439 3440 /* Grab the jsegdep. */ 3441 jsegdep = inoref_jseg(&jremref->jr_ref); 3442 /* 3443 * Remove us from the inoref list. 3444 */ 3445 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 3446 0, &inodedep) == 0) 3447 panic("handle_written_jremref: Lost inodedep"); 3448 TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 3449 /* 3450 * Complete the dirrem. 3451 */ 3452 dirrem = jremref->jr_dirrem; 3453 jremref->jr_dirrem = NULL; 3454 LIST_REMOVE(jremref, jr_deps); 3455 jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; 3456 jwork_insert(&dirrem->dm_jwork, jsegdep); 3457 if (LIST_EMPTY(&dirrem->dm_jremrefhd) && 3458 (dirrem->dm_state & COMPLETE) != 0) 3459 add_to_worklist(&dirrem->dm_list, 0); 3460 free_jremref(jremref); 3461 } 3462 3463 /* 3464 * Called once a jaddref has made it to stable store. The dependency is 3465 * marked complete and any dependent structures are added to the inode 3466 * bufwait list to be completed as soon as it is written. If a bitmap write 3467 * depends on this entry we move the inode into the inodedephd of the 3468 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. 3469 */ 3470 static void 3471 handle_written_jaddref(jaddref) 3472 struct jaddref *jaddref; 3473 { 3474 struct jsegdep *jsegdep; 3475 struct inodedep *inodedep; 3476 struct diradd *diradd; 3477 struct mkdir *mkdir; 3478 3479 /* Grab the jsegdep. */ 3480 jsegdep = inoref_jseg(&jaddref->ja_ref); 3481 mkdir = NULL; 3482 diradd = NULL; 3483 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3484 0, &inodedep) == 0) 3485 panic("handle_written_jaddref: Lost inodedep."); 3486 if (jaddref->ja_diradd == NULL) 3487 panic("handle_written_jaddref: No dependency"); 3488 if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { 3489 diradd = jaddref->ja_diradd; 3490 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); 3491 } else if (jaddref->ja_state & MKDIR_PARENT) { 3492 mkdir = jaddref->ja_mkdir; 3493 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); 3494 } else if (jaddref->ja_state & MKDIR_BODY) 3495 mkdir = jaddref->ja_mkdir; 3496 else 3497 panic("handle_written_jaddref: Unknown dependency %p", 3498 jaddref->ja_diradd); 3499 jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ 3500 /* 3501 * Remove us from the inode list. 3502 */ 3503 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); 3504 /* 3505 * The mkdir may be waiting on the jaddref to clear before freeing. 3506 */ 3507 if (mkdir) { 3508 KASSERT(mkdir->md_list.wk_type == D_MKDIR, 3509 ("handle_written_jaddref: Incorrect type for mkdir %s", 3510 TYPENAME(mkdir->md_list.wk_type))); 3511 mkdir->md_jaddref = NULL; 3512 diradd = mkdir->md_diradd; 3513 mkdir->md_state |= DEPCOMPLETE; 3514 complete_mkdir(mkdir); 3515 } 3516 jwork_insert(&diradd->da_jwork, jsegdep); 3517 if (jaddref->ja_state & NEWBLOCK) { 3518 inodedep->id_state |= ONDEPLIST; 3519 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, 3520 inodedep, id_deps); 3521 } 3522 free_jaddref(jaddref); 3523 } 3524 3525 /* 3526 * Called once a jnewblk journal is written. The allocdirect or allocindir 3527 * is placed in the bmsafemap to await notification of a written bitmap. If 3528 * the operation was canceled we add the segdep to the appropriate 3529 * dependency to free the journal space once the canceling operation 3530 * completes. 3531 */ 3532 static void 3533 handle_written_jnewblk(jnewblk) 3534 struct jnewblk *jnewblk; 3535 { 3536 struct bmsafemap *bmsafemap; 3537 struct freefrag *freefrag; 3538 struct freework *freework; 3539 struct jsegdep *jsegdep; 3540 struct newblk *newblk; 3541 3542 /* Grab the jsegdep. */ 3543 jsegdep = jnewblk->jn_jsegdep; 3544 jnewblk->jn_jsegdep = NULL; 3545 if (jnewblk->jn_dep == NULL) 3546 panic("handle_written_jnewblk: No dependency for the segdep."); 3547 switch (jnewblk->jn_dep->wk_type) { 3548 case D_NEWBLK: 3549 case D_ALLOCDIRECT: 3550 case D_ALLOCINDIR: 3551 /* 3552 * Add the written block to the bmsafemap so it can 3553 * be notified when the bitmap is on disk. 3554 */ 3555 newblk = WK_NEWBLK(jnewblk->jn_dep); 3556 newblk->nb_jnewblk = NULL; 3557 if ((newblk->nb_state & GOINGAWAY) == 0) { 3558 bmsafemap = newblk->nb_bmsafemap; 3559 newblk->nb_state |= ONDEPLIST; 3560 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, 3561 nb_deps); 3562 } 3563 jwork_insert(&newblk->nb_jwork, jsegdep); 3564 break; 3565 case D_FREEFRAG: 3566 /* 3567 * A newblock being removed by a freefrag when replaced by 3568 * frag extension. 3569 */ 3570 freefrag = WK_FREEFRAG(jnewblk->jn_dep); 3571 freefrag->ff_jdep = NULL; 3572 WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); 3573 break; 3574 case D_FREEWORK: 3575 /* 3576 * A direct block was removed by truncate. 3577 */ 3578 freework = WK_FREEWORK(jnewblk->jn_dep); 3579 freework->fw_jnewblk = NULL; 3580 WORKLIST_INSERT(&freework->fw_freeblks->fb_jwork, 3581 &jsegdep->jd_list); 3582 break; 3583 default: 3584 panic("handle_written_jnewblk: Unknown type %d.", 3585 jnewblk->jn_dep->wk_type); 3586 } 3587 jnewblk->jn_dep = NULL; 3588 free_jnewblk(jnewblk); 3589 } 3590 3591 /* 3592 * Cancel a jfreefrag that won't be needed, probably due to colliding with 3593 * an in-flight allocation that has not yet been committed. Divorce us 3594 * from the freefrag and mark it DEPCOMPLETE so that it may be added 3595 * to the worklist. 3596 */ 3597 static void 3598 cancel_jfreefrag(jfreefrag) 3599 struct jfreefrag *jfreefrag; 3600 { 3601 struct freefrag *freefrag; 3602 3603 if (jfreefrag->fr_jsegdep) { 3604 free_jsegdep(jfreefrag->fr_jsegdep); 3605 jfreefrag->fr_jsegdep = NULL; 3606 } 3607 freefrag = jfreefrag->fr_freefrag; 3608 jfreefrag->fr_freefrag = NULL; 3609 free_jfreefrag(jfreefrag); 3610 freefrag->ff_state |= DEPCOMPLETE; 3611 } 3612 3613 /* 3614 * Free a jfreefrag when the parent freefrag is rendered obsolete. 3615 */ 3616 static void 3617 free_jfreefrag(jfreefrag) 3618 struct jfreefrag *jfreefrag; 3619 { 3620 3621 if (jfreefrag->fr_state & INPROGRESS) 3622 WORKLIST_REMOVE(&jfreefrag->fr_list); 3623 else if (jfreefrag->fr_state & ONWORKLIST) 3624 remove_from_journal(&jfreefrag->fr_list); 3625 if (jfreefrag->fr_freefrag != NULL) 3626 panic("free_jfreefrag: Still attached to a freefrag."); 3627 WORKITEM_FREE(jfreefrag, D_JFREEFRAG); 3628 } 3629 3630 /* 3631 * Called when the journal write for a jfreefrag completes. The parent 3632 * freefrag is added to the worklist if this completes its dependencies. 3633 */ 3634 static void 3635 handle_written_jfreefrag(jfreefrag) 3636 struct jfreefrag *jfreefrag; 3637 { 3638 struct jsegdep *jsegdep; 3639 struct freefrag *freefrag; 3640 3641 /* Grab the jsegdep. */ 3642 jsegdep = jfreefrag->fr_jsegdep; 3643 jfreefrag->fr_jsegdep = NULL; 3644 freefrag = jfreefrag->fr_freefrag; 3645 if (freefrag == NULL) 3646 panic("handle_written_jfreefrag: No freefrag."); 3647 freefrag->ff_state |= DEPCOMPLETE; 3648 freefrag->ff_jdep = NULL; 3649 jwork_insert(&freefrag->ff_jwork, jsegdep); 3650 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 3651 add_to_worklist(&freefrag->ff_list, 0); 3652 jfreefrag->fr_freefrag = NULL; 3653 free_jfreefrag(jfreefrag); 3654 } 3655 3656 /* 3657 * Called when the journal write for a jfreeblk completes. The jfreeblk 3658 * is removed from the freeblks list of pending journal writes and the 3659 * jsegdep is moved to the freeblks jwork to be completed when all blocks 3660 * have been reclaimed. 3661 */ 3662 static void 3663 handle_written_jblkdep(jblkdep) 3664 struct jblkdep *jblkdep; 3665 { 3666 struct freeblks *freeblks; 3667 struct jsegdep *jsegdep; 3668 3669 /* Grab the jsegdep. */ 3670 jsegdep = jblkdep->jb_jsegdep; 3671 jblkdep->jb_jsegdep = NULL; 3672 freeblks = jblkdep->jb_freeblks; 3673 LIST_REMOVE(jblkdep, jb_deps); 3674 WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list); 3675 /* 3676 * If the freeblks is all journaled, we can add it to the worklist. 3677 */ 3678 if (LIST_EMPTY(&freeblks->fb_jblkdephd) && 3679 (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 3680 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 3681 3682 free_jblkdep(jblkdep); 3683 } 3684 3685 static struct jsegdep * 3686 newjsegdep(struct worklist *wk) 3687 { 3688 struct jsegdep *jsegdep; 3689 3690 jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); 3691 workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); 3692 jsegdep->jd_seg = NULL; 3693 3694 return (jsegdep); 3695 } 3696 3697 static struct jmvref * 3698 newjmvref(dp, ino, oldoff, newoff) 3699 struct inode *dp; 3700 ino_t ino; 3701 off_t oldoff; 3702 off_t newoff; 3703 { 3704 struct jmvref *jmvref; 3705 3706 jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); 3707 workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); 3708 jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; 3709 jmvref->jm_parent = dp->i_number; 3710 jmvref->jm_ino = ino; 3711 jmvref->jm_oldoff = oldoff; 3712 jmvref->jm_newoff = newoff; 3713 3714 return (jmvref); 3715 } 3716 3717 /* 3718 * Allocate a new jremref that tracks the removal of ip from dp with the 3719 * directory entry offset of diroff. Mark the entry as ATTACHED and 3720 * DEPCOMPLETE as we have all the information required for the journal write 3721 * and the directory has already been removed from the buffer. The caller 3722 * is responsible for linking the jremref into the pagedep and adding it 3723 * to the journal to write. The MKDIR_PARENT flag is set if we're doing 3724 * a DOTDOT addition so handle_workitem_remove() can properly assign 3725 * the jsegdep when we're done. 3726 */ 3727 static struct jremref * 3728 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, 3729 off_t diroff, nlink_t nlink) 3730 { 3731 struct jremref *jremref; 3732 3733 jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); 3734 workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); 3735 jremref->jr_state = ATTACHED; 3736 newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, 3737 nlink, ip->i_mode); 3738 jremref->jr_dirrem = dirrem; 3739 3740 return (jremref); 3741 } 3742 3743 static inline void 3744 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, 3745 nlink_t nlink, uint16_t mode) 3746 { 3747 3748 inoref->if_jsegdep = newjsegdep(&inoref->if_list); 3749 inoref->if_diroff = diroff; 3750 inoref->if_ino = ino; 3751 inoref->if_parent = parent; 3752 inoref->if_nlink = nlink; 3753 inoref->if_mode = mode; 3754 } 3755 3756 /* 3757 * Allocate a new jaddref to track the addition of ino to dp at diroff. The 3758 * directory offset may not be known until later. The caller is responsible 3759 * adding the entry to the journal when this information is available. nlink 3760 * should be the link count prior to the addition and mode is only required 3761 * to have the correct FMT. 3762 */ 3763 static struct jaddref * 3764 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, 3765 uint16_t mode) 3766 { 3767 struct jaddref *jaddref; 3768 3769 jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); 3770 workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); 3771 jaddref->ja_state = ATTACHED; 3772 jaddref->ja_mkdir = NULL; 3773 newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); 3774 3775 return (jaddref); 3776 } 3777 3778 /* 3779 * Create a new free dependency for a freework. The caller is responsible 3780 * for adjusting the reference count when it has the lock held. The freedep 3781 * will track an outstanding bitmap write that will ultimately clear the 3782 * freework to continue. 3783 */ 3784 static struct freedep * 3785 newfreedep(struct freework *freework) 3786 { 3787 struct freedep *freedep; 3788 3789 freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); 3790 workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); 3791 freedep->fd_freework = freework; 3792 3793 return (freedep); 3794 } 3795 3796 /* 3797 * Free a freedep structure once the buffer it is linked to is written. If 3798 * this is the last reference to the freework schedule it for completion. 3799 */ 3800 static void 3801 free_freedep(freedep) 3802 struct freedep *freedep; 3803 { 3804 struct freework *freework; 3805 3806 freework = freedep->fd_freework; 3807 freework->fw_freeblks->fb_cgwait--; 3808 if (--freework->fw_ref == 0) 3809 freework_enqueue(freework); 3810 WORKITEM_FREE(freedep, D_FREEDEP); 3811 } 3812 3813 /* 3814 * Allocate a new freework structure that may be a level in an indirect 3815 * when parent is not NULL or a top level block when it is. The top level 3816 * freework structures are allocated without lk held and before the freeblks 3817 * is visible outside of softdep_setup_freeblocks(). 3818 */ 3819 static struct freework * 3820 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal) 3821 struct ufsmount *ump; 3822 struct freeblks *freeblks; 3823 struct freework *parent; 3824 ufs_lbn_t lbn; 3825 ufs2_daddr_t nb; 3826 int frags; 3827 int off; 3828 int journal; 3829 { 3830 struct freework *freework; 3831 3832 freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); 3833 workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); 3834 freework->fw_state = ATTACHED; 3835 freework->fw_jnewblk = NULL; 3836 freework->fw_freeblks = freeblks; 3837 freework->fw_parent = parent; 3838 freework->fw_lbn = lbn; 3839 freework->fw_blkno = nb; 3840 freework->fw_frags = frags; 3841 freework->fw_indir = NULL; 3842 freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR) 3843 ? 0 : NINDIR(ump->um_fs) + 1; 3844 freework->fw_start = freework->fw_off = off; 3845 if (journal) 3846 newjfreeblk(freeblks, lbn, nb, frags); 3847 if (parent == NULL) { 3848 ACQUIRE_LOCK(&lk); 3849 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); 3850 freeblks->fb_ref++; 3851 FREE_LOCK(&lk); 3852 } 3853 3854 return (freework); 3855 } 3856 3857 /* 3858 * Eliminate a jfreeblk for a block that does not need journaling. 3859 */ 3860 static void 3861 cancel_jfreeblk(freeblks, blkno) 3862 struct freeblks *freeblks; 3863 ufs2_daddr_t blkno; 3864 { 3865 struct jfreeblk *jfreeblk; 3866 struct jblkdep *jblkdep; 3867 3868 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) { 3869 if (jblkdep->jb_list.wk_type != D_JFREEBLK) 3870 continue; 3871 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list); 3872 if (jfreeblk->jf_blkno == blkno) 3873 break; 3874 } 3875 if (jblkdep == NULL) 3876 return; 3877 free_jsegdep(jblkdep->jb_jsegdep); 3878 LIST_REMOVE(jblkdep, jb_deps); 3879 WORKITEM_FREE(jfreeblk, D_JFREEBLK); 3880 } 3881 3882 /* 3883 * Allocate a new jfreeblk to journal top level block pointer when truncating 3884 * a file. The caller must add this to the worklist when lk is held. 3885 */ 3886 static struct jfreeblk * 3887 newjfreeblk(freeblks, lbn, blkno, frags) 3888 struct freeblks *freeblks; 3889 ufs_lbn_t lbn; 3890 ufs2_daddr_t blkno; 3891 int frags; 3892 { 3893 struct jfreeblk *jfreeblk; 3894 3895 jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); 3896 workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK, 3897 freeblks->fb_list.wk_mp); 3898 jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list); 3899 jfreeblk->jf_dep.jb_freeblks = freeblks; 3900 jfreeblk->jf_ino = freeblks->fb_inum; 3901 jfreeblk->jf_lbn = lbn; 3902 jfreeblk->jf_blkno = blkno; 3903 jfreeblk->jf_frags = frags; 3904 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps); 3905 3906 return (jfreeblk); 3907 } 3908 3909 /* 3910 * Allocate a new jtrunc to track a partial truncation. 3911 */ 3912 static struct jtrunc * 3913 newjtrunc(freeblks, size, extsize) 3914 struct freeblks *freeblks; 3915 off_t size; 3916 int extsize; 3917 { 3918 struct jtrunc *jtrunc; 3919 3920 jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); 3921 workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC, 3922 freeblks->fb_list.wk_mp); 3923 jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list); 3924 jtrunc->jt_dep.jb_freeblks = freeblks; 3925 jtrunc->jt_ino = freeblks->fb_inum; 3926 jtrunc->jt_size = size; 3927 jtrunc->jt_extsize = extsize; 3928 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps); 3929 3930 return (jtrunc); 3931 } 3932 3933 /* 3934 * If we're canceling a new bitmap we have to search for another ref 3935 * to move into the bmsafemap dep. This might be better expressed 3936 * with another structure. 3937 */ 3938 static void 3939 move_newblock_dep(jaddref, inodedep) 3940 struct jaddref *jaddref; 3941 struct inodedep *inodedep; 3942 { 3943 struct inoref *inoref; 3944 struct jaddref *jaddrefn; 3945 3946 jaddrefn = NULL; 3947 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3948 inoref = TAILQ_NEXT(inoref, if_deps)) { 3949 if ((jaddref->ja_state & NEWBLOCK) && 3950 inoref->if_list.wk_type == D_JADDREF) { 3951 jaddrefn = (struct jaddref *)inoref; 3952 break; 3953 } 3954 } 3955 if (jaddrefn == NULL) 3956 return; 3957 jaddrefn->ja_state &= ~(ATTACHED | UNDONE); 3958 jaddrefn->ja_state |= jaddref->ja_state & 3959 (ATTACHED | UNDONE | NEWBLOCK); 3960 jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); 3961 jaddref->ja_state |= ATTACHED; 3962 LIST_REMOVE(jaddref, ja_bmdeps); 3963 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, 3964 ja_bmdeps); 3965 } 3966 3967 /* 3968 * Cancel a jaddref either before it has been written or while it is being 3969 * written. This happens when a link is removed before the add reaches 3970 * the disk. The jaddref dependency is kept linked into the bmsafemap 3971 * and inode to prevent the link count or bitmap from reaching the disk 3972 * until handle_workitem_remove() re-adjusts the counts and bitmaps as 3973 * required. 3974 * 3975 * Returns 1 if the canceled addref requires journaling of the remove and 3976 * 0 otherwise. 3977 */ 3978 static int 3979 cancel_jaddref(jaddref, inodedep, wkhd) 3980 struct jaddref *jaddref; 3981 struct inodedep *inodedep; 3982 struct workhead *wkhd; 3983 { 3984 struct inoref *inoref; 3985 struct jsegdep *jsegdep; 3986 int needsj; 3987 3988 KASSERT((jaddref->ja_state & COMPLETE) == 0, 3989 ("cancel_jaddref: Canceling complete jaddref")); 3990 if (jaddref->ja_state & (INPROGRESS | COMPLETE)) 3991 needsj = 1; 3992 else 3993 needsj = 0; 3994 if (inodedep == NULL) 3995 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3996 0, &inodedep) == 0) 3997 panic("cancel_jaddref: Lost inodedep"); 3998 /* 3999 * We must adjust the nlink of any reference operation that follows 4000 * us so that it is consistent with the in-memory reference. This 4001 * ensures that inode nlink rollbacks always have the correct link. 4002 */ 4003 if (needsj == 0) { 4004 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 4005 inoref = TAILQ_NEXT(inoref, if_deps)) { 4006 if (inoref->if_state & GOINGAWAY) 4007 break; 4008 inoref->if_nlink--; 4009 } 4010 } 4011 jsegdep = inoref_jseg(&jaddref->ja_ref); 4012 if (jaddref->ja_state & NEWBLOCK) 4013 move_newblock_dep(jaddref, inodedep); 4014 wake_worklist(&jaddref->ja_list); 4015 jaddref->ja_mkdir = NULL; 4016 if (jaddref->ja_state & INPROGRESS) { 4017 jaddref->ja_state &= ~INPROGRESS; 4018 WORKLIST_REMOVE(&jaddref->ja_list); 4019 jwork_insert(wkhd, jsegdep); 4020 } else { 4021 free_jsegdep(jsegdep); 4022 if (jaddref->ja_state & DEPCOMPLETE) 4023 remove_from_journal(&jaddref->ja_list); 4024 } 4025 jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE); 4026 /* 4027 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove 4028 * can arrange for them to be freed with the bitmap. Otherwise we 4029 * no longer need this addref attached to the inoreflst and it 4030 * will incorrectly adjust nlink if we leave it. 4031 */ 4032 if ((jaddref->ja_state & NEWBLOCK) == 0) { 4033 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 4034 if_deps); 4035 jaddref->ja_state |= COMPLETE; 4036 free_jaddref(jaddref); 4037 return (needsj); 4038 } 4039 /* 4040 * Leave the head of the list for jsegdeps for fast merging. 4041 */ 4042 if (LIST_FIRST(wkhd) != NULL) { 4043 jaddref->ja_state |= ONWORKLIST; 4044 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); 4045 } else 4046 WORKLIST_INSERT(wkhd, &jaddref->ja_list); 4047 4048 return (needsj); 4049 } 4050 4051 /* 4052 * Attempt to free a jaddref structure when some work completes. This 4053 * should only succeed once the entry is written and all dependencies have 4054 * been notified. 4055 */ 4056 static void 4057 free_jaddref(jaddref) 4058 struct jaddref *jaddref; 4059 { 4060 4061 if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) 4062 return; 4063 if (jaddref->ja_ref.if_jsegdep) 4064 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", 4065 jaddref, jaddref->ja_state); 4066 if (jaddref->ja_state & NEWBLOCK) 4067 LIST_REMOVE(jaddref, ja_bmdeps); 4068 if (jaddref->ja_state & (INPROGRESS | ONWORKLIST)) 4069 panic("free_jaddref: Bad state %p(0x%X)", 4070 jaddref, jaddref->ja_state); 4071 if (jaddref->ja_mkdir != NULL) 4072 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); 4073 WORKITEM_FREE(jaddref, D_JADDREF); 4074 } 4075 4076 /* 4077 * Free a jremref structure once it has been written or discarded. 4078 */ 4079 static void 4080 free_jremref(jremref) 4081 struct jremref *jremref; 4082 { 4083 4084 if (jremref->jr_ref.if_jsegdep) 4085 free_jsegdep(jremref->jr_ref.if_jsegdep); 4086 if (jremref->jr_state & INPROGRESS) 4087 panic("free_jremref: IO still pending"); 4088 WORKITEM_FREE(jremref, D_JREMREF); 4089 } 4090 4091 /* 4092 * Free a jnewblk structure. 4093 */ 4094 static void 4095 free_jnewblk(jnewblk) 4096 struct jnewblk *jnewblk; 4097 { 4098 4099 if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) 4100 return; 4101 LIST_REMOVE(jnewblk, jn_deps); 4102 if (jnewblk->jn_dep != NULL) 4103 panic("free_jnewblk: Dependency still attached."); 4104 WORKITEM_FREE(jnewblk, D_JNEWBLK); 4105 } 4106 4107 /* 4108 * Cancel a jnewblk which has been been made redundant by frag extension. 4109 */ 4110 static void 4111 cancel_jnewblk(jnewblk, wkhd) 4112 struct jnewblk *jnewblk; 4113 struct workhead *wkhd; 4114 { 4115 struct jsegdep *jsegdep; 4116 4117 jsegdep = jnewblk->jn_jsegdep; 4118 if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL) 4119 panic("cancel_jnewblk: Invalid state"); 4120 jnewblk->jn_jsegdep = NULL; 4121 jnewblk->jn_dep = NULL; 4122 jnewblk->jn_state |= GOINGAWAY; 4123 if (jnewblk->jn_state & INPROGRESS) { 4124 jnewblk->jn_state &= ~INPROGRESS; 4125 WORKLIST_REMOVE(&jnewblk->jn_list); 4126 jwork_insert(wkhd, jsegdep); 4127 } else { 4128 free_jsegdep(jsegdep); 4129 remove_from_journal(&jnewblk->jn_list); 4130 } 4131 wake_worklist(&jnewblk->jn_list); 4132 WORKLIST_INSERT(wkhd, &jnewblk->jn_list); 4133 } 4134 4135 static void 4136 free_jblkdep(jblkdep) 4137 struct jblkdep *jblkdep; 4138 { 4139 4140 if (jblkdep->jb_list.wk_type == D_JFREEBLK) 4141 WORKITEM_FREE(jblkdep, D_JFREEBLK); 4142 else if (jblkdep->jb_list.wk_type == D_JTRUNC) 4143 WORKITEM_FREE(jblkdep, D_JTRUNC); 4144 else 4145 panic("free_jblkdep: Unexpected type %s", 4146 TYPENAME(jblkdep->jb_list.wk_type)); 4147 } 4148 4149 /* 4150 * Free a single jseg once it is no longer referenced in memory or on 4151 * disk. Reclaim journal blocks and dependencies waiting for the segment 4152 * to disappear. 4153 */ 4154 static void 4155 free_jseg(jseg, jblocks) 4156 struct jseg *jseg; 4157 struct jblocks *jblocks; 4158 { 4159 struct freework *freework; 4160 4161 /* 4162 * Free freework structures that were lingering to indicate freed 4163 * indirect blocks that forced journal write ordering on reallocate. 4164 */ 4165 while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) 4166 indirblk_remove(freework); 4167 if (jblocks->jb_oldestseg == jseg) 4168 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next); 4169 TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); 4170 jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); 4171 KASSERT(LIST_EMPTY(&jseg->js_entries), 4172 ("free_jseg: Freed jseg has valid entries.")); 4173 WORKITEM_FREE(jseg, D_JSEG); 4174 } 4175 4176 /* 4177 * Free all jsegs that meet the criteria for being reclaimed and update 4178 * oldestseg. 4179 */ 4180 static void 4181 free_jsegs(jblocks) 4182 struct jblocks *jblocks; 4183 { 4184 struct jseg *jseg; 4185 4186 /* 4187 * Free only those jsegs which have none allocated before them to 4188 * preserve the journal space ordering. 4189 */ 4190 while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { 4191 /* 4192 * Only reclaim space when nothing depends on this journal 4193 * set and another set has written that it is no longer 4194 * valid. 4195 */ 4196 if (jseg->js_refs != 0) { 4197 jblocks->jb_oldestseg = jseg; 4198 return; 4199 } 4200 if (!LIST_EMPTY(&jseg->js_indirs) && 4201 jseg->js_seq >= jblocks->jb_oldestwrseq) 4202 break; 4203 free_jseg(jseg, jblocks); 4204 } 4205 /* 4206 * If we exited the loop above we still must discover the 4207 * oldest valid segment. 4208 */ 4209 if (jseg) 4210 for (jseg = jblocks->jb_oldestseg; jseg != NULL; 4211 jseg = TAILQ_NEXT(jseg, js_next)) 4212 if (jseg->js_refs != 0) 4213 break; 4214 jblocks->jb_oldestseg = jseg; 4215 /* 4216 * The journal has no valid records but some jsegs may still be 4217 * waiting on oldestwrseq to advance. We force a small record 4218 * out to permit these lingering records to be reclaimed. 4219 */ 4220 if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs)) 4221 jblocks->jb_needseg = 1; 4222 } 4223 4224 /* 4225 * Release one reference to a jseg and free it if the count reaches 0. This 4226 * should eventually reclaim journal space as well. 4227 */ 4228 static void 4229 rele_jseg(jseg) 4230 struct jseg *jseg; 4231 { 4232 4233 KASSERT(jseg->js_refs > 0, 4234 ("free_jseg: Invalid refcnt %d", jseg->js_refs)); 4235 if (--jseg->js_refs != 0) 4236 return; 4237 free_jsegs(jseg->js_jblocks); 4238 } 4239 4240 /* 4241 * Release a jsegdep and decrement the jseg count. 4242 */ 4243 static void 4244 free_jsegdep(jsegdep) 4245 struct jsegdep *jsegdep; 4246 { 4247 4248 if (jsegdep->jd_seg) 4249 rele_jseg(jsegdep->jd_seg); 4250 WORKITEM_FREE(jsegdep, D_JSEGDEP); 4251 } 4252 4253 /* 4254 * Wait for a journal item to make it to disk. Initiate journal processing 4255 * if required. 4256 */ 4257 static int 4258 jwait(wk, waitfor) 4259 struct worklist *wk; 4260 int waitfor; 4261 { 4262 4263 /* 4264 * Blocking journal waits cause slow synchronous behavior. Record 4265 * stats on the frequency of these blocking operations. 4266 */ 4267 if (waitfor == MNT_WAIT) { 4268 stat_journal_wait++; 4269 switch (wk->wk_type) { 4270 case D_JREMREF: 4271 case D_JMVREF: 4272 stat_jwait_filepage++; 4273 break; 4274 case D_JTRUNC: 4275 case D_JFREEBLK: 4276 stat_jwait_freeblks++; 4277 break; 4278 case D_JNEWBLK: 4279 stat_jwait_newblk++; 4280 break; 4281 case D_JADDREF: 4282 stat_jwait_inode++; 4283 break; 4284 default: 4285 break; 4286 } 4287 } 4288 /* 4289 * If IO has not started we process the journal. We can't mark the 4290 * worklist item as IOWAITING because we drop the lock while 4291 * processing the journal and the worklist entry may be freed after 4292 * this point. The caller may call back in and re-issue the request. 4293 */ 4294 if ((wk->wk_state & INPROGRESS) == 0) { 4295 softdep_process_journal(wk->wk_mp, wk, waitfor); 4296 if (waitfor != MNT_WAIT) 4297 return (EBUSY); 4298 return (0); 4299 } 4300 if (waitfor != MNT_WAIT) 4301 return (EBUSY); 4302 wait_worklist(wk, "jwait"); 4303 return (0); 4304 } 4305 4306 /* 4307 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as 4308 * appropriate. This is a convenience function to reduce duplicate code 4309 * for the setup and revert functions below. 4310 */ 4311 static struct inodedep * 4312 inodedep_lookup_ip(ip) 4313 struct inode *ip; 4314 { 4315 struct inodedep *inodedep; 4316 int dflags; 4317 4318 KASSERT(ip->i_nlink >= ip->i_effnlink, 4319 ("inodedep_lookup_ip: bad delta")); 4320 dflags = DEPALLOC; 4321 if (IS_SNAPSHOT(ip)) 4322 dflags |= NODELAY; 4323 (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, 4324 &inodedep); 4325 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 4326 4327 return (inodedep); 4328 } 4329 4330 /* 4331 * Called prior to creating a new inode and linking it to a directory. The 4332 * jaddref structure must already be allocated by softdep_setup_inomapdep 4333 * and it is discovered here so we can initialize the mode and update 4334 * nlinkdelta. 4335 */ 4336 void 4337 softdep_setup_create(dp, ip) 4338 struct inode *dp; 4339 struct inode *ip; 4340 { 4341 struct inodedep *inodedep; 4342 struct jaddref *jaddref; 4343 struct vnode *dvp; 4344 4345 KASSERT(ip->i_nlink == 1, 4346 ("softdep_setup_create: Invalid link count.")); 4347 dvp = ITOV(dp); 4348 ACQUIRE_LOCK(&lk); 4349 inodedep = inodedep_lookup_ip(ip); 4350 if (DOINGSUJ(dvp)) { 4351 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4352 inoreflst); 4353 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 4354 ("softdep_setup_create: No addref structure present.")); 4355 } 4356 softdep_prelink(dvp, NULL); 4357 FREE_LOCK(&lk); 4358 } 4359 4360 /* 4361 * Create a jaddref structure to track the addition of a DOTDOT link when 4362 * we are reparenting an inode as part of a rename. This jaddref will be 4363 * found by softdep_setup_directory_change. Adjusts nlinkdelta for 4364 * non-journaling softdep. 4365 */ 4366 void 4367 softdep_setup_dotdot_link(dp, ip) 4368 struct inode *dp; 4369 struct inode *ip; 4370 { 4371 struct inodedep *inodedep; 4372 struct jaddref *jaddref; 4373 struct vnode *dvp; 4374 struct vnode *vp; 4375 4376 dvp = ITOV(dp); 4377 vp = ITOV(ip); 4378 jaddref = NULL; 4379 /* 4380 * We don't set MKDIR_PARENT as this is not tied to a mkdir and 4381 * is used as a normal link would be. 4382 */ 4383 if (DOINGSUJ(dvp)) 4384 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 4385 dp->i_effnlink - 1, dp->i_mode); 4386 ACQUIRE_LOCK(&lk); 4387 inodedep = inodedep_lookup_ip(dp); 4388 if (jaddref) 4389 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4390 if_deps); 4391 softdep_prelink(dvp, ITOV(ip)); 4392 FREE_LOCK(&lk); 4393 } 4394 4395 /* 4396 * Create a jaddref structure to track a new link to an inode. The directory 4397 * offset is not known until softdep_setup_directory_add or 4398 * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling 4399 * softdep. 4400 */ 4401 void 4402 softdep_setup_link(dp, ip) 4403 struct inode *dp; 4404 struct inode *ip; 4405 { 4406 struct inodedep *inodedep; 4407 struct jaddref *jaddref; 4408 struct vnode *dvp; 4409 4410 dvp = ITOV(dp); 4411 jaddref = NULL; 4412 if (DOINGSUJ(dvp)) 4413 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, 4414 ip->i_mode); 4415 ACQUIRE_LOCK(&lk); 4416 inodedep = inodedep_lookup_ip(ip); 4417 if (jaddref) 4418 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4419 if_deps); 4420 softdep_prelink(dvp, ITOV(ip)); 4421 FREE_LOCK(&lk); 4422 } 4423 4424 /* 4425 * Called to create the jaddref structures to track . and .. references as 4426 * well as lookup and further initialize the incomplete jaddref created 4427 * by softdep_setup_inomapdep when the inode was allocated. Adjusts 4428 * nlinkdelta for non-journaling softdep. 4429 */ 4430 void 4431 softdep_setup_mkdir(dp, ip) 4432 struct inode *dp; 4433 struct inode *ip; 4434 { 4435 struct inodedep *inodedep; 4436 struct jaddref *dotdotaddref; 4437 struct jaddref *dotaddref; 4438 struct jaddref *jaddref; 4439 struct vnode *dvp; 4440 4441 dvp = ITOV(dp); 4442 dotaddref = dotdotaddref = NULL; 4443 if (DOINGSUJ(dvp)) { 4444 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, 4445 ip->i_mode); 4446 dotaddref->ja_state |= MKDIR_BODY; 4447 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 4448 dp->i_effnlink - 1, dp->i_mode); 4449 dotdotaddref->ja_state |= MKDIR_PARENT; 4450 } 4451 ACQUIRE_LOCK(&lk); 4452 inodedep = inodedep_lookup_ip(ip); 4453 if (DOINGSUJ(dvp)) { 4454 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4455 inoreflst); 4456 KASSERT(jaddref != NULL, 4457 ("softdep_setup_mkdir: No addref structure present.")); 4458 KASSERT(jaddref->ja_parent == dp->i_number, 4459 ("softdep_setup_mkdir: bad parent %d", 4460 jaddref->ja_parent)); 4461 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, 4462 if_deps); 4463 } 4464 inodedep = inodedep_lookup_ip(dp); 4465 if (DOINGSUJ(dvp)) 4466 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, 4467 &dotdotaddref->ja_ref, if_deps); 4468 softdep_prelink(ITOV(dp), NULL); 4469 FREE_LOCK(&lk); 4470 } 4471 4472 /* 4473 * Called to track nlinkdelta of the inode and parent directories prior to 4474 * unlinking a directory. 4475 */ 4476 void 4477 softdep_setup_rmdir(dp, ip) 4478 struct inode *dp; 4479 struct inode *ip; 4480 { 4481 struct vnode *dvp; 4482 4483 dvp = ITOV(dp); 4484 ACQUIRE_LOCK(&lk); 4485 (void) inodedep_lookup_ip(ip); 4486 (void) inodedep_lookup_ip(dp); 4487 softdep_prelink(dvp, ITOV(ip)); 4488 FREE_LOCK(&lk); 4489 } 4490 4491 /* 4492 * Called to track nlinkdelta of the inode and parent directories prior to 4493 * unlink. 4494 */ 4495 void 4496 softdep_setup_unlink(dp, ip) 4497 struct inode *dp; 4498 struct inode *ip; 4499 { 4500 struct vnode *dvp; 4501 4502 dvp = ITOV(dp); 4503 ACQUIRE_LOCK(&lk); 4504 (void) inodedep_lookup_ip(ip); 4505 (void) inodedep_lookup_ip(dp); 4506 softdep_prelink(dvp, ITOV(ip)); 4507 FREE_LOCK(&lk); 4508 } 4509 4510 /* 4511 * Called to release the journal structures created by a failed non-directory 4512 * creation. Adjusts nlinkdelta for non-journaling softdep. 4513 */ 4514 void 4515 softdep_revert_create(dp, ip) 4516 struct inode *dp; 4517 struct inode *ip; 4518 { 4519 struct inodedep *inodedep; 4520 struct jaddref *jaddref; 4521 struct vnode *dvp; 4522 4523 dvp = ITOV(dp); 4524 ACQUIRE_LOCK(&lk); 4525 inodedep = inodedep_lookup_ip(ip); 4526 if (DOINGSUJ(dvp)) { 4527 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4528 inoreflst); 4529 KASSERT(jaddref->ja_parent == dp->i_number, 4530 ("softdep_revert_create: addref parent mismatch")); 4531 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4532 } 4533 FREE_LOCK(&lk); 4534 } 4535 4536 /* 4537 * Called to release the journal structures created by a failed dotdot link 4538 * creation. Adjusts nlinkdelta for non-journaling softdep. 4539 */ 4540 void 4541 softdep_revert_dotdot_link(dp, ip) 4542 struct inode *dp; 4543 struct inode *ip; 4544 { 4545 struct inodedep *inodedep; 4546 struct jaddref *jaddref; 4547 struct vnode *dvp; 4548 4549 dvp = ITOV(dp); 4550 ACQUIRE_LOCK(&lk); 4551 inodedep = inodedep_lookup_ip(dp); 4552 if (DOINGSUJ(dvp)) { 4553 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4554 inoreflst); 4555 KASSERT(jaddref->ja_parent == ip->i_number, 4556 ("softdep_revert_dotdot_link: addref parent mismatch")); 4557 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4558 } 4559 FREE_LOCK(&lk); 4560 } 4561 4562 /* 4563 * Called to release the journal structures created by a failed link 4564 * addition. Adjusts nlinkdelta for non-journaling softdep. 4565 */ 4566 void 4567 softdep_revert_link(dp, ip) 4568 struct inode *dp; 4569 struct inode *ip; 4570 { 4571 struct inodedep *inodedep; 4572 struct jaddref *jaddref; 4573 struct vnode *dvp; 4574 4575 dvp = ITOV(dp); 4576 ACQUIRE_LOCK(&lk); 4577 inodedep = inodedep_lookup_ip(ip); 4578 if (DOINGSUJ(dvp)) { 4579 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4580 inoreflst); 4581 KASSERT(jaddref->ja_parent == dp->i_number, 4582 ("softdep_revert_link: addref parent mismatch")); 4583 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4584 } 4585 FREE_LOCK(&lk); 4586 } 4587 4588 /* 4589 * Called to release the journal structures created by a failed mkdir 4590 * attempt. Adjusts nlinkdelta for non-journaling softdep. 4591 */ 4592 void 4593 softdep_revert_mkdir(dp, ip) 4594 struct inode *dp; 4595 struct inode *ip; 4596 { 4597 struct inodedep *inodedep; 4598 struct jaddref *jaddref; 4599 struct jaddref *dotaddref; 4600 struct vnode *dvp; 4601 4602 dvp = ITOV(dp); 4603 4604 ACQUIRE_LOCK(&lk); 4605 inodedep = inodedep_lookup_ip(dp); 4606 if (DOINGSUJ(dvp)) { 4607 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4608 inoreflst); 4609 KASSERT(jaddref->ja_parent == ip->i_number, 4610 ("softdep_revert_mkdir: dotdot addref parent mismatch")); 4611 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4612 } 4613 inodedep = inodedep_lookup_ip(ip); 4614 if (DOINGSUJ(dvp)) { 4615 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4616 inoreflst); 4617 KASSERT(jaddref->ja_parent == dp->i_number, 4618 ("softdep_revert_mkdir: addref parent mismatch")); 4619 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 4620 inoreflst, if_deps); 4621 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4622 KASSERT(dotaddref->ja_parent == ip->i_number, 4623 ("softdep_revert_mkdir: dot addref parent mismatch")); 4624 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); 4625 } 4626 FREE_LOCK(&lk); 4627 } 4628 4629 /* 4630 * Called to correct nlinkdelta after a failed rmdir. 4631 */ 4632 void 4633 softdep_revert_rmdir(dp, ip) 4634 struct inode *dp; 4635 struct inode *ip; 4636 { 4637 4638 ACQUIRE_LOCK(&lk); 4639 (void) inodedep_lookup_ip(ip); 4640 (void) inodedep_lookup_ip(dp); 4641 FREE_LOCK(&lk); 4642 } 4643 4644 /* 4645 * Protecting the freemaps (or bitmaps). 4646 * 4647 * To eliminate the need to execute fsck before mounting a filesystem 4648 * after a power failure, one must (conservatively) guarantee that the 4649 * on-disk copy of the bitmaps never indicate that a live inode or block is 4650 * free. So, when a block or inode is allocated, the bitmap should be 4651 * updated (on disk) before any new pointers. When a block or inode is 4652 * freed, the bitmap should not be updated until all pointers have been 4653 * reset. The latter dependency is handled by the delayed de-allocation 4654 * approach described below for block and inode de-allocation. The former 4655 * dependency is handled by calling the following procedure when a block or 4656 * inode is allocated. When an inode is allocated an "inodedep" is created 4657 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 4658 * Each "inodedep" is also inserted into the hash indexing structure so 4659 * that any additional link additions can be made dependent on the inode 4660 * allocation. 4661 * 4662 * The ufs filesystem maintains a number of free block counts (e.g., per 4663 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 4664 * in addition to the bitmaps. These counts are used to improve efficiency 4665 * during allocation and therefore must be consistent with the bitmaps. 4666 * There is no convenient way to guarantee post-crash consistency of these 4667 * counts with simple update ordering, for two main reasons: (1) The counts 4668 * and bitmaps for a single cylinder group block are not in the same disk 4669 * sector. If a disk write is interrupted (e.g., by power failure), one may 4670 * be written and the other not. (2) Some of the counts are located in the 4671 * superblock rather than the cylinder group block. So, we focus our soft 4672 * updates implementation on protecting the bitmaps. When mounting a 4673 * filesystem, we recompute the auxiliary counts from the bitmaps. 4674 */ 4675 4676 /* 4677 * Called just after updating the cylinder group block to allocate an inode. 4678 */ 4679 void 4680 softdep_setup_inomapdep(bp, ip, newinum, mode) 4681 struct buf *bp; /* buffer for cylgroup block with inode map */ 4682 struct inode *ip; /* inode related to allocation */ 4683 ino_t newinum; /* new inode number being allocated */ 4684 int mode; 4685 { 4686 struct inodedep *inodedep; 4687 struct bmsafemap *bmsafemap; 4688 struct jaddref *jaddref; 4689 struct mount *mp; 4690 struct fs *fs; 4691 4692 mp = UFSTOVFS(ip->i_ump); 4693 fs = ip->i_ump->um_fs; 4694 jaddref = NULL; 4695 4696 /* 4697 * Allocate the journal reference add structure so that the bitmap 4698 * can be dependent on it. 4699 */ 4700 if (MOUNTEDSUJ(mp)) { 4701 jaddref = newjaddref(ip, newinum, 0, 0, mode); 4702 jaddref->ja_state |= NEWBLOCK; 4703 } 4704 4705 /* 4706 * Create a dependency for the newly allocated inode. 4707 * Panic if it already exists as something is seriously wrong. 4708 * Otherwise add it to the dependency list for the buffer holding 4709 * the cylinder group map from which it was allocated. 4710 */ 4711 ACQUIRE_LOCK(&lk); 4712 if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep))) 4713 panic("softdep_setup_inomapdep: dependency %p for new" 4714 "inode already exists", inodedep); 4715 bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum)); 4716 if (jaddref) { 4717 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); 4718 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4719 if_deps); 4720 } else { 4721 inodedep->id_state |= ONDEPLIST; 4722 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 4723 } 4724 inodedep->id_bmsafemap = bmsafemap; 4725 inodedep->id_state &= ~DEPCOMPLETE; 4726 FREE_LOCK(&lk); 4727 } 4728 4729 /* 4730 * Called just after updating the cylinder group block to 4731 * allocate block or fragment. 4732 */ 4733 void 4734 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 4735 struct buf *bp; /* buffer for cylgroup block with block map */ 4736 struct mount *mp; /* filesystem doing allocation */ 4737 ufs2_daddr_t newblkno; /* number of newly allocated block */ 4738 int frags; /* Number of fragments. */ 4739 int oldfrags; /* Previous number of fragments for extend. */ 4740 { 4741 struct newblk *newblk; 4742 struct bmsafemap *bmsafemap; 4743 struct jnewblk *jnewblk; 4744 struct fs *fs; 4745 4746 fs = VFSTOUFS(mp)->um_fs; 4747 jnewblk = NULL; 4748 /* 4749 * Create a dependency for the newly allocated block. 4750 * Add it to the dependency list for the buffer holding 4751 * the cylinder group map from which it was allocated. 4752 */ 4753 if (MOUNTEDSUJ(mp)) { 4754 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); 4755 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); 4756 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); 4757 jnewblk->jn_state = ATTACHED; 4758 jnewblk->jn_blkno = newblkno; 4759 jnewblk->jn_frags = frags; 4760 jnewblk->jn_oldfrags = oldfrags; 4761 #ifdef SUJ_DEBUG 4762 { 4763 struct cg *cgp; 4764 uint8_t *blksfree; 4765 long bno; 4766 int i; 4767 4768 cgp = (struct cg *)bp->b_data; 4769 blksfree = cg_blksfree(cgp); 4770 bno = dtogd(fs, jnewblk->jn_blkno); 4771 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 4772 i++) { 4773 if (isset(blksfree, bno + i)) 4774 panic("softdep_setup_blkmapdep: " 4775 "free fragment %d from %d-%d " 4776 "state 0x%X dep %p", i, 4777 jnewblk->jn_oldfrags, 4778 jnewblk->jn_frags, 4779 jnewblk->jn_state, 4780 jnewblk->jn_dep); 4781 } 4782 } 4783 #endif 4784 } 4785 ACQUIRE_LOCK(&lk); 4786 if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) 4787 panic("softdep_setup_blkmapdep: found block"); 4788 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, 4789 dtog(fs, newblkno)); 4790 if (jnewblk) { 4791 jnewblk->jn_dep = (struct worklist *)newblk; 4792 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); 4793 } else { 4794 newblk->nb_state |= ONDEPLIST; 4795 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 4796 } 4797 newblk->nb_bmsafemap = bmsafemap; 4798 newblk->nb_jnewblk = jnewblk; 4799 FREE_LOCK(&lk); 4800 } 4801 4802 #define BMSAFEMAP_HASH(fs, cg) \ 4803 (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash]) 4804 4805 static int 4806 bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp) 4807 struct bmsafemap_hashhead *bmsafemaphd; 4808 struct mount *mp; 4809 int cg; 4810 struct bmsafemap **bmsafemapp; 4811 { 4812 struct bmsafemap *bmsafemap; 4813 4814 LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) 4815 if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg) 4816 break; 4817 if (bmsafemap) { 4818 *bmsafemapp = bmsafemap; 4819 return (1); 4820 } 4821 *bmsafemapp = NULL; 4822 4823 return (0); 4824 } 4825 4826 /* 4827 * Find the bmsafemap associated with a cylinder group buffer. 4828 * If none exists, create one. The buffer must be locked when 4829 * this routine is called and this routine must be called with 4830 * splbio interrupts blocked. 4831 */ 4832 static struct bmsafemap * 4833 bmsafemap_lookup(mp, bp, cg) 4834 struct mount *mp; 4835 struct buf *bp; 4836 int cg; 4837 { 4838 struct bmsafemap_hashhead *bmsafemaphd; 4839 struct bmsafemap *bmsafemap, *collision; 4840 struct worklist *wk; 4841 struct fs *fs; 4842 4843 mtx_assert(&lk, MA_OWNED); 4844 if (bp) 4845 LIST_FOREACH(wk, &bp->b_dep, wk_list) 4846 if (wk->wk_type == D_BMSAFEMAP) 4847 return (WK_BMSAFEMAP(wk)); 4848 fs = VFSTOUFS(mp)->um_fs; 4849 bmsafemaphd = BMSAFEMAP_HASH(fs, cg); 4850 if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) 4851 return (bmsafemap); 4852 FREE_LOCK(&lk); 4853 bmsafemap = malloc(sizeof(struct bmsafemap), 4854 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 4855 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 4856 bmsafemap->sm_buf = bp; 4857 LIST_INIT(&bmsafemap->sm_inodedephd); 4858 LIST_INIT(&bmsafemap->sm_inodedepwr); 4859 LIST_INIT(&bmsafemap->sm_newblkhd); 4860 LIST_INIT(&bmsafemap->sm_newblkwr); 4861 LIST_INIT(&bmsafemap->sm_jaddrefhd); 4862 LIST_INIT(&bmsafemap->sm_jnewblkhd); 4863 LIST_INIT(&bmsafemap->sm_freehd); 4864 LIST_INIT(&bmsafemap->sm_freewr); 4865 ACQUIRE_LOCK(&lk); 4866 if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) { 4867 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 4868 return (collision); 4869 } 4870 bmsafemap->sm_cg = cg; 4871 LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); 4872 LIST_INSERT_HEAD(&VFSTOUFS(mp)->softdep_dirtycg, bmsafemap, sm_next); 4873 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 4874 return (bmsafemap); 4875 } 4876 4877 /* 4878 * Direct block allocation dependencies. 4879 * 4880 * When a new block is allocated, the corresponding disk locations must be 4881 * initialized (with zeros or new data) before the on-disk inode points to 4882 * them. Also, the freemap from which the block was allocated must be 4883 * updated (on disk) before the inode's pointer. These two dependencies are 4884 * independent of each other and are needed for all file blocks and indirect 4885 * blocks that are pointed to directly by the inode. Just before the 4886 * "in-core" version of the inode is updated with a newly allocated block 4887 * number, a procedure (below) is called to setup allocation dependency 4888 * structures. These structures are removed when the corresponding 4889 * dependencies are satisfied or when the block allocation becomes obsolete 4890 * (i.e., the file is deleted, the block is de-allocated, or the block is a 4891 * fragment that gets upgraded). All of these cases are handled in 4892 * procedures described later. 4893 * 4894 * When a file extension causes a fragment to be upgraded, either to a larger 4895 * fragment or to a full block, the on-disk location may change (if the 4896 * previous fragment could not simply be extended). In this case, the old 4897 * fragment must be de-allocated, but not until after the inode's pointer has 4898 * been updated. In most cases, this is handled by later procedures, which 4899 * will construct a "freefrag" structure to be added to the workitem queue 4900 * when the inode update is complete (or obsolete). The main exception to 4901 * this is when an allocation occurs while a pending allocation dependency 4902 * (for the same block pointer) remains. This case is handled in the main 4903 * allocation dependency setup procedure by immediately freeing the 4904 * unreferenced fragments. 4905 */ 4906 void 4907 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4908 struct inode *ip; /* inode to which block is being added */ 4909 ufs_lbn_t off; /* block pointer within inode */ 4910 ufs2_daddr_t newblkno; /* disk block number being added */ 4911 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ 4912 long newsize; /* size of new block */ 4913 long oldsize; /* size of new block */ 4914 struct buf *bp; /* bp for allocated block */ 4915 { 4916 struct allocdirect *adp, *oldadp; 4917 struct allocdirectlst *adphead; 4918 struct freefrag *freefrag; 4919 struct inodedep *inodedep; 4920 struct pagedep *pagedep; 4921 struct jnewblk *jnewblk; 4922 struct newblk *newblk; 4923 struct mount *mp; 4924 ufs_lbn_t lbn; 4925 4926 lbn = bp->b_lblkno; 4927 mp = UFSTOVFS(ip->i_ump); 4928 if (oldblkno && oldblkno != newblkno) 4929 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4930 else 4931 freefrag = NULL; 4932 4933 ACQUIRE_LOCK(&lk); 4934 if (off >= NDADDR) { 4935 if (lbn > 0) 4936 panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", 4937 lbn, off); 4938 /* allocating an indirect block */ 4939 if (oldblkno != 0) 4940 panic("softdep_setup_allocdirect: non-zero indir"); 4941 } else { 4942 if (off != lbn) 4943 panic("softdep_setup_allocdirect: lbn %jd != off %jd", 4944 lbn, off); 4945 /* 4946 * Allocating a direct block. 4947 * 4948 * If we are allocating a directory block, then we must 4949 * allocate an associated pagedep to track additions and 4950 * deletions. 4951 */ 4952 if ((ip->i_mode & IFMT) == IFDIR) 4953 pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC, 4954 &pagedep); 4955 } 4956 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4957 panic("softdep_setup_allocdirect: lost block"); 4958 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4959 ("softdep_setup_allocdirect: newblk already initialized")); 4960 /* 4961 * Convert the newblk to an allocdirect. 4962 */ 4963 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4964 adp = (struct allocdirect *)newblk; 4965 newblk->nb_freefrag = freefrag; 4966 adp->ad_offset = off; 4967 adp->ad_oldblkno = oldblkno; 4968 adp->ad_newsize = newsize; 4969 adp->ad_oldsize = oldsize; 4970 4971 /* 4972 * Finish initializing the journal. 4973 */ 4974 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4975 jnewblk->jn_ino = ip->i_number; 4976 jnewblk->jn_lbn = lbn; 4977 add_to_journal(&jnewblk->jn_list); 4978 } 4979 if (freefrag && freefrag->ff_jdep != NULL && 4980 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 4981 add_to_journal(freefrag->ff_jdep); 4982 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4983 adp->ad_inodedep = inodedep; 4984 4985 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4986 /* 4987 * The list of allocdirects must be kept in sorted and ascending 4988 * order so that the rollback routines can quickly determine the 4989 * first uncommitted block (the size of the file stored on disk 4990 * ends at the end of the lowest committed fragment, or if there 4991 * are no fragments, at the end of the highest committed block). 4992 * Since files generally grow, the typical case is that the new 4993 * block is to be added at the end of the list. We speed this 4994 * special case by checking against the last allocdirect in the 4995 * list before laboriously traversing the list looking for the 4996 * insertion point. 4997 */ 4998 adphead = &inodedep->id_newinoupdt; 4999 oldadp = TAILQ_LAST(adphead, allocdirectlst); 5000 if (oldadp == NULL || oldadp->ad_offset <= off) { 5001 /* insert at end of list */ 5002 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 5003 if (oldadp != NULL && oldadp->ad_offset == off) 5004 allocdirect_merge(adphead, adp, oldadp); 5005 FREE_LOCK(&lk); 5006 return; 5007 } 5008 TAILQ_FOREACH(oldadp, adphead, ad_next) { 5009 if (oldadp->ad_offset >= off) 5010 break; 5011 } 5012 if (oldadp == NULL) 5013 panic("softdep_setup_allocdirect: lost entry"); 5014 /* insert in middle of list */ 5015 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 5016 if (oldadp->ad_offset == off) 5017 allocdirect_merge(adphead, adp, oldadp); 5018 5019 FREE_LOCK(&lk); 5020 } 5021 5022 /* 5023 * Merge a newer and older journal record to be stored either in a 5024 * newblock or freefrag. This handles aggregating journal records for 5025 * fragment allocation into a second record as well as replacing a 5026 * journal free with an aborted journal allocation. A segment for the 5027 * oldest record will be placed on wkhd if it has been written. If not 5028 * the segment for the newer record will suffice. 5029 */ 5030 static struct worklist * 5031 jnewblk_merge(new, old, wkhd) 5032 struct worklist *new; 5033 struct worklist *old; 5034 struct workhead *wkhd; 5035 { 5036 struct jnewblk *njnewblk; 5037 struct jnewblk *jnewblk; 5038 5039 /* Handle NULLs to simplify callers. */ 5040 if (new == NULL) 5041 return (old); 5042 if (old == NULL) 5043 return (new); 5044 /* Replace a jfreefrag with a jnewblk. */ 5045 if (new->wk_type == D_JFREEFRAG) { 5046 cancel_jfreefrag(WK_JFREEFRAG(new)); 5047 return (old); 5048 } 5049 /* 5050 * Handle merging of two jnewblk records that describe 5051 * different sets of fragments in the same block. 5052 */ 5053 jnewblk = WK_JNEWBLK(old); 5054 njnewblk = WK_JNEWBLK(new); 5055 if (jnewblk->jn_blkno != njnewblk->jn_blkno) 5056 panic("jnewblk_merge: Merging disparate blocks."); 5057 /* 5058 * The record may be rolled back in the cg. 5059 */ 5060 if (jnewblk->jn_state & UNDONE) { 5061 jnewblk->jn_state &= ~UNDONE; 5062 njnewblk->jn_state |= UNDONE; 5063 njnewblk->jn_state &= ~ATTACHED; 5064 } 5065 /* 5066 * We modify the newer addref and free the older so that if neither 5067 * has been written the most up-to-date copy will be on disk. If 5068 * both have been written but rolled back we only temporarily need 5069 * one of them to fix the bits when the cg write completes. 5070 */ 5071 jnewblk->jn_state |= ATTACHED | COMPLETE; 5072 njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; 5073 cancel_jnewblk(jnewblk, wkhd); 5074 WORKLIST_REMOVE(&jnewblk->jn_list); 5075 free_jnewblk(jnewblk); 5076 return (new); 5077 } 5078 5079 /* 5080 * Replace an old allocdirect dependency with a newer one. 5081 * This routine must be called with splbio interrupts blocked. 5082 */ 5083 static void 5084 allocdirect_merge(adphead, newadp, oldadp) 5085 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 5086 struct allocdirect *newadp; /* allocdirect being added */ 5087 struct allocdirect *oldadp; /* existing allocdirect being checked */ 5088 { 5089 struct worklist *wk; 5090 struct freefrag *freefrag; 5091 5092 freefrag = NULL; 5093 mtx_assert(&lk, MA_OWNED); 5094 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 5095 newadp->ad_oldsize != oldadp->ad_newsize || 5096 newadp->ad_offset >= NDADDR) 5097 panic("%s %jd != new %jd || old size %ld != new %ld", 5098 "allocdirect_merge: old blkno", 5099 (intmax_t)newadp->ad_oldblkno, 5100 (intmax_t)oldadp->ad_newblkno, 5101 newadp->ad_oldsize, oldadp->ad_newsize); 5102 newadp->ad_oldblkno = oldadp->ad_oldblkno; 5103 newadp->ad_oldsize = oldadp->ad_oldsize; 5104 /* 5105 * If the old dependency had a fragment to free or had never 5106 * previously had a block allocated, then the new dependency 5107 * can immediately post its freefrag and adopt the old freefrag. 5108 * This action is done by swapping the freefrag dependencies. 5109 * The new dependency gains the old one's freefrag, and the 5110 * old one gets the new one and then immediately puts it on 5111 * the worklist when it is freed by free_newblk. It is 5112 * not possible to do this swap when the old dependency had a 5113 * non-zero size but no previous fragment to free. This condition 5114 * arises when the new block is an extension of the old block. 5115 * Here, the first part of the fragment allocated to the new 5116 * dependency is part of the block currently claimed on disk by 5117 * the old dependency, so cannot legitimately be freed until the 5118 * conditions for the new dependency are fulfilled. 5119 */ 5120 freefrag = newadp->ad_freefrag; 5121 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 5122 newadp->ad_freefrag = oldadp->ad_freefrag; 5123 oldadp->ad_freefrag = freefrag; 5124 } 5125 /* 5126 * If we are tracking a new directory-block allocation, 5127 * move it from the old allocdirect to the new allocdirect. 5128 */ 5129 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 5130 WORKLIST_REMOVE(wk); 5131 if (!LIST_EMPTY(&oldadp->ad_newdirblk)) 5132 panic("allocdirect_merge: extra newdirblk"); 5133 WORKLIST_INSERT(&newadp->ad_newdirblk, wk); 5134 } 5135 TAILQ_REMOVE(adphead, oldadp, ad_next); 5136 /* 5137 * We need to move any journal dependencies over to the freefrag 5138 * that releases this block if it exists. Otherwise we are 5139 * extending an existing block and we'll wait until that is 5140 * complete to release the journal space and extend the 5141 * new journal to cover this old space as well. 5142 */ 5143 if (freefrag == NULL) { 5144 if (oldadp->ad_newblkno != newadp->ad_newblkno) 5145 panic("allocdirect_merge: %jd != %jd", 5146 oldadp->ad_newblkno, newadp->ad_newblkno); 5147 newadp->ad_block.nb_jnewblk = (struct jnewblk *) 5148 jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, 5149 &oldadp->ad_block.nb_jnewblk->jn_list, 5150 &newadp->ad_block.nb_jwork); 5151 oldadp->ad_block.nb_jnewblk = NULL; 5152 cancel_newblk(&oldadp->ad_block, NULL, 5153 &newadp->ad_block.nb_jwork); 5154 } else { 5155 wk = (struct worklist *) cancel_newblk(&oldadp->ad_block, 5156 &freefrag->ff_list, &freefrag->ff_jwork); 5157 freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk, 5158 &freefrag->ff_jwork); 5159 } 5160 free_newblk(&oldadp->ad_block); 5161 } 5162 5163 /* 5164 * Allocate a jfreefrag structure to journal a single block free. 5165 */ 5166 static struct jfreefrag * 5167 newjfreefrag(freefrag, ip, blkno, size, lbn) 5168 struct freefrag *freefrag; 5169 struct inode *ip; 5170 ufs2_daddr_t blkno; 5171 long size; 5172 ufs_lbn_t lbn; 5173 { 5174 struct jfreefrag *jfreefrag; 5175 struct fs *fs; 5176 5177 fs = ip->i_fs; 5178 jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, 5179 M_SOFTDEP_FLAGS); 5180 workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); 5181 jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); 5182 jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; 5183 jfreefrag->fr_ino = ip->i_number; 5184 jfreefrag->fr_lbn = lbn; 5185 jfreefrag->fr_blkno = blkno; 5186 jfreefrag->fr_frags = numfrags(fs, size); 5187 jfreefrag->fr_freefrag = freefrag; 5188 5189 return (jfreefrag); 5190 } 5191 5192 /* 5193 * Allocate a new freefrag structure. 5194 */ 5195 static struct freefrag * 5196 newfreefrag(ip, blkno, size, lbn) 5197 struct inode *ip; 5198 ufs2_daddr_t blkno; 5199 long size; 5200 ufs_lbn_t lbn; 5201 { 5202 struct freefrag *freefrag; 5203 struct fs *fs; 5204 5205 fs = ip->i_fs; 5206 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 5207 panic("newfreefrag: frag size"); 5208 freefrag = malloc(sizeof(struct freefrag), 5209 M_FREEFRAG, M_SOFTDEP_FLAGS); 5210 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); 5211 freefrag->ff_state = ATTACHED; 5212 LIST_INIT(&freefrag->ff_jwork); 5213 freefrag->ff_inum = ip->i_number; 5214 freefrag->ff_vtype = ITOV(ip)->v_type; 5215 freefrag->ff_blkno = blkno; 5216 freefrag->ff_fragsize = size; 5217 5218 if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) { 5219 freefrag->ff_jdep = (struct worklist *) 5220 newjfreefrag(freefrag, ip, blkno, size, lbn); 5221 } else { 5222 freefrag->ff_state |= DEPCOMPLETE; 5223 freefrag->ff_jdep = NULL; 5224 } 5225 5226 return (freefrag); 5227 } 5228 5229 /* 5230 * This workitem de-allocates fragments that were replaced during 5231 * file block allocation. 5232 */ 5233 static void 5234 handle_workitem_freefrag(freefrag) 5235 struct freefrag *freefrag; 5236 { 5237 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); 5238 struct workhead wkhd; 5239 5240 /* 5241 * It would be illegal to add new completion items to the 5242 * freefrag after it was schedule to be done so it must be 5243 * safe to modify the list head here. 5244 */ 5245 LIST_INIT(&wkhd); 5246 ACQUIRE_LOCK(&lk); 5247 LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); 5248 /* 5249 * If the journal has not been written we must cancel it here. 5250 */ 5251 if (freefrag->ff_jdep) { 5252 if (freefrag->ff_jdep->wk_type != D_JNEWBLK) 5253 panic("handle_workitem_freefrag: Unexpected type %d\n", 5254 freefrag->ff_jdep->wk_type); 5255 cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd); 5256 } 5257 FREE_LOCK(&lk); 5258 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, 5259 freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd); 5260 ACQUIRE_LOCK(&lk); 5261 WORKITEM_FREE(freefrag, D_FREEFRAG); 5262 FREE_LOCK(&lk); 5263 } 5264 5265 /* 5266 * Set up a dependency structure for an external attributes data block. 5267 * This routine follows much of the structure of softdep_setup_allocdirect. 5268 * See the description of softdep_setup_allocdirect above for details. 5269 */ 5270 void 5271 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 5272 struct inode *ip; 5273 ufs_lbn_t off; 5274 ufs2_daddr_t newblkno; 5275 ufs2_daddr_t oldblkno; 5276 long newsize; 5277 long oldsize; 5278 struct buf *bp; 5279 { 5280 struct allocdirect *adp, *oldadp; 5281 struct allocdirectlst *adphead; 5282 struct freefrag *freefrag; 5283 struct inodedep *inodedep; 5284 struct jnewblk *jnewblk; 5285 struct newblk *newblk; 5286 struct mount *mp; 5287 ufs_lbn_t lbn; 5288 5289 if (off >= NXADDR) 5290 panic("softdep_setup_allocext: lbn %lld > NXADDR", 5291 (long long)off); 5292 5293 lbn = bp->b_lblkno; 5294 mp = UFSTOVFS(ip->i_ump); 5295 if (oldblkno && oldblkno != newblkno) 5296 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 5297 else 5298 freefrag = NULL; 5299 5300 ACQUIRE_LOCK(&lk); 5301 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 5302 panic("softdep_setup_allocext: lost block"); 5303 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 5304 ("softdep_setup_allocext: newblk already initialized")); 5305 /* 5306 * Convert the newblk to an allocdirect. 5307 */ 5308 newblk->nb_list.wk_type = D_ALLOCDIRECT; 5309 adp = (struct allocdirect *)newblk; 5310 newblk->nb_freefrag = freefrag; 5311 adp->ad_offset = off; 5312 adp->ad_oldblkno = oldblkno; 5313 adp->ad_newsize = newsize; 5314 adp->ad_oldsize = oldsize; 5315 adp->ad_state |= EXTDATA; 5316 5317 /* 5318 * Finish initializing the journal. 5319 */ 5320 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 5321 jnewblk->jn_ino = ip->i_number; 5322 jnewblk->jn_lbn = lbn; 5323 add_to_journal(&jnewblk->jn_list); 5324 } 5325 if (freefrag && freefrag->ff_jdep != NULL && 5326 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 5327 add_to_journal(freefrag->ff_jdep); 5328 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 5329 adp->ad_inodedep = inodedep; 5330 5331 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 5332 /* 5333 * The list of allocdirects must be kept in sorted and ascending 5334 * order so that the rollback routines can quickly determine the 5335 * first uncommitted block (the size of the file stored on disk 5336 * ends at the end of the lowest committed fragment, or if there 5337 * are no fragments, at the end of the highest committed block). 5338 * Since files generally grow, the typical case is that the new 5339 * block is to be added at the end of the list. We speed this 5340 * special case by checking against the last allocdirect in the 5341 * list before laboriously traversing the list looking for the 5342 * insertion point. 5343 */ 5344 adphead = &inodedep->id_newextupdt; 5345 oldadp = TAILQ_LAST(adphead, allocdirectlst); 5346 if (oldadp == NULL || oldadp->ad_offset <= off) { 5347 /* insert at end of list */ 5348 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 5349 if (oldadp != NULL && oldadp->ad_offset == off) 5350 allocdirect_merge(adphead, adp, oldadp); 5351 FREE_LOCK(&lk); 5352 return; 5353 } 5354 TAILQ_FOREACH(oldadp, adphead, ad_next) { 5355 if (oldadp->ad_offset >= off) 5356 break; 5357 } 5358 if (oldadp == NULL) 5359 panic("softdep_setup_allocext: lost entry"); 5360 /* insert in middle of list */ 5361 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 5362 if (oldadp->ad_offset == off) 5363 allocdirect_merge(adphead, adp, oldadp); 5364 FREE_LOCK(&lk); 5365 } 5366 5367 /* 5368 * Indirect block allocation dependencies. 5369 * 5370 * The same dependencies that exist for a direct block also exist when 5371 * a new block is allocated and pointed to by an entry in a block of 5372 * indirect pointers. The undo/redo states described above are also 5373 * used here. Because an indirect block contains many pointers that 5374 * may have dependencies, a second copy of the entire in-memory indirect 5375 * block is kept. The buffer cache copy is always completely up-to-date. 5376 * The second copy, which is used only as a source for disk writes, 5377 * contains only the safe pointers (i.e., those that have no remaining 5378 * update dependencies). The second copy is freed when all pointers 5379 * are safe. The cache is not allowed to replace indirect blocks with 5380 * pending update dependencies. If a buffer containing an indirect 5381 * block with dependencies is written, these routines will mark it 5382 * dirty again. It can only be successfully written once all the 5383 * dependencies are removed. The ffs_fsync routine in conjunction with 5384 * softdep_sync_metadata work together to get all the dependencies 5385 * removed so that a file can be successfully written to disk. Three 5386 * procedures are used when setting up indirect block pointer 5387 * dependencies. The division is necessary because of the organization 5388 * of the "balloc" routine and because of the distinction between file 5389 * pages and file metadata blocks. 5390 */ 5391 5392 /* 5393 * Allocate a new allocindir structure. 5394 */ 5395 static struct allocindir * 5396 newallocindir(ip, ptrno, newblkno, oldblkno, lbn) 5397 struct inode *ip; /* inode for file being extended */ 5398 int ptrno; /* offset of pointer in indirect block */ 5399 ufs2_daddr_t newblkno; /* disk block number being added */ 5400 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 5401 ufs_lbn_t lbn; 5402 { 5403 struct newblk *newblk; 5404 struct allocindir *aip; 5405 struct freefrag *freefrag; 5406 struct jnewblk *jnewblk; 5407 5408 if (oldblkno) 5409 freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); 5410 else 5411 freefrag = NULL; 5412 ACQUIRE_LOCK(&lk); 5413 if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) 5414 panic("new_allocindir: lost block"); 5415 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 5416 ("newallocindir: newblk already initialized")); 5417 newblk->nb_list.wk_type = D_ALLOCINDIR; 5418 newblk->nb_freefrag = freefrag; 5419 aip = (struct allocindir *)newblk; 5420 aip->ai_offset = ptrno; 5421 aip->ai_oldblkno = oldblkno; 5422 aip->ai_lbn = lbn; 5423 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 5424 jnewblk->jn_ino = ip->i_number; 5425 jnewblk->jn_lbn = lbn; 5426 add_to_journal(&jnewblk->jn_list); 5427 } 5428 if (freefrag && freefrag->ff_jdep != NULL && 5429 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 5430 add_to_journal(freefrag->ff_jdep); 5431 return (aip); 5432 } 5433 5434 /* 5435 * Called just before setting an indirect block pointer 5436 * to a newly allocated file page. 5437 */ 5438 void 5439 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 5440 struct inode *ip; /* inode for file being extended */ 5441 ufs_lbn_t lbn; /* allocated block number within file */ 5442 struct buf *bp; /* buffer with indirect blk referencing page */ 5443 int ptrno; /* offset of pointer in indirect block */ 5444 ufs2_daddr_t newblkno; /* disk block number being added */ 5445 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 5446 struct buf *nbp; /* buffer holding allocated page */ 5447 { 5448 struct inodedep *inodedep; 5449 struct freefrag *freefrag; 5450 struct allocindir *aip; 5451 struct pagedep *pagedep; 5452 struct mount *mp; 5453 int dflags; 5454 5455 if (lbn != nbp->b_lblkno) 5456 panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", 5457 lbn, bp->b_lblkno); 5458 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); 5459 mp = UFSTOVFS(ip->i_ump); 5460 aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); 5461 dflags = DEPALLOC; 5462 if (IS_SNAPSHOT(ip)) 5463 dflags |= NODELAY; 5464 (void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep); 5465 /* 5466 * If we are allocating a directory page, then we must 5467 * allocate an associated pagedep to track additions and 5468 * deletions. 5469 */ 5470 if ((ip->i_mode & IFMT) == IFDIR) 5471 pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep); 5472 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 5473 freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 5474 FREE_LOCK(&lk); 5475 if (freefrag) 5476 handle_workitem_freefrag(freefrag); 5477 } 5478 5479 /* 5480 * Called just before setting an indirect block pointer to a 5481 * newly allocated indirect block. 5482 */ 5483 void 5484 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 5485 struct buf *nbp; /* newly allocated indirect block */ 5486 struct inode *ip; /* inode for file being extended */ 5487 struct buf *bp; /* indirect block referencing allocated block */ 5488 int ptrno; /* offset of pointer in indirect block */ 5489 ufs2_daddr_t newblkno; /* disk block number being added */ 5490 { 5491 struct inodedep *inodedep; 5492 struct allocindir *aip; 5493 ufs_lbn_t lbn; 5494 int dflags; 5495 5496 lbn = nbp->b_lblkno; 5497 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); 5498 aip = newallocindir(ip, ptrno, newblkno, 0, lbn); 5499 dflags = DEPALLOC; 5500 if (IS_SNAPSHOT(ip)) 5501 dflags |= NODELAY; 5502 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep); 5503 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 5504 if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)) 5505 panic("softdep_setup_allocindir_meta: Block already existed"); 5506 FREE_LOCK(&lk); 5507 } 5508 5509 static void 5510 indirdep_complete(indirdep) 5511 struct indirdep *indirdep; 5512 { 5513 struct allocindir *aip; 5514 5515 LIST_REMOVE(indirdep, ir_next); 5516 indirdep->ir_state |= DEPCOMPLETE; 5517 5518 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 5519 LIST_REMOVE(aip, ai_next); 5520 free_newblk(&aip->ai_block); 5521 } 5522 /* 5523 * If this indirdep is not attached to a buf it was simply waiting 5524 * on completion to clear completehd. free_indirdep() asserts 5525 * that nothing is dangling. 5526 */ 5527 if ((indirdep->ir_state & ONWORKLIST) == 0) 5528 free_indirdep(indirdep); 5529 } 5530 5531 static struct indirdep * 5532 indirdep_lookup(mp, ip, bp) 5533 struct mount *mp; 5534 struct inode *ip; 5535 struct buf *bp; 5536 { 5537 struct indirdep *indirdep, *newindirdep; 5538 struct newblk *newblk; 5539 struct worklist *wk; 5540 struct fs *fs; 5541 ufs2_daddr_t blkno; 5542 5543 mtx_assert(&lk, MA_OWNED); 5544 indirdep = NULL; 5545 newindirdep = NULL; 5546 fs = ip->i_fs; 5547 for (;;) { 5548 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 5549 if (wk->wk_type != D_INDIRDEP) 5550 continue; 5551 indirdep = WK_INDIRDEP(wk); 5552 break; 5553 } 5554 /* Found on the buffer worklist, no new structure to free. */ 5555 if (indirdep != NULL && newindirdep == NULL) 5556 return (indirdep); 5557 if (indirdep != NULL && newindirdep != NULL) 5558 panic("indirdep_lookup: simultaneous create"); 5559 /* None found on the buffer and a new structure is ready. */ 5560 if (indirdep == NULL && newindirdep != NULL) 5561 break; 5562 /* None found and no new structure available. */ 5563 FREE_LOCK(&lk); 5564 newindirdep = malloc(sizeof(struct indirdep), 5565 M_INDIRDEP, M_SOFTDEP_FLAGS); 5566 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); 5567 newindirdep->ir_state = ATTACHED; 5568 if (ip->i_ump->um_fstype == UFS1) 5569 newindirdep->ir_state |= UFS1FMT; 5570 TAILQ_INIT(&newindirdep->ir_trunc); 5571 newindirdep->ir_saveddata = NULL; 5572 LIST_INIT(&newindirdep->ir_deplisthd); 5573 LIST_INIT(&newindirdep->ir_donehd); 5574 LIST_INIT(&newindirdep->ir_writehd); 5575 LIST_INIT(&newindirdep->ir_completehd); 5576 if (bp->b_blkno == bp->b_lblkno) { 5577 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, 5578 NULL, NULL); 5579 bp->b_blkno = blkno; 5580 } 5581 newindirdep->ir_freeblks = NULL; 5582 newindirdep->ir_savebp = 5583 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); 5584 newindirdep->ir_bp = bp; 5585 BUF_KERNPROC(newindirdep->ir_savebp); 5586 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 5587 ACQUIRE_LOCK(&lk); 5588 } 5589 indirdep = newindirdep; 5590 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 5591 /* 5592 * If the block is not yet allocated we don't set DEPCOMPLETE so 5593 * that we don't free dependencies until the pointers are valid. 5594 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather 5595 * than using the hash. 5596 */ 5597 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)) 5598 LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next); 5599 else 5600 indirdep->ir_state |= DEPCOMPLETE; 5601 return (indirdep); 5602 } 5603 5604 /* 5605 * Called to finish the allocation of the "aip" allocated 5606 * by one of the two routines above. 5607 */ 5608 static struct freefrag * 5609 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) 5610 struct buf *bp; /* in-memory copy of the indirect block */ 5611 struct inode *ip; /* inode for file being extended */ 5612 struct inodedep *inodedep; /* Inodedep for ip */ 5613 struct allocindir *aip; /* allocindir allocated by the above routines */ 5614 ufs_lbn_t lbn; /* Logical block number for this block. */ 5615 { 5616 struct fs *fs; 5617 struct indirdep *indirdep; 5618 struct allocindir *oldaip; 5619 struct freefrag *freefrag; 5620 struct mount *mp; 5621 5622 mtx_assert(&lk, MA_OWNED); 5623 mp = UFSTOVFS(ip->i_ump); 5624 fs = ip->i_fs; 5625 if (bp->b_lblkno >= 0) 5626 panic("setup_allocindir_phase2: not indir blk"); 5627 KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs), 5628 ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset)); 5629 indirdep = indirdep_lookup(mp, ip, bp); 5630 KASSERT(indirdep->ir_savebp != NULL, 5631 ("setup_allocindir_phase2 NULL ir_savebp")); 5632 aip->ai_indirdep = indirdep; 5633 /* 5634 * Check for an unwritten dependency for this indirect offset. If 5635 * there is, merge the old dependency into the new one. This happens 5636 * as a result of reallocblk only. 5637 */ 5638 freefrag = NULL; 5639 if (aip->ai_oldblkno != 0) { 5640 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) { 5641 if (oldaip->ai_offset == aip->ai_offset) { 5642 freefrag = allocindir_merge(aip, oldaip); 5643 goto done; 5644 } 5645 } 5646 LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) { 5647 if (oldaip->ai_offset == aip->ai_offset) { 5648 freefrag = allocindir_merge(aip, oldaip); 5649 goto done; 5650 } 5651 } 5652 } 5653 done: 5654 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 5655 return (freefrag); 5656 } 5657 5658 /* 5659 * Merge two allocindirs which refer to the same block. Move newblock 5660 * dependencies and setup the freefrags appropriately. 5661 */ 5662 static struct freefrag * 5663 allocindir_merge(aip, oldaip) 5664 struct allocindir *aip; 5665 struct allocindir *oldaip; 5666 { 5667 struct freefrag *freefrag; 5668 struct worklist *wk; 5669 5670 if (oldaip->ai_newblkno != aip->ai_oldblkno) 5671 panic("allocindir_merge: blkno"); 5672 aip->ai_oldblkno = oldaip->ai_oldblkno; 5673 freefrag = aip->ai_freefrag; 5674 aip->ai_freefrag = oldaip->ai_freefrag; 5675 oldaip->ai_freefrag = NULL; 5676 KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); 5677 /* 5678 * If we are tracking a new directory-block allocation, 5679 * move it from the old allocindir to the new allocindir. 5680 */ 5681 if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { 5682 WORKLIST_REMOVE(wk); 5683 if (!LIST_EMPTY(&oldaip->ai_newdirblk)) 5684 panic("allocindir_merge: extra newdirblk"); 5685 WORKLIST_INSERT(&aip->ai_newdirblk, wk); 5686 } 5687 /* 5688 * We can skip journaling for this freefrag and just complete 5689 * any pending journal work for the allocindir that is being 5690 * removed after the freefrag completes. 5691 */ 5692 if (freefrag->ff_jdep) 5693 cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep)); 5694 LIST_REMOVE(oldaip, ai_next); 5695 freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block, 5696 &freefrag->ff_list, &freefrag->ff_jwork); 5697 free_newblk(&oldaip->ai_block); 5698 5699 return (freefrag); 5700 } 5701 5702 static inline void 5703 setup_freedirect(freeblks, ip, i, needj) 5704 struct freeblks *freeblks; 5705 struct inode *ip; 5706 int i; 5707 int needj; 5708 { 5709 ufs2_daddr_t blkno; 5710 int frags; 5711 5712 blkno = DIP(ip, i_db[i]); 5713 if (blkno == 0) 5714 return; 5715 DIP_SET(ip, i_db[i], 0); 5716 frags = sblksize(ip->i_fs, ip->i_size, i); 5717 frags = numfrags(ip->i_fs, frags); 5718 newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj); 5719 } 5720 5721 static inline void 5722 setup_freeext(freeblks, ip, i, needj) 5723 struct freeblks *freeblks; 5724 struct inode *ip; 5725 int i; 5726 int needj; 5727 { 5728 ufs2_daddr_t blkno; 5729 int frags; 5730 5731 blkno = ip->i_din2->di_extb[i]; 5732 if (blkno == 0) 5733 return; 5734 ip->i_din2->di_extb[i] = 0; 5735 frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i); 5736 frags = numfrags(ip->i_fs, frags); 5737 newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); 5738 } 5739 5740 static inline void 5741 setup_freeindir(freeblks, ip, i, lbn, needj) 5742 struct freeblks *freeblks; 5743 struct inode *ip; 5744 int i; 5745 ufs_lbn_t lbn; 5746 int needj; 5747 { 5748 ufs2_daddr_t blkno; 5749 5750 blkno = DIP(ip, i_ib[i]); 5751 if (blkno == 0) 5752 return; 5753 DIP_SET(ip, i_ib[i], 0); 5754 newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag, 5755 0, needj); 5756 } 5757 5758 static inline struct freeblks * 5759 newfreeblks(mp, ip) 5760 struct mount *mp; 5761 struct inode *ip; 5762 { 5763 struct freeblks *freeblks; 5764 5765 freeblks = malloc(sizeof(struct freeblks), 5766 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 5767 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); 5768 LIST_INIT(&freeblks->fb_jblkdephd); 5769 LIST_INIT(&freeblks->fb_jwork); 5770 freeblks->fb_ref = 0; 5771 freeblks->fb_cgwait = 0; 5772 freeblks->fb_state = ATTACHED; 5773 freeblks->fb_uid = ip->i_uid; 5774 freeblks->fb_inum = ip->i_number; 5775 freeblks->fb_vtype = ITOV(ip)->v_type; 5776 freeblks->fb_modrev = DIP(ip, i_modrev); 5777 freeblks->fb_devvp = ip->i_devvp; 5778 freeblks->fb_chkcnt = 0; 5779 freeblks->fb_len = 0; 5780 5781 return (freeblks); 5782 } 5783 5784 static void 5785 trunc_indirdep(indirdep, freeblks, bp, off) 5786 struct indirdep *indirdep; 5787 struct freeblks *freeblks; 5788 struct buf *bp; 5789 int off; 5790 { 5791 struct allocindir *aip, *aipn; 5792 5793 /* 5794 * The first set of allocindirs won't be in savedbp. 5795 */ 5796 LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn) 5797 if (aip->ai_offset > off) 5798 cancel_allocindir(aip, bp, freeblks, 1); 5799 LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn) 5800 if (aip->ai_offset > off) 5801 cancel_allocindir(aip, bp, freeblks, 1); 5802 /* 5803 * These will exist in savedbp. 5804 */ 5805 LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn) 5806 if (aip->ai_offset > off) 5807 cancel_allocindir(aip, NULL, freeblks, 0); 5808 LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn) 5809 if (aip->ai_offset > off) 5810 cancel_allocindir(aip, NULL, freeblks, 0); 5811 } 5812 5813 /* 5814 * Follow the chain of indirects down to lastlbn creating a freework 5815 * structure for each. This will be used to start indir_trunc() at 5816 * the right offset and create the journal records for the parrtial 5817 * truncation. A second step will handle the truncated dependencies. 5818 */ 5819 static int 5820 setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno) 5821 struct freeblks *freeblks; 5822 struct inode *ip; 5823 ufs_lbn_t lbn; 5824 ufs_lbn_t lastlbn; 5825 ufs2_daddr_t blkno; 5826 { 5827 struct indirdep *indirdep; 5828 struct indirdep *indirn; 5829 struct freework *freework; 5830 struct newblk *newblk; 5831 struct mount *mp; 5832 struct buf *bp; 5833 uint8_t *start; 5834 uint8_t *end; 5835 ufs_lbn_t lbnadd; 5836 int level; 5837 int error; 5838 int off; 5839 5840 5841 freework = NULL; 5842 if (blkno == 0) 5843 return (0); 5844 mp = freeblks->fb_list.wk_mp; 5845 bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0); 5846 if ((bp->b_flags & B_CACHE) == 0) { 5847 bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno); 5848 bp->b_iocmd = BIO_READ; 5849 bp->b_flags &= ~B_INVAL; 5850 bp->b_ioflags &= ~BIO_ERROR; 5851 vfs_busy_pages(bp, 0); 5852 bp->b_iooffset = dbtob(bp->b_blkno); 5853 bstrategy(bp); 5854 curthread->td_ru.ru_inblock++; 5855 error = bufwait(bp); 5856 if (error) { 5857 brelse(bp); 5858 return (error); 5859 } 5860 } 5861 level = lbn_level(lbn); 5862 lbnadd = lbn_offset(ip->i_fs, level); 5863 /* 5864 * Compute the offset of the last block we want to keep. Store 5865 * in the freework the first block we want to completely free. 5866 */ 5867 off = (lastlbn - -(lbn + level)) / lbnadd; 5868 if (off + 1 == NINDIR(ip->i_fs)) 5869 goto nowork; 5870 freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1, 5871 0); 5872 /* 5873 * Link the freework into the indirdep. This will prevent any new 5874 * allocations from proceeding until we are finished with the 5875 * truncate and the block is written. 5876 */ 5877 ACQUIRE_LOCK(&lk); 5878 indirdep = indirdep_lookup(mp, ip, bp); 5879 if (indirdep->ir_freeblks) 5880 panic("setup_trunc_indir: indirdep already truncated."); 5881 TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); 5882 freework->fw_indir = indirdep; 5883 /* 5884 * Cancel any allocindirs that will not make it to disk. 5885 * We have to do this for all copies of the indirdep that 5886 * live on this newblk. 5887 */ 5888 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 5889 newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk); 5890 LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next) 5891 trunc_indirdep(indirn, freeblks, bp, off); 5892 } else 5893 trunc_indirdep(indirdep, freeblks, bp, off); 5894 FREE_LOCK(&lk); 5895 /* 5896 * Creation is protected by the buf lock. The saveddata is only 5897 * needed if a full truncation follows a partial truncation but it 5898 * is difficult to allocate in that case so we fetch it anyway. 5899 */ 5900 if (indirdep->ir_saveddata == NULL) 5901 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 5902 M_SOFTDEP_FLAGS); 5903 nowork: 5904 /* Fetch the blkno of the child and the zero start offset. */ 5905 if (ip->i_ump->um_fstype == UFS1) { 5906 blkno = ((ufs1_daddr_t *)bp->b_data)[off]; 5907 start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1]; 5908 } else { 5909 blkno = ((ufs2_daddr_t *)bp->b_data)[off]; 5910 start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1]; 5911 } 5912 if (freework) { 5913 /* Zero the truncated pointers. */ 5914 end = bp->b_data + bp->b_bcount; 5915 bzero(start, end - start); 5916 bdwrite(bp); 5917 } else 5918 bqrelse(bp); 5919 if (level == 0) 5920 return (0); 5921 lbn++; /* adjust level */ 5922 lbn -= (off * lbnadd); 5923 return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno); 5924 } 5925 5926 /* 5927 * Complete the partial truncation of an indirect block setup by 5928 * setup_trunc_indir(). This zeros the truncated pointers in the saved 5929 * copy and writes them to disk before the freeblks is allowed to complete. 5930 */ 5931 static void 5932 complete_trunc_indir(freework) 5933 struct freework *freework; 5934 { 5935 struct freework *fwn; 5936 struct indirdep *indirdep; 5937 struct buf *bp; 5938 uintptr_t start; 5939 int count; 5940 5941 indirdep = freework->fw_indir; 5942 for (;;) { 5943 bp = indirdep->ir_bp; 5944 /* See if the block was discarded. */ 5945 if (bp == NULL) 5946 break; 5947 /* Inline part of getdirtybuf(). We dont want bremfree. */ 5948 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) 5949 break; 5950 if (BUF_LOCK(bp, 5951 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, &lk) == 0) 5952 BUF_UNLOCK(bp); 5953 ACQUIRE_LOCK(&lk); 5954 } 5955 mtx_assert(&lk, MA_OWNED); 5956 freework->fw_state |= DEPCOMPLETE; 5957 TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next); 5958 /* 5959 * Zero the pointers in the saved copy. 5960 */ 5961 if (indirdep->ir_state & UFS1FMT) 5962 start = sizeof(ufs1_daddr_t); 5963 else 5964 start = sizeof(ufs2_daddr_t); 5965 start *= freework->fw_start; 5966 count = indirdep->ir_savebp->b_bcount - start; 5967 start += (uintptr_t)indirdep->ir_savebp->b_data; 5968 bzero((char *)start, count); 5969 /* 5970 * We need to start the next truncation in the list if it has not 5971 * been started yet. 5972 */ 5973 fwn = TAILQ_FIRST(&indirdep->ir_trunc); 5974 if (fwn != NULL) { 5975 if (fwn->fw_freeblks == indirdep->ir_freeblks) 5976 TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next); 5977 if ((fwn->fw_state & ONWORKLIST) == 0) 5978 freework_enqueue(fwn); 5979 } 5980 /* 5981 * If bp is NULL the block was fully truncated, restore 5982 * the saved block list otherwise free it if it is no 5983 * longer needed. 5984 */ 5985 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 5986 if (bp == NULL) 5987 bcopy(indirdep->ir_saveddata, 5988 indirdep->ir_savebp->b_data, 5989 indirdep->ir_savebp->b_bcount); 5990 free(indirdep->ir_saveddata, M_INDIRDEP); 5991 indirdep->ir_saveddata = NULL; 5992 } 5993 /* 5994 * When bp is NULL there is a full truncation pending. We 5995 * must wait for this full truncation to be journaled before 5996 * we can release this freework because the disk pointers will 5997 * never be written as zero. 5998 */ 5999 if (bp == NULL) { 6000 if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd)) 6001 handle_written_freework(freework); 6002 else 6003 WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd, 6004 &freework->fw_list); 6005 } else { 6006 /* Complete when the real copy is written. */ 6007 WORKLIST_INSERT(&bp->b_dep, &freework->fw_list); 6008 BUF_UNLOCK(bp); 6009 } 6010 } 6011 6012 /* 6013 * Calculate the number of blocks we are going to release where datablocks 6014 * is the current total and length is the new file size. 6015 */ 6016 ufs2_daddr_t 6017 blkcount(fs, datablocks, length) 6018 struct fs *fs; 6019 ufs2_daddr_t datablocks; 6020 off_t length; 6021 { 6022 off_t totblks, numblks; 6023 6024 totblks = 0; 6025 numblks = howmany(length, fs->fs_bsize); 6026 if (numblks <= NDADDR) { 6027 totblks = howmany(length, fs->fs_fsize); 6028 goto out; 6029 } 6030 totblks = blkstofrags(fs, numblks); 6031 numblks -= NDADDR; 6032 /* 6033 * Count all single, then double, then triple indirects required. 6034 * Subtracting one indirects worth of blocks for each pass 6035 * acknowledges one of each pointed to by the inode. 6036 */ 6037 for (;;) { 6038 totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs))); 6039 numblks -= NINDIR(fs); 6040 if (numblks <= 0) 6041 break; 6042 numblks = howmany(numblks, NINDIR(fs)); 6043 } 6044 out: 6045 totblks = fsbtodb(fs, totblks); 6046 /* 6047 * Handle sparse files. We can't reclaim more blocks than the inode 6048 * references. We will correct it later in handle_complete_freeblks() 6049 * when we know the real count. 6050 */ 6051 if (totblks > datablocks) 6052 return (0); 6053 return (datablocks - totblks); 6054 } 6055 6056 /* 6057 * Handle freeblocks for journaled softupdate filesystems. 6058 * 6059 * Contrary to normal softupdates, we must preserve the block pointers in 6060 * indirects until their subordinates are free. This is to avoid journaling 6061 * every block that is freed which may consume more space than the journal 6062 * itself. The recovery program will see the free block journals at the 6063 * base of the truncated area and traverse them to reclaim space. The 6064 * pointers in the inode may be cleared immediately after the journal 6065 * records are written because each direct and indirect pointer in the 6066 * inode is recorded in a journal. This permits full truncation to proceed 6067 * asynchronously. The write order is journal -> inode -> cgs -> indirects. 6068 * 6069 * The algorithm is as follows: 6070 * 1) Traverse the in-memory state and create journal entries to release 6071 * the relevant blocks and full indirect trees. 6072 * 2) Traverse the indirect block chain adding partial truncation freework 6073 * records to indirects in the path to lastlbn. The freework will 6074 * prevent new allocation dependencies from being satisfied in this 6075 * indirect until the truncation completes. 6076 * 3) Read and lock the inode block, performing an update with the new size 6077 * and pointers. This prevents truncated data from becoming valid on 6078 * disk through step 4. 6079 * 4) Reap unsatisfied dependencies that are beyond the truncated area, 6080 * eliminate journal work for those records that do not require it. 6081 * 5) Schedule the journal records to be written followed by the inode block. 6082 * 6) Allocate any necessary frags for the end of file. 6083 * 7) Zero any partially truncated blocks. 6084 * 6085 * From this truncation proceeds asynchronously using the freework and 6086 * indir_trunc machinery. The file will not be extended again into a 6087 * partially truncated indirect block until all work is completed but 6088 * the normal dependency mechanism ensures that it is rolled back/forward 6089 * as appropriate. Further truncation may occur without delay and is 6090 * serialized in indir_trunc(). 6091 */ 6092 void 6093 softdep_journal_freeblocks(ip, cred, length, flags) 6094 struct inode *ip; /* The inode whose length is to be reduced */ 6095 struct ucred *cred; 6096 off_t length; /* The new length for the file */ 6097 int flags; /* IO_EXT and/or IO_NORMAL */ 6098 { 6099 struct freeblks *freeblks, *fbn; 6100 struct inodedep *inodedep; 6101 struct jblkdep *jblkdep; 6102 struct allocdirect *adp, *adpn; 6103 struct fs *fs; 6104 struct buf *bp; 6105 struct vnode *vp; 6106 struct mount *mp; 6107 ufs2_daddr_t extblocks, datablocks; 6108 ufs_lbn_t tmpval, lbn, lastlbn; 6109 int frags, lastoff, iboff, allocblock, needj, dflags, error, i; 6110 6111 fs = ip->i_fs; 6112 mp = UFSTOVFS(ip->i_ump); 6113 vp = ITOV(ip); 6114 needj = 1; 6115 iboff = -1; 6116 allocblock = 0; 6117 extblocks = 0; 6118 datablocks = 0; 6119 frags = 0; 6120 freeblks = newfreeblks(mp, ip); 6121 ACQUIRE_LOCK(&lk); 6122 /* 6123 * If we're truncating a removed file that will never be written 6124 * we don't need to journal the block frees. The canceled journals 6125 * for the allocations will suffice. 6126 */ 6127 dflags = DEPALLOC; 6128 if (IS_SNAPSHOT(ip)) 6129 dflags |= NODELAY; 6130 inodedep_lookup(mp, ip->i_number, dflags, &inodedep); 6131 if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED && 6132 length == 0) 6133 needj = 0; 6134 FREE_LOCK(&lk); 6135 /* 6136 * Calculate the lbn that we are truncating to. This results in -1 6137 * if we're truncating the 0 bytes. So it is the last lbn we want 6138 * to keep, not the first lbn we want to truncate. 6139 */ 6140 lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1; 6141 lastoff = blkoff(fs, length); 6142 /* 6143 * Compute frags we are keeping in lastlbn. 0 means all. 6144 */ 6145 if (lastlbn >= 0 && lastlbn < NDADDR) { 6146 frags = fragroundup(fs, lastoff); 6147 /* adp offset of last valid allocdirect. */ 6148 iboff = lastlbn; 6149 } else if (lastlbn > 0) 6150 iboff = NDADDR; 6151 if (fs->fs_magic == FS_UFS2_MAGIC) 6152 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 6153 /* 6154 * Handle normal data blocks and indirects. This section saves 6155 * values used after the inode update to complete frag and indirect 6156 * truncation. 6157 */ 6158 if ((flags & IO_NORMAL) != 0) { 6159 /* 6160 * Handle truncation of whole direct and indirect blocks. 6161 */ 6162 for (i = iboff + 1; i < NDADDR; i++) 6163 setup_freedirect(freeblks, ip, i, needj); 6164 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 6165 i++, lbn += tmpval, tmpval *= NINDIR(fs)) { 6166 /* Release a whole indirect tree. */ 6167 if (lbn > lastlbn) { 6168 setup_freeindir(freeblks, ip, i, -lbn -i, 6169 needj); 6170 continue; 6171 } 6172 iboff = i + NDADDR; 6173 /* 6174 * Traverse partially truncated indirect tree. 6175 */ 6176 if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn) 6177 setup_trunc_indir(freeblks, ip, -lbn - i, 6178 lastlbn, DIP(ip, i_ib[i])); 6179 } 6180 /* 6181 * Handle partial truncation to a frag boundary. 6182 */ 6183 if (frags) { 6184 ufs2_daddr_t blkno; 6185 long oldfrags; 6186 6187 oldfrags = blksize(fs, ip, lastlbn); 6188 blkno = DIP(ip, i_db[lastlbn]); 6189 if (blkno && oldfrags != frags) { 6190 oldfrags -= frags; 6191 oldfrags = numfrags(ip->i_fs, oldfrags); 6192 blkno += numfrags(ip->i_fs, frags); 6193 newfreework(ip->i_ump, freeblks, NULL, lastlbn, 6194 blkno, oldfrags, 0, needj); 6195 } else if (blkno == 0) 6196 allocblock = 1; 6197 } 6198 /* 6199 * Add a journal record for partial truncate if we are 6200 * handling indirect blocks. Non-indirects need no extra 6201 * journaling. 6202 */ 6203 if (length != 0 && lastlbn >= NDADDR) { 6204 ip->i_flag |= IN_TRUNCATED; 6205 newjtrunc(freeblks, length, 0); 6206 } 6207 ip->i_size = length; 6208 DIP_SET(ip, i_size, ip->i_size); 6209 datablocks = DIP(ip, i_blocks) - extblocks; 6210 if (length != 0) 6211 datablocks = blkcount(ip->i_fs, datablocks, length); 6212 freeblks->fb_len = length; 6213 } 6214 if ((flags & IO_EXT) != 0) { 6215 for (i = 0; i < NXADDR; i++) 6216 setup_freeext(freeblks, ip, i, needj); 6217 ip->i_din2->di_extsize = 0; 6218 datablocks += extblocks; 6219 } 6220 #ifdef QUOTA 6221 /* Reference the quotas in case the block count is wrong in the end. */ 6222 quotaref(vp, freeblks->fb_quota); 6223 (void) chkdq(ip, -datablocks, NOCRED, 0); 6224 #endif 6225 freeblks->fb_chkcnt = -datablocks; 6226 UFS_LOCK(ip->i_ump); 6227 fs->fs_pendingblocks += datablocks; 6228 UFS_UNLOCK(ip->i_ump); 6229 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); 6230 /* 6231 * Handle truncation of incomplete alloc direct dependencies. We 6232 * hold the inode block locked to prevent incomplete dependencies 6233 * from reaching the disk while we are eliminating those that 6234 * have been truncated. This is a partially inlined ffs_update(). 6235 */ 6236 ufs_itimes(vp); 6237 ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); 6238 error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 6239 (int)fs->fs_bsize, cred, &bp); 6240 if (error) { 6241 brelse(bp); 6242 softdep_error("softdep_journal_freeblocks", error); 6243 return; 6244 } 6245 if (bp->b_bufsize == fs->fs_bsize) 6246 bp->b_flags |= B_CLUSTEROK; 6247 softdep_update_inodeblock(ip, bp, 0); 6248 if (ip->i_ump->um_fstype == UFS1) 6249 *((struct ufs1_dinode *)bp->b_data + 6250 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; 6251 else 6252 *((struct ufs2_dinode *)bp->b_data + 6253 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; 6254 ACQUIRE_LOCK(&lk); 6255 (void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep); 6256 if ((inodedep->id_state & IOSTARTED) != 0) 6257 panic("softdep_setup_freeblocks: inode busy"); 6258 /* 6259 * Add the freeblks structure to the list of operations that 6260 * must await the zero'ed inode being written to disk. If we 6261 * still have a bitmap dependency (needj), then the inode 6262 * has never been written to disk, so we can process the 6263 * freeblks below once we have deleted the dependencies. 6264 */ 6265 if (needj) 6266 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 6267 else 6268 freeblks->fb_state |= COMPLETE; 6269 if ((flags & IO_NORMAL) != 0) { 6270 TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) { 6271 if (adp->ad_offset > iboff) 6272 cancel_allocdirect(&inodedep->id_inoupdt, adp, 6273 freeblks); 6274 /* 6275 * Truncate the allocdirect. We could eliminate 6276 * or modify journal records as well. 6277 */ 6278 else if (adp->ad_offset == iboff && frags) 6279 adp->ad_newsize = frags; 6280 } 6281 } 6282 if ((flags & IO_EXT) != 0) 6283 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 6284 cancel_allocdirect(&inodedep->id_extupdt, adp, 6285 freeblks); 6286 /* 6287 * Add journal work. 6288 */ 6289 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) 6290 add_to_journal(&jblkdep->jb_list); 6291 FREE_LOCK(&lk); 6292 bdwrite(bp); 6293 /* 6294 * Truncate dependency structures beyond length. 6295 */ 6296 trunc_dependencies(ip, freeblks, lastlbn, frags, flags); 6297 /* 6298 * This is only set when we need to allocate a fragment because 6299 * none existed at the end of a frag-sized file. It handles only 6300 * allocating a new, zero filled block. 6301 */ 6302 if (allocblock) { 6303 ip->i_size = length - lastoff; 6304 DIP_SET(ip, i_size, ip->i_size); 6305 error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp); 6306 if (error != 0) { 6307 softdep_error("softdep_journal_freeblks", error); 6308 return; 6309 } 6310 ip->i_size = length; 6311 DIP_SET(ip, i_size, length); 6312 ip->i_flag |= IN_CHANGE | IN_UPDATE; 6313 allocbuf(bp, frags); 6314 ffs_update(vp, MNT_NOWAIT); 6315 bawrite(bp); 6316 } else if (lastoff != 0 && vp->v_type != VDIR) { 6317 int size; 6318 6319 /* 6320 * Zero the end of a truncated frag or block. 6321 */ 6322 size = sblksize(fs, length, lastlbn); 6323 error = bread(vp, lastlbn, size, cred, &bp); 6324 if (error) { 6325 softdep_error("softdep_journal_freeblks", error); 6326 return; 6327 } 6328 bzero((char *)bp->b_data + lastoff, size - lastoff); 6329 bawrite(bp); 6330 6331 } 6332 ACQUIRE_LOCK(&lk); 6333 inodedep_lookup(mp, ip->i_number, dflags, &inodedep); 6334 TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next); 6335 freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST; 6336 /* 6337 * We zero earlier truncations so they don't erroneously 6338 * update i_blocks. 6339 */ 6340 if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0) 6341 TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next) 6342 fbn->fb_len = 0; 6343 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE && 6344 LIST_EMPTY(&freeblks->fb_jblkdephd)) 6345 freeblks->fb_state |= INPROGRESS; 6346 else 6347 freeblks = NULL; 6348 FREE_LOCK(&lk); 6349 if (freeblks) 6350 handle_workitem_freeblocks(freeblks, 0); 6351 trunc_pages(ip, length, extblocks, flags); 6352 6353 } 6354 6355 /* 6356 * Flush a JOP_SYNC to the journal. 6357 */ 6358 void 6359 softdep_journal_fsync(ip) 6360 struct inode *ip; 6361 { 6362 struct jfsync *jfsync; 6363 6364 if ((ip->i_flag & IN_TRUNCATED) == 0) 6365 return; 6366 ip->i_flag &= ~IN_TRUNCATED; 6367 jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO); 6368 workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump)); 6369 jfsync->jfs_size = ip->i_size; 6370 jfsync->jfs_ino = ip->i_number; 6371 ACQUIRE_LOCK(&lk); 6372 add_to_journal(&jfsync->jfs_list); 6373 jwait(&jfsync->jfs_list, MNT_WAIT); 6374 FREE_LOCK(&lk); 6375 } 6376 6377 /* 6378 * Block de-allocation dependencies. 6379 * 6380 * When blocks are de-allocated, the on-disk pointers must be nullified before 6381 * the blocks are made available for use by other files. (The true 6382 * requirement is that old pointers must be nullified before new on-disk 6383 * pointers are set. We chose this slightly more stringent requirement to 6384 * reduce complexity.) Our implementation handles this dependency by updating 6385 * the inode (or indirect block) appropriately but delaying the actual block 6386 * de-allocation (i.e., freemap and free space count manipulation) until 6387 * after the updated versions reach stable storage. After the disk is 6388 * updated, the blocks can be safely de-allocated whenever it is convenient. 6389 * This implementation handles only the common case of reducing a file's 6390 * length to zero. Other cases are handled by the conventional synchronous 6391 * write approach. 6392 * 6393 * The ffs implementation with which we worked double-checks 6394 * the state of the block pointers and file size as it reduces 6395 * a file's length. Some of this code is replicated here in our 6396 * soft updates implementation. The freeblks->fb_chkcnt field is 6397 * used to transfer a part of this information to the procedure 6398 * that eventually de-allocates the blocks. 6399 * 6400 * This routine should be called from the routine that shortens 6401 * a file's length, before the inode's size or block pointers 6402 * are modified. It will save the block pointer information for 6403 * later release and zero the inode so that the calling routine 6404 * can release it. 6405 */ 6406 void 6407 softdep_setup_freeblocks(ip, length, flags) 6408 struct inode *ip; /* The inode whose length is to be reduced */ 6409 off_t length; /* The new length for the file */ 6410 int flags; /* IO_EXT and/or IO_NORMAL */ 6411 { 6412 struct ufs1_dinode *dp1; 6413 struct ufs2_dinode *dp2; 6414 struct freeblks *freeblks; 6415 struct inodedep *inodedep; 6416 struct allocdirect *adp; 6417 struct buf *bp; 6418 struct fs *fs; 6419 ufs2_daddr_t extblocks, datablocks; 6420 struct mount *mp; 6421 int i, delay, error, dflags; 6422 ufs_lbn_t tmpval; 6423 ufs_lbn_t lbn; 6424 6425 fs = ip->i_fs; 6426 mp = UFSTOVFS(ip->i_ump); 6427 if (length != 0) 6428 panic("softdep_setup_freeblocks: non-zero length"); 6429 freeblks = newfreeblks(mp, ip); 6430 extblocks = 0; 6431 datablocks = 0; 6432 if (fs->fs_magic == FS_UFS2_MAGIC) 6433 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 6434 if ((flags & IO_NORMAL) != 0) { 6435 for (i = 0; i < NDADDR; i++) 6436 setup_freedirect(freeblks, ip, i, 0); 6437 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 6438 i++, lbn += tmpval, tmpval *= NINDIR(fs)) 6439 setup_freeindir(freeblks, ip, i, -lbn -i, 0); 6440 ip->i_size = 0; 6441 DIP_SET(ip, i_size, 0); 6442 datablocks = DIP(ip, i_blocks) - extblocks; 6443 } 6444 if ((flags & IO_EXT) != 0) { 6445 for (i = 0; i < NXADDR; i++) 6446 setup_freeext(freeblks, ip, i, 0); 6447 ip->i_din2->di_extsize = 0; 6448 datablocks += extblocks; 6449 } 6450 #ifdef QUOTA 6451 /* Reference the quotas in case the block count is wrong in the end. */ 6452 quotaref(ITOV(ip), freeblks->fb_quota); 6453 (void) chkdq(ip, -datablocks, NOCRED, 0); 6454 #endif 6455 freeblks->fb_chkcnt = -datablocks; 6456 UFS_LOCK(ip->i_ump); 6457 fs->fs_pendingblocks += datablocks; 6458 UFS_UNLOCK(ip->i_ump); 6459 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); 6460 /* 6461 * Push the zero'ed inode to to its disk buffer so that we are free 6462 * to delete its dependencies below. Once the dependencies are gone 6463 * the buffer can be safely released. 6464 */ 6465 if ((error = bread(ip->i_devvp, 6466 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 6467 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 6468 brelse(bp); 6469 softdep_error("softdep_setup_freeblocks", error); 6470 } 6471 if (ip->i_ump->um_fstype == UFS1) { 6472 dp1 = ((struct ufs1_dinode *)bp->b_data + 6473 ino_to_fsbo(fs, ip->i_number)); 6474 ip->i_din1->di_freelink = dp1->di_freelink; 6475 *dp1 = *ip->i_din1; 6476 } else { 6477 dp2 = ((struct ufs2_dinode *)bp->b_data + 6478 ino_to_fsbo(fs, ip->i_number)); 6479 ip->i_din2->di_freelink = dp2->di_freelink; 6480 *dp2 = *ip->i_din2; 6481 } 6482 /* 6483 * Find and eliminate any inode dependencies. 6484 */ 6485 ACQUIRE_LOCK(&lk); 6486 dflags = DEPALLOC; 6487 if (IS_SNAPSHOT(ip)) 6488 dflags |= NODELAY; 6489 (void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep); 6490 if ((inodedep->id_state & IOSTARTED) != 0) 6491 panic("softdep_setup_freeblocks: inode busy"); 6492 /* 6493 * Add the freeblks structure to the list of operations that 6494 * must await the zero'ed inode being written to disk. If we 6495 * still have a bitmap dependency (delay == 0), then the inode 6496 * has never been written to disk, so we can process the 6497 * freeblks below once we have deleted the dependencies. 6498 */ 6499 delay = (inodedep->id_state & DEPCOMPLETE); 6500 if (delay) 6501 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 6502 else 6503 freeblks->fb_state |= COMPLETE; 6504 /* 6505 * Because the file length has been truncated to zero, any 6506 * pending block allocation dependency structures associated 6507 * with this inode are obsolete and can simply be de-allocated. 6508 * We must first merge the two dependency lists to get rid of 6509 * any duplicate freefrag structures, then purge the merged list. 6510 * If we still have a bitmap dependency, then the inode has never 6511 * been written to disk, so we can free any fragments without delay. 6512 */ 6513 if (flags & IO_NORMAL) { 6514 merge_inode_lists(&inodedep->id_newinoupdt, 6515 &inodedep->id_inoupdt); 6516 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 6517 cancel_allocdirect(&inodedep->id_inoupdt, adp, 6518 freeblks); 6519 } 6520 if (flags & IO_EXT) { 6521 merge_inode_lists(&inodedep->id_newextupdt, 6522 &inodedep->id_extupdt); 6523 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 6524 cancel_allocdirect(&inodedep->id_extupdt, adp, 6525 freeblks); 6526 } 6527 FREE_LOCK(&lk); 6528 bdwrite(bp); 6529 trunc_dependencies(ip, freeblks, -1, 0, flags); 6530 ACQUIRE_LOCK(&lk); 6531 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 6532 (void) free_inodedep(inodedep); 6533 freeblks->fb_state |= DEPCOMPLETE; 6534 /* 6535 * If the inode with zeroed block pointers is now on disk 6536 * we can start freeing blocks. 6537 */ 6538 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 6539 freeblks->fb_state |= INPROGRESS; 6540 else 6541 freeblks = NULL; 6542 FREE_LOCK(&lk); 6543 if (freeblks) 6544 handle_workitem_freeblocks(freeblks, 0); 6545 trunc_pages(ip, length, extblocks, flags); 6546 } 6547 6548 /* 6549 * Eliminate pages from the page cache that back parts of this inode and 6550 * adjust the vnode pager's idea of our size. This prevents stale data 6551 * from hanging around in the page cache. 6552 */ 6553 static void 6554 trunc_pages(ip, length, extblocks, flags) 6555 struct inode *ip; 6556 off_t length; 6557 ufs2_daddr_t extblocks; 6558 int flags; 6559 { 6560 struct vnode *vp; 6561 struct fs *fs; 6562 ufs_lbn_t lbn; 6563 off_t end, extend; 6564 6565 vp = ITOV(ip); 6566 fs = ip->i_fs; 6567 extend = OFF_TO_IDX(lblktosize(fs, -extblocks)); 6568 if ((flags & IO_EXT) != 0) 6569 vn_pages_remove(vp, extend, 0); 6570 if ((flags & IO_NORMAL) == 0) 6571 return; 6572 BO_LOCK(&vp->v_bufobj); 6573 drain_output(vp); 6574 BO_UNLOCK(&vp->v_bufobj); 6575 /* 6576 * The vnode pager eliminates file pages we eliminate indirects 6577 * below. 6578 */ 6579 vnode_pager_setsize(vp, length); 6580 /* 6581 * Calculate the end based on the last indirect we want to keep. If 6582 * the block extends into indirects we can just use the negative of 6583 * its lbn. Doubles and triples exist at lower numbers so we must 6584 * be careful not to remove those, if they exist. double and triple 6585 * indirect lbns do not overlap with others so it is not important 6586 * to verify how many levels are required. 6587 */ 6588 lbn = lblkno(fs, length); 6589 if (lbn >= NDADDR) { 6590 /* Calculate the virtual lbn of the triple indirect. */ 6591 lbn = -lbn - (NIADDR - 1); 6592 end = OFF_TO_IDX(lblktosize(fs, lbn)); 6593 } else 6594 end = extend; 6595 vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end); 6596 } 6597 6598 /* 6599 * See if the buf bp is in the range eliminated by truncation. 6600 */ 6601 static int 6602 trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags) 6603 struct buf *bp; 6604 int *blkoffp; 6605 ufs_lbn_t lastlbn; 6606 int lastoff; 6607 int flags; 6608 { 6609 ufs_lbn_t lbn; 6610 6611 *blkoffp = 0; 6612 /* Only match ext/normal blocks as appropriate. */ 6613 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || 6614 ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) 6615 return (0); 6616 /* ALTDATA is always a full truncation. */ 6617 if ((bp->b_xflags & BX_ALTDATA) != 0) 6618 return (1); 6619 /* -1 is full truncation. */ 6620 if (lastlbn == -1) 6621 return (1); 6622 /* 6623 * If this is a partial truncate we only want those 6624 * blocks and indirect blocks that cover the range 6625 * we're after. 6626 */ 6627 lbn = bp->b_lblkno; 6628 if (lbn < 0) 6629 lbn = -(lbn + lbn_level(lbn)); 6630 if (lbn < lastlbn) 6631 return (0); 6632 /* Here we only truncate lblkno if it's partial. */ 6633 if (lbn == lastlbn) { 6634 if (lastoff == 0) 6635 return (0); 6636 *blkoffp = lastoff; 6637 } 6638 return (1); 6639 } 6640 6641 /* 6642 * Eliminate any dependencies that exist in memory beyond lblkno:off 6643 */ 6644 static void 6645 trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags) 6646 struct inode *ip; 6647 struct freeblks *freeblks; 6648 ufs_lbn_t lastlbn; 6649 int lastoff; 6650 int flags; 6651 { 6652 struct bufobj *bo; 6653 struct vnode *vp; 6654 struct buf *bp; 6655 struct fs *fs; 6656 int blkoff; 6657 6658 /* 6659 * We must wait for any I/O in progress to finish so that 6660 * all potential buffers on the dirty list will be visible. 6661 * Once they are all there, walk the list and get rid of 6662 * any dependencies. 6663 */ 6664 fs = ip->i_fs; 6665 vp = ITOV(ip); 6666 bo = &vp->v_bufobj; 6667 BO_LOCK(bo); 6668 drain_output(vp); 6669 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 6670 bp->b_vflags &= ~BV_SCANNED; 6671 restart: 6672 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 6673 if (bp->b_vflags & BV_SCANNED) 6674 continue; 6675 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { 6676 bp->b_vflags |= BV_SCANNED; 6677 continue; 6678 } 6679 if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL) 6680 goto restart; 6681 BO_UNLOCK(bo); 6682 if (deallocate_dependencies(bp, freeblks, blkoff)) 6683 bqrelse(bp); 6684 else 6685 brelse(bp); 6686 BO_LOCK(bo); 6687 goto restart; 6688 } 6689 /* 6690 * Now do the work of vtruncbuf while also matching indirect blocks. 6691 */ 6692 TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) 6693 bp->b_vflags &= ~BV_SCANNED; 6694 cleanrestart: 6695 TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) { 6696 if (bp->b_vflags & BV_SCANNED) 6697 continue; 6698 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { 6699 bp->b_vflags |= BV_SCANNED; 6700 continue; 6701 } 6702 if (BUF_LOCK(bp, 6703 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 6704 BO_MTX(bo)) == ENOLCK) { 6705 BO_LOCK(bo); 6706 goto cleanrestart; 6707 } 6708 bp->b_vflags |= BV_SCANNED; 6709 BO_LOCK(bo); 6710 bremfree(bp); 6711 BO_UNLOCK(bo); 6712 if (blkoff != 0) { 6713 allocbuf(bp, blkoff); 6714 bqrelse(bp); 6715 } else { 6716 bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF; 6717 brelse(bp); 6718 } 6719 BO_LOCK(bo); 6720 goto cleanrestart; 6721 } 6722 drain_output(vp); 6723 BO_UNLOCK(bo); 6724 } 6725 6726 static int 6727 cancel_pagedep(pagedep, freeblks, blkoff) 6728 struct pagedep *pagedep; 6729 struct freeblks *freeblks; 6730 int blkoff; 6731 { 6732 struct jremref *jremref; 6733 struct jmvref *jmvref; 6734 struct dirrem *dirrem, *tmp; 6735 int i; 6736 6737 /* 6738 * Copy any directory remove dependencies to the list 6739 * to be processed after the freeblks proceeds. If 6740 * directory entry never made it to disk they 6741 * can be dumped directly onto the work list. 6742 */ 6743 LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) { 6744 /* Skip this directory removal if it is intended to remain. */ 6745 if (dirrem->dm_offset < blkoff) 6746 continue; 6747 /* 6748 * If there are any dirrems we wait for the journal write 6749 * to complete and then restart the buf scan as the lock 6750 * has been dropped. 6751 */ 6752 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { 6753 jwait(&jremref->jr_list, MNT_WAIT); 6754 return (ERESTART); 6755 } 6756 LIST_REMOVE(dirrem, dm_next); 6757 dirrem->dm_dirinum = pagedep->pd_ino; 6758 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list); 6759 } 6760 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { 6761 jwait(&jmvref->jm_list, MNT_WAIT); 6762 return (ERESTART); 6763 } 6764 /* 6765 * When we're partially truncating a pagedep we just want to flush 6766 * journal entries and return. There can not be any adds in the 6767 * truncated portion of the directory and newblk must remain if 6768 * part of the block remains. 6769 */ 6770 if (blkoff != 0) { 6771 struct diradd *dap; 6772 6773 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 6774 if (dap->da_offset > blkoff) 6775 panic("cancel_pagedep: diradd %p off %d > %d", 6776 dap, dap->da_offset, blkoff); 6777 for (i = 0; i < DAHASHSZ; i++) 6778 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) 6779 if (dap->da_offset > blkoff) 6780 panic("cancel_pagedep: diradd %p off %d > %d", 6781 dap, dap->da_offset, blkoff); 6782 return (0); 6783 } 6784 /* 6785 * There should be no directory add dependencies present 6786 * as the directory could not be truncated until all 6787 * children were removed. 6788 */ 6789 KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, 6790 ("deallocate_dependencies: pendinghd != NULL")); 6791 for (i = 0; i < DAHASHSZ; i++) 6792 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, 6793 ("deallocate_dependencies: diraddhd != NULL")); 6794 if ((pagedep->pd_state & NEWBLOCK) != 0) 6795 free_newdirblk(pagedep->pd_newdirblk); 6796 if (free_pagedep(pagedep) == 0) 6797 panic("Failed to free pagedep %p", pagedep); 6798 return (0); 6799 } 6800 6801 /* 6802 * Reclaim any dependency structures from a buffer that is about to 6803 * be reallocated to a new vnode. The buffer must be locked, thus, 6804 * no I/O completion operations can occur while we are manipulating 6805 * its associated dependencies. The mutex is held so that other I/O's 6806 * associated with related dependencies do not occur. 6807 */ 6808 static int 6809 deallocate_dependencies(bp, freeblks, off) 6810 struct buf *bp; 6811 struct freeblks *freeblks; 6812 int off; 6813 { 6814 struct indirdep *indirdep; 6815 struct pagedep *pagedep; 6816 struct allocdirect *adp; 6817 struct worklist *wk, *wkn; 6818 6819 ACQUIRE_LOCK(&lk); 6820 LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) { 6821 switch (wk->wk_type) { 6822 case D_INDIRDEP: 6823 indirdep = WK_INDIRDEP(wk); 6824 if (bp->b_lblkno >= 0 || 6825 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 6826 panic("deallocate_dependencies: not indir"); 6827 cancel_indirdep(indirdep, bp, freeblks); 6828 continue; 6829 6830 case D_PAGEDEP: 6831 pagedep = WK_PAGEDEP(wk); 6832 if (cancel_pagedep(pagedep, freeblks, off)) { 6833 FREE_LOCK(&lk); 6834 return (ERESTART); 6835 } 6836 continue; 6837 6838 case D_ALLOCINDIR: 6839 /* 6840 * Simply remove the allocindir, we'll find it via 6841 * the indirdep where we can clear pointers if 6842 * needed. 6843 */ 6844 WORKLIST_REMOVE(wk); 6845 continue; 6846 6847 case D_FREEWORK: 6848 /* 6849 * A truncation is waiting for the zero'd pointers 6850 * to be written. It can be freed when the freeblks 6851 * is journaled. 6852 */ 6853 WORKLIST_REMOVE(wk); 6854 wk->wk_state |= ONDEPLIST; 6855 WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); 6856 break; 6857 6858 case D_ALLOCDIRECT: 6859 adp = WK_ALLOCDIRECT(wk); 6860 if (off != 0) 6861 continue; 6862 /* FALLTHROUGH */ 6863 default: 6864 panic("deallocate_dependencies: Unexpected type %s", 6865 TYPENAME(wk->wk_type)); 6866 /* NOTREACHED */ 6867 } 6868 } 6869 FREE_LOCK(&lk); 6870 /* 6871 * Don't throw away this buf, we were partially truncating and 6872 * some deps may always remain. 6873 */ 6874 if (off) { 6875 allocbuf(bp, off); 6876 bp->b_vflags |= BV_SCANNED; 6877 return (EBUSY); 6878 } 6879 bp->b_flags |= B_INVAL | B_NOCACHE; 6880 6881 return (0); 6882 } 6883 6884 /* 6885 * An allocdirect is being canceled due to a truncate. We must make sure 6886 * the journal entry is released in concert with the blkfree that releases 6887 * the storage. Completed journal entries must not be released until the 6888 * space is no longer pointed to by the inode or in the bitmap. 6889 */ 6890 static void 6891 cancel_allocdirect(adphead, adp, freeblks) 6892 struct allocdirectlst *adphead; 6893 struct allocdirect *adp; 6894 struct freeblks *freeblks; 6895 { 6896 struct freework *freework; 6897 struct newblk *newblk; 6898 struct worklist *wk; 6899 6900 TAILQ_REMOVE(adphead, adp, ad_next); 6901 newblk = (struct newblk *)adp; 6902 freework = NULL; 6903 /* 6904 * Find the correct freework structure. 6905 */ 6906 LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { 6907 if (wk->wk_type != D_FREEWORK) 6908 continue; 6909 freework = WK_FREEWORK(wk); 6910 if (freework->fw_blkno == newblk->nb_newblkno) 6911 break; 6912 } 6913 if (freework == NULL) 6914 panic("cancel_allocdirect: Freework not found"); 6915 /* 6916 * If a newblk exists at all we still have the journal entry that 6917 * initiated the allocation so we do not need to journal the free. 6918 */ 6919 cancel_jfreeblk(freeblks, freework->fw_blkno); 6920 /* 6921 * If the journal hasn't been written the jnewblk must be passed 6922 * to the call to ffs_blkfree that reclaims the space. We accomplish 6923 * this by linking the journal dependency into the freework to be 6924 * freed when freework_freeblock() is called. If the journal has 6925 * been written we can simply reclaim the journal space when the 6926 * freeblks work is complete. 6927 */ 6928 freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list, 6929 &freeblks->fb_jwork); 6930 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); 6931 } 6932 6933 6934 /* 6935 * Cancel a new block allocation. May be an indirect or direct block. We 6936 * remove it from various lists and return any journal record that needs to 6937 * be resolved by the caller. 6938 * 6939 * A special consideration is made for indirects which were never pointed 6940 * at on disk and will never be found once this block is released. 6941 */ 6942 static struct jnewblk * 6943 cancel_newblk(newblk, wk, wkhd) 6944 struct newblk *newblk; 6945 struct worklist *wk; 6946 struct workhead *wkhd; 6947 { 6948 struct jnewblk *jnewblk; 6949 6950 newblk->nb_state |= GOINGAWAY; 6951 /* 6952 * Previously we traversed the completedhd on each indirdep 6953 * attached to this newblk to cancel them and gather journal 6954 * work. Since we need only the oldest journal segment and 6955 * the lowest point on the tree will always have the oldest 6956 * journal segment we are free to release the segments 6957 * of any subordinates and may leave the indirdep list to 6958 * indirdep_complete() when this newblk is freed. 6959 */ 6960 if (newblk->nb_state & ONDEPLIST) { 6961 newblk->nb_state &= ~ONDEPLIST; 6962 LIST_REMOVE(newblk, nb_deps); 6963 } 6964 if (newblk->nb_state & ONWORKLIST) 6965 WORKLIST_REMOVE(&newblk->nb_list); 6966 /* 6967 * If the journal entry hasn't been written we save a pointer to 6968 * the dependency that frees it until it is written or the 6969 * superseding operation completes. 6970 */ 6971 jnewblk = newblk->nb_jnewblk; 6972 if (jnewblk != NULL && wk != NULL) { 6973 newblk->nb_jnewblk = NULL; 6974 jnewblk->jn_dep = wk; 6975 } 6976 if (!LIST_EMPTY(&newblk->nb_jwork)) 6977 jwork_move(wkhd, &newblk->nb_jwork); 6978 /* 6979 * When truncating we must free the newdirblk early to remove 6980 * the pagedep from the hash before returning. 6981 */ 6982 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) 6983 free_newdirblk(WK_NEWDIRBLK(wk)); 6984 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 6985 panic("cancel_newblk: extra newdirblk"); 6986 6987 return (jnewblk); 6988 } 6989 6990 /* 6991 * Schedule the freefrag associated with a newblk to be released once 6992 * the pointers are written and the previous block is no longer needed. 6993 */ 6994 static void 6995 newblk_freefrag(newblk) 6996 struct newblk *newblk; 6997 { 6998 struct freefrag *freefrag; 6999 7000 if (newblk->nb_freefrag == NULL) 7001 return; 7002 freefrag = newblk->nb_freefrag; 7003 newblk->nb_freefrag = NULL; 7004 freefrag->ff_state |= COMPLETE; 7005 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 7006 add_to_worklist(&freefrag->ff_list, 0); 7007 } 7008 7009 /* 7010 * Free a newblk. Generate a new freefrag work request if appropriate. 7011 * This must be called after the inode pointer and any direct block pointers 7012 * are valid or fully removed via truncate or frag extension. 7013 */ 7014 static void 7015 free_newblk(newblk) 7016 struct newblk *newblk; 7017 { 7018 struct indirdep *indirdep; 7019 struct worklist *wk; 7020 7021 KASSERT(newblk->nb_jnewblk == NULL, 7022 ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk)); 7023 mtx_assert(&lk, MA_OWNED); 7024 newblk_freefrag(newblk); 7025 if (newblk->nb_state & ONDEPLIST) 7026 LIST_REMOVE(newblk, nb_deps); 7027 if (newblk->nb_state & ONWORKLIST) 7028 WORKLIST_REMOVE(&newblk->nb_list); 7029 LIST_REMOVE(newblk, nb_hash); 7030 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) 7031 free_newdirblk(WK_NEWDIRBLK(wk)); 7032 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 7033 panic("free_newblk: extra newdirblk"); 7034 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) 7035 indirdep_complete(indirdep); 7036 handle_jwork(&newblk->nb_jwork); 7037 newblk->nb_list.wk_type = D_NEWBLK; 7038 WORKITEM_FREE(newblk, D_NEWBLK); 7039 } 7040 7041 /* 7042 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 7043 * This routine must be called with splbio interrupts blocked. 7044 */ 7045 static void 7046 free_newdirblk(newdirblk) 7047 struct newdirblk *newdirblk; 7048 { 7049 struct pagedep *pagedep; 7050 struct diradd *dap; 7051 struct worklist *wk; 7052 7053 mtx_assert(&lk, MA_OWNED); 7054 WORKLIST_REMOVE(&newdirblk->db_list); 7055 /* 7056 * If the pagedep is still linked onto the directory buffer 7057 * dependency chain, then some of the entries on the 7058 * pd_pendinghd list may not be committed to disk yet. In 7059 * this case, we will simply clear the NEWBLOCK flag and 7060 * let the pd_pendinghd list be processed when the pagedep 7061 * is next written. If the pagedep is no longer on the buffer 7062 * dependency chain, then all the entries on the pd_pending 7063 * list are committed to disk and we can free them here. 7064 */ 7065 pagedep = newdirblk->db_pagedep; 7066 pagedep->pd_state &= ~NEWBLOCK; 7067 if ((pagedep->pd_state & ONWORKLIST) == 0) { 7068 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 7069 free_diradd(dap, NULL); 7070 /* 7071 * If no dependencies remain, the pagedep will be freed. 7072 */ 7073 free_pagedep(pagedep); 7074 } 7075 /* Should only ever be one item in the list. */ 7076 while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { 7077 WORKLIST_REMOVE(wk); 7078 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 7079 } 7080 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 7081 } 7082 7083 /* 7084 * Prepare an inode to be freed. The actual free operation is not 7085 * done until the zero'ed inode has been written to disk. 7086 */ 7087 void 7088 softdep_freefile(pvp, ino, mode) 7089 struct vnode *pvp; 7090 ino_t ino; 7091 int mode; 7092 { 7093 struct inode *ip = VTOI(pvp); 7094 struct inodedep *inodedep; 7095 struct freefile *freefile; 7096 struct freeblks *freeblks; 7097 7098 /* 7099 * This sets up the inode de-allocation dependency. 7100 */ 7101 freefile = malloc(sizeof(struct freefile), 7102 M_FREEFILE, M_SOFTDEP_FLAGS); 7103 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); 7104 freefile->fx_mode = mode; 7105 freefile->fx_oldinum = ino; 7106 freefile->fx_devvp = ip->i_devvp; 7107 LIST_INIT(&freefile->fx_jwork); 7108 UFS_LOCK(ip->i_ump); 7109 ip->i_fs->fs_pendinginodes += 1; 7110 UFS_UNLOCK(ip->i_ump); 7111 7112 /* 7113 * If the inodedep does not exist, then the zero'ed inode has 7114 * been written to disk. If the allocated inode has never been 7115 * written to disk, then the on-disk inode is zero'ed. In either 7116 * case we can free the file immediately. If the journal was 7117 * canceled before being written the inode will never make it to 7118 * disk and we must send the canceled journal entrys to 7119 * ffs_freefile() to be cleared in conjunction with the bitmap. 7120 * Any blocks waiting on the inode to write can be safely freed 7121 * here as it will never been written. 7122 */ 7123 ACQUIRE_LOCK(&lk); 7124 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 7125 if (inodedep) { 7126 /* 7127 * Clear out freeblks that no longer need to reference 7128 * this inode. 7129 */ 7130 while ((freeblks = 7131 TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) { 7132 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, 7133 fb_next); 7134 freeblks->fb_state &= ~ONDEPLIST; 7135 } 7136 /* 7137 * Remove this inode from the unlinked list. 7138 */ 7139 if (inodedep->id_state & UNLINKED) { 7140 /* 7141 * Save the journal work to be freed with the bitmap 7142 * before we clear UNLINKED. Otherwise it can be lost 7143 * if the inode block is written. 7144 */ 7145 handle_bufwait(inodedep, &freefile->fx_jwork); 7146 clear_unlinked_inodedep(inodedep); 7147 /* Re-acquire inodedep as we've dropped lk. */ 7148 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 7149 } 7150 } 7151 if (inodedep == NULL || check_inode_unwritten(inodedep)) { 7152 FREE_LOCK(&lk); 7153 handle_workitem_freefile(freefile); 7154 return; 7155 } 7156 if ((inodedep->id_state & DEPCOMPLETE) == 0) 7157 inodedep->id_state |= GOINGAWAY; 7158 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 7159 FREE_LOCK(&lk); 7160 if (ip->i_number == ino) 7161 ip->i_flag |= IN_MODIFIED; 7162 } 7163 7164 /* 7165 * Check to see if an inode has never been written to disk. If 7166 * so free the inodedep and return success, otherwise return failure. 7167 * This routine must be called with splbio interrupts blocked. 7168 * 7169 * If we still have a bitmap dependency, then the inode has never 7170 * been written to disk. Drop the dependency as it is no longer 7171 * necessary since the inode is being deallocated. We set the 7172 * ALLCOMPLETE flags since the bitmap now properly shows that the 7173 * inode is not allocated. Even if the inode is actively being 7174 * written, it has been rolled back to its zero'ed state, so we 7175 * are ensured that a zero inode is what is on the disk. For short 7176 * lived files, this change will usually result in removing all the 7177 * dependencies from the inode so that it can be freed immediately. 7178 */ 7179 static int 7180 check_inode_unwritten(inodedep) 7181 struct inodedep *inodedep; 7182 { 7183 7184 mtx_assert(&lk, MA_OWNED); 7185 7186 if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || 7187 !LIST_EMPTY(&inodedep->id_dirremhd) || 7188 !LIST_EMPTY(&inodedep->id_pendinghd) || 7189 !LIST_EMPTY(&inodedep->id_bufwait) || 7190 !LIST_EMPTY(&inodedep->id_inowait) || 7191 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 7192 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 7193 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 7194 !TAILQ_EMPTY(&inodedep->id_extupdt) || 7195 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 7196 !TAILQ_EMPTY(&inodedep->id_freeblklst) || 7197 inodedep->id_mkdiradd != NULL || 7198 inodedep->id_nlinkdelta != 0) 7199 return (0); 7200 /* 7201 * Another process might be in initiate_write_inodeblock_ufs[12] 7202 * trying to allocate memory without holding "Softdep Lock". 7203 */ 7204 if ((inodedep->id_state & IOSTARTED) != 0 && 7205 inodedep->id_savedino1 == NULL) 7206 return (0); 7207 7208 if (inodedep->id_state & ONDEPLIST) 7209 LIST_REMOVE(inodedep, id_deps); 7210 inodedep->id_state &= ~ONDEPLIST; 7211 inodedep->id_state |= ALLCOMPLETE; 7212 inodedep->id_bmsafemap = NULL; 7213 if (inodedep->id_state & ONWORKLIST) 7214 WORKLIST_REMOVE(&inodedep->id_list); 7215 if (inodedep->id_savedino1 != NULL) { 7216 free(inodedep->id_savedino1, M_SAVEDINO); 7217 inodedep->id_savedino1 = NULL; 7218 } 7219 if (free_inodedep(inodedep) == 0) 7220 panic("check_inode_unwritten: busy inode"); 7221 return (1); 7222 } 7223 7224 /* 7225 * Try to free an inodedep structure. Return 1 if it could be freed. 7226 */ 7227 static int 7228 free_inodedep(inodedep) 7229 struct inodedep *inodedep; 7230 { 7231 7232 mtx_assert(&lk, MA_OWNED); 7233 if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || 7234 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 7235 !LIST_EMPTY(&inodedep->id_dirremhd) || 7236 !LIST_EMPTY(&inodedep->id_pendinghd) || 7237 !LIST_EMPTY(&inodedep->id_bufwait) || 7238 !LIST_EMPTY(&inodedep->id_inowait) || 7239 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 7240 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 7241 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 7242 !TAILQ_EMPTY(&inodedep->id_extupdt) || 7243 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 7244 !TAILQ_EMPTY(&inodedep->id_freeblklst) || 7245 inodedep->id_mkdiradd != NULL || 7246 inodedep->id_nlinkdelta != 0 || 7247 inodedep->id_savedino1 != NULL) 7248 return (0); 7249 if (inodedep->id_state & ONDEPLIST) 7250 LIST_REMOVE(inodedep, id_deps); 7251 LIST_REMOVE(inodedep, id_hash); 7252 WORKITEM_FREE(inodedep, D_INODEDEP); 7253 return (1); 7254 } 7255 7256 /* 7257 * Free the block referenced by a freework structure. The parent freeblks 7258 * structure is released and completed when the final cg bitmap reaches 7259 * the disk. This routine may be freeing a jnewblk which never made it to 7260 * disk in which case we do not have to wait as the operation is undone 7261 * in memory immediately. 7262 */ 7263 static void 7264 freework_freeblock(freework) 7265 struct freework *freework; 7266 { 7267 struct freeblks *freeblks; 7268 struct jnewblk *jnewblk; 7269 struct ufsmount *ump; 7270 struct workhead wkhd; 7271 struct fs *fs; 7272 int bsize; 7273 int needj; 7274 7275 mtx_assert(&lk, MA_OWNED); 7276 /* 7277 * Handle partial truncate separately. 7278 */ 7279 if (freework->fw_indir) { 7280 complete_trunc_indir(freework); 7281 return; 7282 } 7283 freeblks = freework->fw_freeblks; 7284 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7285 fs = ump->um_fs; 7286 needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0; 7287 bsize = lfragtosize(fs, freework->fw_frags); 7288 LIST_INIT(&wkhd); 7289 /* 7290 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives 7291 * on the indirblk hashtable and prevents premature freeing. 7292 */ 7293 freework->fw_state |= DEPCOMPLETE; 7294 /* 7295 * SUJ needs to wait for the segment referencing freed indirect 7296 * blocks to expire so that we know the checker will not confuse 7297 * a re-allocated indirect block with its old contents. 7298 */ 7299 if (needj && freework->fw_lbn <= -NDADDR) 7300 indirblk_insert(freework); 7301 /* 7302 * If we are canceling an existing jnewblk pass it to the free 7303 * routine, otherwise pass the freeblk which will ultimately 7304 * release the freeblks. If we're not journaling, we can just 7305 * free the freeblks immediately. 7306 */ 7307 jnewblk = freework->fw_jnewblk; 7308 if (jnewblk != NULL) { 7309 cancel_jnewblk(jnewblk, &wkhd); 7310 needj = 0; 7311 } else if (needj) { 7312 freework->fw_state |= DELAYEDFREE; 7313 freeblks->fb_cgwait++; 7314 WORKLIST_INSERT(&wkhd, &freework->fw_list); 7315 } 7316 FREE_LOCK(&lk); 7317 freeblks_free(ump, freeblks, btodb(bsize)); 7318 ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, 7319 freeblks->fb_inum, freeblks->fb_vtype, &wkhd); 7320 ACQUIRE_LOCK(&lk); 7321 /* 7322 * The jnewblk will be discarded and the bits in the map never 7323 * made it to disk. We can immediately free the freeblk. 7324 */ 7325 if (needj == 0) 7326 handle_written_freework(freework); 7327 } 7328 7329 /* 7330 * We enqueue freework items that need processing back on the freeblks and 7331 * add the freeblks to the worklist. This makes it easier to find all work 7332 * required to flush a truncation in process_truncates(). 7333 */ 7334 static void 7335 freework_enqueue(freework) 7336 struct freework *freework; 7337 { 7338 struct freeblks *freeblks; 7339 7340 freeblks = freework->fw_freeblks; 7341 if ((freework->fw_state & INPROGRESS) == 0) 7342 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); 7343 if ((freeblks->fb_state & 7344 (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE && 7345 LIST_EMPTY(&freeblks->fb_jblkdephd)) 7346 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 7347 } 7348 7349 /* 7350 * Start, continue, or finish the process of freeing an indirect block tree. 7351 * The free operation may be paused at any point with fw_off containing the 7352 * offset to restart from. This enables us to implement some flow control 7353 * for large truncates which may fan out and generate a huge number of 7354 * dependencies. 7355 */ 7356 static void 7357 handle_workitem_indirblk(freework) 7358 struct freework *freework; 7359 { 7360 struct freeblks *freeblks; 7361 struct ufsmount *ump; 7362 struct fs *fs; 7363 7364 freeblks = freework->fw_freeblks; 7365 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7366 fs = ump->um_fs; 7367 if (freework->fw_state & DEPCOMPLETE) { 7368 handle_written_freework(freework); 7369 return; 7370 } 7371 if (freework->fw_off == NINDIR(fs)) { 7372 freework_freeblock(freework); 7373 return; 7374 } 7375 freework->fw_state |= INPROGRESS; 7376 FREE_LOCK(&lk); 7377 indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), 7378 freework->fw_lbn); 7379 ACQUIRE_LOCK(&lk); 7380 } 7381 7382 /* 7383 * Called when a freework structure attached to a cg buf is written. The 7384 * ref on either the parent or the freeblks structure is released and 7385 * the freeblks is added back to the worklist if there is more work to do. 7386 */ 7387 static void 7388 handle_written_freework(freework) 7389 struct freework *freework; 7390 { 7391 struct freeblks *freeblks; 7392 struct freework *parent; 7393 7394 freeblks = freework->fw_freeblks; 7395 parent = freework->fw_parent; 7396 if (freework->fw_state & DELAYEDFREE) 7397 freeblks->fb_cgwait--; 7398 freework->fw_state |= COMPLETE; 7399 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) 7400 WORKITEM_FREE(freework, D_FREEWORK); 7401 if (parent) { 7402 if (--parent->fw_ref == 0) 7403 freework_enqueue(parent); 7404 return; 7405 } 7406 if (--freeblks->fb_ref != 0) 7407 return; 7408 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) == 7409 ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) 7410 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 7411 } 7412 7413 /* 7414 * This workitem routine performs the block de-allocation. 7415 * The workitem is added to the pending list after the updated 7416 * inode block has been written to disk. As mentioned above, 7417 * checks regarding the number of blocks de-allocated (compared 7418 * to the number of blocks allocated for the file) are also 7419 * performed in this function. 7420 */ 7421 static int 7422 handle_workitem_freeblocks(freeblks, flags) 7423 struct freeblks *freeblks; 7424 int flags; 7425 { 7426 struct freework *freework; 7427 struct newblk *newblk; 7428 struct allocindir *aip; 7429 struct ufsmount *ump; 7430 struct worklist *wk; 7431 7432 KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), 7433 ("handle_workitem_freeblocks: Journal entries not written.")); 7434 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7435 ACQUIRE_LOCK(&lk); 7436 while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { 7437 WORKLIST_REMOVE(wk); 7438 switch (wk->wk_type) { 7439 case D_DIRREM: 7440 wk->wk_state |= COMPLETE; 7441 add_to_worklist(wk, 0); 7442 continue; 7443 7444 case D_ALLOCDIRECT: 7445 free_newblk(WK_NEWBLK(wk)); 7446 continue; 7447 7448 case D_ALLOCINDIR: 7449 aip = WK_ALLOCINDIR(wk); 7450 freework = NULL; 7451 if (aip->ai_state & DELAYEDFREE) { 7452 FREE_LOCK(&lk); 7453 freework = newfreework(ump, freeblks, NULL, 7454 aip->ai_lbn, aip->ai_newblkno, 7455 ump->um_fs->fs_frag, 0, 0); 7456 ACQUIRE_LOCK(&lk); 7457 } 7458 newblk = WK_NEWBLK(wk); 7459 if (newblk->nb_jnewblk) { 7460 freework->fw_jnewblk = newblk->nb_jnewblk; 7461 newblk->nb_jnewblk->jn_dep = &freework->fw_list; 7462 newblk->nb_jnewblk = NULL; 7463 } 7464 free_newblk(newblk); 7465 continue; 7466 7467 case D_FREEWORK: 7468 freework = WK_FREEWORK(wk); 7469 if (freework->fw_lbn <= -NDADDR) 7470 handle_workitem_indirblk(freework); 7471 else 7472 freework_freeblock(freework); 7473 continue; 7474 default: 7475 panic("handle_workitem_freeblocks: Unknown type %s", 7476 TYPENAME(wk->wk_type)); 7477 } 7478 } 7479 if (freeblks->fb_ref != 0) { 7480 freeblks->fb_state &= ~INPROGRESS; 7481 wake_worklist(&freeblks->fb_list); 7482 freeblks = NULL; 7483 } 7484 FREE_LOCK(&lk); 7485 if (freeblks) 7486 return handle_complete_freeblocks(freeblks, flags); 7487 return (0); 7488 } 7489 7490 /* 7491 * Handle completion of block free via truncate. This allows fs_pending 7492 * to track the actual free block count more closely than if we only updated 7493 * it at the end. We must be careful to handle cases where the block count 7494 * on free was incorrect. 7495 */ 7496 static void 7497 freeblks_free(ump, freeblks, blocks) 7498 struct ufsmount *ump; 7499 struct freeblks *freeblks; 7500 int blocks; 7501 { 7502 struct fs *fs; 7503 ufs2_daddr_t remain; 7504 7505 UFS_LOCK(ump); 7506 remain = -freeblks->fb_chkcnt; 7507 freeblks->fb_chkcnt += blocks; 7508 if (remain > 0) { 7509 if (remain < blocks) 7510 blocks = remain; 7511 fs = ump->um_fs; 7512 fs->fs_pendingblocks -= blocks; 7513 } 7514 UFS_UNLOCK(ump); 7515 } 7516 7517 /* 7518 * Once all of the freework workitems are complete we can retire the 7519 * freeblocks dependency and any journal work awaiting completion. This 7520 * can not be called until all other dependencies are stable on disk. 7521 */ 7522 static int 7523 handle_complete_freeblocks(freeblks, flags) 7524 struct freeblks *freeblks; 7525 int flags; 7526 { 7527 struct inodedep *inodedep; 7528 struct inode *ip; 7529 struct vnode *vp; 7530 struct fs *fs; 7531 struct ufsmount *ump; 7532 ufs2_daddr_t spare; 7533 7534 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7535 fs = ump->um_fs; 7536 flags = LK_EXCLUSIVE | flags; 7537 spare = freeblks->fb_chkcnt; 7538 7539 /* 7540 * If we did not release the expected number of blocks we may have 7541 * to adjust the inode block count here. Only do so if it wasn't 7542 * a truncation to zero and the modrev still matches. 7543 */ 7544 if (spare && freeblks->fb_len != 0) { 7545 if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum, 7546 flags, &vp, FFSV_FORCEINSMQ) != 0) 7547 return (EBUSY); 7548 ip = VTOI(vp); 7549 if (DIP(ip, i_modrev) == freeblks->fb_modrev) { 7550 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare); 7551 ip->i_flag |= IN_CHANGE; 7552 /* 7553 * We must wait so this happens before the 7554 * journal is reclaimed. 7555 */ 7556 ffs_update(vp, 1); 7557 } 7558 vput(vp); 7559 } 7560 if (spare < 0) { 7561 UFS_LOCK(ump); 7562 fs->fs_pendingblocks += spare; 7563 UFS_UNLOCK(ump); 7564 } 7565 #ifdef QUOTA 7566 /* Handle spare. */ 7567 if (spare) 7568 quotaadj(freeblks->fb_quota, ump, -spare); 7569 quotarele(freeblks->fb_quota); 7570 #endif 7571 ACQUIRE_LOCK(&lk); 7572 if (freeblks->fb_state & ONDEPLIST) { 7573 inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum, 7574 0, &inodedep); 7575 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); 7576 freeblks->fb_state &= ~ONDEPLIST; 7577 if (TAILQ_EMPTY(&inodedep->id_freeblklst)) 7578 free_inodedep(inodedep); 7579 } 7580 /* 7581 * All of the freeblock deps must be complete prior to this call 7582 * so it's now safe to complete earlier outstanding journal entries. 7583 */ 7584 handle_jwork(&freeblks->fb_jwork); 7585 WORKITEM_FREE(freeblks, D_FREEBLKS); 7586 FREE_LOCK(&lk); 7587 return (0); 7588 } 7589 7590 /* 7591 * Release blocks associated with the freeblks and stored in the indirect 7592 * block dbn. If level is greater than SINGLE, the block is an indirect block 7593 * and recursive calls to indirtrunc must be used to cleanse other indirect 7594 * blocks. 7595 * 7596 * This handles partial and complete truncation of blocks. Partial is noted 7597 * with goingaway == 0. In this case the freework is completed after the 7598 * zero'd indirects are written to disk. For full truncation the freework 7599 * is completed after the block is freed. 7600 */ 7601 static void 7602 indir_trunc(freework, dbn, lbn) 7603 struct freework *freework; 7604 ufs2_daddr_t dbn; 7605 ufs_lbn_t lbn; 7606 { 7607 struct freework *nfreework; 7608 struct workhead wkhd; 7609 struct freeblks *freeblks; 7610 struct buf *bp; 7611 struct fs *fs; 7612 struct indirdep *indirdep; 7613 struct ufsmount *ump; 7614 ufs1_daddr_t *bap1 = 0; 7615 ufs2_daddr_t nb, nnb, *bap2 = 0; 7616 ufs_lbn_t lbnadd, nlbn; 7617 int i, nblocks, ufs1fmt; 7618 int freedblocks; 7619 int goingaway; 7620 int freedeps; 7621 int needj; 7622 int level; 7623 int cnt; 7624 7625 freeblks = freework->fw_freeblks; 7626 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7627 fs = ump->um_fs; 7628 /* 7629 * Get buffer of block pointers to be freed. There are three cases: 7630 * 7631 * 1) Partial truncate caches the indirdep pointer in the freework 7632 * which provides us a back copy to the save bp which holds the 7633 * pointers we want to clear. When this completes the zero 7634 * pointers are written to the real copy. 7635 * 2) The indirect is being completely truncated, cancel_indirdep() 7636 * eliminated the real copy and placed the indirdep on the saved 7637 * copy. The indirdep and buf are discarded when this completes. 7638 * 3) The indirect was not in memory, we read a copy off of the disk 7639 * using the devvp and drop and invalidate the buffer when we're 7640 * done. 7641 */ 7642 goingaway = 1; 7643 indirdep = NULL; 7644 if (freework->fw_indir != NULL) { 7645 goingaway = 0; 7646 indirdep = freework->fw_indir; 7647 bp = indirdep->ir_savebp; 7648 if (bp == NULL || bp->b_blkno != dbn) 7649 panic("indir_trunc: Bad saved buf %p blkno %jd", 7650 bp, (intmax_t)dbn); 7651 } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) { 7652 /* 7653 * The lock prevents the buf dep list from changing and 7654 * indirects on devvp should only ever have one dependency. 7655 */ 7656 indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep)); 7657 if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0) 7658 panic("indir_trunc: Bad indirdep %p from buf %p", 7659 indirdep, bp); 7660 } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 7661 NOCRED, &bp) != 0) { 7662 brelse(bp); 7663 return; 7664 } 7665 ACQUIRE_LOCK(&lk); 7666 /* Protects against a race with complete_trunc_indir(). */ 7667 freework->fw_state &= ~INPROGRESS; 7668 /* 7669 * If we have an indirdep we need to enforce the truncation order 7670 * and discard it when it is complete. 7671 */ 7672 if (indirdep) { 7673 if (freework != TAILQ_FIRST(&indirdep->ir_trunc) && 7674 !TAILQ_EMPTY(&indirdep->ir_trunc)) { 7675 /* 7676 * Add the complete truncate to the list on the 7677 * indirdep to enforce in-order processing. 7678 */ 7679 if (freework->fw_indir == NULL) 7680 TAILQ_INSERT_TAIL(&indirdep->ir_trunc, 7681 freework, fw_next); 7682 FREE_LOCK(&lk); 7683 return; 7684 } 7685 /* 7686 * If we're goingaway, free the indirdep. Otherwise it will 7687 * linger until the write completes. 7688 */ 7689 if (goingaway) { 7690 free_indirdep(indirdep); 7691 ump->um_numindirdeps -= 1; 7692 } 7693 } 7694 FREE_LOCK(&lk); 7695 /* Initialize pointers depending on block size. */ 7696 if (ump->um_fstype == UFS1) { 7697 bap1 = (ufs1_daddr_t *)bp->b_data; 7698 nb = bap1[freework->fw_off]; 7699 ufs1fmt = 1; 7700 } else { 7701 bap2 = (ufs2_daddr_t *)bp->b_data; 7702 nb = bap2[freework->fw_off]; 7703 ufs1fmt = 0; 7704 } 7705 level = lbn_level(lbn); 7706 needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0; 7707 lbnadd = lbn_offset(fs, level); 7708 nblocks = btodb(fs->fs_bsize); 7709 nfreework = freework; 7710 freedeps = 0; 7711 cnt = 0; 7712 /* 7713 * Reclaim blocks. Traverses into nested indirect levels and 7714 * arranges for the current level to be freed when subordinates 7715 * are free when journaling. 7716 */ 7717 for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { 7718 if (i != NINDIR(fs) - 1) { 7719 if (ufs1fmt) 7720 nnb = bap1[i+1]; 7721 else 7722 nnb = bap2[i+1]; 7723 } else 7724 nnb = 0; 7725 if (nb == 0) 7726 continue; 7727 cnt++; 7728 if (level != 0) { 7729 nlbn = (lbn + 1) - (i * lbnadd); 7730 if (needj != 0) { 7731 nfreework = newfreework(ump, freeblks, freework, 7732 nlbn, nb, fs->fs_frag, 0, 0); 7733 freedeps++; 7734 } 7735 indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); 7736 } else { 7737 struct freedep *freedep; 7738 7739 /* 7740 * Attempt to aggregate freedep dependencies for 7741 * all blocks being released to the same CG. 7742 */ 7743 LIST_INIT(&wkhd); 7744 if (needj != 0 && 7745 (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { 7746 freedep = newfreedep(freework); 7747 WORKLIST_INSERT_UNLOCKED(&wkhd, 7748 &freedep->fd_list); 7749 freedeps++; 7750 } 7751 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, 7752 fs->fs_bsize, freeblks->fb_inum, 7753 freeblks->fb_vtype, &wkhd); 7754 } 7755 } 7756 if (goingaway) { 7757 bp->b_flags |= B_INVAL | B_NOCACHE; 7758 brelse(bp); 7759 } 7760 freedblocks = 0; 7761 if (level == 0) 7762 freedblocks = (nblocks * cnt); 7763 if (needj == 0) 7764 freedblocks += nblocks; 7765 freeblks_free(ump, freeblks, freedblocks); 7766 /* 7767 * If we are journaling set up the ref counts and offset so this 7768 * indirect can be completed when its children are free. 7769 */ 7770 if (needj) { 7771 ACQUIRE_LOCK(&lk); 7772 freework->fw_off = i; 7773 freework->fw_ref += freedeps; 7774 freework->fw_ref -= NINDIR(fs) + 1; 7775 if (level == 0) 7776 freeblks->fb_cgwait += freedeps; 7777 if (freework->fw_ref == 0) 7778 freework_freeblock(freework); 7779 FREE_LOCK(&lk); 7780 return; 7781 } 7782 /* 7783 * If we're not journaling we can free the indirect now. 7784 */ 7785 dbn = dbtofsb(fs, dbn); 7786 ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, 7787 freeblks->fb_inum, freeblks->fb_vtype, NULL); 7788 /* Non SUJ softdep does single-threaded truncations. */ 7789 if (freework->fw_blkno == dbn) { 7790 freework->fw_state |= ALLCOMPLETE; 7791 ACQUIRE_LOCK(&lk); 7792 handle_written_freework(freework); 7793 FREE_LOCK(&lk); 7794 } 7795 return; 7796 } 7797 7798 /* 7799 * Cancel an allocindir when it is removed via truncation. When bp is not 7800 * NULL the indirect never appeared on disk and is scheduled to be freed 7801 * independently of the indir so we can more easily track journal work. 7802 */ 7803 static void 7804 cancel_allocindir(aip, bp, freeblks, trunc) 7805 struct allocindir *aip; 7806 struct buf *bp; 7807 struct freeblks *freeblks; 7808 int trunc; 7809 { 7810 struct indirdep *indirdep; 7811 struct freefrag *freefrag; 7812 struct newblk *newblk; 7813 7814 newblk = (struct newblk *)aip; 7815 LIST_REMOVE(aip, ai_next); 7816 /* 7817 * We must eliminate the pointer in bp if it must be freed on its 7818 * own due to partial truncate or pending journal work. 7819 */ 7820 if (bp && (trunc || newblk->nb_jnewblk)) { 7821 /* 7822 * Clear the pointer and mark the aip to be freed 7823 * directly if it never existed on disk. 7824 */ 7825 aip->ai_state |= DELAYEDFREE; 7826 indirdep = aip->ai_indirdep; 7827 if (indirdep->ir_state & UFS1FMT) 7828 ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0; 7829 else 7830 ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0; 7831 } 7832 /* 7833 * When truncating the previous pointer will be freed via 7834 * savedbp. Eliminate the freefrag which would dup free. 7835 */ 7836 if (trunc && (freefrag = newblk->nb_freefrag) != NULL) { 7837 newblk->nb_freefrag = NULL; 7838 if (freefrag->ff_jdep) 7839 cancel_jfreefrag( 7840 WK_JFREEFRAG(freefrag->ff_jdep)); 7841 jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork); 7842 WORKITEM_FREE(freefrag, D_FREEFRAG); 7843 } 7844 /* 7845 * If the journal hasn't been written the jnewblk must be passed 7846 * to the call to ffs_blkfree that reclaims the space. We accomplish 7847 * this by leaving the journal dependency on the newblk to be freed 7848 * when a freework is created in handle_workitem_freeblocks(). 7849 */ 7850 cancel_newblk(newblk, NULL, &freeblks->fb_jwork); 7851 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); 7852 } 7853 7854 /* 7855 * Create the mkdir dependencies for . and .. in a new directory. Link them 7856 * in to a newdirblk so any subsequent additions are tracked properly. The 7857 * caller is responsible for adding the mkdir1 dependency to the journal 7858 * and updating id_mkdiradd. This function returns with lk held. 7859 */ 7860 static struct mkdir * 7861 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) 7862 struct diradd *dap; 7863 ino_t newinum; 7864 ino_t dinum; 7865 struct buf *newdirbp; 7866 struct mkdir **mkdirp; 7867 { 7868 struct newblk *newblk; 7869 struct pagedep *pagedep; 7870 struct inodedep *inodedep; 7871 struct newdirblk *newdirblk = 0; 7872 struct mkdir *mkdir1, *mkdir2; 7873 struct worklist *wk; 7874 struct jaddref *jaddref; 7875 struct mount *mp; 7876 7877 mp = dap->da_list.wk_mp; 7878 newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, 7879 M_SOFTDEP_FLAGS); 7880 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 7881 LIST_INIT(&newdirblk->db_mkdir); 7882 mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 7883 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); 7884 mkdir1->md_state = ATTACHED | MKDIR_BODY; 7885 mkdir1->md_diradd = dap; 7886 mkdir1->md_jaddref = NULL; 7887 mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 7888 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); 7889 mkdir2->md_state = ATTACHED | MKDIR_PARENT; 7890 mkdir2->md_diradd = dap; 7891 mkdir2->md_jaddref = NULL; 7892 if (MOUNTEDSUJ(mp) == 0) { 7893 mkdir1->md_state |= DEPCOMPLETE; 7894 mkdir2->md_state |= DEPCOMPLETE; 7895 } 7896 /* 7897 * Dependency on "." and ".." being written to disk. 7898 */ 7899 mkdir1->md_buf = newdirbp; 7900 ACQUIRE_LOCK(&lk); 7901 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 7902 /* 7903 * We must link the pagedep, allocdirect, and newdirblk for 7904 * the initial file page so the pointer to the new directory 7905 * is not written until the directory contents are live and 7906 * any subsequent additions are not marked live until the 7907 * block is reachable via the inode. 7908 */ 7909 if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0) 7910 panic("setup_newdir: lost pagedep"); 7911 LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) 7912 if (wk->wk_type == D_ALLOCDIRECT) 7913 break; 7914 if (wk == NULL) 7915 panic("setup_newdir: lost allocdirect"); 7916 if (pagedep->pd_state & NEWBLOCK) 7917 panic("setup_newdir: NEWBLOCK already set"); 7918 newblk = WK_NEWBLK(wk); 7919 pagedep->pd_state |= NEWBLOCK; 7920 pagedep->pd_newdirblk = newdirblk; 7921 newdirblk->db_pagedep = pagedep; 7922 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 7923 WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); 7924 /* 7925 * Look up the inodedep for the parent directory so that we 7926 * can link mkdir2 into the pending dotdot jaddref or 7927 * the inode write if there is none. If the inode is 7928 * ALLCOMPLETE and no jaddref is present all dependencies have 7929 * been satisfied and mkdir2 can be freed. 7930 */ 7931 inodedep_lookup(mp, dinum, 0, &inodedep); 7932 if (MOUNTEDSUJ(mp)) { 7933 if (inodedep == NULL) 7934 panic("setup_newdir: Lost parent."); 7935 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 7936 inoreflst); 7937 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && 7938 (jaddref->ja_state & MKDIR_PARENT), 7939 ("setup_newdir: bad dotdot jaddref %p", jaddref)); 7940 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 7941 mkdir2->md_jaddref = jaddref; 7942 jaddref->ja_mkdir = mkdir2; 7943 } else if (inodedep == NULL || 7944 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 7945 dap->da_state &= ~MKDIR_PARENT; 7946 WORKITEM_FREE(mkdir2, D_MKDIR); 7947 } else { 7948 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 7949 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list); 7950 } 7951 *mkdirp = mkdir2; 7952 7953 return (mkdir1); 7954 } 7955 7956 /* 7957 * Directory entry addition dependencies. 7958 * 7959 * When adding a new directory entry, the inode (with its incremented link 7960 * count) must be written to disk before the directory entry's pointer to it. 7961 * Also, if the inode is newly allocated, the corresponding freemap must be 7962 * updated (on disk) before the directory entry's pointer. These requirements 7963 * are met via undo/redo on the directory entry's pointer, which consists 7964 * simply of the inode number. 7965 * 7966 * As directory entries are added and deleted, the free space within a 7967 * directory block can become fragmented. The ufs filesystem will compact 7968 * a fragmented directory block to make space for a new entry. When this 7969 * occurs, the offsets of previously added entries change. Any "diradd" 7970 * dependency structures corresponding to these entries must be updated with 7971 * the new offsets. 7972 */ 7973 7974 /* 7975 * This routine is called after the in-memory inode's link 7976 * count has been incremented, but before the directory entry's 7977 * pointer to the inode has been set. 7978 */ 7979 int 7980 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 7981 struct buf *bp; /* buffer containing directory block */ 7982 struct inode *dp; /* inode for directory */ 7983 off_t diroffset; /* offset of new entry in directory */ 7984 ino_t newinum; /* inode referenced by new directory entry */ 7985 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 7986 int isnewblk; /* entry is in a newly allocated block */ 7987 { 7988 int offset; /* offset of new entry within directory block */ 7989 ufs_lbn_t lbn; /* block in directory containing new entry */ 7990 struct fs *fs; 7991 struct diradd *dap; 7992 struct newblk *newblk; 7993 struct pagedep *pagedep; 7994 struct inodedep *inodedep; 7995 struct newdirblk *newdirblk = 0; 7996 struct mkdir *mkdir1, *mkdir2; 7997 struct jaddref *jaddref; 7998 struct mount *mp; 7999 int isindir; 8000 8001 /* 8002 * Whiteouts have no dependencies. 8003 */ 8004 if (newinum == WINO) { 8005 if (newdirbp != NULL) 8006 bdwrite(newdirbp); 8007 return (0); 8008 } 8009 jaddref = NULL; 8010 mkdir1 = mkdir2 = NULL; 8011 mp = UFSTOVFS(dp->i_ump); 8012 fs = dp->i_fs; 8013 lbn = lblkno(fs, diroffset); 8014 offset = blkoff(fs, diroffset); 8015 dap = malloc(sizeof(struct diradd), M_DIRADD, 8016 M_SOFTDEP_FLAGS|M_ZERO); 8017 workitem_alloc(&dap->da_list, D_DIRADD, mp); 8018 dap->da_offset = offset; 8019 dap->da_newinum = newinum; 8020 dap->da_state = ATTACHED; 8021 LIST_INIT(&dap->da_jwork); 8022 isindir = bp->b_lblkno >= NDADDR; 8023 if (isnewblk && 8024 (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { 8025 newdirblk = malloc(sizeof(struct newdirblk), 8026 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 8027 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 8028 LIST_INIT(&newdirblk->db_mkdir); 8029 } 8030 /* 8031 * If we're creating a new directory setup the dependencies and set 8032 * the dap state to wait for them. Otherwise it's COMPLETE and 8033 * we can move on. 8034 */ 8035 if (newdirbp == NULL) { 8036 dap->da_state |= DEPCOMPLETE; 8037 ACQUIRE_LOCK(&lk); 8038 } else { 8039 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 8040 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, 8041 &mkdir2); 8042 } 8043 /* 8044 * Link into parent directory pagedep to await its being written. 8045 */ 8046 pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep); 8047 #ifdef DEBUG 8048 if (diradd_lookup(pagedep, offset) != NULL) 8049 panic("softdep_setup_directory_add: %p already at off %d\n", 8050 diradd_lookup(pagedep, offset), offset); 8051 #endif 8052 dap->da_pagedep = pagedep; 8053 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 8054 da_pdlist); 8055 inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep); 8056 /* 8057 * If we're journaling, link the diradd into the jaddref so it 8058 * may be completed after the journal entry is written. Otherwise, 8059 * link the diradd into its inodedep. If the inode is not yet 8060 * written place it on the bufwait list, otherwise do the post-inode 8061 * write processing to put it on the id_pendinghd list. 8062 */ 8063 if (MOUNTEDSUJ(mp)) { 8064 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 8065 inoreflst); 8066 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 8067 ("softdep_setup_directory_add: bad jaddref %p", jaddref)); 8068 jaddref->ja_diroff = diroffset; 8069 jaddref->ja_diradd = dap; 8070 add_to_journal(&jaddref->ja_list); 8071 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 8072 diradd_inode_written(dap, inodedep); 8073 else 8074 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 8075 /* 8076 * Add the journal entries for . and .. links now that the primary 8077 * link is written. 8078 */ 8079 if (mkdir1 != NULL && MOUNTEDSUJ(mp)) { 8080 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 8081 inoreflst, if_deps); 8082 KASSERT(jaddref != NULL && 8083 jaddref->ja_ino == jaddref->ja_parent && 8084 (jaddref->ja_state & MKDIR_BODY), 8085 ("softdep_setup_directory_add: bad dot jaddref %p", 8086 jaddref)); 8087 mkdir1->md_jaddref = jaddref; 8088 jaddref->ja_mkdir = mkdir1; 8089 /* 8090 * It is important that the dotdot journal entry 8091 * is added prior to the dot entry since dot writes 8092 * both the dot and dotdot links. These both must 8093 * be added after the primary link for the journal 8094 * to remain consistent. 8095 */ 8096 add_to_journal(&mkdir2->md_jaddref->ja_list); 8097 add_to_journal(&jaddref->ja_list); 8098 } 8099 /* 8100 * If we are adding a new directory remember this diradd so that if 8101 * we rename it we can keep the dot and dotdot dependencies. If 8102 * we are adding a new name for an inode that has a mkdiradd we 8103 * must be in rename and we have to move the dot and dotdot 8104 * dependencies to this new name. The old name is being orphaned 8105 * soon. 8106 */ 8107 if (mkdir1 != NULL) { 8108 if (inodedep->id_mkdiradd != NULL) 8109 panic("softdep_setup_directory_add: Existing mkdir"); 8110 inodedep->id_mkdiradd = dap; 8111 } else if (inodedep->id_mkdiradd) 8112 merge_diradd(inodedep, dap); 8113 if (newdirblk) { 8114 /* 8115 * There is nothing to do if we are already tracking 8116 * this block. 8117 */ 8118 if ((pagedep->pd_state & NEWBLOCK) != 0) { 8119 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 8120 FREE_LOCK(&lk); 8121 return (0); 8122 } 8123 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) 8124 == 0) 8125 panic("softdep_setup_directory_add: lost entry"); 8126 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 8127 pagedep->pd_state |= NEWBLOCK; 8128 pagedep->pd_newdirblk = newdirblk; 8129 newdirblk->db_pagedep = pagedep; 8130 FREE_LOCK(&lk); 8131 /* 8132 * If we extended into an indirect signal direnter to sync. 8133 */ 8134 if (isindir) 8135 return (1); 8136 return (0); 8137 } 8138 FREE_LOCK(&lk); 8139 return (0); 8140 } 8141 8142 /* 8143 * This procedure is called to change the offset of a directory 8144 * entry when compacting a directory block which must be owned 8145 * exclusively by the caller. Note that the actual entry movement 8146 * must be done in this procedure to ensure that no I/O completions 8147 * occur while the move is in progress. 8148 */ 8149 void 8150 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 8151 struct buf *bp; /* Buffer holding directory block. */ 8152 struct inode *dp; /* inode for directory */ 8153 caddr_t base; /* address of dp->i_offset */ 8154 caddr_t oldloc; /* address of old directory location */ 8155 caddr_t newloc; /* address of new directory location */ 8156 int entrysize; /* size of directory entry */ 8157 { 8158 int offset, oldoffset, newoffset; 8159 struct pagedep *pagedep; 8160 struct jmvref *jmvref; 8161 struct diradd *dap; 8162 struct direct *de; 8163 struct mount *mp; 8164 ufs_lbn_t lbn; 8165 int flags; 8166 8167 mp = UFSTOVFS(dp->i_ump); 8168 de = (struct direct *)oldloc; 8169 jmvref = NULL; 8170 flags = 0; 8171 /* 8172 * Moves are always journaled as it would be too complex to 8173 * determine if any affected adds or removes are present in the 8174 * journal. 8175 */ 8176 if (MOUNTEDSUJ(mp)) { 8177 flags = DEPALLOC; 8178 jmvref = newjmvref(dp, de->d_ino, 8179 dp->i_offset + (oldloc - base), 8180 dp->i_offset + (newloc - base)); 8181 } 8182 lbn = lblkno(dp->i_fs, dp->i_offset); 8183 offset = blkoff(dp->i_fs, dp->i_offset); 8184 oldoffset = offset + (oldloc - base); 8185 newoffset = offset + (newloc - base); 8186 ACQUIRE_LOCK(&lk); 8187 if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0) 8188 goto done; 8189 dap = diradd_lookup(pagedep, oldoffset); 8190 if (dap) { 8191 dap->da_offset = newoffset; 8192 newoffset = DIRADDHASH(newoffset); 8193 oldoffset = DIRADDHASH(oldoffset); 8194 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && 8195 newoffset != oldoffset) { 8196 LIST_REMOVE(dap, da_pdlist); 8197 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], 8198 dap, da_pdlist); 8199 } 8200 } 8201 done: 8202 if (jmvref) { 8203 jmvref->jm_pagedep = pagedep; 8204 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); 8205 add_to_journal(&jmvref->jm_list); 8206 } 8207 bcopy(oldloc, newloc, entrysize); 8208 FREE_LOCK(&lk); 8209 } 8210 8211 /* 8212 * Move the mkdir dependencies and journal work from one diradd to another 8213 * when renaming a directory. The new name must depend on the mkdir deps 8214 * completing as the old name did. Directories can only have one valid link 8215 * at a time so one must be canonical. 8216 */ 8217 static void 8218 merge_diradd(inodedep, newdap) 8219 struct inodedep *inodedep; 8220 struct diradd *newdap; 8221 { 8222 struct diradd *olddap; 8223 struct mkdir *mkdir, *nextmd; 8224 short state; 8225 8226 olddap = inodedep->id_mkdiradd; 8227 inodedep->id_mkdiradd = newdap; 8228 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8229 newdap->da_state &= ~DEPCOMPLETE; 8230 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 8231 nextmd = LIST_NEXT(mkdir, md_mkdirs); 8232 if (mkdir->md_diradd != olddap) 8233 continue; 8234 mkdir->md_diradd = newdap; 8235 state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); 8236 newdap->da_state |= state; 8237 olddap->da_state &= ~state; 8238 if ((olddap->da_state & 8239 (MKDIR_PARENT | MKDIR_BODY)) == 0) 8240 break; 8241 } 8242 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 8243 panic("merge_diradd: unfound ref"); 8244 } 8245 /* 8246 * Any mkdir related journal items are not safe to be freed until 8247 * the new name is stable. 8248 */ 8249 jwork_move(&newdap->da_jwork, &olddap->da_jwork); 8250 olddap->da_state |= DEPCOMPLETE; 8251 complete_diradd(olddap); 8252 } 8253 8254 /* 8255 * Move the diradd to the pending list when all diradd dependencies are 8256 * complete. 8257 */ 8258 static void 8259 complete_diradd(dap) 8260 struct diradd *dap; 8261 { 8262 struct pagedep *pagedep; 8263 8264 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 8265 if (dap->da_state & DIRCHG) 8266 pagedep = dap->da_previous->dm_pagedep; 8267 else 8268 pagedep = dap->da_pagedep; 8269 LIST_REMOVE(dap, da_pdlist); 8270 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 8271 } 8272 } 8273 8274 /* 8275 * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal 8276 * add entries and conditonally journal the remove. 8277 */ 8278 static void 8279 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) 8280 struct diradd *dap; 8281 struct dirrem *dirrem; 8282 struct jremref *jremref; 8283 struct jremref *dotremref; 8284 struct jremref *dotdotremref; 8285 { 8286 struct inodedep *inodedep; 8287 struct jaddref *jaddref; 8288 struct inoref *inoref; 8289 struct mkdir *mkdir; 8290 8291 /* 8292 * If no remove references were allocated we're on a non-journaled 8293 * filesystem and can skip the cancel step. 8294 */ 8295 if (jremref == NULL) { 8296 free_diradd(dap, NULL); 8297 return; 8298 } 8299 /* 8300 * Cancel the primary name an free it if it does not require 8301 * journaling. 8302 */ 8303 if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 8304 0, &inodedep) != 0) { 8305 /* Abort the addref that reference this diradd. */ 8306 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 8307 if (inoref->if_list.wk_type != D_JADDREF) 8308 continue; 8309 jaddref = (struct jaddref *)inoref; 8310 if (jaddref->ja_diradd != dap) 8311 continue; 8312 if (cancel_jaddref(jaddref, inodedep, 8313 &dirrem->dm_jwork) == 0) { 8314 free_jremref(jremref); 8315 jremref = NULL; 8316 } 8317 break; 8318 } 8319 } 8320 /* 8321 * Cancel subordinate names and free them if they do not require 8322 * journaling. 8323 */ 8324 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8325 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 8326 if (mkdir->md_diradd != dap) 8327 continue; 8328 if ((jaddref = mkdir->md_jaddref) == NULL) 8329 continue; 8330 mkdir->md_jaddref = NULL; 8331 if (mkdir->md_state & MKDIR_PARENT) { 8332 if (cancel_jaddref(jaddref, NULL, 8333 &dirrem->dm_jwork) == 0) { 8334 free_jremref(dotdotremref); 8335 dotdotremref = NULL; 8336 } 8337 } else { 8338 if (cancel_jaddref(jaddref, inodedep, 8339 &dirrem->dm_jwork) == 0) { 8340 free_jremref(dotremref); 8341 dotremref = NULL; 8342 } 8343 } 8344 } 8345 } 8346 8347 if (jremref) 8348 journal_jremref(dirrem, jremref, inodedep); 8349 if (dotremref) 8350 journal_jremref(dirrem, dotremref, inodedep); 8351 if (dotdotremref) 8352 journal_jremref(dirrem, dotdotremref, NULL); 8353 jwork_move(&dirrem->dm_jwork, &dap->da_jwork); 8354 free_diradd(dap, &dirrem->dm_jwork); 8355 } 8356 8357 /* 8358 * Free a diradd dependency structure. This routine must be called 8359 * with splbio interrupts blocked. 8360 */ 8361 static void 8362 free_diradd(dap, wkhd) 8363 struct diradd *dap; 8364 struct workhead *wkhd; 8365 { 8366 struct dirrem *dirrem; 8367 struct pagedep *pagedep; 8368 struct inodedep *inodedep; 8369 struct mkdir *mkdir, *nextmd; 8370 8371 mtx_assert(&lk, MA_OWNED); 8372 LIST_REMOVE(dap, da_pdlist); 8373 if (dap->da_state & ONWORKLIST) 8374 WORKLIST_REMOVE(&dap->da_list); 8375 if ((dap->da_state & DIRCHG) == 0) { 8376 pagedep = dap->da_pagedep; 8377 } else { 8378 dirrem = dap->da_previous; 8379 pagedep = dirrem->dm_pagedep; 8380 dirrem->dm_dirinum = pagedep->pd_ino; 8381 dirrem->dm_state |= COMPLETE; 8382 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8383 add_to_worklist(&dirrem->dm_list, 0); 8384 } 8385 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 8386 0, &inodedep) != 0) 8387 if (inodedep->id_mkdiradd == dap) 8388 inodedep->id_mkdiradd = NULL; 8389 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8390 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 8391 nextmd = LIST_NEXT(mkdir, md_mkdirs); 8392 if (mkdir->md_diradd != dap) 8393 continue; 8394 dap->da_state &= 8395 ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 8396 LIST_REMOVE(mkdir, md_mkdirs); 8397 if (mkdir->md_state & ONWORKLIST) 8398 WORKLIST_REMOVE(&mkdir->md_list); 8399 if (mkdir->md_jaddref != NULL) 8400 panic("free_diradd: Unexpected jaddref"); 8401 WORKITEM_FREE(mkdir, D_MKDIR); 8402 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 8403 break; 8404 } 8405 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 8406 panic("free_diradd: unfound ref"); 8407 } 8408 if (inodedep) 8409 free_inodedep(inodedep); 8410 /* 8411 * Free any journal segments waiting for the directory write. 8412 */ 8413 handle_jwork(&dap->da_jwork); 8414 WORKITEM_FREE(dap, D_DIRADD); 8415 } 8416 8417 /* 8418 * Directory entry removal dependencies. 8419 * 8420 * When removing a directory entry, the entry's inode pointer must be 8421 * zero'ed on disk before the corresponding inode's link count is decremented 8422 * (possibly freeing the inode for re-use). This dependency is handled by 8423 * updating the directory entry but delaying the inode count reduction until 8424 * after the directory block has been written to disk. After this point, the 8425 * inode count can be decremented whenever it is convenient. 8426 */ 8427 8428 /* 8429 * This routine should be called immediately after removing 8430 * a directory entry. The inode's link count should not be 8431 * decremented by the calling procedure -- the soft updates 8432 * code will do this task when it is safe. 8433 */ 8434 void 8435 softdep_setup_remove(bp, dp, ip, isrmdir) 8436 struct buf *bp; /* buffer containing directory block */ 8437 struct inode *dp; /* inode for the directory being modified */ 8438 struct inode *ip; /* inode for directory entry being removed */ 8439 int isrmdir; /* indicates if doing RMDIR */ 8440 { 8441 struct dirrem *dirrem, *prevdirrem; 8442 struct inodedep *inodedep; 8443 int direct; 8444 8445 /* 8446 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want 8447 * newdirrem() to setup the full directory remove which requires 8448 * isrmdir > 1. 8449 */ 8450 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 8451 /* 8452 * Add the dirrem to the inodedep's pending remove list for quick 8453 * discovery later. 8454 */ 8455 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 8456 &inodedep) == 0) 8457 panic("softdep_setup_remove: Lost inodedep."); 8458 dirrem->dm_state |= ONDEPLIST; 8459 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 8460 8461 /* 8462 * If the COMPLETE flag is clear, then there were no active 8463 * entries and we want to roll back to a zeroed entry until 8464 * the new inode is committed to disk. If the COMPLETE flag is 8465 * set then we have deleted an entry that never made it to 8466 * disk. If the entry we deleted resulted from a name change, 8467 * then the old name still resides on disk. We cannot delete 8468 * its inode (returned to us in prevdirrem) until the zeroed 8469 * directory entry gets to disk. The new inode has never been 8470 * referenced on the disk, so can be deleted immediately. 8471 */ 8472 if ((dirrem->dm_state & COMPLETE) == 0) { 8473 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 8474 dm_next); 8475 FREE_LOCK(&lk); 8476 } else { 8477 if (prevdirrem != NULL) 8478 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 8479 prevdirrem, dm_next); 8480 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 8481 direct = LIST_EMPTY(&dirrem->dm_jremrefhd); 8482 FREE_LOCK(&lk); 8483 if (direct) 8484 handle_workitem_remove(dirrem, 0); 8485 } 8486 } 8487 8488 /* 8489 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the 8490 * pd_pendinghd list of a pagedep. 8491 */ 8492 static struct diradd * 8493 diradd_lookup(pagedep, offset) 8494 struct pagedep *pagedep; 8495 int offset; 8496 { 8497 struct diradd *dap; 8498 8499 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 8500 if (dap->da_offset == offset) 8501 return (dap); 8502 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 8503 if (dap->da_offset == offset) 8504 return (dap); 8505 return (NULL); 8506 } 8507 8508 /* 8509 * Search for a .. diradd dependency in a directory that is being removed. 8510 * If the directory was renamed to a new parent we have a diradd rather 8511 * than a mkdir for the .. entry. We need to cancel it now before 8512 * it is found in truncate(). 8513 */ 8514 static struct jremref * 8515 cancel_diradd_dotdot(ip, dirrem, jremref) 8516 struct inode *ip; 8517 struct dirrem *dirrem; 8518 struct jremref *jremref; 8519 { 8520 struct pagedep *pagedep; 8521 struct diradd *dap; 8522 struct worklist *wk; 8523 8524 if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0, 8525 &pagedep) == 0) 8526 return (jremref); 8527 dap = diradd_lookup(pagedep, DOTDOT_OFFSET); 8528 if (dap == NULL) 8529 return (jremref); 8530 cancel_diradd(dap, dirrem, jremref, NULL, NULL); 8531 /* 8532 * Mark any journal work as belonging to the parent so it is freed 8533 * with the .. reference. 8534 */ 8535 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 8536 wk->wk_state |= MKDIR_PARENT; 8537 return (NULL); 8538 } 8539 8540 /* 8541 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to 8542 * replace it with a dirrem/diradd pair as a result of re-parenting a 8543 * directory. This ensures that we don't simultaneously have a mkdir and 8544 * a diradd for the same .. entry. 8545 */ 8546 static struct jremref * 8547 cancel_mkdir_dotdot(ip, dirrem, jremref) 8548 struct inode *ip; 8549 struct dirrem *dirrem; 8550 struct jremref *jremref; 8551 { 8552 struct inodedep *inodedep; 8553 struct jaddref *jaddref; 8554 struct mkdir *mkdir; 8555 struct diradd *dap; 8556 8557 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 8558 &inodedep) == 0) 8559 panic("cancel_mkdir_dotdot: Lost inodedep"); 8560 dap = inodedep->id_mkdiradd; 8561 if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) 8562 return (jremref); 8563 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; 8564 mkdir = LIST_NEXT(mkdir, md_mkdirs)) 8565 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) 8566 break; 8567 if (mkdir == NULL) 8568 panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); 8569 if ((jaddref = mkdir->md_jaddref) != NULL) { 8570 mkdir->md_jaddref = NULL; 8571 jaddref->ja_state &= ~MKDIR_PARENT; 8572 if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, 8573 &inodedep) == 0) 8574 panic("cancel_mkdir_dotdot: Lost parent inodedep"); 8575 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { 8576 journal_jremref(dirrem, jremref, inodedep); 8577 jremref = NULL; 8578 } 8579 } 8580 if (mkdir->md_state & ONWORKLIST) 8581 WORKLIST_REMOVE(&mkdir->md_list); 8582 mkdir->md_state |= ALLCOMPLETE; 8583 complete_mkdir(mkdir); 8584 return (jremref); 8585 } 8586 8587 static void 8588 journal_jremref(dirrem, jremref, inodedep) 8589 struct dirrem *dirrem; 8590 struct jremref *jremref; 8591 struct inodedep *inodedep; 8592 { 8593 8594 if (inodedep == NULL) 8595 if (inodedep_lookup(jremref->jr_list.wk_mp, 8596 jremref->jr_ref.if_ino, 0, &inodedep) == 0) 8597 panic("journal_jremref: Lost inodedep"); 8598 LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); 8599 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 8600 add_to_journal(&jremref->jr_list); 8601 } 8602 8603 static void 8604 dirrem_journal(dirrem, jremref, dotremref, dotdotremref) 8605 struct dirrem *dirrem; 8606 struct jremref *jremref; 8607 struct jremref *dotremref; 8608 struct jremref *dotdotremref; 8609 { 8610 struct inodedep *inodedep; 8611 8612 8613 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, 8614 &inodedep) == 0) 8615 panic("dirrem_journal: Lost inodedep"); 8616 journal_jremref(dirrem, jremref, inodedep); 8617 if (dotremref) 8618 journal_jremref(dirrem, dotremref, inodedep); 8619 if (dotdotremref) 8620 journal_jremref(dirrem, dotdotremref, NULL); 8621 } 8622 8623 /* 8624 * Allocate a new dirrem if appropriate and return it along with 8625 * its associated pagedep. Called without a lock, returns with lock. 8626 */ 8627 static struct dirrem * 8628 newdirrem(bp, dp, ip, isrmdir, prevdirremp) 8629 struct buf *bp; /* buffer containing directory block */ 8630 struct inode *dp; /* inode for the directory being modified */ 8631 struct inode *ip; /* inode for directory entry being removed */ 8632 int isrmdir; /* indicates if doing RMDIR */ 8633 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 8634 { 8635 int offset; 8636 ufs_lbn_t lbn; 8637 struct diradd *dap; 8638 struct dirrem *dirrem; 8639 struct pagedep *pagedep; 8640 struct jremref *jremref; 8641 struct jremref *dotremref; 8642 struct jremref *dotdotremref; 8643 struct vnode *dvp; 8644 8645 /* 8646 * Whiteouts have no deletion dependencies. 8647 */ 8648 if (ip == NULL) 8649 panic("newdirrem: whiteout"); 8650 dvp = ITOV(dp); 8651 /* 8652 * If we are over our limit, try to improve the situation. 8653 * Limiting the number of dirrem structures will also limit 8654 * the number of freefile and freeblks structures. 8655 */ 8656 ACQUIRE_LOCK(&lk); 8657 if (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2) 8658 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS); 8659 FREE_LOCK(&lk); 8660 dirrem = malloc(sizeof(struct dirrem), 8661 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); 8662 workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); 8663 LIST_INIT(&dirrem->dm_jremrefhd); 8664 LIST_INIT(&dirrem->dm_jwork); 8665 dirrem->dm_state = isrmdir ? RMDIR : 0; 8666 dirrem->dm_oldinum = ip->i_number; 8667 *prevdirremp = NULL; 8668 /* 8669 * Allocate remove reference structures to track journal write 8670 * dependencies. We will always have one for the link and 8671 * when doing directories we will always have one more for dot. 8672 * When renaming a directory we skip the dotdot link change so 8673 * this is not needed. 8674 */ 8675 jremref = dotremref = dotdotremref = NULL; 8676 if (DOINGSUJ(dvp)) { 8677 if (isrmdir) { 8678 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 8679 ip->i_effnlink + 2); 8680 dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, 8681 ip->i_effnlink + 1); 8682 dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, 8683 dp->i_effnlink + 1); 8684 dotdotremref->jr_state |= MKDIR_PARENT; 8685 } else 8686 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 8687 ip->i_effnlink + 1); 8688 } 8689 ACQUIRE_LOCK(&lk); 8690 lbn = lblkno(dp->i_fs, dp->i_offset); 8691 offset = blkoff(dp->i_fs, dp->i_offset); 8692 pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC, 8693 &pagedep); 8694 dirrem->dm_pagedep = pagedep; 8695 dirrem->dm_offset = offset; 8696 /* 8697 * If we're renaming a .. link to a new directory, cancel any 8698 * existing MKDIR_PARENT mkdir. If it has already been canceled 8699 * the jremref is preserved for any potential diradd in this 8700 * location. This can not coincide with a rmdir. 8701 */ 8702 if (dp->i_offset == DOTDOT_OFFSET) { 8703 if (isrmdir) 8704 panic("newdirrem: .. directory change during remove?"); 8705 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); 8706 } 8707 /* 8708 * If we're removing a directory search for the .. dependency now and 8709 * cancel it. Any pending journal work will be added to the dirrem 8710 * to be completed when the workitem remove completes. 8711 */ 8712 if (isrmdir) 8713 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); 8714 /* 8715 * Check for a diradd dependency for the same directory entry. 8716 * If present, then both dependencies become obsolete and can 8717 * be de-allocated. 8718 */ 8719 dap = diradd_lookup(pagedep, offset); 8720 if (dap == NULL) { 8721 /* 8722 * Link the jremref structures into the dirrem so they are 8723 * written prior to the pagedep. 8724 */ 8725 if (jremref) 8726 dirrem_journal(dirrem, jremref, dotremref, 8727 dotdotremref); 8728 return (dirrem); 8729 } 8730 /* 8731 * Must be ATTACHED at this point. 8732 */ 8733 if ((dap->da_state & ATTACHED) == 0) 8734 panic("newdirrem: not ATTACHED"); 8735 if (dap->da_newinum != ip->i_number) 8736 panic("newdirrem: inum %d should be %d", 8737 ip->i_number, dap->da_newinum); 8738 /* 8739 * If we are deleting a changed name that never made it to disk, 8740 * then return the dirrem describing the previous inode (which 8741 * represents the inode currently referenced from this entry on disk). 8742 */ 8743 if ((dap->da_state & DIRCHG) != 0) { 8744 *prevdirremp = dap->da_previous; 8745 dap->da_state &= ~DIRCHG; 8746 dap->da_pagedep = pagedep; 8747 } 8748 /* 8749 * We are deleting an entry that never made it to disk. 8750 * Mark it COMPLETE so we can delete its inode immediately. 8751 */ 8752 dirrem->dm_state |= COMPLETE; 8753 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); 8754 #ifdef SUJ_DEBUG 8755 if (isrmdir == 0) { 8756 struct worklist *wk; 8757 8758 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 8759 if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) 8760 panic("bad wk %p (0x%X)\n", wk, wk->wk_state); 8761 } 8762 #endif 8763 8764 return (dirrem); 8765 } 8766 8767 /* 8768 * Directory entry change dependencies. 8769 * 8770 * Changing an existing directory entry requires that an add operation 8771 * be completed first followed by a deletion. The semantics for the addition 8772 * are identical to the description of adding a new entry above except 8773 * that the rollback is to the old inode number rather than zero. Once 8774 * the addition dependency is completed, the removal is done as described 8775 * in the removal routine above. 8776 */ 8777 8778 /* 8779 * This routine should be called immediately after changing 8780 * a directory entry. The inode's link count should not be 8781 * decremented by the calling procedure -- the soft updates 8782 * code will perform this task when it is safe. 8783 */ 8784 void 8785 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 8786 struct buf *bp; /* buffer containing directory block */ 8787 struct inode *dp; /* inode for the directory being modified */ 8788 struct inode *ip; /* inode for directory entry being removed */ 8789 ino_t newinum; /* new inode number for changed entry */ 8790 int isrmdir; /* indicates if doing RMDIR */ 8791 { 8792 int offset; 8793 struct diradd *dap = NULL; 8794 struct dirrem *dirrem, *prevdirrem; 8795 struct pagedep *pagedep; 8796 struct inodedep *inodedep; 8797 struct jaddref *jaddref; 8798 struct mount *mp; 8799 8800 offset = blkoff(dp->i_fs, dp->i_offset); 8801 mp = UFSTOVFS(dp->i_ump); 8802 8803 /* 8804 * Whiteouts do not need diradd dependencies. 8805 */ 8806 if (newinum != WINO) { 8807 dap = malloc(sizeof(struct diradd), 8808 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 8809 workitem_alloc(&dap->da_list, D_DIRADD, mp); 8810 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 8811 dap->da_offset = offset; 8812 dap->da_newinum = newinum; 8813 LIST_INIT(&dap->da_jwork); 8814 } 8815 8816 /* 8817 * Allocate a new dirrem and ACQUIRE_LOCK. 8818 */ 8819 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 8820 pagedep = dirrem->dm_pagedep; 8821 /* 8822 * The possible values for isrmdir: 8823 * 0 - non-directory file rename 8824 * 1 - directory rename within same directory 8825 * inum - directory rename to new directory of given inode number 8826 * When renaming to a new directory, we are both deleting and 8827 * creating a new directory entry, so the link count on the new 8828 * directory should not change. Thus we do not need the followup 8829 * dirrem which is usually done in handle_workitem_remove. We set 8830 * the DIRCHG flag to tell handle_workitem_remove to skip the 8831 * followup dirrem. 8832 */ 8833 if (isrmdir > 1) 8834 dirrem->dm_state |= DIRCHG; 8835 8836 /* 8837 * Whiteouts have no additional dependencies, 8838 * so just put the dirrem on the correct list. 8839 */ 8840 if (newinum == WINO) { 8841 if ((dirrem->dm_state & COMPLETE) == 0) { 8842 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 8843 dm_next); 8844 } else { 8845 dirrem->dm_dirinum = pagedep->pd_ino; 8846 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8847 add_to_worklist(&dirrem->dm_list, 0); 8848 } 8849 FREE_LOCK(&lk); 8850 return; 8851 } 8852 /* 8853 * Add the dirrem to the inodedep's pending remove list for quick 8854 * discovery later. A valid nlinkdelta ensures that this lookup 8855 * will not fail. 8856 */ 8857 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 8858 panic("softdep_setup_directory_change: Lost inodedep."); 8859 dirrem->dm_state |= ONDEPLIST; 8860 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 8861 8862 /* 8863 * If the COMPLETE flag is clear, then there were no active 8864 * entries and we want to roll back to the previous inode until 8865 * the new inode is committed to disk. If the COMPLETE flag is 8866 * set, then we have deleted an entry that never made it to disk. 8867 * If the entry we deleted resulted from a name change, then the old 8868 * inode reference still resides on disk. Any rollback that we do 8869 * needs to be to that old inode (returned to us in prevdirrem). If 8870 * the entry we deleted resulted from a create, then there is 8871 * no entry on the disk, so we want to roll back to zero rather 8872 * than the uncommitted inode. In either of the COMPLETE cases we 8873 * want to immediately free the unwritten and unreferenced inode. 8874 */ 8875 if ((dirrem->dm_state & COMPLETE) == 0) { 8876 dap->da_previous = dirrem; 8877 } else { 8878 if (prevdirrem != NULL) { 8879 dap->da_previous = prevdirrem; 8880 } else { 8881 dap->da_state &= ~DIRCHG; 8882 dap->da_pagedep = pagedep; 8883 } 8884 dirrem->dm_dirinum = pagedep->pd_ino; 8885 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8886 add_to_worklist(&dirrem->dm_list, 0); 8887 } 8888 /* 8889 * Lookup the jaddref for this journal entry. We must finish 8890 * initializing it and make the diradd write dependent on it. 8891 * If we're not journaling, put it on the id_bufwait list if the 8892 * inode is not yet written. If it is written, do the post-inode 8893 * write processing to put it on the id_pendinghd list. 8894 */ 8895 inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep); 8896 if (MOUNTEDSUJ(mp)) { 8897 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 8898 inoreflst); 8899 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 8900 ("softdep_setup_directory_change: bad jaddref %p", 8901 jaddref)); 8902 jaddref->ja_diroff = dp->i_offset; 8903 jaddref->ja_diradd = dap; 8904 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 8905 dap, da_pdlist); 8906 add_to_journal(&jaddref->ja_list); 8907 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 8908 dap->da_state |= COMPLETE; 8909 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 8910 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 8911 } else { 8912 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 8913 dap, da_pdlist); 8914 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 8915 } 8916 /* 8917 * If we're making a new name for a directory that has not been 8918 * committed when need to move the dot and dotdot references to 8919 * this new name. 8920 */ 8921 if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) 8922 merge_diradd(inodedep, dap); 8923 FREE_LOCK(&lk); 8924 } 8925 8926 /* 8927 * Called whenever the link count on an inode is changed. 8928 * It creates an inode dependency so that the new reference(s) 8929 * to the inode cannot be committed to disk until the updated 8930 * inode has been written. 8931 */ 8932 void 8933 softdep_change_linkcnt(ip) 8934 struct inode *ip; /* the inode with the increased link count */ 8935 { 8936 struct inodedep *inodedep; 8937 int dflags; 8938 8939 ACQUIRE_LOCK(&lk); 8940 dflags = DEPALLOC; 8941 if (IS_SNAPSHOT(ip)) 8942 dflags |= NODELAY; 8943 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep); 8944 if (ip->i_nlink < ip->i_effnlink) 8945 panic("softdep_change_linkcnt: bad delta"); 8946 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 8947 FREE_LOCK(&lk); 8948 } 8949 8950 /* 8951 * Attach a sbdep dependency to the superblock buf so that we can keep 8952 * track of the head of the linked list of referenced but unlinked inodes. 8953 */ 8954 void 8955 softdep_setup_sbupdate(ump, fs, bp) 8956 struct ufsmount *ump; 8957 struct fs *fs; 8958 struct buf *bp; 8959 { 8960 struct sbdep *sbdep; 8961 struct worklist *wk; 8962 8963 if (MOUNTEDSUJ(UFSTOVFS(ump)) == 0) 8964 return; 8965 LIST_FOREACH(wk, &bp->b_dep, wk_list) 8966 if (wk->wk_type == D_SBDEP) 8967 break; 8968 if (wk != NULL) 8969 return; 8970 sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); 8971 workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); 8972 sbdep->sb_fs = fs; 8973 sbdep->sb_ump = ump; 8974 ACQUIRE_LOCK(&lk); 8975 WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); 8976 FREE_LOCK(&lk); 8977 } 8978 8979 /* 8980 * Return the first unlinked inodedep which is ready to be the head of the 8981 * list. The inodedep and all those after it must have valid next pointers. 8982 */ 8983 static struct inodedep * 8984 first_unlinked_inodedep(ump) 8985 struct ufsmount *ump; 8986 { 8987 struct inodedep *inodedep; 8988 struct inodedep *idp; 8989 8990 for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); 8991 inodedep; inodedep = idp) { 8992 if ((inodedep->id_state & UNLINKNEXT) == 0) 8993 return (NULL); 8994 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 8995 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) 8996 break; 8997 if ((inodedep->id_state & UNLINKPREV) == 0) 8998 panic("first_unlinked_inodedep: prev != next"); 8999 } 9000 if (inodedep == NULL) 9001 return (NULL); 9002 9003 return (inodedep); 9004 } 9005 9006 /* 9007 * Set the sujfree unlinked head pointer prior to writing a superblock. 9008 */ 9009 static void 9010 initiate_write_sbdep(sbdep) 9011 struct sbdep *sbdep; 9012 { 9013 struct inodedep *inodedep; 9014 struct fs *bpfs; 9015 struct fs *fs; 9016 9017 bpfs = sbdep->sb_fs; 9018 fs = sbdep->sb_ump->um_fs; 9019 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 9020 if (inodedep) { 9021 fs->fs_sujfree = inodedep->id_ino; 9022 inodedep->id_state |= UNLINKPREV; 9023 } else 9024 fs->fs_sujfree = 0; 9025 bpfs->fs_sujfree = fs->fs_sujfree; 9026 } 9027 9028 /* 9029 * After a superblock is written determine whether it must be written again 9030 * due to a changing unlinked list head. 9031 */ 9032 static int 9033 handle_written_sbdep(sbdep, bp) 9034 struct sbdep *sbdep; 9035 struct buf *bp; 9036 { 9037 struct inodedep *inodedep; 9038 struct mount *mp; 9039 struct fs *fs; 9040 9041 fs = sbdep->sb_fs; 9042 mp = UFSTOVFS(sbdep->sb_ump); 9043 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 9044 if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || 9045 (inodedep == NULL && fs->fs_sujfree != 0)) { 9046 bdirty(bp); 9047 return (1); 9048 } 9049 WORKITEM_FREE(sbdep, D_SBDEP); 9050 if (fs->fs_sujfree == 0) 9051 return (0); 9052 if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0) 9053 panic("handle_written_sbdep: lost inodedep"); 9054 /* 9055 * Now that we have a record of this inode in stable store allow it 9056 * to be written to free up pending work. Inodes may see a lot of 9057 * write activity after they are unlinked which we must not hold up. 9058 */ 9059 for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { 9060 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) 9061 panic("handle_written_sbdep: Bad inodedep %p (0x%X)", 9062 inodedep, inodedep->id_state); 9063 if (inodedep->id_state & UNLINKONLIST) 9064 break; 9065 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; 9066 } 9067 9068 return (0); 9069 } 9070 9071 /* 9072 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. 9073 */ 9074 static void 9075 unlinked_inodedep(mp, inodedep) 9076 struct mount *mp; 9077 struct inodedep *inodedep; 9078 { 9079 struct ufsmount *ump; 9080 9081 if (MOUNTEDSUJ(mp) == 0) 9082 return; 9083 ump = VFSTOUFS(mp); 9084 ump->um_fs->fs_fmod = 1; 9085 inodedep->id_state |= UNLINKED; 9086 TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); 9087 } 9088 9089 /* 9090 * Remove an inodedep from the unlinked inodedep list. This may require 9091 * disk writes if the inode has made it that far. 9092 */ 9093 static void 9094 clear_unlinked_inodedep(inodedep) 9095 struct inodedep *inodedep; 9096 { 9097 struct ufsmount *ump; 9098 struct inodedep *idp; 9099 struct inodedep *idn; 9100 struct fs *fs; 9101 struct buf *bp; 9102 ino_t ino; 9103 ino_t nino; 9104 ino_t pino; 9105 int error; 9106 9107 ump = VFSTOUFS(inodedep->id_list.wk_mp); 9108 fs = ump->um_fs; 9109 ino = inodedep->id_ino; 9110 error = 0; 9111 for (;;) { 9112 /* 9113 * If nothing has yet been written simply remove us from 9114 * the in memory list and return. This is the most common 9115 * case where handle_workitem_remove() loses the final 9116 * reference. 9117 */ 9118 if ((inodedep->id_state & UNLINKLINKS) == 0) 9119 break; 9120 /* 9121 * If we have a NEXT pointer and no PREV pointer we can simply 9122 * clear NEXT's PREV and remove ourselves from the list. Be 9123 * careful not to clear PREV if the superblock points at 9124 * next as well. 9125 */ 9126 idn = TAILQ_NEXT(inodedep, id_unlinked); 9127 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { 9128 if (idn && fs->fs_sujfree != idn->id_ino) 9129 idn->id_state &= ~UNLINKPREV; 9130 break; 9131 } 9132 /* 9133 * Here we have an inodedep which is actually linked into 9134 * the list. We must remove it by forcing a write to the 9135 * link before us, whether it be the superblock or an inode. 9136 * Unfortunately the list may change while we're waiting 9137 * on the buf lock for either resource so we must loop until 9138 * we lock the right one. If both the superblock and an 9139 * inode point to this inode we must clear the inode first 9140 * followed by the superblock. 9141 */ 9142 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 9143 pino = 0; 9144 if (idp && (idp->id_state & UNLINKNEXT)) 9145 pino = idp->id_ino; 9146 FREE_LOCK(&lk); 9147 if (pino == 0) 9148 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 9149 (int)fs->fs_sbsize, 0, 0, 0); 9150 else 9151 error = bread(ump->um_devvp, 9152 fsbtodb(fs, ino_to_fsba(fs, pino)), 9153 (int)fs->fs_bsize, NOCRED, &bp); 9154 ACQUIRE_LOCK(&lk); 9155 if (error) 9156 break; 9157 /* If the list has changed restart the loop. */ 9158 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 9159 nino = 0; 9160 if (idp && (idp->id_state & UNLINKNEXT)) 9161 nino = idp->id_ino; 9162 if (nino != pino || 9163 (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { 9164 FREE_LOCK(&lk); 9165 brelse(bp); 9166 ACQUIRE_LOCK(&lk); 9167 continue; 9168 } 9169 /* 9170 * Remove us from the in memory list. After this we cannot 9171 * access the inodedep. 9172 */ 9173 idn = TAILQ_NEXT(inodedep, id_unlinked); 9174 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 9175 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 9176 /* 9177 * Determine the next inode number. 9178 */ 9179 nino = 0; 9180 if (idn) { 9181 /* 9182 * If next isn't on the list we can just clear prev's 9183 * state and schedule it to be fixed later. No need 9184 * to synchronously write if we're not in the real 9185 * list. 9186 */ 9187 if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) { 9188 idp->id_state &= ~UNLINKNEXT; 9189 if ((idp->id_state & ONWORKLIST) == 0) 9190 WORKLIST_INSERT(&bp->b_dep, 9191 &idp->id_list); 9192 FREE_LOCK(&lk); 9193 bawrite(bp); 9194 ACQUIRE_LOCK(&lk); 9195 return; 9196 } 9197 nino = idn->id_ino; 9198 } 9199 FREE_LOCK(&lk); 9200 /* 9201 * The predecessor's next pointer is manually updated here 9202 * so that the NEXT flag is never cleared for an element 9203 * that is in the list. 9204 */ 9205 if (pino == 0) { 9206 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 9207 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 9208 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 9209 bp); 9210 } else if (fs->fs_magic == FS_UFS1_MAGIC) 9211 ((struct ufs1_dinode *)bp->b_data + 9212 ino_to_fsbo(fs, pino))->di_freelink = nino; 9213 else 9214 ((struct ufs2_dinode *)bp->b_data + 9215 ino_to_fsbo(fs, pino))->di_freelink = nino; 9216 /* 9217 * If the bwrite fails we have no recourse to recover. The 9218 * filesystem is corrupted already. 9219 */ 9220 bwrite(bp); 9221 ACQUIRE_LOCK(&lk); 9222 /* 9223 * If the superblock pointer still needs to be cleared force 9224 * a write here. 9225 */ 9226 if (fs->fs_sujfree == ino) { 9227 FREE_LOCK(&lk); 9228 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 9229 (int)fs->fs_sbsize, 0, 0, 0); 9230 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 9231 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 9232 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 9233 bp); 9234 bwrite(bp); 9235 ACQUIRE_LOCK(&lk); 9236 } 9237 if (fs->fs_sujfree != ino) 9238 return; 9239 panic("clear_unlinked_inodedep: Failed to clear free head"); 9240 } 9241 if (inodedep->id_ino == fs->fs_sujfree) 9242 panic("clear_unlinked_inodedep: Freeing head of free list"); 9243 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 9244 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 9245 return; 9246 } 9247 9248 /* 9249 * This workitem decrements the inode's link count. 9250 * If the link count reaches zero, the file is removed. 9251 */ 9252 static int 9253 handle_workitem_remove(dirrem, flags) 9254 struct dirrem *dirrem; 9255 int flags; 9256 { 9257 struct inodedep *inodedep; 9258 struct workhead dotdotwk; 9259 struct worklist *wk; 9260 struct ufsmount *ump; 9261 struct mount *mp; 9262 struct vnode *vp; 9263 struct inode *ip; 9264 ino_t oldinum; 9265 9266 if (dirrem->dm_state & ONWORKLIST) 9267 panic("handle_workitem_remove: dirrem %p still on worklist", 9268 dirrem); 9269 oldinum = dirrem->dm_oldinum; 9270 mp = dirrem->dm_list.wk_mp; 9271 ump = VFSTOUFS(mp); 9272 flags |= LK_EXCLUSIVE; 9273 if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0) 9274 return (EBUSY); 9275 ip = VTOI(vp); 9276 ACQUIRE_LOCK(&lk); 9277 if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) 9278 panic("handle_workitem_remove: lost inodedep"); 9279 if (dirrem->dm_state & ONDEPLIST) 9280 LIST_REMOVE(dirrem, dm_inonext); 9281 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 9282 ("handle_workitem_remove: Journal entries not written.")); 9283 9284 /* 9285 * Move all dependencies waiting on the remove to complete 9286 * from the dirrem to the inode inowait list to be completed 9287 * after the inode has been updated and written to disk. Any 9288 * marked MKDIR_PARENT are saved to be completed when the .. ref 9289 * is removed. 9290 */ 9291 LIST_INIT(&dotdotwk); 9292 while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { 9293 WORKLIST_REMOVE(wk); 9294 if (wk->wk_state & MKDIR_PARENT) { 9295 wk->wk_state &= ~MKDIR_PARENT; 9296 WORKLIST_INSERT(&dotdotwk, wk); 9297 continue; 9298 } 9299 WORKLIST_INSERT(&inodedep->id_inowait, wk); 9300 } 9301 LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); 9302 /* 9303 * Normal file deletion. 9304 */ 9305 if ((dirrem->dm_state & RMDIR) == 0) { 9306 ip->i_nlink--; 9307 DIP_SET(ip, i_nlink, ip->i_nlink); 9308 ip->i_flag |= IN_CHANGE; 9309 if (ip->i_nlink < ip->i_effnlink) 9310 panic("handle_workitem_remove: bad file delta"); 9311 if (ip->i_nlink == 0) 9312 unlinked_inodedep(mp, inodedep); 9313 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 9314 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 9315 ("handle_workitem_remove: worklist not empty. %s", 9316 TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); 9317 WORKITEM_FREE(dirrem, D_DIRREM); 9318 FREE_LOCK(&lk); 9319 goto out; 9320 } 9321 /* 9322 * Directory deletion. Decrement reference count for both the 9323 * just deleted parent directory entry and the reference for ".". 9324 * Arrange to have the reference count on the parent decremented 9325 * to account for the loss of "..". 9326 */ 9327 ip->i_nlink -= 2; 9328 DIP_SET(ip, i_nlink, ip->i_nlink); 9329 ip->i_flag |= IN_CHANGE; 9330 if (ip->i_nlink < ip->i_effnlink) 9331 panic("handle_workitem_remove: bad dir delta"); 9332 if (ip->i_nlink == 0) 9333 unlinked_inodedep(mp, inodedep); 9334 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 9335 /* 9336 * Rename a directory to a new parent. Since, we are both deleting 9337 * and creating a new directory entry, the link count on the new 9338 * directory should not change. Thus we skip the followup dirrem. 9339 */ 9340 if (dirrem->dm_state & DIRCHG) { 9341 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 9342 ("handle_workitem_remove: DIRCHG and worklist not empty.")); 9343 WORKITEM_FREE(dirrem, D_DIRREM); 9344 FREE_LOCK(&lk); 9345 goto out; 9346 } 9347 dirrem->dm_state = ONDEPLIST; 9348 dirrem->dm_oldinum = dirrem->dm_dirinum; 9349 /* 9350 * Place the dirrem on the parent's diremhd list. 9351 */ 9352 if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) 9353 panic("handle_workitem_remove: lost dir inodedep"); 9354 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 9355 /* 9356 * If the allocated inode has never been written to disk, then 9357 * the on-disk inode is zero'ed and we can remove the file 9358 * immediately. When journaling if the inode has been marked 9359 * unlinked and not DEPCOMPLETE we know it can never be written. 9360 */ 9361 inodedep_lookup(mp, oldinum, 0, &inodedep); 9362 if (inodedep == NULL || 9363 (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || 9364 check_inode_unwritten(inodedep)) { 9365 FREE_LOCK(&lk); 9366 vput(vp); 9367 return handle_workitem_remove(dirrem, flags); 9368 } 9369 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 9370 FREE_LOCK(&lk); 9371 ip->i_flag |= IN_CHANGE; 9372 out: 9373 ffs_update(vp, 0); 9374 vput(vp); 9375 return (0); 9376 } 9377 9378 /* 9379 * Inode de-allocation dependencies. 9380 * 9381 * When an inode's link count is reduced to zero, it can be de-allocated. We 9382 * found it convenient to postpone de-allocation until after the inode is 9383 * written to disk with its new link count (zero). At this point, all of the 9384 * on-disk inode's block pointers are nullified and, with careful dependency 9385 * list ordering, all dependencies related to the inode will be satisfied and 9386 * the corresponding dependency structures de-allocated. So, if/when the 9387 * inode is reused, there will be no mixing of old dependencies with new 9388 * ones. This artificial dependency is set up by the block de-allocation 9389 * procedure above (softdep_setup_freeblocks) and completed by the 9390 * following procedure. 9391 */ 9392 static void 9393 handle_workitem_freefile(freefile) 9394 struct freefile *freefile; 9395 { 9396 struct workhead wkhd; 9397 struct fs *fs; 9398 struct inodedep *idp; 9399 struct ufsmount *ump; 9400 int error; 9401 9402 ump = VFSTOUFS(freefile->fx_list.wk_mp); 9403 fs = ump->um_fs; 9404 #ifdef DEBUG 9405 ACQUIRE_LOCK(&lk); 9406 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); 9407 FREE_LOCK(&lk); 9408 if (error) 9409 panic("handle_workitem_freefile: inodedep %p survived", idp); 9410 #endif 9411 UFS_LOCK(ump); 9412 fs->fs_pendinginodes -= 1; 9413 UFS_UNLOCK(ump); 9414 LIST_INIT(&wkhd); 9415 LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); 9416 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, 9417 freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) 9418 softdep_error("handle_workitem_freefile", error); 9419 ACQUIRE_LOCK(&lk); 9420 WORKITEM_FREE(freefile, D_FREEFILE); 9421 FREE_LOCK(&lk); 9422 } 9423 9424 9425 /* 9426 * Helper function which unlinks marker element from work list and returns 9427 * the next element on the list. 9428 */ 9429 static __inline struct worklist * 9430 markernext(struct worklist *marker) 9431 { 9432 struct worklist *next; 9433 9434 next = LIST_NEXT(marker, wk_list); 9435 LIST_REMOVE(marker, wk_list); 9436 return next; 9437 } 9438 9439 /* 9440 * Disk writes. 9441 * 9442 * The dependency structures constructed above are most actively used when file 9443 * system blocks are written to disk. No constraints are placed on when a 9444 * block can be written, but unsatisfied update dependencies are made safe by 9445 * modifying (or replacing) the source memory for the duration of the disk 9446 * write. When the disk write completes, the memory block is again brought 9447 * up-to-date. 9448 * 9449 * In-core inode structure reclamation. 9450 * 9451 * Because there are a finite number of "in-core" inode structures, they are 9452 * reused regularly. By transferring all inode-related dependencies to the 9453 * in-memory inode block and indexing them separately (via "inodedep"s), we 9454 * can allow "in-core" inode structures to be reused at any time and avoid 9455 * any increase in contention. 9456 * 9457 * Called just before entering the device driver to initiate a new disk I/O. 9458 * The buffer must be locked, thus, no I/O completion operations can occur 9459 * while we are manipulating its associated dependencies. 9460 */ 9461 static void 9462 softdep_disk_io_initiation(bp) 9463 struct buf *bp; /* structure describing disk write to occur */ 9464 { 9465 struct worklist *wk; 9466 struct worklist marker; 9467 struct inodedep *inodedep; 9468 struct freeblks *freeblks; 9469 struct jblkdep *jblkdep; 9470 struct newblk *newblk; 9471 9472 /* 9473 * We only care about write operations. There should never 9474 * be dependencies for reads. 9475 */ 9476 if (bp->b_iocmd != BIO_WRITE) 9477 panic("softdep_disk_io_initiation: not write"); 9478 9479 if (bp->b_vflags & BV_BKGRDINPROG) 9480 panic("softdep_disk_io_initiation: Writing buffer with " 9481 "background write in progress: %p", bp); 9482 9483 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 9484 PHOLD(curproc); /* Don't swap out kernel stack */ 9485 9486 ACQUIRE_LOCK(&lk); 9487 /* 9488 * Do any necessary pre-I/O processing. 9489 */ 9490 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; 9491 wk = markernext(&marker)) { 9492 LIST_INSERT_AFTER(wk, &marker, wk_list); 9493 switch (wk->wk_type) { 9494 9495 case D_PAGEDEP: 9496 initiate_write_filepage(WK_PAGEDEP(wk), bp); 9497 continue; 9498 9499 case D_INODEDEP: 9500 inodedep = WK_INODEDEP(wk); 9501 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) 9502 initiate_write_inodeblock_ufs1(inodedep, bp); 9503 else 9504 initiate_write_inodeblock_ufs2(inodedep, bp); 9505 continue; 9506 9507 case D_INDIRDEP: 9508 initiate_write_indirdep(WK_INDIRDEP(wk), bp); 9509 continue; 9510 9511 case D_BMSAFEMAP: 9512 initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); 9513 continue; 9514 9515 case D_JSEG: 9516 WK_JSEG(wk)->js_buf = NULL; 9517 continue; 9518 9519 case D_FREEBLKS: 9520 freeblks = WK_FREEBLKS(wk); 9521 jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd); 9522 /* 9523 * We have to wait for the freeblks to be journaled 9524 * before we can write an inodeblock with updated 9525 * pointers. Be careful to arrange the marker so 9526 * we revisit the freeblks if it's not removed by 9527 * the first jwait(). 9528 */ 9529 if (jblkdep != NULL) { 9530 LIST_REMOVE(&marker, wk_list); 9531 LIST_INSERT_BEFORE(wk, &marker, wk_list); 9532 jwait(&jblkdep->jb_list, MNT_WAIT); 9533 } 9534 continue; 9535 case D_ALLOCDIRECT: 9536 case D_ALLOCINDIR: 9537 /* 9538 * We have to wait for the jnewblk to be journaled 9539 * before we can write to a block if the contents 9540 * may be confused with an earlier file's indirect 9541 * at recovery time. Handle the marker as described 9542 * above. 9543 */ 9544 newblk = WK_NEWBLK(wk); 9545 if (newblk->nb_jnewblk != NULL && 9546 indirblk_lookup(newblk->nb_list.wk_mp, 9547 newblk->nb_newblkno)) { 9548 LIST_REMOVE(&marker, wk_list); 9549 LIST_INSERT_BEFORE(wk, &marker, wk_list); 9550 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 9551 } 9552 continue; 9553 9554 case D_SBDEP: 9555 initiate_write_sbdep(WK_SBDEP(wk)); 9556 continue; 9557 9558 case D_MKDIR: 9559 case D_FREEWORK: 9560 case D_FREEDEP: 9561 case D_JSEGDEP: 9562 continue; 9563 9564 default: 9565 panic("handle_disk_io_initiation: Unexpected type %s", 9566 TYPENAME(wk->wk_type)); 9567 /* NOTREACHED */ 9568 } 9569 } 9570 FREE_LOCK(&lk); 9571 PRELE(curproc); /* Allow swapout of kernel stack */ 9572 } 9573 9574 /* 9575 * Called from within the procedure above to deal with unsatisfied 9576 * allocation dependencies in a directory. The buffer must be locked, 9577 * thus, no I/O completion operations can occur while we are 9578 * manipulating its associated dependencies. 9579 */ 9580 static void 9581 initiate_write_filepage(pagedep, bp) 9582 struct pagedep *pagedep; 9583 struct buf *bp; 9584 { 9585 struct jremref *jremref; 9586 struct jmvref *jmvref; 9587 struct dirrem *dirrem; 9588 struct diradd *dap; 9589 struct direct *ep; 9590 int i; 9591 9592 if (pagedep->pd_state & IOSTARTED) { 9593 /* 9594 * This can only happen if there is a driver that does not 9595 * understand chaining. Here biodone will reissue the call 9596 * to strategy for the incomplete buffers. 9597 */ 9598 printf("initiate_write_filepage: already started\n"); 9599 return; 9600 } 9601 pagedep->pd_state |= IOSTARTED; 9602 /* 9603 * Wait for all journal remove dependencies to hit the disk. 9604 * We can not allow any potentially conflicting directory adds 9605 * to be visible before removes and rollback is too difficult. 9606 * lk may be dropped and re-acquired, however we hold the buf 9607 * locked so the dependency can not go away. 9608 */ 9609 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) 9610 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) 9611 jwait(&jremref->jr_list, MNT_WAIT); 9612 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) 9613 jwait(&jmvref->jm_list, MNT_WAIT); 9614 for (i = 0; i < DAHASHSZ; i++) { 9615 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 9616 ep = (struct direct *) 9617 ((char *)bp->b_data + dap->da_offset); 9618 if (ep->d_ino != dap->da_newinum) 9619 panic("%s: dir inum %d != new %d", 9620 "initiate_write_filepage", 9621 ep->d_ino, dap->da_newinum); 9622 if (dap->da_state & DIRCHG) 9623 ep->d_ino = dap->da_previous->dm_oldinum; 9624 else 9625 ep->d_ino = 0; 9626 dap->da_state &= ~ATTACHED; 9627 dap->da_state |= UNDONE; 9628 } 9629 } 9630 } 9631 9632 /* 9633 * Version of initiate_write_inodeblock that handles UFS1 dinodes. 9634 * Note that any bug fixes made to this routine must be done in the 9635 * version found below. 9636 * 9637 * Called from within the procedure above to deal with unsatisfied 9638 * allocation dependencies in an inodeblock. The buffer must be 9639 * locked, thus, no I/O completion operations can occur while we 9640 * are manipulating its associated dependencies. 9641 */ 9642 static void 9643 initiate_write_inodeblock_ufs1(inodedep, bp) 9644 struct inodedep *inodedep; 9645 struct buf *bp; /* The inode block */ 9646 { 9647 struct allocdirect *adp, *lastadp; 9648 struct ufs1_dinode *dp; 9649 struct ufs1_dinode *sip; 9650 struct inoref *inoref; 9651 struct fs *fs; 9652 ufs_lbn_t i; 9653 #ifdef INVARIANTS 9654 ufs_lbn_t prevlbn = 0; 9655 #endif 9656 int deplist; 9657 9658 if (inodedep->id_state & IOSTARTED) 9659 panic("initiate_write_inodeblock_ufs1: already started"); 9660 inodedep->id_state |= IOSTARTED; 9661 fs = inodedep->id_fs; 9662 dp = (struct ufs1_dinode *)bp->b_data + 9663 ino_to_fsbo(fs, inodedep->id_ino); 9664 9665 /* 9666 * If we're on the unlinked list but have not yet written our 9667 * next pointer initialize it here. 9668 */ 9669 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9670 struct inodedep *inon; 9671 9672 inon = TAILQ_NEXT(inodedep, id_unlinked); 9673 dp->di_freelink = inon ? inon->id_ino : 0; 9674 } 9675 /* 9676 * If the bitmap is not yet written, then the allocated 9677 * inode cannot be written to disk. 9678 */ 9679 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 9680 if (inodedep->id_savedino1 != NULL) 9681 panic("initiate_write_inodeblock_ufs1: I/O underway"); 9682 FREE_LOCK(&lk); 9683 sip = malloc(sizeof(struct ufs1_dinode), 9684 M_SAVEDINO, M_SOFTDEP_FLAGS); 9685 ACQUIRE_LOCK(&lk); 9686 inodedep->id_savedino1 = sip; 9687 *inodedep->id_savedino1 = *dp; 9688 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 9689 dp->di_gen = inodedep->id_savedino1->di_gen; 9690 dp->di_freelink = inodedep->id_savedino1->di_freelink; 9691 return; 9692 } 9693 /* 9694 * If no dependencies, then there is nothing to roll back. 9695 */ 9696 inodedep->id_savedsize = dp->di_size; 9697 inodedep->id_savedextsize = 0; 9698 inodedep->id_savednlink = dp->di_nlink; 9699 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 9700 TAILQ_EMPTY(&inodedep->id_inoreflst)) 9701 return; 9702 /* 9703 * Revert the link count to that of the first unwritten journal entry. 9704 */ 9705 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 9706 if (inoref) 9707 dp->di_nlink = inoref->if_nlink; 9708 /* 9709 * Set the dependencies to busy. 9710 */ 9711 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9712 adp = TAILQ_NEXT(adp, ad_next)) { 9713 #ifdef INVARIANTS 9714 if (deplist != 0 && prevlbn >= adp->ad_offset) 9715 panic("softdep_write_inodeblock: lbn order"); 9716 prevlbn = adp->ad_offset; 9717 if (adp->ad_offset < NDADDR && 9718 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 9719 panic("%s: direct pointer #%jd mismatch %d != %jd", 9720 "softdep_write_inodeblock", 9721 (intmax_t)adp->ad_offset, 9722 dp->di_db[adp->ad_offset], 9723 (intmax_t)adp->ad_newblkno); 9724 if (adp->ad_offset >= NDADDR && 9725 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 9726 panic("%s: indirect pointer #%jd mismatch %d != %jd", 9727 "softdep_write_inodeblock", 9728 (intmax_t)adp->ad_offset - NDADDR, 9729 dp->di_ib[adp->ad_offset - NDADDR], 9730 (intmax_t)adp->ad_newblkno); 9731 deplist |= 1 << adp->ad_offset; 9732 if ((adp->ad_state & ATTACHED) == 0) 9733 panic("softdep_write_inodeblock: Unknown state 0x%x", 9734 adp->ad_state); 9735 #endif /* INVARIANTS */ 9736 adp->ad_state &= ~ATTACHED; 9737 adp->ad_state |= UNDONE; 9738 } 9739 /* 9740 * The on-disk inode cannot claim to be any larger than the last 9741 * fragment that has been written. Otherwise, the on-disk inode 9742 * might have fragments that were not the last block in the file 9743 * which would corrupt the filesystem. 9744 */ 9745 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9746 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9747 if (adp->ad_offset >= NDADDR) 9748 break; 9749 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 9750 /* keep going until hitting a rollback to a frag */ 9751 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9752 continue; 9753 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9754 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 9755 #ifdef INVARIANTS 9756 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 9757 panic("softdep_write_inodeblock: lost dep1"); 9758 #endif /* INVARIANTS */ 9759 dp->di_db[i] = 0; 9760 } 9761 for (i = 0; i < NIADDR; i++) { 9762 #ifdef INVARIANTS 9763 if (dp->di_ib[i] != 0 && 9764 (deplist & ((1 << NDADDR) << i)) == 0) 9765 panic("softdep_write_inodeblock: lost dep2"); 9766 #endif /* INVARIANTS */ 9767 dp->di_ib[i] = 0; 9768 } 9769 return; 9770 } 9771 /* 9772 * If we have zero'ed out the last allocated block of the file, 9773 * roll back the size to the last currently allocated block. 9774 * We know that this last allocated block is a full-sized as 9775 * we already checked for fragments in the loop above. 9776 */ 9777 if (lastadp != NULL && 9778 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9779 for (i = lastadp->ad_offset; i >= 0; i--) 9780 if (dp->di_db[i] != 0) 9781 break; 9782 dp->di_size = (i + 1) * fs->fs_bsize; 9783 } 9784 /* 9785 * The only dependencies are for indirect blocks. 9786 * 9787 * The file size for indirect block additions is not guaranteed. 9788 * Such a guarantee would be non-trivial to achieve. The conventional 9789 * synchronous write implementation also does not make this guarantee. 9790 * Fsck should catch and fix discrepancies. Arguably, the file size 9791 * can be over-estimated without destroying integrity when the file 9792 * moves into the indirect blocks (i.e., is large). If we want to 9793 * postpone fsck, we are stuck with this argument. 9794 */ 9795 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 9796 dp->di_ib[adp->ad_offset - NDADDR] = 0; 9797 } 9798 9799 /* 9800 * Version of initiate_write_inodeblock that handles UFS2 dinodes. 9801 * Note that any bug fixes made to this routine must be done in the 9802 * version found above. 9803 * 9804 * Called from within the procedure above to deal with unsatisfied 9805 * allocation dependencies in an inodeblock. The buffer must be 9806 * locked, thus, no I/O completion operations can occur while we 9807 * are manipulating its associated dependencies. 9808 */ 9809 static void 9810 initiate_write_inodeblock_ufs2(inodedep, bp) 9811 struct inodedep *inodedep; 9812 struct buf *bp; /* The inode block */ 9813 { 9814 struct allocdirect *adp, *lastadp; 9815 struct ufs2_dinode *dp; 9816 struct ufs2_dinode *sip; 9817 struct inoref *inoref; 9818 struct fs *fs; 9819 ufs_lbn_t i; 9820 #ifdef INVARIANTS 9821 ufs_lbn_t prevlbn = 0; 9822 #endif 9823 int deplist; 9824 9825 if (inodedep->id_state & IOSTARTED) 9826 panic("initiate_write_inodeblock_ufs2: already started"); 9827 inodedep->id_state |= IOSTARTED; 9828 fs = inodedep->id_fs; 9829 dp = (struct ufs2_dinode *)bp->b_data + 9830 ino_to_fsbo(fs, inodedep->id_ino); 9831 9832 /* 9833 * If we're on the unlinked list but have not yet written our 9834 * next pointer initialize it here. 9835 */ 9836 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9837 struct inodedep *inon; 9838 9839 inon = TAILQ_NEXT(inodedep, id_unlinked); 9840 dp->di_freelink = inon ? inon->id_ino : 0; 9841 } 9842 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == 9843 (UNLINKED | UNLINKNEXT)) { 9844 struct inodedep *inon; 9845 ino_t freelink; 9846 9847 inon = TAILQ_NEXT(inodedep, id_unlinked); 9848 freelink = inon ? inon->id_ino : 0; 9849 if (freelink != dp->di_freelink) 9850 panic("ino %p(0x%X) %d, %d != %d", 9851 inodedep, inodedep->id_state, inodedep->id_ino, 9852 freelink, dp->di_freelink); 9853 } 9854 /* 9855 * If the bitmap is not yet written, then the allocated 9856 * inode cannot be written to disk. 9857 */ 9858 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 9859 if (inodedep->id_savedino2 != NULL) 9860 panic("initiate_write_inodeblock_ufs2: I/O underway"); 9861 FREE_LOCK(&lk); 9862 sip = malloc(sizeof(struct ufs2_dinode), 9863 M_SAVEDINO, M_SOFTDEP_FLAGS); 9864 ACQUIRE_LOCK(&lk); 9865 inodedep->id_savedino2 = sip; 9866 *inodedep->id_savedino2 = *dp; 9867 bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); 9868 dp->di_gen = inodedep->id_savedino2->di_gen; 9869 dp->di_freelink = inodedep->id_savedino2->di_freelink; 9870 return; 9871 } 9872 /* 9873 * If no dependencies, then there is nothing to roll back. 9874 */ 9875 inodedep->id_savedsize = dp->di_size; 9876 inodedep->id_savedextsize = dp->di_extsize; 9877 inodedep->id_savednlink = dp->di_nlink; 9878 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 9879 TAILQ_EMPTY(&inodedep->id_extupdt) && 9880 TAILQ_EMPTY(&inodedep->id_inoreflst)) 9881 return; 9882 /* 9883 * Revert the link count to that of the first unwritten journal entry. 9884 */ 9885 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 9886 if (inoref) 9887 dp->di_nlink = inoref->if_nlink; 9888 9889 /* 9890 * Set the ext data dependencies to busy. 9891 */ 9892 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 9893 adp = TAILQ_NEXT(adp, ad_next)) { 9894 #ifdef INVARIANTS 9895 if (deplist != 0 && prevlbn >= adp->ad_offset) 9896 panic("softdep_write_inodeblock: lbn order"); 9897 prevlbn = adp->ad_offset; 9898 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) 9899 panic("%s: direct pointer #%jd mismatch %jd != %jd", 9900 "softdep_write_inodeblock", 9901 (intmax_t)adp->ad_offset, 9902 (intmax_t)dp->di_extb[adp->ad_offset], 9903 (intmax_t)adp->ad_newblkno); 9904 deplist |= 1 << adp->ad_offset; 9905 if ((adp->ad_state & ATTACHED) == 0) 9906 panic("softdep_write_inodeblock: Unknown state 0x%x", 9907 adp->ad_state); 9908 #endif /* INVARIANTS */ 9909 adp->ad_state &= ~ATTACHED; 9910 adp->ad_state |= UNDONE; 9911 } 9912 /* 9913 * The on-disk inode cannot claim to be any larger than the last 9914 * fragment that has been written. Otherwise, the on-disk inode 9915 * might have fragments that were not the last block in the ext 9916 * data which would corrupt the filesystem. 9917 */ 9918 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 9919 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9920 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; 9921 /* keep going until hitting a rollback to a frag */ 9922 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9923 continue; 9924 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9925 for (i = adp->ad_offset + 1; i < NXADDR; i++) { 9926 #ifdef INVARIANTS 9927 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) 9928 panic("softdep_write_inodeblock: lost dep1"); 9929 #endif /* INVARIANTS */ 9930 dp->di_extb[i] = 0; 9931 } 9932 lastadp = NULL; 9933 break; 9934 } 9935 /* 9936 * If we have zero'ed out the last allocated block of the ext 9937 * data, roll back the size to the last currently allocated block. 9938 * We know that this last allocated block is a full-sized as 9939 * we already checked for fragments in the loop above. 9940 */ 9941 if (lastadp != NULL && 9942 dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9943 for (i = lastadp->ad_offset; i >= 0; i--) 9944 if (dp->di_extb[i] != 0) 9945 break; 9946 dp->di_extsize = (i + 1) * fs->fs_bsize; 9947 } 9948 /* 9949 * Set the file data dependencies to busy. 9950 */ 9951 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9952 adp = TAILQ_NEXT(adp, ad_next)) { 9953 #ifdef INVARIANTS 9954 if (deplist != 0 && prevlbn >= adp->ad_offset) 9955 panic("softdep_write_inodeblock: lbn order"); 9956 if ((adp->ad_state & ATTACHED) == 0) 9957 panic("inodedep %p and adp %p not attached", inodedep, adp); 9958 prevlbn = adp->ad_offset; 9959 if (adp->ad_offset < NDADDR && 9960 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 9961 panic("%s: direct pointer #%jd mismatch %jd != %jd", 9962 "softdep_write_inodeblock", 9963 (intmax_t)adp->ad_offset, 9964 (intmax_t)dp->di_db[adp->ad_offset], 9965 (intmax_t)adp->ad_newblkno); 9966 if (adp->ad_offset >= NDADDR && 9967 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 9968 panic("%s indirect pointer #%jd mismatch %jd != %jd", 9969 "softdep_write_inodeblock:", 9970 (intmax_t)adp->ad_offset - NDADDR, 9971 (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], 9972 (intmax_t)adp->ad_newblkno); 9973 deplist |= 1 << adp->ad_offset; 9974 if ((adp->ad_state & ATTACHED) == 0) 9975 panic("softdep_write_inodeblock: Unknown state 0x%x", 9976 adp->ad_state); 9977 #endif /* INVARIANTS */ 9978 adp->ad_state &= ~ATTACHED; 9979 adp->ad_state |= UNDONE; 9980 } 9981 /* 9982 * The on-disk inode cannot claim to be any larger than the last 9983 * fragment that has been written. Otherwise, the on-disk inode 9984 * might have fragments that were not the last block in the file 9985 * which would corrupt the filesystem. 9986 */ 9987 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9988 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9989 if (adp->ad_offset >= NDADDR) 9990 break; 9991 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 9992 /* keep going until hitting a rollback to a frag */ 9993 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9994 continue; 9995 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9996 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 9997 #ifdef INVARIANTS 9998 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 9999 panic("softdep_write_inodeblock: lost dep2"); 10000 #endif /* INVARIANTS */ 10001 dp->di_db[i] = 0; 10002 } 10003 for (i = 0; i < NIADDR; i++) { 10004 #ifdef INVARIANTS 10005 if (dp->di_ib[i] != 0 && 10006 (deplist & ((1 << NDADDR) << i)) == 0) 10007 panic("softdep_write_inodeblock: lost dep3"); 10008 #endif /* INVARIANTS */ 10009 dp->di_ib[i] = 0; 10010 } 10011 return; 10012 } 10013 /* 10014 * If we have zero'ed out the last allocated block of the file, 10015 * roll back the size to the last currently allocated block. 10016 * We know that this last allocated block is a full-sized as 10017 * we already checked for fragments in the loop above. 10018 */ 10019 if (lastadp != NULL && 10020 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 10021 for (i = lastadp->ad_offset; i >= 0; i--) 10022 if (dp->di_db[i] != 0) 10023 break; 10024 dp->di_size = (i + 1) * fs->fs_bsize; 10025 } 10026 /* 10027 * The only dependencies are for indirect blocks. 10028 * 10029 * The file size for indirect block additions is not guaranteed. 10030 * Such a guarantee would be non-trivial to achieve. The conventional 10031 * synchronous write implementation also does not make this guarantee. 10032 * Fsck should catch and fix discrepancies. Arguably, the file size 10033 * can be over-estimated without destroying integrity when the file 10034 * moves into the indirect blocks (i.e., is large). If we want to 10035 * postpone fsck, we are stuck with this argument. 10036 */ 10037 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 10038 dp->di_ib[adp->ad_offset - NDADDR] = 0; 10039 } 10040 10041 /* 10042 * Cancel an indirdep as a result of truncation. Release all of the 10043 * children allocindirs and place their journal work on the appropriate 10044 * list. 10045 */ 10046 static void 10047 cancel_indirdep(indirdep, bp, freeblks) 10048 struct indirdep *indirdep; 10049 struct buf *bp; 10050 struct freeblks *freeblks; 10051 { 10052 struct allocindir *aip; 10053 10054 /* 10055 * None of the indirect pointers will ever be visible, 10056 * so they can simply be tossed. GOINGAWAY ensures 10057 * that allocated pointers will be saved in the buffer 10058 * cache until they are freed. Note that they will 10059 * only be able to be found by their physical address 10060 * since the inode mapping the logical address will 10061 * be gone. The save buffer used for the safe copy 10062 * was allocated in setup_allocindir_phase2 using 10063 * the physical address so it could be used for this 10064 * purpose. Hence we swap the safe copy with the real 10065 * copy, allowing the safe copy to be freed and holding 10066 * on to the real copy for later use in indir_trunc. 10067 */ 10068 if (indirdep->ir_state & GOINGAWAY) 10069 panic("cancel_indirdep: already gone"); 10070 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 10071 indirdep->ir_state |= DEPCOMPLETE; 10072 LIST_REMOVE(indirdep, ir_next); 10073 } 10074 indirdep->ir_state |= GOINGAWAY; 10075 VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1; 10076 /* 10077 * Pass in bp for blocks still have journal writes 10078 * pending so we can cancel them on their own. 10079 */ 10080 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 10081 cancel_allocindir(aip, bp, freeblks, 0); 10082 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) 10083 cancel_allocindir(aip, NULL, freeblks, 0); 10084 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) 10085 cancel_allocindir(aip, NULL, freeblks, 0); 10086 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0) 10087 cancel_allocindir(aip, NULL, freeblks, 0); 10088 /* 10089 * If there are pending partial truncations we need to keep the 10090 * old block copy around until they complete. This is because 10091 * the current b_data is not a perfect superset of the available 10092 * blocks. 10093 */ 10094 if (TAILQ_EMPTY(&indirdep->ir_trunc)) 10095 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); 10096 else 10097 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 10098 WORKLIST_REMOVE(&indirdep->ir_list); 10099 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); 10100 indirdep->ir_bp = NULL; 10101 indirdep->ir_freeblks = freeblks; 10102 } 10103 10104 /* 10105 * Free an indirdep once it no longer has new pointers to track. 10106 */ 10107 static void 10108 free_indirdep(indirdep) 10109 struct indirdep *indirdep; 10110 { 10111 10112 KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc), 10113 ("free_indirdep: Indir trunc list not empty.")); 10114 KASSERT(LIST_EMPTY(&indirdep->ir_completehd), 10115 ("free_indirdep: Complete head not empty.")); 10116 KASSERT(LIST_EMPTY(&indirdep->ir_writehd), 10117 ("free_indirdep: write head not empty.")); 10118 KASSERT(LIST_EMPTY(&indirdep->ir_donehd), 10119 ("free_indirdep: done head not empty.")); 10120 KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), 10121 ("free_indirdep: deplist head not empty.")); 10122 KASSERT((indirdep->ir_state & DEPCOMPLETE), 10123 ("free_indirdep: %p still on newblk list.", indirdep)); 10124 KASSERT(indirdep->ir_saveddata == NULL, 10125 ("free_indirdep: %p still has saved data.", indirdep)); 10126 if (indirdep->ir_state & ONWORKLIST) 10127 WORKLIST_REMOVE(&indirdep->ir_list); 10128 WORKITEM_FREE(indirdep, D_INDIRDEP); 10129 } 10130 10131 /* 10132 * Called before a write to an indirdep. This routine is responsible for 10133 * rolling back pointers to a safe state which includes only those 10134 * allocindirs which have been completed. 10135 */ 10136 static void 10137 initiate_write_indirdep(indirdep, bp) 10138 struct indirdep *indirdep; 10139 struct buf *bp; 10140 { 10141 10142 indirdep->ir_state |= IOSTARTED; 10143 if (indirdep->ir_state & GOINGAWAY) 10144 panic("disk_io_initiation: indirdep gone"); 10145 /* 10146 * If there are no remaining dependencies, this will be writing 10147 * the real pointers. 10148 */ 10149 if (LIST_EMPTY(&indirdep->ir_deplisthd) && 10150 TAILQ_EMPTY(&indirdep->ir_trunc)) 10151 return; 10152 /* 10153 * Replace up-to-date version with safe version. 10154 */ 10155 if (indirdep->ir_saveddata == NULL) { 10156 FREE_LOCK(&lk); 10157 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 10158 M_SOFTDEP_FLAGS); 10159 ACQUIRE_LOCK(&lk); 10160 } 10161 indirdep->ir_state &= ~ATTACHED; 10162 indirdep->ir_state |= UNDONE; 10163 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 10164 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 10165 bp->b_bcount); 10166 } 10167 10168 /* 10169 * Called when an inode has been cleared in a cg bitmap. This finally 10170 * eliminates any canceled jaddrefs 10171 */ 10172 void 10173 softdep_setup_inofree(mp, bp, ino, wkhd) 10174 struct mount *mp; 10175 struct buf *bp; 10176 ino_t ino; 10177 struct workhead *wkhd; 10178 { 10179 struct worklist *wk, *wkn; 10180 struct inodedep *inodedep; 10181 uint8_t *inosused; 10182 struct cg *cgp; 10183 struct fs *fs; 10184 10185 ACQUIRE_LOCK(&lk); 10186 fs = VFSTOUFS(mp)->um_fs; 10187 cgp = (struct cg *)bp->b_data; 10188 inosused = cg_inosused(cgp); 10189 if (isset(inosused, ino % fs->fs_ipg)) 10190 panic("softdep_setup_inofree: inode %d not freed.", ino); 10191 if (inodedep_lookup(mp, ino, 0, &inodedep)) 10192 panic("softdep_setup_inofree: ino %d has existing inodedep %p", 10193 ino, inodedep); 10194 if (wkhd) { 10195 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 10196 if (wk->wk_type != D_JADDREF) 10197 continue; 10198 WORKLIST_REMOVE(wk); 10199 /* 10200 * We can free immediately even if the jaddref 10201 * isn't attached in a background write as now 10202 * the bitmaps are reconciled. 10203 */ 10204 wk->wk_state |= COMPLETE | ATTACHED; 10205 free_jaddref(WK_JADDREF(wk)); 10206 } 10207 jwork_move(&bp->b_dep, wkhd); 10208 } 10209 FREE_LOCK(&lk); 10210 } 10211 10212 10213 /* 10214 * Called via ffs_blkfree() after a set of frags has been cleared from a cg 10215 * map. Any dependencies waiting for the write to clear are added to the 10216 * buf's list and any jnewblks that are being canceled are discarded 10217 * immediately. 10218 */ 10219 void 10220 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 10221 struct mount *mp; 10222 struct buf *bp; 10223 ufs2_daddr_t blkno; 10224 int frags; 10225 struct workhead *wkhd; 10226 { 10227 struct bmsafemap *bmsafemap; 10228 struct jnewblk *jnewblk; 10229 struct worklist *wk; 10230 struct fs *fs; 10231 #ifdef SUJ_DEBUG 10232 uint8_t *blksfree; 10233 struct cg *cgp; 10234 ufs2_daddr_t jstart; 10235 ufs2_daddr_t jend; 10236 ufs2_daddr_t end; 10237 long bno; 10238 int i; 10239 #endif 10240 10241 ACQUIRE_LOCK(&lk); 10242 /* Lookup the bmsafemap so we track when it is dirty. */ 10243 fs = VFSTOUFS(mp)->um_fs; 10244 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 10245 /* 10246 * Detach any jnewblks which have been canceled. They must linger 10247 * until the bitmap is cleared again by ffs_blkfree() to prevent 10248 * an unjournaled allocation from hitting the disk. 10249 */ 10250 if (wkhd) { 10251 while ((wk = LIST_FIRST(wkhd)) != NULL) { 10252 WORKLIST_REMOVE(wk); 10253 if (wk->wk_type != D_JNEWBLK) { 10254 WORKLIST_INSERT(&bmsafemap->sm_freehd, wk); 10255 continue; 10256 } 10257 jnewblk = WK_JNEWBLK(wk); 10258 KASSERT(jnewblk->jn_state & GOINGAWAY, 10259 ("softdep_setup_blkfree: jnewblk not canceled.")); 10260 #ifdef SUJ_DEBUG 10261 /* 10262 * Assert that this block is free in the bitmap 10263 * before we discard the jnewblk. 10264 */ 10265 cgp = (struct cg *)bp->b_data; 10266 blksfree = cg_blksfree(cgp); 10267 bno = dtogd(fs, jnewblk->jn_blkno); 10268 for (i = jnewblk->jn_oldfrags; 10269 i < jnewblk->jn_frags; i++) { 10270 if (isset(blksfree, bno + i)) 10271 continue; 10272 panic("softdep_setup_blkfree: not free"); 10273 } 10274 #endif 10275 /* 10276 * Even if it's not attached we can free immediately 10277 * as the new bitmap is correct. 10278 */ 10279 wk->wk_state |= COMPLETE | ATTACHED; 10280 free_jnewblk(jnewblk); 10281 } 10282 } 10283 10284 #ifdef SUJ_DEBUG 10285 /* 10286 * Assert that we are not freeing a block which has an outstanding 10287 * allocation dependency. 10288 */ 10289 fs = VFSTOUFS(mp)->um_fs; 10290 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 10291 end = blkno + frags; 10292 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 10293 /* 10294 * Don't match against blocks that will be freed when the 10295 * background write is done. 10296 */ 10297 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == 10298 (COMPLETE | DEPCOMPLETE)) 10299 continue; 10300 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; 10301 jend = jnewblk->jn_blkno + jnewblk->jn_frags; 10302 if ((blkno >= jstart && blkno < jend) || 10303 (end > jstart && end <= jend)) { 10304 printf("state 0x%X %jd - %d %d dep %p\n", 10305 jnewblk->jn_state, jnewblk->jn_blkno, 10306 jnewblk->jn_oldfrags, jnewblk->jn_frags, 10307 jnewblk->jn_dep); 10308 panic("softdep_setup_blkfree: " 10309 "%jd-%jd(%d) overlaps with %jd-%jd", 10310 blkno, end, frags, jstart, jend); 10311 } 10312 } 10313 #endif 10314 FREE_LOCK(&lk); 10315 } 10316 10317 /* 10318 * Revert a block allocation when the journal record that describes it 10319 * is not yet written. 10320 */ 10321 int 10322 jnewblk_rollback(jnewblk, fs, cgp, blksfree) 10323 struct jnewblk *jnewblk; 10324 struct fs *fs; 10325 struct cg *cgp; 10326 uint8_t *blksfree; 10327 { 10328 ufs1_daddr_t fragno; 10329 long cgbno, bbase; 10330 int frags, blk; 10331 int i; 10332 10333 frags = 0; 10334 cgbno = dtogd(fs, jnewblk->jn_blkno); 10335 /* 10336 * We have to test which frags need to be rolled back. We may 10337 * be operating on a stale copy when doing background writes. 10338 */ 10339 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) 10340 if (isclr(blksfree, cgbno + i)) 10341 frags++; 10342 if (frags == 0) 10343 return (0); 10344 /* 10345 * This is mostly ffs_blkfree() sans some validation and 10346 * superblock updates. 10347 */ 10348 if (frags == fs->fs_frag) { 10349 fragno = fragstoblks(fs, cgbno); 10350 ffs_setblock(fs, blksfree, fragno); 10351 ffs_clusteracct(fs, cgp, fragno, 1); 10352 cgp->cg_cs.cs_nbfree++; 10353 } else { 10354 cgbno += jnewblk->jn_oldfrags; 10355 bbase = cgbno - fragnum(fs, cgbno); 10356 /* Decrement the old frags. */ 10357 blk = blkmap(fs, blksfree, bbase); 10358 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 10359 /* Deallocate the fragment */ 10360 for (i = 0; i < frags; i++) 10361 setbit(blksfree, cgbno + i); 10362 cgp->cg_cs.cs_nffree += frags; 10363 /* Add back in counts associated with the new frags */ 10364 blk = blkmap(fs, blksfree, bbase); 10365 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 10366 /* If a complete block has been reassembled, account for it. */ 10367 fragno = fragstoblks(fs, bbase); 10368 if (ffs_isblock(fs, blksfree, fragno)) { 10369 cgp->cg_cs.cs_nffree -= fs->fs_frag; 10370 ffs_clusteracct(fs, cgp, fragno, 1); 10371 cgp->cg_cs.cs_nbfree++; 10372 } 10373 } 10374 stat_jnewblk++; 10375 jnewblk->jn_state &= ~ATTACHED; 10376 jnewblk->jn_state |= UNDONE; 10377 10378 return (frags); 10379 } 10380 10381 static void 10382 initiate_write_bmsafemap(bmsafemap, bp) 10383 struct bmsafemap *bmsafemap; 10384 struct buf *bp; /* The cg block. */ 10385 { 10386 struct jaddref *jaddref; 10387 struct jnewblk *jnewblk; 10388 uint8_t *inosused; 10389 uint8_t *blksfree; 10390 struct cg *cgp; 10391 struct fs *fs; 10392 ino_t ino; 10393 10394 if (bmsafemap->sm_state & IOSTARTED) 10395 panic("initiate_write_bmsafemap: Already started\n"); 10396 bmsafemap->sm_state |= IOSTARTED; 10397 /* 10398 * Clear any inode allocations which are pending journal writes. 10399 */ 10400 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { 10401 cgp = (struct cg *)bp->b_data; 10402 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 10403 inosused = cg_inosused(cgp); 10404 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { 10405 ino = jaddref->ja_ino % fs->fs_ipg; 10406 /* 10407 * If this is a background copy the inode may not 10408 * be marked used yet. 10409 */ 10410 if (isset(inosused, ino)) { 10411 if ((jaddref->ja_mode & IFMT) == IFDIR) 10412 cgp->cg_cs.cs_ndir--; 10413 cgp->cg_cs.cs_nifree++; 10414 clrbit(inosused, ino); 10415 jaddref->ja_state &= ~ATTACHED; 10416 jaddref->ja_state |= UNDONE; 10417 stat_jaddref++; 10418 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 10419 panic("initiate_write_bmsafemap: inode %d " 10420 "marked free", jaddref->ja_ino); 10421 } 10422 } 10423 /* 10424 * Clear any block allocations which are pending journal writes. 10425 */ 10426 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 10427 cgp = (struct cg *)bp->b_data; 10428 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 10429 blksfree = cg_blksfree(cgp); 10430 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 10431 if (jnewblk_rollback(jnewblk, fs, cgp, blksfree)) 10432 continue; 10433 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 10434 panic("initiate_write_bmsafemap: block %jd " 10435 "marked free", jnewblk->jn_blkno); 10436 } 10437 } 10438 /* 10439 * Move allocation lists to the written lists so they can be 10440 * cleared once the block write is complete. 10441 */ 10442 LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, 10443 inodedep, id_deps); 10444 LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, 10445 newblk, nb_deps); 10446 LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, 10447 wk_list); 10448 } 10449 10450 /* 10451 * This routine is called during the completion interrupt 10452 * service routine for a disk write (from the procedure called 10453 * by the device driver to inform the filesystem caches of 10454 * a request completion). It should be called early in this 10455 * procedure, before the block is made available to other 10456 * processes or other routines are called. 10457 * 10458 */ 10459 static void 10460 softdep_disk_write_complete(bp) 10461 struct buf *bp; /* describes the completed disk write */ 10462 { 10463 struct worklist *wk; 10464 struct worklist *owk; 10465 struct workhead reattach; 10466 struct freeblks *freeblks; 10467 struct buf *sbp; 10468 10469 /* 10470 * If an error occurred while doing the write, then the data 10471 * has not hit the disk and the dependencies cannot be unrolled. 10472 */ 10473 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) 10474 return; 10475 LIST_INIT(&reattach); 10476 /* 10477 * This lock must not be released anywhere in this code segment. 10478 */ 10479 sbp = NULL; 10480 owk = NULL; 10481 ACQUIRE_LOCK(&lk); 10482 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 10483 WORKLIST_REMOVE(wk); 10484 dep_write[wk->wk_type]++; 10485 if (wk == owk) 10486 panic("duplicate worklist: %p\n", wk); 10487 owk = wk; 10488 switch (wk->wk_type) { 10489 10490 case D_PAGEDEP: 10491 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 10492 WORKLIST_INSERT(&reattach, wk); 10493 continue; 10494 10495 case D_INODEDEP: 10496 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 10497 WORKLIST_INSERT(&reattach, wk); 10498 continue; 10499 10500 case D_BMSAFEMAP: 10501 if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) 10502 WORKLIST_INSERT(&reattach, wk); 10503 continue; 10504 10505 case D_MKDIR: 10506 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 10507 continue; 10508 10509 case D_ALLOCDIRECT: 10510 wk->wk_state |= COMPLETE; 10511 handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); 10512 continue; 10513 10514 case D_ALLOCINDIR: 10515 wk->wk_state |= COMPLETE; 10516 handle_allocindir_partdone(WK_ALLOCINDIR(wk)); 10517 continue; 10518 10519 case D_INDIRDEP: 10520 if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) 10521 WORKLIST_INSERT(&reattach, wk); 10522 continue; 10523 10524 case D_FREEBLKS: 10525 wk->wk_state |= COMPLETE; 10526 freeblks = WK_FREEBLKS(wk); 10527 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE && 10528 LIST_EMPTY(&freeblks->fb_jblkdephd)) 10529 add_to_worklist(wk, WK_NODELAY); 10530 continue; 10531 10532 case D_FREEWORK: 10533 handle_written_freework(WK_FREEWORK(wk)); 10534 break; 10535 10536 case D_JSEGDEP: 10537 free_jsegdep(WK_JSEGDEP(wk)); 10538 continue; 10539 10540 case D_JSEG: 10541 handle_written_jseg(WK_JSEG(wk), bp); 10542 continue; 10543 10544 case D_SBDEP: 10545 if (handle_written_sbdep(WK_SBDEP(wk), bp)) 10546 WORKLIST_INSERT(&reattach, wk); 10547 continue; 10548 10549 case D_FREEDEP: 10550 free_freedep(WK_FREEDEP(wk)); 10551 continue; 10552 10553 default: 10554 panic("handle_disk_write_complete: Unknown type %s", 10555 TYPENAME(wk->wk_type)); 10556 /* NOTREACHED */ 10557 } 10558 } 10559 /* 10560 * Reattach any requests that must be redone. 10561 */ 10562 while ((wk = LIST_FIRST(&reattach)) != NULL) { 10563 WORKLIST_REMOVE(wk); 10564 WORKLIST_INSERT(&bp->b_dep, wk); 10565 } 10566 FREE_LOCK(&lk); 10567 if (sbp) 10568 brelse(sbp); 10569 } 10570 10571 /* 10572 * Called from within softdep_disk_write_complete above. Note that 10573 * this routine is always called from interrupt level with further 10574 * splbio interrupts blocked. 10575 */ 10576 static void 10577 handle_allocdirect_partdone(adp, wkhd) 10578 struct allocdirect *adp; /* the completed allocdirect */ 10579 struct workhead *wkhd; /* Work to do when inode is writtne. */ 10580 { 10581 struct allocdirectlst *listhead; 10582 struct allocdirect *listadp; 10583 struct inodedep *inodedep; 10584 long bsize; 10585 10586 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 10587 return; 10588 /* 10589 * The on-disk inode cannot claim to be any larger than the last 10590 * fragment that has been written. Otherwise, the on-disk inode 10591 * might have fragments that were not the last block in the file 10592 * which would corrupt the filesystem. Thus, we cannot free any 10593 * allocdirects after one whose ad_oldblkno claims a fragment as 10594 * these blocks must be rolled back to zero before writing the inode. 10595 * We check the currently active set of allocdirects in id_inoupdt 10596 * or id_extupdt as appropriate. 10597 */ 10598 inodedep = adp->ad_inodedep; 10599 bsize = inodedep->id_fs->fs_bsize; 10600 if (adp->ad_state & EXTDATA) 10601 listhead = &inodedep->id_extupdt; 10602 else 10603 listhead = &inodedep->id_inoupdt; 10604 TAILQ_FOREACH(listadp, listhead, ad_next) { 10605 /* found our block */ 10606 if (listadp == adp) 10607 break; 10608 /* continue if ad_oldlbn is not a fragment */ 10609 if (listadp->ad_oldsize == 0 || 10610 listadp->ad_oldsize == bsize) 10611 continue; 10612 /* hit a fragment */ 10613 return; 10614 } 10615 /* 10616 * If we have reached the end of the current list without 10617 * finding the just finished dependency, then it must be 10618 * on the future dependency list. Future dependencies cannot 10619 * be freed until they are moved to the current list. 10620 */ 10621 if (listadp == NULL) { 10622 #ifdef DEBUG 10623 if (adp->ad_state & EXTDATA) 10624 listhead = &inodedep->id_newextupdt; 10625 else 10626 listhead = &inodedep->id_newinoupdt; 10627 TAILQ_FOREACH(listadp, listhead, ad_next) 10628 /* found our block */ 10629 if (listadp == adp) 10630 break; 10631 if (listadp == NULL) 10632 panic("handle_allocdirect_partdone: lost dep"); 10633 #endif /* DEBUG */ 10634 return; 10635 } 10636 /* 10637 * If we have found the just finished dependency, then queue 10638 * it along with anything that follows it that is complete. 10639 * Since the pointer has not yet been written in the inode 10640 * as the dependency prevents it, place the allocdirect on the 10641 * bufwait list where it will be freed once the pointer is 10642 * valid. 10643 */ 10644 if (wkhd == NULL) 10645 wkhd = &inodedep->id_bufwait; 10646 for (; adp; adp = listadp) { 10647 listadp = TAILQ_NEXT(adp, ad_next); 10648 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 10649 return; 10650 TAILQ_REMOVE(listhead, adp, ad_next); 10651 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); 10652 } 10653 } 10654 10655 /* 10656 * Called from within softdep_disk_write_complete above. This routine 10657 * completes successfully written allocindirs. 10658 */ 10659 static void 10660 handle_allocindir_partdone(aip) 10661 struct allocindir *aip; /* the completed allocindir */ 10662 { 10663 struct indirdep *indirdep; 10664 10665 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 10666 return; 10667 indirdep = aip->ai_indirdep; 10668 LIST_REMOVE(aip, ai_next); 10669 /* 10670 * Don't set a pointer while the buffer is undergoing IO or while 10671 * we have active truncations. 10672 */ 10673 if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) { 10674 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 10675 return; 10676 } 10677 if (indirdep->ir_state & UFS1FMT) 10678 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 10679 aip->ai_newblkno; 10680 else 10681 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 10682 aip->ai_newblkno; 10683 /* 10684 * Await the pointer write before freeing the allocindir. 10685 */ 10686 LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); 10687 } 10688 10689 /* 10690 * Release segments held on a jwork list. 10691 */ 10692 static void 10693 handle_jwork(wkhd) 10694 struct workhead *wkhd; 10695 { 10696 struct worklist *wk; 10697 10698 while ((wk = LIST_FIRST(wkhd)) != NULL) { 10699 WORKLIST_REMOVE(wk); 10700 switch (wk->wk_type) { 10701 case D_JSEGDEP: 10702 free_jsegdep(WK_JSEGDEP(wk)); 10703 continue; 10704 case D_FREEDEP: 10705 free_freedep(WK_FREEDEP(wk)); 10706 continue; 10707 case D_FREEFRAG: 10708 rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep)); 10709 WORKITEM_FREE(wk, D_FREEFRAG); 10710 case D_FREEWORK: 10711 handle_written_freework(WK_FREEWORK(wk)); 10712 continue; 10713 default: 10714 panic("handle_jwork: Unknown type %s\n", 10715 TYPENAME(wk->wk_type)); 10716 } 10717 } 10718 } 10719 10720 /* 10721 * Handle the bufwait list on an inode when it is safe to release items 10722 * held there. This normally happens after an inode block is written but 10723 * may be delayed and handled later if there are pending journal items that 10724 * are not yet safe to be released. 10725 */ 10726 static struct freefile * 10727 handle_bufwait(inodedep, refhd) 10728 struct inodedep *inodedep; 10729 struct workhead *refhd; 10730 { 10731 struct jaddref *jaddref; 10732 struct freefile *freefile; 10733 struct worklist *wk; 10734 10735 freefile = NULL; 10736 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 10737 WORKLIST_REMOVE(wk); 10738 switch (wk->wk_type) { 10739 case D_FREEFILE: 10740 /* 10741 * We defer adding freefile to the worklist 10742 * until all other additions have been made to 10743 * ensure that it will be done after all the 10744 * old blocks have been freed. 10745 */ 10746 if (freefile != NULL) 10747 panic("handle_bufwait: freefile"); 10748 freefile = WK_FREEFILE(wk); 10749 continue; 10750 10751 case D_MKDIR: 10752 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 10753 continue; 10754 10755 case D_DIRADD: 10756 diradd_inode_written(WK_DIRADD(wk), inodedep); 10757 continue; 10758 10759 case D_FREEFRAG: 10760 wk->wk_state |= COMPLETE; 10761 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 10762 add_to_worklist(wk, 0); 10763 continue; 10764 10765 case D_DIRREM: 10766 wk->wk_state |= COMPLETE; 10767 add_to_worklist(wk, 0); 10768 continue; 10769 10770 case D_ALLOCDIRECT: 10771 case D_ALLOCINDIR: 10772 free_newblk(WK_NEWBLK(wk)); 10773 continue; 10774 10775 case D_JNEWBLK: 10776 wk->wk_state |= COMPLETE; 10777 free_jnewblk(WK_JNEWBLK(wk)); 10778 continue; 10779 10780 /* 10781 * Save freed journal segments and add references on 10782 * the supplied list which will delay their release 10783 * until the cg bitmap is cleared on disk. 10784 */ 10785 case D_JSEGDEP: 10786 if (refhd == NULL) 10787 free_jsegdep(WK_JSEGDEP(wk)); 10788 else 10789 WORKLIST_INSERT(refhd, wk); 10790 continue; 10791 10792 case D_JADDREF: 10793 jaddref = WK_JADDREF(wk); 10794 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 10795 if_deps); 10796 /* 10797 * Transfer any jaddrefs to the list to be freed with 10798 * the bitmap if we're handling a removed file. 10799 */ 10800 if (refhd == NULL) { 10801 wk->wk_state |= COMPLETE; 10802 free_jaddref(jaddref); 10803 } else 10804 WORKLIST_INSERT(refhd, wk); 10805 continue; 10806 10807 default: 10808 panic("handle_bufwait: Unknown type %p(%s)", 10809 wk, TYPENAME(wk->wk_type)); 10810 /* NOTREACHED */ 10811 } 10812 } 10813 return (freefile); 10814 } 10815 /* 10816 * Called from within softdep_disk_write_complete above to restore 10817 * in-memory inode block contents to their most up-to-date state. Note 10818 * that this routine is always called from interrupt level with further 10819 * splbio interrupts blocked. 10820 */ 10821 static int 10822 handle_written_inodeblock(inodedep, bp) 10823 struct inodedep *inodedep; 10824 struct buf *bp; /* buffer containing the inode block */ 10825 { 10826 struct freefile *freefile; 10827 struct allocdirect *adp, *nextadp; 10828 struct ufs1_dinode *dp1 = NULL; 10829 struct ufs2_dinode *dp2 = NULL; 10830 struct workhead wkhd; 10831 int hadchanges, fstype; 10832 ino_t freelink; 10833 10834 LIST_INIT(&wkhd); 10835 hadchanges = 0; 10836 freefile = NULL; 10837 if ((inodedep->id_state & IOSTARTED) == 0) 10838 panic("handle_written_inodeblock: not started"); 10839 inodedep->id_state &= ~IOSTARTED; 10840 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { 10841 fstype = UFS1; 10842 dp1 = (struct ufs1_dinode *)bp->b_data + 10843 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 10844 freelink = dp1->di_freelink; 10845 } else { 10846 fstype = UFS2; 10847 dp2 = (struct ufs2_dinode *)bp->b_data + 10848 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 10849 freelink = dp2->di_freelink; 10850 } 10851 /* 10852 * If we wrote a valid freelink pointer during the last write 10853 * record it here. 10854 */ 10855 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 10856 struct inodedep *inon; 10857 10858 inon = TAILQ_NEXT(inodedep, id_unlinked); 10859 if ((inon == NULL && freelink == 0) || 10860 (inon && inon->id_ino == freelink)) { 10861 if (inon) 10862 inon->id_state |= UNLINKPREV; 10863 inodedep->id_state |= UNLINKNEXT; 10864 } else 10865 hadchanges = 1; 10866 } 10867 /* Leave this inodeblock dirty until it's in the list. */ 10868 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) 10869 hadchanges = 1; 10870 /* 10871 * If we had to rollback the inode allocation because of 10872 * bitmaps being incomplete, then simply restore it. 10873 * Keep the block dirty so that it will not be reclaimed until 10874 * all associated dependencies have been cleared and the 10875 * corresponding updates written to disk. 10876 */ 10877 if (inodedep->id_savedino1 != NULL) { 10878 hadchanges = 1; 10879 if (fstype == UFS1) 10880 *dp1 = *inodedep->id_savedino1; 10881 else 10882 *dp2 = *inodedep->id_savedino2; 10883 free(inodedep->id_savedino1, M_SAVEDINO); 10884 inodedep->id_savedino1 = NULL; 10885 if ((bp->b_flags & B_DELWRI) == 0) 10886 stat_inode_bitmap++; 10887 bdirty(bp); 10888 /* 10889 * If the inode is clear here and GOINGAWAY it will never 10890 * be written. Process the bufwait and clear any pending 10891 * work which may include the freefile. 10892 */ 10893 if (inodedep->id_state & GOINGAWAY) 10894 goto bufwait; 10895 return (1); 10896 } 10897 inodedep->id_state |= COMPLETE; 10898 /* 10899 * Roll forward anything that had to be rolled back before 10900 * the inode could be updated. 10901 */ 10902 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 10903 nextadp = TAILQ_NEXT(adp, ad_next); 10904 if (adp->ad_state & ATTACHED) 10905 panic("handle_written_inodeblock: new entry"); 10906 if (fstype == UFS1) { 10907 if (adp->ad_offset < NDADDR) { 10908 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) 10909 panic("%s %s #%jd mismatch %d != %jd", 10910 "handle_written_inodeblock:", 10911 "direct pointer", 10912 (intmax_t)adp->ad_offset, 10913 dp1->di_db[adp->ad_offset], 10914 (intmax_t)adp->ad_oldblkno); 10915 dp1->di_db[adp->ad_offset] = adp->ad_newblkno; 10916 } else { 10917 if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) 10918 panic("%s: %s #%jd allocated as %d", 10919 "handle_written_inodeblock", 10920 "indirect pointer", 10921 (intmax_t)adp->ad_offset - NDADDR, 10922 dp1->di_ib[adp->ad_offset - NDADDR]); 10923 dp1->di_ib[adp->ad_offset - NDADDR] = 10924 adp->ad_newblkno; 10925 } 10926 } else { 10927 if (adp->ad_offset < NDADDR) { 10928 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) 10929 panic("%s: %s #%jd %s %jd != %jd", 10930 "handle_written_inodeblock", 10931 "direct pointer", 10932 (intmax_t)adp->ad_offset, "mismatch", 10933 (intmax_t)dp2->di_db[adp->ad_offset], 10934 (intmax_t)adp->ad_oldblkno); 10935 dp2->di_db[adp->ad_offset] = adp->ad_newblkno; 10936 } else { 10937 if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) 10938 panic("%s: %s #%jd allocated as %jd", 10939 "handle_written_inodeblock", 10940 "indirect pointer", 10941 (intmax_t)adp->ad_offset - NDADDR, 10942 (intmax_t) 10943 dp2->di_ib[adp->ad_offset - NDADDR]); 10944 dp2->di_ib[adp->ad_offset - NDADDR] = 10945 adp->ad_newblkno; 10946 } 10947 } 10948 adp->ad_state &= ~UNDONE; 10949 adp->ad_state |= ATTACHED; 10950 hadchanges = 1; 10951 } 10952 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { 10953 nextadp = TAILQ_NEXT(adp, ad_next); 10954 if (adp->ad_state & ATTACHED) 10955 panic("handle_written_inodeblock: new entry"); 10956 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) 10957 panic("%s: direct pointers #%jd %s %jd != %jd", 10958 "handle_written_inodeblock", 10959 (intmax_t)adp->ad_offset, "mismatch", 10960 (intmax_t)dp2->di_extb[adp->ad_offset], 10961 (intmax_t)adp->ad_oldblkno); 10962 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; 10963 adp->ad_state &= ~UNDONE; 10964 adp->ad_state |= ATTACHED; 10965 hadchanges = 1; 10966 } 10967 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 10968 stat_direct_blk_ptrs++; 10969 /* 10970 * Reset the file size to its most up-to-date value. 10971 */ 10972 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) 10973 panic("handle_written_inodeblock: bad size"); 10974 if (inodedep->id_savednlink > LINK_MAX) 10975 panic("handle_written_inodeblock: Invalid link count " 10976 "%d for inodedep %p", inodedep->id_savednlink, inodedep); 10977 if (fstype == UFS1) { 10978 if (dp1->di_nlink != inodedep->id_savednlink) { 10979 dp1->di_nlink = inodedep->id_savednlink; 10980 hadchanges = 1; 10981 } 10982 if (dp1->di_size != inodedep->id_savedsize) { 10983 dp1->di_size = inodedep->id_savedsize; 10984 hadchanges = 1; 10985 } 10986 } else { 10987 if (dp2->di_nlink != inodedep->id_savednlink) { 10988 dp2->di_nlink = inodedep->id_savednlink; 10989 hadchanges = 1; 10990 } 10991 if (dp2->di_size != inodedep->id_savedsize) { 10992 dp2->di_size = inodedep->id_savedsize; 10993 hadchanges = 1; 10994 } 10995 if (dp2->di_extsize != inodedep->id_savedextsize) { 10996 dp2->di_extsize = inodedep->id_savedextsize; 10997 hadchanges = 1; 10998 } 10999 } 11000 inodedep->id_savedsize = -1; 11001 inodedep->id_savedextsize = -1; 11002 inodedep->id_savednlink = -1; 11003 /* 11004 * If there were any rollbacks in the inode block, then it must be 11005 * marked dirty so that its will eventually get written back in 11006 * its correct form. 11007 */ 11008 if (hadchanges) 11009 bdirty(bp); 11010 bufwait: 11011 /* 11012 * Process any allocdirects that completed during the update. 11013 */ 11014 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 11015 handle_allocdirect_partdone(adp, &wkhd); 11016 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 11017 handle_allocdirect_partdone(adp, &wkhd); 11018 /* 11019 * Process deallocations that were held pending until the 11020 * inode had been written to disk. Freeing of the inode 11021 * is delayed until after all blocks have been freed to 11022 * avoid creation of new <vfsid, inum, lbn> triples 11023 * before the old ones have been deleted. Completely 11024 * unlinked inodes are not processed until the unlinked 11025 * inode list is written or the last reference is removed. 11026 */ 11027 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { 11028 freefile = handle_bufwait(inodedep, NULL); 11029 if (freefile && !LIST_EMPTY(&wkhd)) { 11030 WORKLIST_INSERT(&wkhd, &freefile->fx_list); 11031 freefile = NULL; 11032 } 11033 } 11034 /* 11035 * Move rolled forward dependency completions to the bufwait list 11036 * now that those that were already written have been processed. 11037 */ 11038 if (!LIST_EMPTY(&wkhd) && hadchanges == 0) 11039 panic("handle_written_inodeblock: bufwait but no changes"); 11040 jwork_move(&inodedep->id_bufwait, &wkhd); 11041 11042 if (freefile != NULL) { 11043 /* 11044 * If the inode is goingaway it was never written. Fake up 11045 * the state here so free_inodedep() can succeed. 11046 */ 11047 if (inodedep->id_state & GOINGAWAY) 11048 inodedep->id_state |= COMPLETE | DEPCOMPLETE; 11049 if (free_inodedep(inodedep) == 0) 11050 panic("handle_written_inodeblock: live inodedep %p", 11051 inodedep); 11052 add_to_worklist(&freefile->fx_list, 0); 11053 return (0); 11054 } 11055 11056 /* 11057 * If no outstanding dependencies, free it. 11058 */ 11059 if (free_inodedep(inodedep) || 11060 (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && 11061 TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && 11062 TAILQ_FIRST(&inodedep->id_extupdt) == 0 && 11063 LIST_FIRST(&inodedep->id_bufwait) == 0)) 11064 return (0); 11065 return (hadchanges); 11066 } 11067 11068 static int 11069 handle_written_indirdep(indirdep, bp, bpp) 11070 struct indirdep *indirdep; 11071 struct buf *bp; 11072 struct buf **bpp; 11073 { 11074 struct allocindir *aip; 11075 struct buf *sbp; 11076 int chgs; 11077 11078 if (indirdep->ir_state & GOINGAWAY) 11079 panic("handle_written_indirdep: indirdep gone"); 11080 if ((indirdep->ir_state & IOSTARTED) == 0) 11081 panic("handle_written_indirdep: IO not started"); 11082 chgs = 0; 11083 /* 11084 * If there were rollbacks revert them here. 11085 */ 11086 if (indirdep->ir_saveddata) { 11087 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 11088 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 11089 free(indirdep->ir_saveddata, M_INDIRDEP); 11090 indirdep->ir_saveddata = NULL; 11091 } 11092 chgs = 1; 11093 } 11094 indirdep->ir_state &= ~(UNDONE | IOSTARTED); 11095 indirdep->ir_state |= ATTACHED; 11096 /* 11097 * Move allocindirs with written pointers to the completehd if 11098 * the indirdep's pointer is not yet written. Otherwise 11099 * free them here. 11100 */ 11101 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { 11102 LIST_REMOVE(aip, ai_next); 11103 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 11104 LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, 11105 ai_next); 11106 newblk_freefrag(&aip->ai_block); 11107 continue; 11108 } 11109 free_newblk(&aip->ai_block); 11110 } 11111 /* 11112 * Move allocindirs that have finished dependency processing from 11113 * the done list to the write list after updating the pointers. 11114 */ 11115 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 11116 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 11117 handle_allocindir_partdone(aip); 11118 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 11119 panic("disk_write_complete: not gone"); 11120 chgs = 1; 11121 } 11122 } 11123 /* 11124 * Preserve the indirdep if there were any changes or if it is not 11125 * yet valid on disk. 11126 */ 11127 if (chgs) { 11128 stat_indir_blk_ptrs++; 11129 bdirty(bp); 11130 return (1); 11131 } 11132 /* 11133 * If there were no changes we can discard the savedbp and detach 11134 * ourselves from the buf. We are only carrying completed pointers 11135 * in this case. 11136 */ 11137 sbp = indirdep->ir_savebp; 11138 sbp->b_flags |= B_INVAL | B_NOCACHE; 11139 indirdep->ir_savebp = NULL; 11140 indirdep->ir_bp = NULL; 11141 if (*bpp != NULL) 11142 panic("handle_written_indirdep: bp already exists."); 11143 *bpp = sbp; 11144 /* 11145 * The indirdep may not be freed until its parent points at it. 11146 */ 11147 if (indirdep->ir_state & DEPCOMPLETE) 11148 free_indirdep(indirdep); 11149 11150 return (0); 11151 } 11152 11153 /* 11154 * Process a diradd entry after its dependent inode has been written. 11155 * This routine must be called with splbio interrupts blocked. 11156 */ 11157 static void 11158 diradd_inode_written(dap, inodedep) 11159 struct diradd *dap; 11160 struct inodedep *inodedep; 11161 { 11162 11163 dap->da_state |= COMPLETE; 11164 complete_diradd(dap); 11165 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 11166 } 11167 11168 /* 11169 * Returns true if the bmsafemap will have rollbacks when written. Must 11170 * only be called with lk and the buf lock on the cg held. 11171 */ 11172 static int 11173 bmsafemap_rollbacks(bmsafemap) 11174 struct bmsafemap *bmsafemap; 11175 { 11176 11177 return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 11178 !LIST_EMPTY(&bmsafemap->sm_jnewblkhd)); 11179 } 11180 11181 /* 11182 * Re-apply an allocation when a cg write is complete. 11183 */ 11184 static int 11185 jnewblk_rollforward(jnewblk, fs, cgp, blksfree) 11186 struct jnewblk *jnewblk; 11187 struct fs *fs; 11188 struct cg *cgp; 11189 uint8_t *blksfree; 11190 { 11191 ufs1_daddr_t fragno; 11192 ufs2_daddr_t blkno; 11193 long cgbno, bbase; 11194 int frags, blk; 11195 int i; 11196 11197 frags = 0; 11198 cgbno = dtogd(fs, jnewblk->jn_blkno); 11199 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { 11200 if (isclr(blksfree, cgbno + i)) 11201 panic("jnewblk_rollforward: re-allocated fragment"); 11202 frags++; 11203 } 11204 if (frags == fs->fs_frag) { 11205 blkno = fragstoblks(fs, cgbno); 11206 ffs_clrblock(fs, blksfree, (long)blkno); 11207 ffs_clusteracct(fs, cgp, blkno, -1); 11208 cgp->cg_cs.cs_nbfree--; 11209 } else { 11210 bbase = cgbno - fragnum(fs, cgbno); 11211 cgbno += jnewblk->jn_oldfrags; 11212 /* If a complete block had been reassembled, account for it. */ 11213 fragno = fragstoblks(fs, bbase); 11214 if (ffs_isblock(fs, blksfree, fragno)) { 11215 cgp->cg_cs.cs_nffree += fs->fs_frag; 11216 ffs_clusteracct(fs, cgp, fragno, -1); 11217 cgp->cg_cs.cs_nbfree--; 11218 } 11219 /* Decrement the old frags. */ 11220 blk = blkmap(fs, blksfree, bbase); 11221 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 11222 /* Allocate the fragment */ 11223 for (i = 0; i < frags; i++) 11224 clrbit(blksfree, cgbno + i); 11225 cgp->cg_cs.cs_nffree -= frags; 11226 /* Add back in counts associated with the new frags */ 11227 blk = blkmap(fs, blksfree, bbase); 11228 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 11229 } 11230 return (frags); 11231 } 11232 11233 /* 11234 * Complete a write to a bmsafemap structure. Roll forward any bitmap 11235 * changes if it's not a background write. Set all written dependencies 11236 * to DEPCOMPLETE and free the structure if possible. 11237 */ 11238 static int 11239 handle_written_bmsafemap(bmsafemap, bp) 11240 struct bmsafemap *bmsafemap; 11241 struct buf *bp; 11242 { 11243 struct newblk *newblk; 11244 struct inodedep *inodedep; 11245 struct jaddref *jaddref, *jatmp; 11246 struct jnewblk *jnewblk, *jntmp; 11247 struct ufsmount *ump; 11248 uint8_t *inosused; 11249 uint8_t *blksfree; 11250 struct cg *cgp; 11251 struct fs *fs; 11252 ino_t ino; 11253 int chgs; 11254 11255 if ((bmsafemap->sm_state & IOSTARTED) == 0) 11256 panic("initiate_write_bmsafemap: Not started\n"); 11257 ump = VFSTOUFS(bmsafemap->sm_list.wk_mp); 11258 chgs = 0; 11259 bmsafemap->sm_state &= ~IOSTARTED; 11260 /* 11261 * Release journal work that was waiting on the write. 11262 */ 11263 handle_jwork(&bmsafemap->sm_freewr); 11264 11265 /* 11266 * Restore unwritten inode allocation pending jaddref writes. 11267 */ 11268 if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { 11269 cgp = (struct cg *)bp->b_data; 11270 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 11271 inosused = cg_inosused(cgp); 11272 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, 11273 ja_bmdeps, jatmp) { 11274 if ((jaddref->ja_state & UNDONE) == 0) 11275 continue; 11276 ino = jaddref->ja_ino % fs->fs_ipg; 11277 if (isset(inosused, ino)) 11278 panic("handle_written_bmsafemap: " 11279 "re-allocated inode"); 11280 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) { 11281 if ((jaddref->ja_mode & IFMT) == IFDIR) 11282 cgp->cg_cs.cs_ndir++; 11283 cgp->cg_cs.cs_nifree--; 11284 setbit(inosused, ino); 11285 chgs = 1; 11286 } 11287 jaddref->ja_state &= ~UNDONE; 11288 jaddref->ja_state |= ATTACHED; 11289 free_jaddref(jaddref); 11290 } 11291 } 11292 /* 11293 * Restore any block allocations which are pending journal writes. 11294 */ 11295 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 11296 cgp = (struct cg *)bp->b_data; 11297 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 11298 blksfree = cg_blksfree(cgp); 11299 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, 11300 jntmp) { 11301 if ((jnewblk->jn_state & UNDONE) == 0) 11302 continue; 11303 if ((bp->b_xflags & BX_BKGRDMARKER) == 0 && 11304 jnewblk_rollforward(jnewblk, fs, cgp, blksfree)) 11305 chgs = 1; 11306 jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); 11307 jnewblk->jn_state |= ATTACHED; 11308 free_jnewblk(jnewblk); 11309 } 11310 } 11311 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { 11312 newblk->nb_state |= DEPCOMPLETE; 11313 newblk->nb_state &= ~ONDEPLIST; 11314 newblk->nb_bmsafemap = NULL; 11315 LIST_REMOVE(newblk, nb_deps); 11316 if (newblk->nb_list.wk_type == D_ALLOCDIRECT) 11317 handle_allocdirect_partdone( 11318 WK_ALLOCDIRECT(&newblk->nb_list), NULL); 11319 else if (newblk->nb_list.wk_type == D_ALLOCINDIR) 11320 handle_allocindir_partdone( 11321 WK_ALLOCINDIR(&newblk->nb_list)); 11322 else if (newblk->nb_list.wk_type != D_NEWBLK) 11323 panic("handle_written_bmsafemap: Unexpected type: %s", 11324 TYPENAME(newblk->nb_list.wk_type)); 11325 } 11326 while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { 11327 inodedep->id_state |= DEPCOMPLETE; 11328 inodedep->id_state &= ~ONDEPLIST; 11329 LIST_REMOVE(inodedep, id_deps); 11330 inodedep->id_bmsafemap = NULL; 11331 } 11332 LIST_REMOVE(bmsafemap, sm_next); 11333 if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && 11334 LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && 11335 LIST_EMPTY(&bmsafemap->sm_newblkhd) && 11336 LIST_EMPTY(&bmsafemap->sm_inodedephd) && 11337 LIST_EMPTY(&bmsafemap->sm_freehd)) { 11338 LIST_REMOVE(bmsafemap, sm_hash); 11339 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 11340 return (0); 11341 } 11342 LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); 11343 bdirty(bp); 11344 return (1); 11345 } 11346 11347 /* 11348 * Try to free a mkdir dependency. 11349 */ 11350 static void 11351 complete_mkdir(mkdir) 11352 struct mkdir *mkdir; 11353 { 11354 struct diradd *dap; 11355 11356 if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) 11357 return; 11358 LIST_REMOVE(mkdir, md_mkdirs); 11359 dap = mkdir->md_diradd; 11360 dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 11361 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { 11362 dap->da_state |= DEPCOMPLETE; 11363 complete_diradd(dap); 11364 } 11365 WORKITEM_FREE(mkdir, D_MKDIR); 11366 } 11367 11368 /* 11369 * Handle the completion of a mkdir dependency. 11370 */ 11371 static void 11372 handle_written_mkdir(mkdir, type) 11373 struct mkdir *mkdir; 11374 int type; 11375 { 11376 11377 if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) 11378 panic("handle_written_mkdir: bad type"); 11379 mkdir->md_state |= COMPLETE; 11380 complete_mkdir(mkdir); 11381 } 11382 11383 static int 11384 free_pagedep(pagedep) 11385 struct pagedep *pagedep; 11386 { 11387 int i; 11388 11389 if (pagedep->pd_state & NEWBLOCK) 11390 return (0); 11391 if (!LIST_EMPTY(&pagedep->pd_dirremhd)) 11392 return (0); 11393 for (i = 0; i < DAHASHSZ; i++) 11394 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 11395 return (0); 11396 if (!LIST_EMPTY(&pagedep->pd_pendinghd)) 11397 return (0); 11398 if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) 11399 return (0); 11400 if (pagedep->pd_state & ONWORKLIST) 11401 WORKLIST_REMOVE(&pagedep->pd_list); 11402 LIST_REMOVE(pagedep, pd_hash); 11403 WORKITEM_FREE(pagedep, D_PAGEDEP); 11404 11405 return (1); 11406 } 11407 11408 /* 11409 * Called from within softdep_disk_write_complete above. 11410 * A write operation was just completed. Removed inodes can 11411 * now be freed and associated block pointers may be committed. 11412 * Note that this routine is always called from interrupt level 11413 * with further splbio interrupts blocked. 11414 */ 11415 static int 11416 handle_written_filepage(pagedep, bp) 11417 struct pagedep *pagedep; 11418 struct buf *bp; /* buffer containing the written page */ 11419 { 11420 struct dirrem *dirrem; 11421 struct diradd *dap, *nextdap; 11422 struct direct *ep; 11423 int i, chgs; 11424 11425 if ((pagedep->pd_state & IOSTARTED) == 0) 11426 panic("handle_written_filepage: not started"); 11427 pagedep->pd_state &= ~IOSTARTED; 11428 /* 11429 * Process any directory removals that have been committed. 11430 */ 11431 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 11432 LIST_REMOVE(dirrem, dm_next); 11433 dirrem->dm_state |= COMPLETE; 11434 dirrem->dm_dirinum = pagedep->pd_ino; 11435 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 11436 ("handle_written_filepage: Journal entries not written.")); 11437 add_to_worklist(&dirrem->dm_list, 0); 11438 } 11439 /* 11440 * Free any directory additions that have been committed. 11441 * If it is a newly allocated block, we have to wait until 11442 * the on-disk directory inode claims the new block. 11443 */ 11444 if ((pagedep->pd_state & NEWBLOCK) == 0) 11445 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 11446 free_diradd(dap, NULL); 11447 /* 11448 * Uncommitted directory entries must be restored. 11449 */ 11450 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 11451 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 11452 dap = nextdap) { 11453 nextdap = LIST_NEXT(dap, da_pdlist); 11454 if (dap->da_state & ATTACHED) 11455 panic("handle_written_filepage: attached"); 11456 ep = (struct direct *) 11457 ((char *)bp->b_data + dap->da_offset); 11458 ep->d_ino = dap->da_newinum; 11459 dap->da_state &= ~UNDONE; 11460 dap->da_state |= ATTACHED; 11461 chgs = 1; 11462 /* 11463 * If the inode referenced by the directory has 11464 * been written out, then the dependency can be 11465 * moved to the pending list. 11466 */ 11467 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 11468 LIST_REMOVE(dap, da_pdlist); 11469 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 11470 da_pdlist); 11471 } 11472 } 11473 } 11474 /* 11475 * If there were any rollbacks in the directory, then it must be 11476 * marked dirty so that its will eventually get written back in 11477 * its correct form. 11478 */ 11479 if (chgs) { 11480 if ((bp->b_flags & B_DELWRI) == 0) 11481 stat_dir_entry++; 11482 bdirty(bp); 11483 return (1); 11484 } 11485 /* 11486 * If we are not waiting for a new directory block to be 11487 * claimed by its inode, then the pagedep will be freed. 11488 * Otherwise it will remain to track any new entries on 11489 * the page in case they are fsync'ed. 11490 */ 11491 free_pagedep(pagedep); 11492 return (0); 11493 } 11494 11495 /* 11496 * Writing back in-core inode structures. 11497 * 11498 * The filesystem only accesses an inode's contents when it occupies an 11499 * "in-core" inode structure. These "in-core" structures are separate from 11500 * the page frames used to cache inode blocks. Only the latter are 11501 * transferred to/from the disk. So, when the updated contents of the 11502 * "in-core" inode structure are copied to the corresponding in-memory inode 11503 * block, the dependencies are also transferred. The following procedure is 11504 * called when copying a dirty "in-core" inode to a cached inode block. 11505 */ 11506 11507 /* 11508 * Called when an inode is loaded from disk. If the effective link count 11509 * differed from the actual link count when it was last flushed, then we 11510 * need to ensure that the correct effective link count is put back. 11511 */ 11512 void 11513 softdep_load_inodeblock(ip) 11514 struct inode *ip; /* the "in_core" copy of the inode */ 11515 { 11516 struct inodedep *inodedep; 11517 11518 /* 11519 * Check for alternate nlink count. 11520 */ 11521 ip->i_effnlink = ip->i_nlink; 11522 ACQUIRE_LOCK(&lk); 11523 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 11524 &inodedep) == 0) { 11525 FREE_LOCK(&lk); 11526 return; 11527 } 11528 ip->i_effnlink -= inodedep->id_nlinkdelta; 11529 FREE_LOCK(&lk); 11530 } 11531 11532 /* 11533 * This routine is called just before the "in-core" inode 11534 * information is to be copied to the in-memory inode block. 11535 * Recall that an inode block contains several inodes. If 11536 * the force flag is set, then the dependencies will be 11537 * cleared so that the update can always be made. Note that 11538 * the buffer is locked when this routine is called, so we 11539 * will never be in the middle of writing the inode block 11540 * to disk. 11541 */ 11542 void 11543 softdep_update_inodeblock(ip, bp, waitfor) 11544 struct inode *ip; /* the "in_core" copy of the inode */ 11545 struct buf *bp; /* the buffer containing the inode block */ 11546 int waitfor; /* nonzero => update must be allowed */ 11547 { 11548 struct inodedep *inodedep; 11549 struct inoref *inoref; 11550 struct worklist *wk; 11551 struct mount *mp; 11552 struct buf *ibp; 11553 struct fs *fs; 11554 int error; 11555 11556 mp = UFSTOVFS(ip->i_ump); 11557 fs = ip->i_fs; 11558 /* 11559 * Preserve the freelink that is on disk. clear_unlinked_inodedep() 11560 * does not have access to the in-core ip so must write directly into 11561 * the inode block buffer when setting freelink. 11562 */ 11563 if (fs->fs_magic == FS_UFS1_MAGIC) 11564 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + 11565 ino_to_fsbo(fs, ip->i_number))->di_freelink); 11566 else 11567 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + 11568 ino_to_fsbo(fs, ip->i_number))->di_freelink); 11569 /* 11570 * If the effective link count is not equal to the actual link 11571 * count, then we must track the difference in an inodedep while 11572 * the inode is (potentially) tossed out of the cache. Otherwise, 11573 * if there is no existing inodedep, then there are no dependencies 11574 * to track. 11575 */ 11576 ACQUIRE_LOCK(&lk); 11577 again: 11578 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 11579 FREE_LOCK(&lk); 11580 if (ip->i_effnlink != ip->i_nlink) 11581 panic("softdep_update_inodeblock: bad link count"); 11582 return; 11583 } 11584 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) 11585 panic("softdep_update_inodeblock: bad delta"); 11586 /* 11587 * If we're flushing all dependencies we must also move any waiting 11588 * for journal writes onto the bufwait list prior to I/O. 11589 */ 11590 if (waitfor) { 11591 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 11592 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 11593 == DEPCOMPLETE) { 11594 jwait(&inoref->if_list, MNT_WAIT); 11595 goto again; 11596 } 11597 } 11598 } 11599 /* 11600 * Changes have been initiated. Anything depending on these 11601 * changes cannot occur until this inode has been written. 11602 */ 11603 inodedep->id_state &= ~COMPLETE; 11604 if ((inodedep->id_state & ONWORKLIST) == 0) 11605 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 11606 /* 11607 * Any new dependencies associated with the incore inode must 11608 * now be moved to the list associated with the buffer holding 11609 * the in-memory copy of the inode. Once merged process any 11610 * allocdirects that are completed by the merger. 11611 */ 11612 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); 11613 if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) 11614 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), 11615 NULL); 11616 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); 11617 if (!TAILQ_EMPTY(&inodedep->id_extupdt)) 11618 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), 11619 NULL); 11620 /* 11621 * Now that the inode has been pushed into the buffer, the 11622 * operations dependent on the inode being written to disk 11623 * can be moved to the id_bufwait so that they will be 11624 * processed when the buffer I/O completes. 11625 */ 11626 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 11627 WORKLIST_REMOVE(wk); 11628 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 11629 } 11630 /* 11631 * Newly allocated inodes cannot be written until the bitmap 11632 * that allocates them have been written (indicated by 11633 * DEPCOMPLETE being set in id_state). If we are doing a 11634 * forced sync (e.g., an fsync on a file), we force the bitmap 11635 * to be written so that the update can be done. 11636 */ 11637 if (waitfor == 0) { 11638 FREE_LOCK(&lk); 11639 return; 11640 } 11641 retry: 11642 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { 11643 FREE_LOCK(&lk); 11644 return; 11645 } 11646 ibp = inodedep->id_bmsafemap->sm_buf; 11647 ibp = getdirtybuf(ibp, &lk, MNT_WAIT); 11648 if (ibp == NULL) { 11649 /* 11650 * If ibp came back as NULL, the dependency could have been 11651 * freed while we slept. Look it up again, and check to see 11652 * that it has completed. 11653 */ 11654 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 11655 goto retry; 11656 FREE_LOCK(&lk); 11657 return; 11658 } 11659 FREE_LOCK(&lk); 11660 if ((error = bwrite(ibp)) != 0) 11661 softdep_error("softdep_update_inodeblock: bwrite", error); 11662 } 11663 11664 /* 11665 * Merge the a new inode dependency list (such as id_newinoupdt) into an 11666 * old inode dependency list (such as id_inoupdt). This routine must be 11667 * called with splbio interrupts blocked. 11668 */ 11669 static void 11670 merge_inode_lists(newlisthead, oldlisthead) 11671 struct allocdirectlst *newlisthead; 11672 struct allocdirectlst *oldlisthead; 11673 { 11674 struct allocdirect *listadp, *newadp; 11675 11676 newadp = TAILQ_FIRST(newlisthead); 11677 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { 11678 if (listadp->ad_offset < newadp->ad_offset) { 11679 listadp = TAILQ_NEXT(listadp, ad_next); 11680 continue; 11681 } 11682 TAILQ_REMOVE(newlisthead, newadp, ad_next); 11683 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 11684 if (listadp->ad_offset == newadp->ad_offset) { 11685 allocdirect_merge(oldlisthead, newadp, 11686 listadp); 11687 listadp = newadp; 11688 } 11689 newadp = TAILQ_FIRST(newlisthead); 11690 } 11691 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { 11692 TAILQ_REMOVE(newlisthead, newadp, ad_next); 11693 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); 11694 } 11695 } 11696 11697 /* 11698 * If we are doing an fsync, then we must ensure that any directory 11699 * entries for the inode have been written after the inode gets to disk. 11700 */ 11701 int 11702 softdep_fsync(vp) 11703 struct vnode *vp; /* the "in_core" copy of the inode */ 11704 { 11705 struct inodedep *inodedep; 11706 struct pagedep *pagedep; 11707 struct inoref *inoref; 11708 struct worklist *wk; 11709 struct diradd *dap; 11710 struct mount *mp; 11711 struct vnode *pvp; 11712 struct inode *ip; 11713 struct buf *bp; 11714 struct fs *fs; 11715 struct thread *td = curthread; 11716 int error, flushparent, pagedep_new_block; 11717 ino_t parentino; 11718 ufs_lbn_t lbn; 11719 11720 ip = VTOI(vp); 11721 fs = ip->i_fs; 11722 mp = vp->v_mount; 11723 ACQUIRE_LOCK(&lk); 11724 restart: 11725 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 11726 FREE_LOCK(&lk); 11727 return (0); 11728 } 11729 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 11730 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 11731 == DEPCOMPLETE) { 11732 jwait(&inoref->if_list, MNT_WAIT); 11733 goto restart; 11734 } 11735 } 11736 if (!LIST_EMPTY(&inodedep->id_inowait) || 11737 !TAILQ_EMPTY(&inodedep->id_extupdt) || 11738 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 11739 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 11740 !TAILQ_EMPTY(&inodedep->id_newinoupdt)) 11741 panic("softdep_fsync: pending ops %p", inodedep); 11742 for (error = 0, flushparent = 0; ; ) { 11743 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 11744 break; 11745 if (wk->wk_type != D_DIRADD) 11746 panic("softdep_fsync: Unexpected type %s", 11747 TYPENAME(wk->wk_type)); 11748 dap = WK_DIRADD(wk); 11749 /* 11750 * Flush our parent if this directory entry has a MKDIR_PARENT 11751 * dependency or is contained in a newly allocated block. 11752 */ 11753 if (dap->da_state & DIRCHG) 11754 pagedep = dap->da_previous->dm_pagedep; 11755 else 11756 pagedep = dap->da_pagedep; 11757 parentino = pagedep->pd_ino; 11758 lbn = pagedep->pd_lbn; 11759 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 11760 panic("softdep_fsync: dirty"); 11761 if ((dap->da_state & MKDIR_PARENT) || 11762 (pagedep->pd_state & NEWBLOCK)) 11763 flushparent = 1; 11764 else 11765 flushparent = 0; 11766 /* 11767 * If we are being fsync'ed as part of vgone'ing this vnode, 11768 * then we will not be able to release and recover the 11769 * vnode below, so we just have to give up on writing its 11770 * directory entry out. It will eventually be written, just 11771 * not now, but then the user was not asking to have it 11772 * written, so we are not breaking any promises. 11773 */ 11774 if (vp->v_iflag & VI_DOOMED) 11775 break; 11776 /* 11777 * We prevent deadlock by always fetching inodes from the 11778 * root, moving down the directory tree. Thus, when fetching 11779 * our parent directory, we first try to get the lock. If 11780 * that fails, we must unlock ourselves before requesting 11781 * the lock on our parent. See the comment in ufs_lookup 11782 * for details on possible races. 11783 */ 11784 FREE_LOCK(&lk); 11785 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, 11786 FFSV_FORCEINSMQ)) { 11787 error = vfs_busy(mp, MBF_NOWAIT); 11788 if (error != 0) { 11789 vfs_ref(mp); 11790 VOP_UNLOCK(vp, 0); 11791 error = vfs_busy(mp, 0); 11792 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 11793 vfs_rel(mp); 11794 if (error != 0) 11795 return (ENOENT); 11796 if (vp->v_iflag & VI_DOOMED) { 11797 vfs_unbusy(mp); 11798 return (ENOENT); 11799 } 11800 } 11801 VOP_UNLOCK(vp, 0); 11802 error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, 11803 &pvp, FFSV_FORCEINSMQ); 11804 vfs_unbusy(mp); 11805 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 11806 if (vp->v_iflag & VI_DOOMED) { 11807 if (error == 0) 11808 vput(pvp); 11809 error = ENOENT; 11810 } 11811 if (error != 0) 11812 return (error); 11813 } 11814 /* 11815 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 11816 * that are contained in direct blocks will be resolved by 11817 * doing a ffs_update. Pagedeps contained in indirect blocks 11818 * may require a complete sync'ing of the directory. So, we 11819 * try the cheap and fast ffs_update first, and if that fails, 11820 * then we do the slower ffs_syncvnode of the directory. 11821 */ 11822 if (flushparent) { 11823 int locked; 11824 11825 if ((error = ffs_update(pvp, 1)) != 0) { 11826 vput(pvp); 11827 return (error); 11828 } 11829 ACQUIRE_LOCK(&lk); 11830 locked = 1; 11831 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { 11832 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { 11833 if (wk->wk_type != D_DIRADD) 11834 panic("softdep_fsync: Unexpected type %s", 11835 TYPENAME(wk->wk_type)); 11836 dap = WK_DIRADD(wk); 11837 if (dap->da_state & DIRCHG) 11838 pagedep = dap->da_previous->dm_pagedep; 11839 else 11840 pagedep = dap->da_pagedep; 11841 pagedep_new_block = pagedep->pd_state & NEWBLOCK; 11842 FREE_LOCK(&lk); 11843 locked = 0; 11844 if (pagedep_new_block && 11845 (error = ffs_syncvnode(pvp, MNT_WAIT))) { 11846 vput(pvp); 11847 return (error); 11848 } 11849 } 11850 } 11851 if (locked) 11852 FREE_LOCK(&lk); 11853 } 11854 /* 11855 * Flush directory page containing the inode's name. 11856 */ 11857 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 11858 &bp); 11859 if (error == 0) 11860 error = bwrite(bp); 11861 else 11862 brelse(bp); 11863 vput(pvp); 11864 if (error != 0) 11865 return (error); 11866 ACQUIRE_LOCK(&lk); 11867 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 11868 break; 11869 } 11870 FREE_LOCK(&lk); 11871 return (0); 11872 } 11873 11874 /* 11875 * Flush all the dirty bitmaps associated with the block device 11876 * before flushing the rest of the dirty blocks so as to reduce 11877 * the number of dependencies that will have to be rolled back. 11878 * 11879 * XXX Unused? 11880 */ 11881 void 11882 softdep_fsync_mountdev(vp) 11883 struct vnode *vp; 11884 { 11885 struct buf *bp, *nbp; 11886 struct worklist *wk; 11887 struct bufobj *bo; 11888 11889 if (!vn_isdisk(vp, NULL)) 11890 panic("softdep_fsync_mountdev: vnode not a disk"); 11891 bo = &vp->v_bufobj; 11892 restart: 11893 BO_LOCK(bo); 11894 ACQUIRE_LOCK(&lk); 11895 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 11896 /* 11897 * If it is already scheduled, skip to the next buffer. 11898 */ 11899 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 11900 continue; 11901 11902 if ((bp->b_flags & B_DELWRI) == 0) 11903 panic("softdep_fsync_mountdev: not dirty"); 11904 /* 11905 * We are only interested in bitmaps with outstanding 11906 * dependencies. 11907 */ 11908 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 11909 wk->wk_type != D_BMSAFEMAP || 11910 (bp->b_vflags & BV_BKGRDINPROG)) { 11911 BUF_UNLOCK(bp); 11912 continue; 11913 } 11914 FREE_LOCK(&lk); 11915 BO_UNLOCK(bo); 11916 bremfree(bp); 11917 (void) bawrite(bp); 11918 goto restart; 11919 } 11920 FREE_LOCK(&lk); 11921 drain_output(vp); 11922 BO_UNLOCK(bo); 11923 } 11924 11925 /* 11926 * Sync all cylinder groups that were dirty at the time this function is 11927 * called. Newly dirtied cgs will be inserted before the sintenel. This 11928 * is used to flush freedep activity that may be holding up writes to a 11929 * indirect block. 11930 */ 11931 static int 11932 sync_cgs(mp, waitfor) 11933 struct mount *mp; 11934 int waitfor; 11935 { 11936 struct bmsafemap *bmsafemap; 11937 struct bmsafemap *sintenel; 11938 struct ufsmount *ump; 11939 struct buf *bp; 11940 int error; 11941 11942 sintenel = malloc(sizeof(*sintenel), M_BMSAFEMAP, M_ZERO | M_WAITOK); 11943 sintenel->sm_cg = -1; 11944 ump = VFSTOUFS(mp); 11945 error = 0; 11946 ACQUIRE_LOCK(&lk); 11947 LIST_INSERT_HEAD(&ump->softdep_dirtycg, sintenel, sm_next); 11948 for (bmsafemap = LIST_NEXT(sintenel, sm_next); bmsafemap != NULL; 11949 bmsafemap = LIST_NEXT(sintenel, sm_next)) { 11950 /* Skip sintenels and cgs with no work to release. */ 11951 if (bmsafemap->sm_cg == -1 || 11952 (LIST_EMPTY(&bmsafemap->sm_freehd) && 11953 LIST_EMPTY(&bmsafemap->sm_freewr))) { 11954 LIST_REMOVE(sintenel, sm_next); 11955 LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next); 11956 continue; 11957 } 11958 /* 11959 * If we don't get the lock and we're waiting try again, if 11960 * not move on to the next buf and try to sync it. 11961 */ 11962 bp = getdirtybuf(bmsafemap->sm_buf, &lk, waitfor); 11963 if (bp == NULL && waitfor == MNT_WAIT) 11964 continue; 11965 LIST_REMOVE(sintenel, sm_next); 11966 LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next); 11967 if (bp == NULL) 11968 continue; 11969 FREE_LOCK(&lk); 11970 if (waitfor == MNT_NOWAIT) 11971 bawrite(bp); 11972 else 11973 error = bwrite(bp); 11974 ACQUIRE_LOCK(&lk); 11975 if (error) 11976 break; 11977 } 11978 LIST_REMOVE(sintenel, sm_next); 11979 FREE_LOCK(&lk); 11980 free(sintenel, M_BMSAFEMAP); 11981 return (error); 11982 } 11983 11984 /* 11985 * This routine is called when we are trying to synchronously flush a 11986 * file. This routine must eliminate any filesystem metadata dependencies 11987 * so that the syncing routine can succeed. 11988 */ 11989 int 11990 softdep_sync_metadata(struct vnode *vp) 11991 { 11992 int error; 11993 11994 /* 11995 * Ensure that any direct block dependencies have been cleared, 11996 * truncations are started, and inode references are journaled. 11997 */ 11998 ACQUIRE_LOCK(&lk); 11999 /* 12000 * Write all journal records to prevent rollbacks on devvp. 12001 */ 12002 if (vp->v_type == VCHR) 12003 softdep_flushjournal(vp->v_mount); 12004 error = flush_inodedep_deps(vp, vp->v_mount, VTOI(vp)->i_number); 12005 /* 12006 * Ensure that all truncates are written so we won't find deps on 12007 * indirect blocks. 12008 */ 12009 process_truncates(vp); 12010 FREE_LOCK(&lk); 12011 12012 return (error); 12013 } 12014 12015 /* 12016 * This routine is called when we are attempting to sync a buf with 12017 * dependencies. If waitfor is MNT_NOWAIT it attempts to schedule any 12018 * other IO it can but returns EBUSY if the buffer is not yet able to 12019 * be written. Dependencies which will not cause rollbacks will always 12020 * return 0. 12021 */ 12022 int 12023 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) 12024 { 12025 struct indirdep *indirdep; 12026 struct pagedep *pagedep; 12027 struct allocindir *aip; 12028 struct newblk *newblk; 12029 struct buf *nbp; 12030 struct worklist *wk; 12031 int i, error; 12032 12033 /* 12034 * For VCHR we just don't want to force flush any dependencies that 12035 * will cause rollbacks. 12036 */ 12037 if (vp->v_type == VCHR) { 12038 if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0)) 12039 return (EBUSY); 12040 return (0); 12041 } 12042 ACQUIRE_LOCK(&lk); 12043 /* 12044 * As we hold the buffer locked, none of its dependencies 12045 * will disappear. 12046 */ 12047 error = 0; 12048 top: 12049 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 12050 switch (wk->wk_type) { 12051 12052 case D_ALLOCDIRECT: 12053 case D_ALLOCINDIR: 12054 newblk = WK_NEWBLK(wk); 12055 if (newblk->nb_jnewblk != NULL) { 12056 if (waitfor == MNT_NOWAIT) { 12057 error = EBUSY; 12058 goto out_unlock; 12059 } 12060 jwait(&newblk->nb_jnewblk->jn_list, waitfor); 12061 goto top; 12062 } 12063 if (newblk->nb_state & DEPCOMPLETE || 12064 waitfor == MNT_NOWAIT) 12065 continue; 12066 nbp = newblk->nb_bmsafemap->sm_buf; 12067 nbp = getdirtybuf(nbp, &lk, waitfor); 12068 if (nbp == NULL) 12069 goto top; 12070 FREE_LOCK(&lk); 12071 if ((error = bwrite(nbp)) != 0) 12072 goto out; 12073 ACQUIRE_LOCK(&lk); 12074 continue; 12075 12076 case D_INDIRDEP: 12077 indirdep = WK_INDIRDEP(wk); 12078 if (waitfor == MNT_NOWAIT) { 12079 if (!TAILQ_EMPTY(&indirdep->ir_trunc) || 12080 !LIST_EMPTY(&indirdep->ir_deplisthd)) { 12081 error = EBUSY; 12082 goto out_unlock; 12083 } 12084 } 12085 if (!TAILQ_EMPTY(&indirdep->ir_trunc)) 12086 panic("softdep_sync_buf: truncation pending."); 12087 restart: 12088 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 12089 newblk = (struct newblk *)aip; 12090 if (newblk->nb_jnewblk != NULL) { 12091 jwait(&newblk->nb_jnewblk->jn_list, 12092 waitfor); 12093 goto restart; 12094 } 12095 if (newblk->nb_state & DEPCOMPLETE) 12096 continue; 12097 nbp = newblk->nb_bmsafemap->sm_buf; 12098 nbp = getdirtybuf(nbp, &lk, waitfor); 12099 if (nbp == NULL) 12100 goto restart; 12101 FREE_LOCK(&lk); 12102 if ((error = bwrite(nbp)) != 0) 12103 goto out; 12104 ACQUIRE_LOCK(&lk); 12105 goto restart; 12106 } 12107 continue; 12108 12109 case D_PAGEDEP: 12110 /* 12111 * Only flush directory entries in synchronous passes. 12112 */ 12113 if (waitfor != MNT_WAIT) { 12114 error = EBUSY; 12115 goto out_unlock; 12116 } 12117 /* 12118 * While syncing snapshots, we must allow recursive 12119 * lookups. 12120 */ 12121 BUF_AREC(bp); 12122 /* 12123 * We are trying to sync a directory that may 12124 * have dependencies on both its own metadata 12125 * and/or dependencies on the inodes of any 12126 * recently allocated files. We walk its diradd 12127 * lists pushing out the associated inode. 12128 */ 12129 pagedep = WK_PAGEDEP(wk); 12130 for (i = 0; i < DAHASHSZ; i++) { 12131 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 12132 continue; 12133 if ((error = flush_pagedep_deps(vp, wk->wk_mp, 12134 &pagedep->pd_diraddhd[i]))) { 12135 BUF_NOREC(bp); 12136 goto out_unlock; 12137 } 12138 } 12139 BUF_NOREC(bp); 12140 continue; 12141 12142 case D_FREEWORK: 12143 case D_FREEDEP: 12144 case D_JSEGDEP: 12145 case D_JNEWBLK: 12146 continue; 12147 12148 default: 12149 panic("softdep_sync_buf: Unknown type %s", 12150 TYPENAME(wk->wk_type)); 12151 /* NOTREACHED */ 12152 } 12153 } 12154 out_unlock: 12155 FREE_LOCK(&lk); 12156 out: 12157 return (error); 12158 } 12159 12160 /* 12161 * Flush the dependencies associated with an inodedep. 12162 * Called with splbio blocked. 12163 */ 12164 static int 12165 flush_inodedep_deps(vp, mp, ino) 12166 struct vnode *vp; 12167 struct mount *mp; 12168 ino_t ino; 12169 { 12170 struct inodedep *inodedep; 12171 struct inoref *inoref; 12172 int error, waitfor; 12173 12174 /* 12175 * This work is done in two passes. The first pass grabs most 12176 * of the buffers and begins asynchronously writing them. The 12177 * only way to wait for these asynchronous writes is to sleep 12178 * on the filesystem vnode which may stay busy for a long time 12179 * if the filesystem is active. So, instead, we make a second 12180 * pass over the dependencies blocking on each write. In the 12181 * usual case we will be blocking against a write that we 12182 * initiated, so when it is done the dependency will have been 12183 * resolved. Thus the second pass is expected to end quickly. 12184 * We give a brief window at the top of the loop to allow 12185 * any pending I/O to complete. 12186 */ 12187 for (error = 0, waitfor = MNT_NOWAIT; ; ) { 12188 if (error) 12189 return (error); 12190 FREE_LOCK(&lk); 12191 ACQUIRE_LOCK(&lk); 12192 restart: 12193 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 12194 return (0); 12195 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 12196 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 12197 == DEPCOMPLETE) { 12198 jwait(&inoref->if_list, MNT_WAIT); 12199 goto restart; 12200 } 12201 } 12202 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || 12203 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || 12204 flush_deplist(&inodedep->id_extupdt, waitfor, &error) || 12205 flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) 12206 continue; 12207 /* 12208 * If pass2, we are done, otherwise do pass 2. 12209 */ 12210 if (waitfor == MNT_WAIT) 12211 break; 12212 waitfor = MNT_WAIT; 12213 } 12214 /* 12215 * Try freeing inodedep in case all dependencies have been removed. 12216 */ 12217 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) 12218 (void) free_inodedep(inodedep); 12219 return (0); 12220 } 12221 12222 /* 12223 * Flush an inode dependency list. 12224 * Called with splbio blocked. 12225 */ 12226 static int 12227 flush_deplist(listhead, waitfor, errorp) 12228 struct allocdirectlst *listhead; 12229 int waitfor; 12230 int *errorp; 12231 { 12232 struct allocdirect *adp; 12233 struct newblk *newblk; 12234 struct buf *bp; 12235 12236 mtx_assert(&lk, MA_OWNED); 12237 TAILQ_FOREACH(adp, listhead, ad_next) { 12238 newblk = (struct newblk *)adp; 12239 if (newblk->nb_jnewblk != NULL) { 12240 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 12241 return (1); 12242 } 12243 if (newblk->nb_state & DEPCOMPLETE) 12244 continue; 12245 bp = newblk->nb_bmsafemap->sm_buf; 12246 bp = getdirtybuf(bp, &lk, waitfor); 12247 if (bp == NULL) { 12248 if (waitfor == MNT_NOWAIT) 12249 continue; 12250 return (1); 12251 } 12252 FREE_LOCK(&lk); 12253 if (waitfor == MNT_NOWAIT) 12254 bawrite(bp); 12255 else 12256 *errorp = bwrite(bp); 12257 ACQUIRE_LOCK(&lk); 12258 return (1); 12259 } 12260 return (0); 12261 } 12262 12263 /* 12264 * Flush dependencies associated with an allocdirect block. 12265 */ 12266 static int 12267 flush_newblk_dep(vp, mp, lbn) 12268 struct vnode *vp; 12269 struct mount *mp; 12270 ufs_lbn_t lbn; 12271 { 12272 struct newblk *newblk; 12273 struct bufobj *bo; 12274 struct inode *ip; 12275 struct buf *bp; 12276 ufs2_daddr_t blkno; 12277 int error; 12278 12279 error = 0; 12280 bo = &vp->v_bufobj; 12281 ip = VTOI(vp); 12282 blkno = DIP(ip, i_db[lbn]); 12283 if (blkno == 0) 12284 panic("flush_newblk_dep: Missing block"); 12285 ACQUIRE_LOCK(&lk); 12286 /* 12287 * Loop until all dependencies related to this block are satisfied. 12288 * We must be careful to restart after each sleep in case a write 12289 * completes some part of this process for us. 12290 */ 12291 for (;;) { 12292 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { 12293 FREE_LOCK(&lk); 12294 break; 12295 } 12296 if (newblk->nb_list.wk_type != D_ALLOCDIRECT) 12297 panic("flush_newblk_deps: Bad newblk %p", newblk); 12298 /* 12299 * Flush the journal. 12300 */ 12301 if (newblk->nb_jnewblk != NULL) { 12302 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 12303 continue; 12304 } 12305 /* 12306 * Write the bitmap dependency. 12307 */ 12308 if ((newblk->nb_state & DEPCOMPLETE) == 0) { 12309 bp = newblk->nb_bmsafemap->sm_buf; 12310 bp = getdirtybuf(bp, &lk, MNT_WAIT); 12311 if (bp == NULL) 12312 continue; 12313 FREE_LOCK(&lk); 12314 error = bwrite(bp); 12315 if (error) 12316 break; 12317 ACQUIRE_LOCK(&lk); 12318 continue; 12319 } 12320 /* 12321 * Write the buffer. 12322 */ 12323 FREE_LOCK(&lk); 12324 BO_LOCK(bo); 12325 bp = gbincore(bo, lbn); 12326 if (bp != NULL) { 12327 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 12328 LK_INTERLOCK, BO_MTX(bo)); 12329 if (error == ENOLCK) { 12330 ACQUIRE_LOCK(&lk); 12331 continue; /* Slept, retry */ 12332 } 12333 if (error != 0) 12334 break; /* Failed */ 12335 if (bp->b_flags & B_DELWRI) { 12336 bremfree(bp); 12337 error = bwrite(bp); 12338 if (error) 12339 break; 12340 } else 12341 BUF_UNLOCK(bp); 12342 } else 12343 BO_UNLOCK(bo); 12344 /* 12345 * We have to wait for the direct pointers to 12346 * point at the newdirblk before the dependency 12347 * will go away. 12348 */ 12349 error = ffs_update(vp, MNT_WAIT); 12350 if (error) 12351 break; 12352 ACQUIRE_LOCK(&lk); 12353 } 12354 return (error); 12355 } 12356 12357 /* 12358 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 12359 * Called with splbio blocked. 12360 */ 12361 static int 12362 flush_pagedep_deps(pvp, mp, diraddhdp) 12363 struct vnode *pvp; 12364 struct mount *mp; 12365 struct diraddhd *diraddhdp; 12366 { 12367 struct inodedep *inodedep; 12368 struct inoref *inoref; 12369 struct ufsmount *ump; 12370 struct diradd *dap; 12371 struct vnode *vp; 12372 int error = 0; 12373 struct buf *bp; 12374 ino_t inum; 12375 12376 ump = VFSTOUFS(mp); 12377 restart: 12378 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 12379 /* 12380 * Flush ourselves if this directory entry 12381 * has a MKDIR_PARENT dependency. 12382 */ 12383 if (dap->da_state & MKDIR_PARENT) { 12384 FREE_LOCK(&lk); 12385 if ((error = ffs_update(pvp, MNT_WAIT)) != 0) 12386 break; 12387 ACQUIRE_LOCK(&lk); 12388 /* 12389 * If that cleared dependencies, go on to next. 12390 */ 12391 if (dap != LIST_FIRST(diraddhdp)) 12392 continue; 12393 if (dap->da_state & MKDIR_PARENT) 12394 panic("flush_pagedep_deps: MKDIR_PARENT"); 12395 } 12396 /* 12397 * A newly allocated directory must have its "." and 12398 * ".." entries written out before its name can be 12399 * committed in its parent. 12400 */ 12401 inum = dap->da_newinum; 12402 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 12403 panic("flush_pagedep_deps: lost inode1"); 12404 /* 12405 * Wait for any pending journal adds to complete so we don't 12406 * cause rollbacks while syncing. 12407 */ 12408 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 12409 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 12410 == DEPCOMPLETE) { 12411 jwait(&inoref->if_list, MNT_WAIT); 12412 goto restart; 12413 } 12414 } 12415 if (dap->da_state & MKDIR_BODY) { 12416 FREE_LOCK(&lk); 12417 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 12418 FFSV_FORCEINSMQ))) 12419 break; 12420 error = flush_newblk_dep(vp, mp, 0); 12421 /* 12422 * If we still have the dependency we might need to 12423 * update the vnode to sync the new link count to 12424 * disk. 12425 */ 12426 if (error == 0 && dap == LIST_FIRST(diraddhdp)) 12427 error = ffs_update(vp, MNT_WAIT); 12428 vput(vp); 12429 if (error != 0) 12430 break; 12431 ACQUIRE_LOCK(&lk); 12432 /* 12433 * If that cleared dependencies, go on to next. 12434 */ 12435 if (dap != LIST_FIRST(diraddhdp)) 12436 continue; 12437 if (dap->da_state & MKDIR_BODY) { 12438 inodedep_lookup(UFSTOVFS(ump), inum, 0, 12439 &inodedep); 12440 panic("flush_pagedep_deps: MKDIR_BODY " 12441 "inodedep %p dap %p vp %p", 12442 inodedep, dap, vp); 12443 } 12444 } 12445 /* 12446 * Flush the inode on which the directory entry depends. 12447 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 12448 * the only remaining dependency is that the updated inode 12449 * count must get pushed to disk. The inode has already 12450 * been pushed into its inode buffer (via VOP_UPDATE) at 12451 * the time of the reference count change. So we need only 12452 * locate that buffer, ensure that there will be no rollback 12453 * caused by a bitmap dependency, then write the inode buffer. 12454 */ 12455 retry: 12456 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 12457 panic("flush_pagedep_deps: lost inode"); 12458 /* 12459 * If the inode still has bitmap dependencies, 12460 * push them to disk. 12461 */ 12462 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { 12463 bp = inodedep->id_bmsafemap->sm_buf; 12464 bp = getdirtybuf(bp, &lk, MNT_WAIT); 12465 if (bp == NULL) 12466 goto retry; 12467 FREE_LOCK(&lk); 12468 if ((error = bwrite(bp)) != 0) 12469 break; 12470 ACQUIRE_LOCK(&lk); 12471 if (dap != LIST_FIRST(diraddhdp)) 12472 continue; 12473 } 12474 /* 12475 * If the inode is still sitting in a buffer waiting 12476 * to be written or waiting for the link count to be 12477 * adjusted update it here to flush it to disk. 12478 */ 12479 if (dap == LIST_FIRST(diraddhdp)) { 12480 FREE_LOCK(&lk); 12481 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 12482 FFSV_FORCEINSMQ))) 12483 break; 12484 error = ffs_update(vp, MNT_WAIT); 12485 vput(vp); 12486 if (error) 12487 break; 12488 ACQUIRE_LOCK(&lk); 12489 } 12490 /* 12491 * If we have failed to get rid of all the dependencies 12492 * then something is seriously wrong. 12493 */ 12494 if (dap == LIST_FIRST(diraddhdp)) { 12495 inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); 12496 panic("flush_pagedep_deps: failed to flush " 12497 "inodedep %p ino %d dap %p", inodedep, inum, dap); 12498 } 12499 } 12500 if (error) 12501 ACQUIRE_LOCK(&lk); 12502 return (error); 12503 } 12504 12505 /* 12506 * A large burst of file addition or deletion activity can drive the 12507 * memory load excessively high. First attempt to slow things down 12508 * using the techniques below. If that fails, this routine requests 12509 * the offending operations to fall back to running synchronously 12510 * until the memory load returns to a reasonable level. 12511 */ 12512 int 12513 softdep_slowdown(vp) 12514 struct vnode *vp; 12515 { 12516 struct ufsmount *ump; 12517 int jlow; 12518 int max_softdeps_hard; 12519 12520 ACQUIRE_LOCK(&lk); 12521 jlow = 0; 12522 /* 12523 * Check for journal space if needed. 12524 */ 12525 if (DOINGSUJ(vp)) { 12526 ump = VFSTOUFS(vp->v_mount); 12527 if (journal_space(ump, 0) == 0) 12528 jlow = 1; 12529 } 12530 max_softdeps_hard = max_softdeps * 11 / 10; 12531 if (dep_current[D_DIRREM] < max_softdeps_hard / 2 && 12532 dep_current[D_INODEDEP] < max_softdeps_hard && 12533 VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps && 12534 dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) { 12535 FREE_LOCK(&lk); 12536 return (0); 12537 } 12538 if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow) 12539 softdep_speedup(); 12540 stat_sync_limit_hit += 1; 12541 FREE_LOCK(&lk); 12542 if (DOINGSUJ(vp)) 12543 return (0); 12544 return (1); 12545 } 12546 12547 /* 12548 * Called by the allocation routines when they are about to fail 12549 * in the hope that we can free up the requested resource (inodes 12550 * or disk space). 12551 * 12552 * First check to see if the work list has anything on it. If it has, 12553 * clean up entries until we successfully free the requested resource. 12554 * Because this process holds inodes locked, we cannot handle any remove 12555 * requests that might block on a locked inode as that could lead to 12556 * deadlock. If the worklist yields none of the requested resource, 12557 * start syncing out vnodes to free up the needed space. 12558 */ 12559 int 12560 softdep_request_cleanup(fs, vp, cred, resource) 12561 struct fs *fs; 12562 struct vnode *vp; 12563 struct ucred *cred; 12564 int resource; 12565 { 12566 struct ufsmount *ump; 12567 struct mount *mp; 12568 struct vnode *lvp, *mvp; 12569 long starttime; 12570 ufs2_daddr_t needed; 12571 int error; 12572 12573 /* 12574 * If we are being called because of a process doing a 12575 * copy-on-write, then it is not safe to process any 12576 * worklist items as we will recurse into the copyonwrite 12577 * routine. This will result in an incoherent snapshot. 12578 * If the vnode that we hold is a snapshot, we must avoid 12579 * handling other resources that could cause deadlock. 12580 */ 12581 if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp))) 12582 return (0); 12583 12584 if (resource == FLUSH_BLOCKS_WAIT) 12585 stat_cleanup_blkrequests += 1; 12586 else 12587 stat_cleanup_inorequests += 1; 12588 12589 mp = vp->v_mount; 12590 ump = VFSTOUFS(mp); 12591 mtx_assert(UFS_MTX(ump), MA_OWNED); 12592 UFS_UNLOCK(ump); 12593 error = ffs_update(vp, 1); 12594 if (error != 0) { 12595 UFS_LOCK(ump); 12596 return (0); 12597 } 12598 /* 12599 * If we are in need of resources, consider pausing for 12600 * tickdelay to give ourselves some breathing room. 12601 */ 12602 ACQUIRE_LOCK(&lk); 12603 process_removes(vp); 12604 process_truncates(vp); 12605 request_cleanup(UFSTOVFS(ump), resource); 12606 FREE_LOCK(&lk); 12607 /* 12608 * Now clean up at least as many resources as we will need. 12609 * 12610 * When requested to clean up inodes, the number that are needed 12611 * is set by the number of simultaneous writers (mnt_writeopcount) 12612 * plus a bit of slop (2) in case some more writers show up while 12613 * we are cleaning. 12614 * 12615 * When requested to free up space, the amount of space that 12616 * we need is enough blocks to allocate a full-sized segment 12617 * (fs_contigsumsize). The number of such segments that will 12618 * be needed is set by the number of simultaneous writers 12619 * (mnt_writeopcount) plus a bit of slop (2) in case some more 12620 * writers show up while we are cleaning. 12621 * 12622 * Additionally, if we are unpriviledged and allocating space, 12623 * we need to ensure that we clean up enough blocks to get the 12624 * needed number of blocks over the threshhold of the minimum 12625 * number of blocks required to be kept free by the filesystem 12626 * (fs_minfree). 12627 */ 12628 if (resource == FLUSH_INODES_WAIT) { 12629 needed = vp->v_mount->mnt_writeopcount + 2; 12630 } else if (resource == FLUSH_BLOCKS_WAIT) { 12631 needed = (vp->v_mount->mnt_writeopcount + 2) * 12632 fs->fs_contigsumsize; 12633 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0)) 12634 needed += fragstoblks(fs, 12635 roundup((fs->fs_dsize * fs->fs_minfree / 100) - 12636 fs->fs_cstotal.cs_nffree, fs->fs_frag)); 12637 } else { 12638 UFS_LOCK(ump); 12639 printf("softdep_request_cleanup: Unknown resource type %d\n", 12640 resource); 12641 return (0); 12642 } 12643 starttime = time_second; 12644 retry: 12645 if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && 12646 fs->fs_cstotal.cs_nbfree <= needed) || 12647 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 12648 fs->fs_cstotal.cs_nifree <= needed)) { 12649 ACQUIRE_LOCK(&lk); 12650 if (ump->softdep_on_worklist > 0 && 12651 process_worklist_item(UFSTOVFS(ump), 12652 ump->softdep_on_worklist, LK_NOWAIT) != 0) 12653 stat_worklist_push += 1; 12654 FREE_LOCK(&lk); 12655 } 12656 /* 12657 * If we still need resources and there are no more worklist 12658 * entries to process to obtain them, we have to start flushing 12659 * the dirty vnodes to force the release of additional requests 12660 * to the worklist that we can then process to reap addition 12661 * resources. We walk the vnodes associated with the mount point 12662 * until we get the needed worklist requests that we can reap. 12663 */ 12664 if ((resource == FLUSH_BLOCKS_WAIT && 12665 fs->fs_cstotal.cs_nbfree <= needed) || 12666 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 12667 fs->fs_cstotal.cs_nifree <= needed)) { 12668 MNT_ILOCK(mp); 12669 MNT_VNODE_FOREACH(lvp, mp, mvp) { 12670 VI_LOCK(lvp); 12671 if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { 12672 VI_UNLOCK(lvp); 12673 continue; 12674 } 12675 MNT_IUNLOCK(mp); 12676 if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT, 12677 curthread)) { 12678 MNT_ILOCK(mp); 12679 continue; 12680 } 12681 if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ 12682 vput(lvp); 12683 MNT_ILOCK(mp); 12684 continue; 12685 } 12686 (void) ffs_syncvnode(lvp, MNT_NOWAIT); 12687 vput(lvp); 12688 MNT_ILOCK(mp); 12689 } 12690 MNT_IUNLOCK(mp); 12691 lvp = ump->um_devvp; 12692 if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { 12693 VOP_FSYNC(lvp, MNT_NOWAIT, curthread); 12694 VOP_UNLOCK(lvp, 0); 12695 } 12696 if (ump->softdep_on_worklist > 0) { 12697 stat_cleanup_retries += 1; 12698 goto retry; 12699 } 12700 stat_cleanup_failures += 1; 12701 } 12702 if (time_second - starttime > stat_cleanup_high_delay) 12703 stat_cleanup_high_delay = time_second - starttime; 12704 UFS_LOCK(ump); 12705 return (1); 12706 } 12707 12708 /* 12709 * If memory utilization has gotten too high, deliberately slow things 12710 * down and speed up the I/O processing. 12711 */ 12712 extern struct thread *syncertd; 12713 static int 12714 request_cleanup(mp, resource) 12715 struct mount *mp; 12716 int resource; 12717 { 12718 struct thread *td = curthread; 12719 struct ufsmount *ump; 12720 12721 mtx_assert(&lk, MA_OWNED); 12722 /* 12723 * We never hold up the filesystem syncer or buf daemon. 12724 */ 12725 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) 12726 return (0); 12727 ump = VFSTOUFS(mp); 12728 /* 12729 * First check to see if the work list has gotten backlogged. 12730 * If it has, co-opt this process to help clean up two entries. 12731 * Because this process may hold inodes locked, we cannot 12732 * handle any remove requests that might block on a locked 12733 * inode as that could lead to deadlock. We set TDP_SOFTDEP 12734 * to avoid recursively processing the worklist. 12735 */ 12736 if (ump->softdep_on_worklist > max_softdeps / 10) { 12737 td->td_pflags |= TDP_SOFTDEP; 12738 process_worklist_item(mp, 2, LK_NOWAIT); 12739 td->td_pflags &= ~TDP_SOFTDEP; 12740 stat_worklist_push += 2; 12741 return(1); 12742 } 12743 /* 12744 * Next, we attempt to speed up the syncer process. If that 12745 * is successful, then we allow the process to continue. 12746 */ 12747 if (softdep_speedup() && 12748 resource != FLUSH_BLOCKS_WAIT && 12749 resource != FLUSH_INODES_WAIT) 12750 return(0); 12751 /* 12752 * If we are resource constrained on inode dependencies, try 12753 * flushing some dirty inodes. Otherwise, we are constrained 12754 * by file deletions, so try accelerating flushes of directories 12755 * with removal dependencies. We would like to do the cleanup 12756 * here, but we probably hold an inode locked at this point and 12757 * that might deadlock against one that we try to clean. So, 12758 * the best that we can do is request the syncer daemon to do 12759 * the cleanup for us. 12760 */ 12761 switch (resource) { 12762 12763 case FLUSH_INODES: 12764 case FLUSH_INODES_WAIT: 12765 stat_ino_limit_push += 1; 12766 req_clear_inodedeps += 1; 12767 stat_countp = &stat_ino_limit_hit; 12768 break; 12769 12770 case FLUSH_BLOCKS: 12771 case FLUSH_BLOCKS_WAIT: 12772 stat_blk_limit_push += 1; 12773 req_clear_remove += 1; 12774 stat_countp = &stat_blk_limit_hit; 12775 break; 12776 12777 default: 12778 panic("request_cleanup: unknown type"); 12779 } 12780 /* 12781 * Hopefully the syncer daemon will catch up and awaken us. 12782 * We wait at most tickdelay before proceeding in any case. 12783 */ 12784 proc_waiting += 1; 12785 if (callout_pending(&softdep_callout) == FALSE) 12786 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 12787 pause_timer, 0); 12788 12789 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); 12790 proc_waiting -= 1; 12791 return (1); 12792 } 12793 12794 /* 12795 * Awaken processes pausing in request_cleanup and clear proc_waiting 12796 * to indicate that there is no longer a timer running. 12797 */ 12798 static void 12799 pause_timer(arg) 12800 void *arg; 12801 { 12802 12803 /* 12804 * The callout_ API has acquired mtx and will hold it around this 12805 * function call. 12806 */ 12807 *stat_countp += 1; 12808 wakeup_one(&proc_waiting); 12809 if (proc_waiting > 0) 12810 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 12811 pause_timer, 0); 12812 } 12813 12814 /* 12815 * Flush out a directory with at least one removal dependency in an effort to 12816 * reduce the number of dirrem, freefile, and freeblks dependency structures. 12817 */ 12818 static void 12819 clear_remove(td) 12820 struct thread *td; 12821 { 12822 struct pagedep_hashhead *pagedephd; 12823 struct pagedep *pagedep; 12824 static int next = 0; 12825 struct mount *mp; 12826 struct vnode *vp; 12827 struct bufobj *bo; 12828 int error, cnt; 12829 ino_t ino; 12830 12831 mtx_assert(&lk, MA_OWNED); 12832 12833 for (cnt = 0; cnt < pagedep_hash; cnt++) { 12834 pagedephd = &pagedep_hashtbl[next++]; 12835 if (next >= pagedep_hash) 12836 next = 0; 12837 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 12838 if (LIST_EMPTY(&pagedep->pd_dirremhd)) 12839 continue; 12840 mp = pagedep->pd_list.wk_mp; 12841 ino = pagedep->pd_ino; 12842 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 12843 continue; 12844 FREE_LOCK(&lk); 12845 12846 /* 12847 * Let unmount clear deps 12848 */ 12849 error = vfs_busy(mp, MBF_NOWAIT); 12850 if (error != 0) 12851 goto finish_write; 12852 error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 12853 FFSV_FORCEINSMQ); 12854 vfs_unbusy(mp); 12855 if (error != 0) { 12856 softdep_error("clear_remove: vget", error); 12857 goto finish_write; 12858 } 12859 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 12860 softdep_error("clear_remove: fsync", error); 12861 bo = &vp->v_bufobj; 12862 BO_LOCK(bo); 12863 drain_output(vp); 12864 BO_UNLOCK(bo); 12865 vput(vp); 12866 finish_write: 12867 vn_finished_write(mp); 12868 ACQUIRE_LOCK(&lk); 12869 return; 12870 } 12871 } 12872 } 12873 12874 /* 12875 * Clear out a block of dirty inodes in an effort to reduce 12876 * the number of inodedep dependency structures. 12877 */ 12878 static void 12879 clear_inodedeps(td) 12880 struct thread *td; 12881 { 12882 struct inodedep_hashhead *inodedephd; 12883 struct inodedep *inodedep; 12884 static int next = 0; 12885 struct mount *mp; 12886 struct vnode *vp; 12887 struct fs *fs; 12888 int error, cnt; 12889 ino_t firstino, lastino, ino; 12890 12891 mtx_assert(&lk, MA_OWNED); 12892 /* 12893 * Pick a random inode dependency to be cleared. 12894 * We will then gather up all the inodes in its block 12895 * that have dependencies and flush them out. 12896 */ 12897 for (cnt = 0; cnt < inodedep_hash; cnt++) { 12898 inodedephd = &inodedep_hashtbl[next++]; 12899 if (next >= inodedep_hash) 12900 next = 0; 12901 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 12902 break; 12903 } 12904 if (inodedep == NULL) 12905 return; 12906 fs = inodedep->id_fs; 12907 mp = inodedep->id_list.wk_mp; 12908 /* 12909 * Find the last inode in the block with dependencies. 12910 */ 12911 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 12912 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 12913 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) 12914 break; 12915 /* 12916 * Asynchronously push all but the last inode with dependencies. 12917 * Synchronously push the last inode with dependencies to ensure 12918 * that the inode block gets written to free up the inodedeps. 12919 */ 12920 for (ino = firstino; ino <= lastino; ino++) { 12921 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 12922 continue; 12923 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 12924 continue; 12925 FREE_LOCK(&lk); 12926 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ 12927 if (error != 0) { 12928 vn_finished_write(mp); 12929 ACQUIRE_LOCK(&lk); 12930 return; 12931 } 12932 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 12933 FFSV_FORCEINSMQ)) != 0) { 12934 softdep_error("clear_inodedeps: vget", error); 12935 vfs_unbusy(mp); 12936 vn_finished_write(mp); 12937 ACQUIRE_LOCK(&lk); 12938 return; 12939 } 12940 vfs_unbusy(mp); 12941 if (ino == lastino) { 12942 if ((error = ffs_syncvnode(vp, MNT_WAIT))) 12943 softdep_error("clear_inodedeps: fsync1", error); 12944 } else { 12945 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 12946 softdep_error("clear_inodedeps: fsync2", error); 12947 BO_LOCK(&vp->v_bufobj); 12948 drain_output(vp); 12949 BO_UNLOCK(&vp->v_bufobj); 12950 } 12951 vput(vp); 12952 vn_finished_write(mp); 12953 ACQUIRE_LOCK(&lk); 12954 } 12955 } 12956 12957 void 12958 softdep_buf_append(bp, wkhd) 12959 struct buf *bp; 12960 struct workhead *wkhd; 12961 { 12962 struct worklist *wk; 12963 12964 ACQUIRE_LOCK(&lk); 12965 while ((wk = LIST_FIRST(wkhd)) != NULL) { 12966 WORKLIST_REMOVE(wk); 12967 WORKLIST_INSERT(&bp->b_dep, wk); 12968 } 12969 FREE_LOCK(&lk); 12970 12971 } 12972 12973 void 12974 softdep_inode_append(ip, cred, wkhd) 12975 struct inode *ip; 12976 struct ucred *cred; 12977 struct workhead *wkhd; 12978 { 12979 struct buf *bp; 12980 struct fs *fs; 12981 int error; 12982 12983 fs = ip->i_fs; 12984 error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 12985 (int)fs->fs_bsize, cred, &bp); 12986 if (error) { 12987 softdep_freework(wkhd); 12988 return; 12989 } 12990 softdep_buf_append(bp, wkhd); 12991 bqrelse(bp); 12992 } 12993 12994 void 12995 softdep_freework(wkhd) 12996 struct workhead *wkhd; 12997 { 12998 12999 ACQUIRE_LOCK(&lk); 13000 handle_jwork(wkhd); 13001 FREE_LOCK(&lk); 13002 } 13003 13004 /* 13005 * Function to determine if the buffer has outstanding dependencies 13006 * that will cause a roll-back if the buffer is written. If wantcount 13007 * is set, return number of dependencies, otherwise just yes or no. 13008 */ 13009 static int 13010 softdep_count_dependencies(bp, wantcount) 13011 struct buf *bp; 13012 int wantcount; 13013 { 13014 struct worklist *wk; 13015 struct bmsafemap *bmsafemap; 13016 struct freework *freework; 13017 struct inodedep *inodedep; 13018 struct indirdep *indirdep; 13019 struct freeblks *freeblks; 13020 struct allocindir *aip; 13021 struct pagedep *pagedep; 13022 struct dirrem *dirrem; 13023 struct newblk *newblk; 13024 struct mkdir *mkdir; 13025 struct diradd *dap; 13026 int i, retval; 13027 13028 retval = 0; 13029 ACQUIRE_LOCK(&lk); 13030 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 13031 switch (wk->wk_type) { 13032 13033 case D_INODEDEP: 13034 inodedep = WK_INODEDEP(wk); 13035 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 13036 /* bitmap allocation dependency */ 13037 retval += 1; 13038 if (!wantcount) 13039 goto out; 13040 } 13041 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 13042 /* direct block pointer dependency */ 13043 retval += 1; 13044 if (!wantcount) 13045 goto out; 13046 } 13047 if (TAILQ_FIRST(&inodedep->id_extupdt)) { 13048 /* direct block pointer dependency */ 13049 retval += 1; 13050 if (!wantcount) 13051 goto out; 13052 } 13053 if (TAILQ_FIRST(&inodedep->id_inoreflst)) { 13054 /* Add reference dependency. */ 13055 retval += 1; 13056 if (!wantcount) 13057 goto out; 13058 } 13059 continue; 13060 13061 case D_INDIRDEP: 13062 indirdep = WK_INDIRDEP(wk); 13063 13064 TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) { 13065 /* indirect truncation dependency */ 13066 retval += 1; 13067 if (!wantcount) 13068 goto out; 13069 } 13070 13071 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 13072 /* indirect block pointer dependency */ 13073 retval += 1; 13074 if (!wantcount) 13075 goto out; 13076 } 13077 continue; 13078 13079 case D_PAGEDEP: 13080 pagedep = WK_PAGEDEP(wk); 13081 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 13082 if (LIST_FIRST(&dirrem->dm_jremrefhd)) { 13083 /* Journal remove ref dependency. */ 13084 retval += 1; 13085 if (!wantcount) 13086 goto out; 13087 } 13088 } 13089 for (i = 0; i < DAHASHSZ; i++) { 13090 13091 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 13092 /* directory entry dependency */ 13093 retval += 1; 13094 if (!wantcount) 13095 goto out; 13096 } 13097 } 13098 continue; 13099 13100 case D_BMSAFEMAP: 13101 bmsafemap = WK_BMSAFEMAP(wk); 13102 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { 13103 /* Add reference dependency. */ 13104 retval += 1; 13105 if (!wantcount) 13106 goto out; 13107 } 13108 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { 13109 /* Allocate block dependency. */ 13110 retval += 1; 13111 if (!wantcount) 13112 goto out; 13113 } 13114 continue; 13115 13116 case D_FREEBLKS: 13117 freeblks = WK_FREEBLKS(wk); 13118 if (LIST_FIRST(&freeblks->fb_jblkdephd)) { 13119 /* Freeblk journal dependency. */ 13120 retval += 1; 13121 if (!wantcount) 13122 goto out; 13123 } 13124 continue; 13125 13126 case D_ALLOCDIRECT: 13127 case D_ALLOCINDIR: 13128 newblk = WK_NEWBLK(wk); 13129 if (newblk->nb_jnewblk) { 13130 /* Journal allocate dependency. */ 13131 retval += 1; 13132 if (!wantcount) 13133 goto out; 13134 } 13135 continue; 13136 13137 case D_MKDIR: 13138 mkdir = WK_MKDIR(wk); 13139 if (mkdir->md_jaddref) { 13140 /* Journal reference dependency. */ 13141 retval += 1; 13142 if (!wantcount) 13143 goto out; 13144 } 13145 continue; 13146 13147 case D_FREEWORK: 13148 case D_FREEDEP: 13149 case D_JSEGDEP: 13150 case D_JSEG: 13151 case D_SBDEP: 13152 /* never a dependency on these blocks */ 13153 continue; 13154 13155 default: 13156 panic("softdep_count_dependencies: Unexpected type %s", 13157 TYPENAME(wk->wk_type)); 13158 /* NOTREACHED */ 13159 } 13160 } 13161 out: 13162 FREE_LOCK(&lk); 13163 return retval; 13164 } 13165 13166 /* 13167 * Acquire exclusive access to a buffer. 13168 * Must be called with a locked mtx parameter. 13169 * Return acquired buffer or NULL on failure. 13170 */ 13171 static struct buf * 13172 getdirtybuf(bp, mtx, waitfor) 13173 struct buf *bp; 13174 struct mtx *mtx; 13175 int waitfor; 13176 { 13177 int error; 13178 13179 mtx_assert(mtx, MA_OWNED); 13180 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { 13181 if (waitfor != MNT_WAIT) 13182 return (NULL); 13183 error = BUF_LOCK(bp, 13184 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); 13185 /* 13186 * Even if we sucessfully acquire bp here, we have dropped 13187 * mtx, which may violates our guarantee. 13188 */ 13189 if (error == 0) 13190 BUF_UNLOCK(bp); 13191 else if (error != ENOLCK) 13192 panic("getdirtybuf: inconsistent lock: %d", error); 13193 mtx_lock(mtx); 13194 return (NULL); 13195 } 13196 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 13197 if (mtx == &lk && waitfor == MNT_WAIT) { 13198 mtx_unlock(mtx); 13199 BO_LOCK(bp->b_bufobj); 13200 BUF_UNLOCK(bp); 13201 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 13202 bp->b_vflags |= BV_BKGRDWAIT; 13203 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), 13204 PRIBIO | PDROP, "getbuf", 0); 13205 } else 13206 BO_UNLOCK(bp->b_bufobj); 13207 mtx_lock(mtx); 13208 return (NULL); 13209 } 13210 BUF_UNLOCK(bp); 13211 if (waitfor != MNT_WAIT) 13212 return (NULL); 13213 /* 13214 * The mtx argument must be bp->b_vp's mutex in 13215 * this case. 13216 */ 13217 #ifdef DEBUG_VFS_LOCKS 13218 if (bp->b_vp->v_type != VCHR) 13219 ASSERT_BO_LOCKED(bp->b_bufobj); 13220 #endif 13221 bp->b_vflags |= BV_BKGRDWAIT; 13222 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); 13223 return (NULL); 13224 } 13225 if ((bp->b_flags & B_DELWRI) == 0) { 13226 BUF_UNLOCK(bp); 13227 return (NULL); 13228 } 13229 bremfree(bp); 13230 return (bp); 13231 } 13232 13233 13234 /* 13235 * Check if it is safe to suspend the file system now. On entry, 13236 * the vnode interlock for devvp should be held. Return 0 with 13237 * the mount interlock held if the file system can be suspended now, 13238 * otherwise return EAGAIN with the mount interlock held. 13239 */ 13240 int 13241 softdep_check_suspend(struct mount *mp, 13242 struct vnode *devvp, 13243 int softdep_deps, 13244 int softdep_accdeps, 13245 int secondary_writes, 13246 int secondary_accwrites) 13247 { 13248 struct bufobj *bo; 13249 struct ufsmount *ump; 13250 int error; 13251 13252 ump = VFSTOUFS(mp); 13253 bo = &devvp->v_bufobj; 13254 ASSERT_BO_LOCKED(bo); 13255 13256 for (;;) { 13257 if (!TRY_ACQUIRE_LOCK(&lk)) { 13258 BO_UNLOCK(bo); 13259 ACQUIRE_LOCK(&lk); 13260 FREE_LOCK(&lk); 13261 BO_LOCK(bo); 13262 continue; 13263 } 13264 MNT_ILOCK(mp); 13265 if (mp->mnt_secondary_writes != 0) { 13266 FREE_LOCK(&lk); 13267 BO_UNLOCK(bo); 13268 msleep(&mp->mnt_secondary_writes, 13269 MNT_MTX(mp), 13270 (PUSER - 1) | PDROP, "secwr", 0); 13271 BO_LOCK(bo); 13272 continue; 13273 } 13274 break; 13275 } 13276 13277 /* 13278 * Reasons for needing more work before suspend: 13279 * - Dirty buffers on devvp. 13280 * - Softdep activity occurred after start of vnode sync loop 13281 * - Secondary writes occurred after start of vnode sync loop 13282 */ 13283 error = 0; 13284 if (bo->bo_numoutput > 0 || 13285 bo->bo_dirty.bv_cnt > 0 || 13286 softdep_deps != 0 || 13287 ump->softdep_deps != 0 || 13288 softdep_accdeps != ump->softdep_accdeps || 13289 secondary_writes != 0 || 13290 mp->mnt_secondary_writes != 0 || 13291 secondary_accwrites != mp->mnt_secondary_accwrites) 13292 error = EAGAIN; 13293 FREE_LOCK(&lk); 13294 BO_UNLOCK(bo); 13295 return (error); 13296 } 13297 13298 13299 /* 13300 * Get the number of dependency structures for the file system, both 13301 * the current number and the total number allocated. These will 13302 * later be used to detect that softdep processing has occurred. 13303 */ 13304 void 13305 softdep_get_depcounts(struct mount *mp, 13306 int *softdep_depsp, 13307 int *softdep_accdepsp) 13308 { 13309 struct ufsmount *ump; 13310 13311 ump = VFSTOUFS(mp); 13312 ACQUIRE_LOCK(&lk); 13313 *softdep_depsp = ump->softdep_deps; 13314 *softdep_accdepsp = ump->softdep_accdeps; 13315 FREE_LOCK(&lk); 13316 } 13317 13318 /* 13319 * Wait for pending output on a vnode to complete. 13320 * Must be called with vnode lock and interlock locked. 13321 * 13322 * XXX: Should just be a call to bufobj_wwait(). 13323 */ 13324 static void 13325 drain_output(vp) 13326 struct vnode *vp; 13327 { 13328 struct bufobj *bo; 13329 13330 bo = &vp->v_bufobj; 13331 ASSERT_VOP_LOCKED(vp, "drain_output"); 13332 ASSERT_BO_LOCKED(bo); 13333 13334 while (bo->bo_numoutput) { 13335 bo->bo_flag |= BO_WWAIT; 13336 msleep((caddr_t)&bo->bo_numoutput, 13337 BO_MTX(bo), PRIBIO + 1, "drainvp", 0); 13338 } 13339 } 13340 13341 /* 13342 * Called whenever a buffer that is being invalidated or reallocated 13343 * contains dependencies. This should only happen if an I/O error has 13344 * occurred. The routine is called with the buffer locked. 13345 */ 13346 static void 13347 softdep_deallocate_dependencies(bp) 13348 struct buf *bp; 13349 { 13350 13351 if ((bp->b_ioflags & BIO_ERROR) == 0) 13352 panic("softdep_deallocate_dependencies: dangling deps"); 13353 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 13354 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 13355 } 13356 13357 /* 13358 * Function to handle asynchronous write errors in the filesystem. 13359 */ 13360 static void 13361 softdep_error(func, error) 13362 char *func; 13363 int error; 13364 { 13365 13366 /* XXX should do something better! */ 13367 printf("%s: got error %d while accessing filesystem\n", func, error); 13368 } 13369 13370 #ifdef DDB 13371 13372 static void 13373 inodedep_print(struct inodedep *inodedep, int verbose) 13374 { 13375 db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" 13376 " saveino %p\n", 13377 inodedep, inodedep->id_fs, inodedep->id_state, 13378 (intmax_t)inodedep->id_ino, 13379 (intmax_t)fsbtodb(inodedep->id_fs, 13380 ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), 13381 inodedep->id_nlinkdelta, inodedep->id_savednlink, 13382 inodedep->id_savedino1); 13383 13384 if (verbose == 0) 13385 return; 13386 13387 db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " 13388 "mkdiradd %p\n", 13389 LIST_FIRST(&inodedep->id_pendinghd), 13390 LIST_FIRST(&inodedep->id_bufwait), 13391 LIST_FIRST(&inodedep->id_inowait), 13392 TAILQ_FIRST(&inodedep->id_inoreflst), 13393 inodedep->id_mkdiradd); 13394 db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", 13395 TAILQ_FIRST(&inodedep->id_inoupdt), 13396 TAILQ_FIRST(&inodedep->id_newinoupdt), 13397 TAILQ_FIRST(&inodedep->id_extupdt), 13398 TAILQ_FIRST(&inodedep->id_newextupdt)); 13399 } 13400 13401 DB_SHOW_COMMAND(inodedep, db_show_inodedep) 13402 { 13403 13404 if (have_addr == 0) { 13405 db_printf("Address required\n"); 13406 return; 13407 } 13408 inodedep_print((struct inodedep*)addr, 1); 13409 } 13410 13411 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) 13412 { 13413 struct inodedep_hashhead *inodedephd; 13414 struct inodedep *inodedep; 13415 struct fs *fs; 13416 int cnt; 13417 13418 fs = have_addr ? (struct fs *)addr : NULL; 13419 for (cnt = 0; cnt < inodedep_hash; cnt++) { 13420 inodedephd = &inodedep_hashtbl[cnt]; 13421 LIST_FOREACH(inodedep, inodedephd, id_hash) { 13422 if (fs != NULL && fs != inodedep->id_fs) 13423 continue; 13424 inodedep_print(inodedep, 0); 13425 } 13426 } 13427 } 13428 13429 DB_SHOW_COMMAND(worklist, db_show_worklist) 13430 { 13431 struct worklist *wk; 13432 13433 if (have_addr == 0) { 13434 db_printf("Address required\n"); 13435 return; 13436 } 13437 wk = (struct worklist *)addr; 13438 printf("worklist: %p type %s state 0x%X\n", 13439 wk, TYPENAME(wk->wk_type), wk->wk_state); 13440 } 13441 13442 DB_SHOW_COMMAND(workhead, db_show_workhead) 13443 { 13444 struct workhead *wkhd; 13445 struct worklist *wk; 13446 int i; 13447 13448 if (have_addr == 0) { 13449 db_printf("Address required\n"); 13450 return; 13451 } 13452 wkhd = (struct workhead *)addr; 13453 wk = LIST_FIRST(wkhd); 13454 for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) 13455 db_printf("worklist: %p type %s state 0x%X", 13456 wk, TYPENAME(wk->wk_type), wk->wk_state); 13457 if (i == 100) 13458 db_printf("workhead overflow"); 13459 printf("\n"); 13460 } 13461 13462 13463 DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) 13464 { 13465 struct jaddref *jaddref; 13466 struct diradd *diradd; 13467 struct mkdir *mkdir; 13468 13469 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 13470 diradd = mkdir->md_diradd; 13471 db_printf("mkdir: %p state 0x%X dap %p state 0x%X", 13472 mkdir, mkdir->md_state, diradd, diradd->da_state); 13473 if ((jaddref = mkdir->md_jaddref) != NULL) 13474 db_printf(" jaddref %p jaddref state 0x%X", 13475 jaddref, jaddref->ja_state); 13476 db_printf("\n"); 13477 } 13478 } 13479 13480 #endif /* DDB */ 13481 13482 #endif /* SOFTUPDATES */ 13483