1 /*- 2 * Copyright 1998, 2000 Marshall Kirk McKusick. 3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org> 4 * All rights reserved. 5 * 6 * The soft updates code is derived from the appendix of a University 7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, 8 * "Soft Updates: A Solution to the Metadata Update Problem in File 9 * Systems", CSE-TR-254-95, August 1995). 10 * 11 * Further information about soft updates can be obtained from: 12 * 13 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 14 * 1614 Oxford Street mckusick@mckusick.com 15 * Berkeley, CA 94709-1608 +1-510-843-9542 16 * USA 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, 32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 * 39 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 40 */ 41 42 #include <sys/cdefs.h> 43 __FBSDID("$FreeBSD$"); 44 45 #include "opt_ffs.h" 46 #include "opt_ddb.h" 47 48 /* 49 * For now we want the safety net that the DEBUG flag provides. 50 */ 51 #ifndef DEBUG 52 #define DEBUG 53 #endif 54 55 #include <sys/param.h> 56 #include <sys/kernel.h> 57 #include <sys/systm.h> 58 #include <sys/bio.h> 59 #include <sys/buf.h> 60 #include <sys/kdb.h> 61 #include <sys/kthread.h> 62 #include <sys/limits.h> 63 #include <sys/lock.h> 64 #include <sys/malloc.h> 65 #include <sys/mount.h> 66 #include <sys/mutex.h> 67 #include <sys/namei.h> 68 #include <sys/priv.h> 69 #include <sys/proc.h> 70 #include <sys/stat.h> 71 #include <sys/sysctl.h> 72 #include <sys/syslog.h> 73 #include <sys/vnode.h> 74 #include <sys/conf.h> 75 76 #include <ufs/ufs/dir.h> 77 #include <ufs/ufs/extattr.h> 78 #include <ufs/ufs/quota.h> 79 #include <ufs/ufs/inode.h> 80 #include <ufs/ufs/ufsmount.h> 81 #include <ufs/ffs/fs.h> 82 #include <ufs/ffs/softdep.h> 83 #include <ufs/ffs/ffs_extern.h> 84 #include <ufs/ufs/ufs_extern.h> 85 86 #include <vm/vm.h> 87 #include <vm/vm_extern.h> 88 #include <vm/vm_object.h> 89 90 #include <ddb/ddb.h> 91 92 #ifndef SOFTUPDATES 93 94 int 95 softdep_flushfiles(oldmnt, flags, td) 96 struct mount *oldmnt; 97 int flags; 98 struct thread *td; 99 { 100 101 panic("softdep_flushfiles called"); 102 } 103 104 int 105 softdep_mount(devvp, mp, fs, cred) 106 struct vnode *devvp; 107 struct mount *mp; 108 struct fs *fs; 109 struct ucred *cred; 110 { 111 112 return (0); 113 } 114 115 void 116 softdep_initialize() 117 { 118 119 return; 120 } 121 122 void 123 softdep_uninitialize() 124 { 125 126 return; 127 } 128 129 void 130 softdep_unmount(mp) 131 struct mount *mp; 132 { 133 134 } 135 136 void 137 softdep_setup_sbupdate(ump, fs, bp) 138 struct ufsmount *ump; 139 struct fs *fs; 140 struct buf *bp; 141 { 142 } 143 144 void 145 softdep_setup_inomapdep(bp, ip, newinum, mode) 146 struct buf *bp; 147 struct inode *ip; 148 ino_t newinum; 149 int mode; 150 { 151 152 panic("softdep_setup_inomapdep called"); 153 } 154 155 void 156 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 157 struct buf *bp; 158 struct mount *mp; 159 ufs2_daddr_t newblkno; 160 int frags; 161 int oldfrags; 162 { 163 164 panic("softdep_setup_blkmapdep called"); 165 } 166 167 void 168 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 169 struct inode *ip; 170 ufs_lbn_t lbn; 171 ufs2_daddr_t newblkno; 172 ufs2_daddr_t oldblkno; 173 long newsize; 174 long oldsize; 175 struct buf *bp; 176 { 177 178 panic("softdep_setup_allocdirect called"); 179 } 180 181 void 182 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) 183 struct inode *ip; 184 ufs_lbn_t lbn; 185 ufs2_daddr_t newblkno; 186 ufs2_daddr_t oldblkno; 187 long newsize; 188 long oldsize; 189 struct buf *bp; 190 { 191 192 panic("softdep_setup_allocext called"); 193 } 194 195 void 196 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 197 struct inode *ip; 198 ufs_lbn_t lbn; 199 struct buf *bp; 200 int ptrno; 201 ufs2_daddr_t newblkno; 202 ufs2_daddr_t oldblkno; 203 struct buf *nbp; 204 { 205 206 panic("softdep_setup_allocindir_page called"); 207 } 208 209 void 210 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 211 struct buf *nbp; 212 struct inode *ip; 213 struct buf *bp; 214 int ptrno; 215 ufs2_daddr_t newblkno; 216 { 217 218 panic("softdep_setup_allocindir_meta called"); 219 } 220 221 void 222 softdep_journal_freeblocks(ip, cred, length, flags) 223 struct inode *ip; 224 struct ucred *cred; 225 off_t length; 226 int flags; 227 { 228 229 panic("softdep_journal_freeblocks called"); 230 } 231 232 void 233 softdep_journal_fsync(ip) 234 struct inode *ip; 235 { 236 237 panic("softdep_journal_fsync called"); 238 } 239 240 void 241 softdep_setup_freeblocks(ip, length, flags) 242 struct inode *ip; 243 off_t length; 244 int flags; 245 { 246 247 panic("softdep_setup_freeblocks called"); 248 } 249 250 void 251 softdep_freefile(pvp, ino, mode) 252 struct vnode *pvp; 253 ino_t ino; 254 int mode; 255 { 256 257 panic("softdep_freefile called"); 258 } 259 260 int 261 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 262 struct buf *bp; 263 struct inode *dp; 264 off_t diroffset; 265 ino_t newinum; 266 struct buf *newdirbp; 267 int isnewblk; 268 { 269 270 panic("softdep_setup_directory_add called"); 271 } 272 273 void 274 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 275 struct buf *bp; 276 struct inode *dp; 277 caddr_t base; 278 caddr_t oldloc; 279 caddr_t newloc; 280 int entrysize; 281 { 282 283 panic("softdep_change_directoryentry_offset called"); 284 } 285 286 void 287 softdep_setup_remove(bp, dp, ip, isrmdir) 288 struct buf *bp; 289 struct inode *dp; 290 struct inode *ip; 291 int isrmdir; 292 { 293 294 panic("softdep_setup_remove called"); 295 } 296 297 void 298 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 299 struct buf *bp; 300 struct inode *dp; 301 struct inode *ip; 302 ino_t newinum; 303 int isrmdir; 304 { 305 306 panic("softdep_setup_directory_change called"); 307 } 308 309 void 310 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 311 struct mount *mp; 312 struct buf *bp; 313 ufs2_daddr_t blkno; 314 int frags; 315 struct workhead *wkhd; 316 { 317 318 panic("%s called", __FUNCTION__); 319 } 320 321 void 322 softdep_setup_inofree(mp, bp, ino, wkhd) 323 struct mount *mp; 324 struct buf *bp; 325 ino_t ino; 326 struct workhead *wkhd; 327 { 328 329 panic("%s called", __FUNCTION__); 330 } 331 332 void 333 softdep_setup_unlink(dp, ip) 334 struct inode *dp; 335 struct inode *ip; 336 { 337 338 panic("%s called", __FUNCTION__); 339 } 340 341 void 342 softdep_setup_link(dp, ip) 343 struct inode *dp; 344 struct inode *ip; 345 { 346 347 panic("%s called", __FUNCTION__); 348 } 349 350 void 351 softdep_revert_link(dp, ip) 352 struct inode *dp; 353 struct inode *ip; 354 { 355 356 panic("%s called", __FUNCTION__); 357 } 358 359 void 360 softdep_setup_rmdir(dp, ip) 361 struct inode *dp; 362 struct inode *ip; 363 { 364 365 panic("%s called", __FUNCTION__); 366 } 367 368 void 369 softdep_revert_rmdir(dp, ip) 370 struct inode *dp; 371 struct inode *ip; 372 { 373 374 panic("%s called", __FUNCTION__); 375 } 376 377 void 378 softdep_setup_create(dp, ip) 379 struct inode *dp; 380 struct inode *ip; 381 { 382 383 panic("%s called", __FUNCTION__); 384 } 385 386 void 387 softdep_revert_create(dp, ip) 388 struct inode *dp; 389 struct inode *ip; 390 { 391 392 panic("%s called", __FUNCTION__); 393 } 394 395 void 396 softdep_setup_mkdir(dp, ip) 397 struct inode *dp; 398 struct inode *ip; 399 { 400 401 panic("%s called", __FUNCTION__); 402 } 403 404 void 405 softdep_revert_mkdir(dp, ip) 406 struct inode *dp; 407 struct inode *ip; 408 { 409 410 panic("%s called", __FUNCTION__); 411 } 412 413 void 414 softdep_setup_dotdot_link(dp, ip) 415 struct inode *dp; 416 struct inode *ip; 417 { 418 419 panic("%s called", __FUNCTION__); 420 } 421 422 int 423 softdep_prealloc(vp, waitok) 424 struct vnode *vp; 425 int waitok; 426 { 427 428 panic("%s called", __FUNCTION__); 429 430 return (0); 431 } 432 433 int 434 softdep_journal_lookup(mp, vpp) 435 struct mount *mp; 436 struct vnode **vpp; 437 { 438 439 return (ENOENT); 440 } 441 442 void 443 softdep_change_linkcnt(ip) 444 struct inode *ip; 445 { 446 447 panic("softdep_change_linkcnt called"); 448 } 449 450 void 451 softdep_load_inodeblock(ip) 452 struct inode *ip; 453 { 454 455 panic("softdep_load_inodeblock called"); 456 } 457 458 void 459 softdep_update_inodeblock(ip, bp, waitfor) 460 struct inode *ip; 461 struct buf *bp; 462 int waitfor; 463 { 464 465 panic("softdep_update_inodeblock called"); 466 } 467 468 int 469 softdep_fsync(vp) 470 struct vnode *vp; /* the "in_core" copy of the inode */ 471 { 472 473 return (0); 474 } 475 476 void 477 softdep_fsync_mountdev(vp) 478 struct vnode *vp; 479 { 480 481 return; 482 } 483 484 int 485 softdep_flushworklist(oldmnt, countp, td) 486 struct mount *oldmnt; 487 int *countp; 488 struct thread *td; 489 { 490 491 *countp = 0; 492 return (0); 493 } 494 495 int 496 softdep_sync_metadata(struct vnode *vp) 497 { 498 499 return (0); 500 } 501 502 int 503 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) 504 { 505 506 return (0); 507 } 508 509 int 510 softdep_slowdown(vp) 511 struct vnode *vp; 512 { 513 514 panic("softdep_slowdown called"); 515 } 516 517 void 518 softdep_releasefile(ip) 519 struct inode *ip; /* inode with the zero effective link count */ 520 { 521 522 panic("softdep_releasefile called"); 523 } 524 525 int 526 softdep_request_cleanup(fs, vp, cred, resource) 527 struct fs *fs; 528 struct vnode *vp; 529 struct ucred *cred; 530 int resource; 531 { 532 533 return (0); 534 } 535 536 int 537 softdep_check_suspend(struct mount *mp, 538 struct vnode *devvp, 539 int softdep_deps, 540 int softdep_accdeps, 541 int secondary_writes, 542 int secondary_accwrites) 543 { 544 struct bufobj *bo; 545 int error; 546 547 (void) softdep_deps, 548 (void) softdep_accdeps; 549 550 bo = &devvp->v_bufobj; 551 ASSERT_BO_LOCKED(bo); 552 553 MNT_ILOCK(mp); 554 while (mp->mnt_secondary_writes != 0) { 555 BO_UNLOCK(bo); 556 msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), 557 (PUSER - 1) | PDROP, "secwr", 0); 558 BO_LOCK(bo); 559 MNT_ILOCK(mp); 560 } 561 562 /* 563 * Reasons for needing more work before suspend: 564 * - Dirty buffers on devvp. 565 * - Secondary writes occurred after start of vnode sync loop 566 */ 567 error = 0; 568 if (bo->bo_numoutput > 0 || 569 bo->bo_dirty.bv_cnt > 0 || 570 secondary_writes != 0 || 571 mp->mnt_secondary_writes != 0 || 572 secondary_accwrites != mp->mnt_secondary_accwrites) 573 error = EAGAIN; 574 BO_UNLOCK(bo); 575 return (error); 576 } 577 578 void 579 softdep_get_depcounts(struct mount *mp, 580 int *softdepactivep, 581 int *softdepactiveaccp) 582 { 583 (void) mp; 584 *softdepactivep = 0; 585 *softdepactiveaccp = 0; 586 } 587 588 void 589 softdep_buf_append(bp, wkhd) 590 struct buf *bp; 591 struct workhead *wkhd; 592 { 593 594 panic("softdep_buf_appendwork called"); 595 } 596 597 void 598 softdep_inode_append(ip, cred, wkhd) 599 struct inode *ip; 600 struct ucred *cred; 601 struct workhead *wkhd; 602 { 603 604 panic("softdep_inode_appendwork called"); 605 } 606 607 void 608 softdep_freework(wkhd) 609 struct workhead *wkhd; 610 { 611 612 panic("softdep_freework called"); 613 } 614 615 #else 616 617 FEATURE(softupdates, "FFS soft-updates support"); 618 619 /* 620 * These definitions need to be adapted to the system to which 621 * this file is being ported. 622 */ 623 624 #define M_SOFTDEP_FLAGS (M_WAITOK) 625 626 #define D_PAGEDEP 0 627 #define D_INODEDEP 1 628 #define D_BMSAFEMAP 2 629 #define D_NEWBLK 3 630 #define D_ALLOCDIRECT 4 631 #define D_INDIRDEP 5 632 #define D_ALLOCINDIR 6 633 #define D_FREEFRAG 7 634 #define D_FREEBLKS 8 635 #define D_FREEFILE 9 636 #define D_DIRADD 10 637 #define D_MKDIR 11 638 #define D_DIRREM 12 639 #define D_NEWDIRBLK 13 640 #define D_FREEWORK 14 641 #define D_FREEDEP 15 642 #define D_JADDREF 16 643 #define D_JREMREF 17 644 #define D_JMVREF 18 645 #define D_JNEWBLK 19 646 #define D_JFREEBLK 20 647 #define D_JFREEFRAG 21 648 #define D_JSEG 22 649 #define D_JSEGDEP 23 650 #define D_SBDEP 24 651 #define D_JTRUNC 25 652 #define D_JFSYNC 26 653 #define D_SENTINAL 27 654 #define D_LAST D_SENTINAL 655 656 unsigned long dep_current[D_LAST + 1]; 657 unsigned long dep_total[D_LAST + 1]; 658 unsigned long dep_write[D_LAST + 1]; 659 660 661 static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, 662 "soft updates stats"); 663 static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, 664 "total dependencies allocated"); 665 static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, 666 "current dependencies allocated"); 667 static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0, 668 "current dependencies written"); 669 670 #define SOFTDEP_TYPE(type, str, long) \ 671 static MALLOC_DEFINE(M_ ## type, #str, long); \ 672 SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ 673 &dep_total[D_ ## type], 0, ""); \ 674 SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ 675 &dep_current[D_ ## type], 0, ""); \ 676 SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \ 677 &dep_write[D_ ## type], 0, ""); 678 679 SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); 680 SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); 681 SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, 682 "Block or frag allocated from cyl group map"); 683 SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); 684 SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); 685 SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); 686 SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); 687 SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); 688 SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); 689 SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); 690 SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); 691 SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); 692 SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); 693 SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); 694 SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); 695 SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); 696 SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); 697 SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); 698 SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); 699 SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); 700 SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); 701 SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); 702 SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); 703 SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); 704 SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); 705 SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); 706 SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete"); 707 708 static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); 709 static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); 710 711 /* 712 * translate from workitem type to memory type 713 * MUST match the defines above, such that memtype[D_XXX] == M_XXX 714 */ 715 static struct malloc_type *memtype[] = { 716 M_PAGEDEP, 717 M_INODEDEP, 718 M_BMSAFEMAP, 719 M_NEWBLK, 720 M_ALLOCDIRECT, 721 M_INDIRDEP, 722 M_ALLOCINDIR, 723 M_FREEFRAG, 724 M_FREEBLKS, 725 M_FREEFILE, 726 M_DIRADD, 727 M_MKDIR, 728 M_DIRREM, 729 M_NEWDIRBLK, 730 M_FREEWORK, 731 M_FREEDEP, 732 M_JADDREF, 733 M_JREMREF, 734 M_JMVREF, 735 M_JNEWBLK, 736 M_JFREEBLK, 737 M_JFREEFRAG, 738 M_JSEG, 739 M_JSEGDEP, 740 M_SBDEP, 741 M_JTRUNC, 742 M_JFSYNC 743 }; 744 745 static LIST_HEAD(mkdirlist, mkdir) mkdirlisthd; 746 747 #define DtoM(type) (memtype[type]) 748 749 /* 750 * Names of malloc types. 751 */ 752 #define TYPENAME(type) \ 753 ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") 754 /* 755 * End system adaptation definitions. 756 */ 757 758 #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) 759 #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) 760 761 /* 762 * Forward declarations. 763 */ 764 struct inodedep_hashhead; 765 struct newblk_hashhead; 766 struct pagedep_hashhead; 767 struct bmsafemap_hashhead; 768 769 /* 770 * Internal function prototypes. 771 */ 772 static void softdep_error(char *, int); 773 static void drain_output(struct vnode *); 774 static struct buf *getdirtybuf(struct buf *, struct mtx *, int); 775 static void clear_remove(struct thread *); 776 static void clear_inodedeps(struct thread *); 777 static void unlinked_inodedep(struct mount *, struct inodedep *); 778 static void clear_unlinked_inodedep(struct inodedep *); 779 static struct inodedep *first_unlinked_inodedep(struct ufsmount *); 780 static int flush_pagedep_deps(struct vnode *, struct mount *, 781 struct diraddhd *); 782 static int free_pagedep(struct pagedep *); 783 static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); 784 static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); 785 static int flush_deplist(struct allocdirectlst *, int, int *); 786 static int sync_cgs(struct mount *, int); 787 static int handle_written_filepage(struct pagedep *, struct buf *); 788 static int handle_written_sbdep(struct sbdep *, struct buf *); 789 static void initiate_write_sbdep(struct sbdep *); 790 static void diradd_inode_written(struct diradd *, struct inodedep *); 791 static int handle_written_indirdep(struct indirdep *, struct buf *, 792 struct buf**); 793 static int handle_written_inodeblock(struct inodedep *, struct buf *); 794 static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *, 795 uint8_t *); 796 static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); 797 static void handle_written_jaddref(struct jaddref *); 798 static void handle_written_jremref(struct jremref *); 799 static void handle_written_jseg(struct jseg *, struct buf *); 800 static void handle_written_jnewblk(struct jnewblk *); 801 static void handle_written_jblkdep(struct jblkdep *); 802 static void handle_written_jfreefrag(struct jfreefrag *); 803 static void complete_jseg(struct jseg *); 804 static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); 805 static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); 806 static void jremref_write(struct jremref *, struct jseg *, uint8_t *); 807 static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); 808 static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); 809 static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data); 810 static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); 811 static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); 812 static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); 813 static inline void inoref_write(struct inoref *, struct jseg *, 814 struct jrefrec *); 815 static void handle_allocdirect_partdone(struct allocdirect *, 816 struct workhead *); 817 static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *, 818 struct workhead *); 819 static void indirdep_complete(struct indirdep *); 820 static int indirblk_lookup(struct mount *, ufs2_daddr_t); 821 static void indirblk_insert(struct freework *); 822 static void indirblk_remove(struct freework *); 823 static void handle_allocindir_partdone(struct allocindir *); 824 static void initiate_write_filepage(struct pagedep *, struct buf *); 825 static void initiate_write_indirdep(struct indirdep*, struct buf *); 826 static void handle_written_mkdir(struct mkdir *, int); 827 static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *, 828 uint8_t *); 829 static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); 830 static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); 831 static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); 832 static void handle_workitem_freefile(struct freefile *); 833 static int handle_workitem_remove(struct dirrem *, int); 834 static struct dirrem *newdirrem(struct buf *, struct inode *, 835 struct inode *, int, struct dirrem **); 836 static struct indirdep *indirdep_lookup(struct mount *, struct inode *, 837 struct buf *); 838 static void cancel_indirdep(struct indirdep *, struct buf *, 839 struct freeblks *); 840 static void free_indirdep(struct indirdep *); 841 static void free_diradd(struct diradd *, struct workhead *); 842 static void merge_diradd(struct inodedep *, struct diradd *); 843 static void complete_diradd(struct diradd *); 844 static struct diradd *diradd_lookup(struct pagedep *, int); 845 static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, 846 struct jremref *); 847 static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, 848 struct jremref *); 849 static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, 850 struct jremref *, struct jremref *); 851 static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, 852 struct jremref *); 853 static void cancel_allocindir(struct allocindir *, struct buf *bp, 854 struct freeblks *, int); 855 static int setup_trunc_indir(struct freeblks *, struct inode *, 856 ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t); 857 static void complete_trunc_indir(struct freework *); 858 static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *, 859 int); 860 static void complete_mkdir(struct mkdir *); 861 static void free_newdirblk(struct newdirblk *); 862 static void free_jremref(struct jremref *); 863 static void free_jaddref(struct jaddref *); 864 static void free_jsegdep(struct jsegdep *); 865 static void free_jsegs(struct jblocks *); 866 static void rele_jseg(struct jseg *); 867 static void free_jseg(struct jseg *, struct jblocks *); 868 static void free_jnewblk(struct jnewblk *); 869 static void free_jblkdep(struct jblkdep *); 870 static void free_jfreefrag(struct jfreefrag *); 871 static void free_freedep(struct freedep *); 872 static void journal_jremref(struct dirrem *, struct jremref *, 873 struct inodedep *); 874 static void cancel_jnewblk(struct jnewblk *, struct workhead *); 875 static int cancel_jaddref(struct jaddref *, struct inodedep *, 876 struct workhead *); 877 static void cancel_jfreefrag(struct jfreefrag *); 878 static inline void setup_freedirect(struct freeblks *, struct inode *, 879 int, int); 880 static inline void setup_freeext(struct freeblks *, struct inode *, int, int); 881 static inline void setup_freeindir(struct freeblks *, struct inode *, int, 882 ufs_lbn_t, int); 883 static inline struct freeblks *newfreeblks(struct mount *, struct inode *); 884 static void freeblks_free(struct ufsmount *, struct freeblks *, int); 885 static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); 886 ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t); 887 static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int); 888 static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t, 889 int, int); 890 static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int); 891 static int cancel_pagedep(struct pagedep *, struct freeblks *, int); 892 static int deallocate_dependencies(struct buf *, struct freeblks *, int); 893 static void newblk_freefrag(struct newblk*); 894 static void free_newblk(struct newblk *); 895 static void cancel_allocdirect(struct allocdirectlst *, 896 struct allocdirect *, struct freeblks *); 897 static int check_inode_unwritten(struct inodedep *); 898 static int free_inodedep(struct inodedep *); 899 static void freework_freeblock(struct freework *); 900 static void freework_enqueue(struct freework *); 901 static int handle_workitem_freeblocks(struct freeblks *, int); 902 static int handle_complete_freeblocks(struct freeblks *, int); 903 static void handle_workitem_indirblk(struct freework *); 904 static void handle_written_freework(struct freework *); 905 static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); 906 static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, 907 struct workhead *); 908 static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *, 909 struct inodedep *, struct allocindir *, ufs_lbn_t); 910 static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, 911 ufs2_daddr_t, ufs_lbn_t); 912 static void handle_workitem_freefrag(struct freefrag *); 913 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, 914 ufs_lbn_t); 915 static void allocdirect_merge(struct allocdirectlst *, 916 struct allocdirect *, struct allocdirect *); 917 static struct freefrag *allocindir_merge(struct allocindir *, 918 struct allocindir *); 919 static int bmsafemap_find(struct bmsafemap_hashhead *, struct mount *, int, 920 struct bmsafemap **); 921 static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, 922 int cg); 923 static int newblk_find(struct newblk_hashhead *, struct mount *, ufs2_daddr_t, 924 int, struct newblk **); 925 static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); 926 static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, 927 struct inodedep **); 928 static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); 929 static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t, 930 int, struct pagedep **); 931 static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, 932 struct mount *mp, int, struct pagedep **); 933 static void pause_timer(void *); 934 static int request_cleanup(struct mount *, int); 935 static int process_worklist_item(struct mount *, int, int); 936 static void process_removes(struct vnode *); 937 static void process_truncates(struct vnode *); 938 static void jwork_move(struct workhead *, struct workhead *); 939 static void jwork_insert(struct workhead *, struct jsegdep *); 940 static void add_to_worklist(struct worklist *, int); 941 static void wake_worklist(struct worklist *); 942 static void wait_worklist(struct worklist *, char *); 943 static void remove_from_worklist(struct worklist *); 944 static void softdep_flush(void); 945 static void softdep_flushjournal(struct mount *); 946 static int softdep_speedup(void); 947 static void worklist_speedup(void); 948 static int journal_mount(struct mount *, struct fs *, struct ucred *); 949 static void journal_unmount(struct mount *); 950 static int journal_space(struct ufsmount *, int); 951 static void journal_suspend(struct ufsmount *); 952 static int journal_unsuspend(struct ufsmount *ump); 953 static void softdep_prelink(struct vnode *, struct vnode *); 954 static void add_to_journal(struct worklist *); 955 static void remove_from_journal(struct worklist *); 956 static void softdep_process_journal(struct mount *, struct worklist *, int); 957 static struct jremref *newjremref(struct dirrem *, struct inode *, 958 struct inode *ip, off_t, nlink_t); 959 static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, 960 uint16_t); 961 static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, 962 uint16_t); 963 static inline struct jsegdep *inoref_jseg(struct inoref *); 964 static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); 965 static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, 966 ufs2_daddr_t, int); 967 static struct jtrunc *newjtrunc(struct freeblks *, off_t, int); 968 static void move_newblock_dep(struct jaddref *, struct inodedep *); 969 static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t); 970 static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, 971 ufs2_daddr_t, long, ufs_lbn_t); 972 static struct freework *newfreework(struct ufsmount *, struct freeblks *, 973 struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int); 974 static int jwait(struct worklist *, int); 975 static struct inodedep *inodedep_lookup_ip(struct inode *); 976 static int bmsafemap_rollbacks(struct bmsafemap *); 977 static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); 978 static void handle_jwork(struct workhead *); 979 static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, 980 struct mkdir **); 981 static struct jblocks *jblocks_create(void); 982 static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); 983 static void jblocks_free(struct jblocks *, struct mount *, int); 984 static void jblocks_destroy(struct jblocks *); 985 static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); 986 987 /* 988 * Exported softdep operations. 989 */ 990 static void softdep_disk_io_initiation(struct buf *); 991 static void softdep_disk_write_complete(struct buf *); 992 static void softdep_deallocate_dependencies(struct buf *); 993 static int softdep_count_dependencies(struct buf *bp, int); 994 995 static struct mtx lk; 996 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); 997 998 #define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) 999 #define ACQUIRE_LOCK(lk) mtx_lock(lk) 1000 #define FREE_LOCK(lk) mtx_unlock(lk) 1001 1002 #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) 1003 #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) 1004 1005 /* 1006 * Worklist queue management. 1007 * These routines require that the lock be held. 1008 */ 1009 #ifndef /* NOT */ DEBUG 1010 #define WORKLIST_INSERT(head, item) do { \ 1011 (item)->wk_state |= ONWORKLIST; \ 1012 LIST_INSERT_HEAD(head, item, wk_list); \ 1013 } while (0) 1014 #define WORKLIST_REMOVE(item) do { \ 1015 (item)->wk_state &= ~ONWORKLIST; \ 1016 LIST_REMOVE(item, wk_list); \ 1017 } while (0) 1018 #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT 1019 #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE 1020 1021 #else /* DEBUG */ 1022 static void worklist_insert(struct workhead *, struct worklist *, int); 1023 static void worklist_remove(struct worklist *, int); 1024 1025 #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) 1026 #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) 1027 #define WORKLIST_REMOVE(item) worklist_remove(item, 1) 1028 #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) 1029 1030 static void 1031 worklist_insert(head, item, locked) 1032 struct workhead *head; 1033 struct worklist *item; 1034 int locked; 1035 { 1036 1037 if (locked) 1038 mtx_assert(&lk, MA_OWNED); 1039 if (item->wk_state & ONWORKLIST) 1040 panic("worklist_insert: %p %s(0x%X) already on list", 1041 item, TYPENAME(item->wk_type), item->wk_state); 1042 item->wk_state |= ONWORKLIST; 1043 LIST_INSERT_HEAD(head, item, wk_list); 1044 } 1045 1046 static void 1047 worklist_remove(item, locked) 1048 struct worklist *item; 1049 int locked; 1050 { 1051 1052 if (locked) 1053 mtx_assert(&lk, MA_OWNED); 1054 if ((item->wk_state & ONWORKLIST) == 0) 1055 panic("worklist_remove: %p %s(0x%X) not on list", 1056 item, TYPENAME(item->wk_type), item->wk_state); 1057 item->wk_state &= ~ONWORKLIST; 1058 LIST_REMOVE(item, wk_list); 1059 } 1060 #endif /* DEBUG */ 1061 1062 /* 1063 * Merge two jsegdeps keeping only the oldest one as newer references 1064 * can't be discarded until after older references. 1065 */ 1066 static inline struct jsegdep * 1067 jsegdep_merge(struct jsegdep *one, struct jsegdep *two) 1068 { 1069 struct jsegdep *swp; 1070 1071 if (two == NULL) 1072 return (one); 1073 1074 if (one->jd_seg->js_seq > two->jd_seg->js_seq) { 1075 swp = one; 1076 one = two; 1077 two = swp; 1078 } 1079 WORKLIST_REMOVE(&two->jd_list); 1080 free_jsegdep(two); 1081 1082 return (one); 1083 } 1084 1085 /* 1086 * If two freedeps are compatible free one to reduce list size. 1087 */ 1088 static inline struct freedep * 1089 freedep_merge(struct freedep *one, struct freedep *two) 1090 { 1091 if (two == NULL) 1092 return (one); 1093 1094 if (one->fd_freework == two->fd_freework) { 1095 WORKLIST_REMOVE(&two->fd_list); 1096 free_freedep(two); 1097 } 1098 return (one); 1099 } 1100 1101 /* 1102 * Move journal work from one list to another. Duplicate freedeps and 1103 * jsegdeps are coalesced to keep the lists as small as possible. 1104 */ 1105 static void 1106 jwork_move(dst, src) 1107 struct workhead *dst; 1108 struct workhead *src; 1109 { 1110 struct freedep *freedep; 1111 struct jsegdep *jsegdep; 1112 struct worklist *wkn; 1113 struct worklist *wk; 1114 1115 KASSERT(dst != src, 1116 ("jwork_move: dst == src")); 1117 freedep = NULL; 1118 jsegdep = NULL; 1119 LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { 1120 if (wk->wk_type == D_JSEGDEP) 1121 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1122 if (wk->wk_type == D_FREEDEP) 1123 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1124 } 1125 1126 mtx_assert(&lk, MA_OWNED); 1127 while ((wk = LIST_FIRST(src)) != NULL) { 1128 WORKLIST_REMOVE(wk); 1129 WORKLIST_INSERT(dst, wk); 1130 if (wk->wk_type == D_JSEGDEP) { 1131 jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); 1132 continue; 1133 } 1134 if (wk->wk_type == D_FREEDEP) 1135 freedep = freedep_merge(WK_FREEDEP(wk), freedep); 1136 } 1137 } 1138 1139 static void 1140 jwork_insert(dst, jsegdep) 1141 struct workhead *dst; 1142 struct jsegdep *jsegdep; 1143 { 1144 struct jsegdep *jsegdepn; 1145 struct worklist *wk; 1146 1147 LIST_FOREACH(wk, dst, wk_list) 1148 if (wk->wk_type == D_JSEGDEP) 1149 break; 1150 if (wk == NULL) { 1151 WORKLIST_INSERT(dst, &jsegdep->jd_list); 1152 return; 1153 } 1154 jsegdepn = WK_JSEGDEP(wk); 1155 if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) { 1156 WORKLIST_REMOVE(wk); 1157 free_jsegdep(jsegdepn); 1158 WORKLIST_INSERT(dst, &jsegdep->jd_list); 1159 } else 1160 free_jsegdep(jsegdep); 1161 } 1162 1163 /* 1164 * Routines for tracking and managing workitems. 1165 */ 1166 static void workitem_free(struct worklist *, int); 1167 static void workitem_alloc(struct worklist *, int, struct mount *); 1168 1169 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) 1170 1171 static void 1172 workitem_free(item, type) 1173 struct worklist *item; 1174 int type; 1175 { 1176 struct ufsmount *ump; 1177 mtx_assert(&lk, MA_OWNED); 1178 1179 #ifdef DEBUG 1180 if (item->wk_state & ONWORKLIST) 1181 panic("workitem_free: %s(0x%X) still on list", 1182 TYPENAME(item->wk_type), item->wk_state); 1183 if (item->wk_type != type) 1184 panic("workitem_free: type mismatch %s != %s", 1185 TYPENAME(item->wk_type), TYPENAME(type)); 1186 #endif 1187 if (item->wk_state & IOWAITING) 1188 wakeup(item); 1189 ump = VFSTOUFS(item->wk_mp); 1190 if (--ump->softdep_deps == 0 && ump->softdep_req) 1191 wakeup(&ump->softdep_deps); 1192 dep_current[type]--; 1193 free(item, DtoM(type)); 1194 } 1195 1196 static void 1197 workitem_alloc(item, type, mp) 1198 struct worklist *item; 1199 int type; 1200 struct mount *mp; 1201 { 1202 struct ufsmount *ump; 1203 1204 item->wk_type = type; 1205 item->wk_mp = mp; 1206 item->wk_state = 0; 1207 1208 ump = VFSTOUFS(mp); 1209 ACQUIRE_LOCK(&lk); 1210 dep_current[type]++; 1211 dep_total[type]++; 1212 ump->softdep_deps++; 1213 ump->softdep_accdeps++; 1214 FREE_LOCK(&lk); 1215 } 1216 1217 /* 1218 * Workitem queue management 1219 */ 1220 static int max_softdeps; /* maximum number of structs before slowdown */ 1221 static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ 1222 static int tickdelay = 2; /* number of ticks to pause during slowdown */ 1223 static int proc_waiting; /* tracks whether we have a timeout posted */ 1224 static int *stat_countp; /* statistic to count in proc_waiting timeout */ 1225 static struct callout softdep_callout; 1226 static int req_pending; 1227 static int req_clear_inodedeps; /* syncer process flush some inodedeps */ 1228 static int req_clear_remove; /* syncer process flush some freeblks */ 1229 1230 /* 1231 * runtime statistics 1232 */ 1233 static int stat_worklist_push; /* number of worklist cleanups */ 1234 static int stat_blk_limit_push; /* number of times block limit neared */ 1235 static int stat_ino_limit_push; /* number of times inode limit neared */ 1236 static int stat_blk_limit_hit; /* number of times block slowdown imposed */ 1237 static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ 1238 static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ 1239 static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ 1240 static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ 1241 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ 1242 static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ 1243 static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ 1244 static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ 1245 static int stat_journal_min; /* Times hit journal min threshold */ 1246 static int stat_journal_low; /* Times hit journal low threshold */ 1247 static int stat_journal_wait; /* Times blocked in jwait(). */ 1248 static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ 1249 static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ 1250 static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ 1251 static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ 1252 static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */ 1253 static int stat_cleanup_blkrequests; /* Number of block cleanup requests */ 1254 static int stat_cleanup_inorequests; /* Number of inode cleanup requests */ 1255 static int stat_cleanup_retries; /* Number of cleanups that needed to flush */ 1256 static int stat_cleanup_failures; /* Number of cleanup requests that failed */ 1257 1258 SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, 1259 &max_softdeps, 0, ""); 1260 SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, 1261 &tickdelay, 0, ""); 1262 SYSCTL_INT(_debug_softdep, OID_AUTO, maxindirdeps, CTLFLAG_RW, 1263 &maxindirdeps, 0, ""); 1264 SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, 1265 &stat_worklist_push, 0,""); 1266 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, 1267 &stat_blk_limit_push, 0,""); 1268 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, 1269 &stat_ino_limit_push, 0,""); 1270 SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, 1271 &stat_blk_limit_hit, 0, ""); 1272 SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, 1273 &stat_ino_limit_hit, 0, ""); 1274 SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, 1275 &stat_sync_limit_hit, 0, ""); 1276 SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, 1277 &stat_indir_blk_ptrs, 0, ""); 1278 SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, 1279 &stat_inode_bitmap, 0, ""); 1280 SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, 1281 &stat_direct_blk_ptrs, 0, ""); 1282 SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, 1283 &stat_dir_entry, 0, ""); 1284 SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW, 1285 &stat_jaddref, 0, ""); 1286 SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW, 1287 &stat_jnewblk, 0, ""); 1288 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW, 1289 &stat_journal_low, 0, ""); 1290 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW, 1291 &stat_journal_min, 0, ""); 1292 SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW, 1293 &stat_journal_wait, 0, ""); 1294 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW, 1295 &stat_jwait_filepage, 0, ""); 1296 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW, 1297 &stat_jwait_freeblks, 0, ""); 1298 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW, 1299 &stat_jwait_inode, 0, ""); 1300 SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW, 1301 &stat_jwait_newblk, 0, ""); 1302 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW, 1303 &stat_cleanup_blkrequests, 0, ""); 1304 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW, 1305 &stat_cleanup_inorequests, 0, ""); 1306 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW, 1307 &stat_cleanup_high_delay, 0, ""); 1308 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW, 1309 &stat_cleanup_retries, 0, ""); 1310 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW, 1311 &stat_cleanup_failures, 0, ""); 1312 1313 SYSCTL_DECL(_vfs_ffs); 1314 1315 LIST_HEAD(bmsafemap_hashhead, bmsafemap) *bmsafemap_hashtbl; 1316 static u_long bmsafemap_hash; /* size of hash table - 1 */ 1317 1318 static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ 1319 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, 1320 &compute_summary_at_mount, 0, "Recompute summary at mount"); 1321 1322 static struct proc *softdepproc; 1323 static struct kproc_desc softdep_kp = { 1324 "softdepflush", 1325 softdep_flush, 1326 &softdepproc 1327 }; 1328 SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, 1329 &softdep_kp); 1330 1331 static void 1332 softdep_flush(void) 1333 { 1334 struct mount *nmp; 1335 struct mount *mp; 1336 struct ufsmount *ump; 1337 struct thread *td; 1338 int remaining; 1339 int progress; 1340 int vfslocked; 1341 1342 td = curthread; 1343 td->td_pflags |= TDP_NORUNNINGBUF; 1344 1345 for (;;) { 1346 kproc_suspend_check(softdepproc); 1347 vfslocked = VFS_LOCK_GIANT((struct mount *)NULL); 1348 ACQUIRE_LOCK(&lk); 1349 /* 1350 * If requested, try removing inode or removal dependencies. 1351 */ 1352 if (req_clear_inodedeps) { 1353 clear_inodedeps(td); 1354 req_clear_inodedeps -= 1; 1355 wakeup_one(&proc_waiting); 1356 } 1357 if (req_clear_remove) { 1358 clear_remove(td); 1359 req_clear_remove -= 1; 1360 wakeup_one(&proc_waiting); 1361 } 1362 FREE_LOCK(&lk); 1363 VFS_UNLOCK_GIANT(vfslocked); 1364 remaining = progress = 0; 1365 mtx_lock(&mountlist_mtx); 1366 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1367 nmp = TAILQ_NEXT(mp, mnt_list); 1368 if (MOUNTEDSOFTDEP(mp) == 0) 1369 continue; 1370 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 1371 continue; 1372 vfslocked = VFS_LOCK_GIANT(mp); 1373 progress += softdep_process_worklist(mp, 0); 1374 ump = VFSTOUFS(mp); 1375 remaining += ump->softdep_on_worklist; 1376 VFS_UNLOCK_GIANT(vfslocked); 1377 mtx_lock(&mountlist_mtx); 1378 nmp = TAILQ_NEXT(mp, mnt_list); 1379 vfs_unbusy(mp); 1380 } 1381 mtx_unlock(&mountlist_mtx); 1382 if (remaining && progress) 1383 continue; 1384 ACQUIRE_LOCK(&lk); 1385 if (!req_pending) 1386 msleep(&req_pending, &lk, PVM, "sdflush", hz); 1387 req_pending = 0; 1388 FREE_LOCK(&lk); 1389 } 1390 } 1391 1392 static void 1393 worklist_speedup(void) 1394 { 1395 mtx_assert(&lk, MA_OWNED); 1396 if (req_pending == 0) { 1397 req_pending = 1; 1398 wakeup(&req_pending); 1399 } 1400 } 1401 1402 static int 1403 softdep_speedup(void) 1404 { 1405 1406 worklist_speedup(); 1407 bd_speedup(); 1408 return speedup_syncer(); 1409 } 1410 1411 /* 1412 * Add an item to the end of the work queue. 1413 * This routine requires that the lock be held. 1414 * This is the only routine that adds items to the list. 1415 * The following routine is the only one that removes items 1416 * and does so in order from first to last. 1417 */ 1418 1419 #define WK_HEAD 0x0001 /* Add to HEAD. */ 1420 #define WK_NODELAY 0x0002 /* Process immediately. */ 1421 1422 static void 1423 add_to_worklist(wk, flags) 1424 struct worklist *wk; 1425 int flags; 1426 { 1427 struct ufsmount *ump; 1428 1429 mtx_assert(&lk, MA_OWNED); 1430 ump = VFSTOUFS(wk->wk_mp); 1431 if (wk->wk_state & ONWORKLIST) 1432 panic("add_to_worklist: %s(0x%X) already on list", 1433 TYPENAME(wk->wk_type), wk->wk_state); 1434 wk->wk_state |= ONWORKLIST; 1435 if (ump->softdep_on_worklist == 0) { 1436 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1437 ump->softdep_worklist_tail = wk; 1438 } else if (flags & WK_HEAD) { 1439 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); 1440 } else { 1441 LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); 1442 ump->softdep_worklist_tail = wk; 1443 } 1444 ump->softdep_on_worklist += 1; 1445 if (flags & WK_NODELAY) 1446 worklist_speedup(); 1447 } 1448 1449 /* 1450 * Remove the item to be processed. If we are removing the last 1451 * item on the list, we need to recalculate the tail pointer. 1452 */ 1453 static void 1454 remove_from_worklist(wk) 1455 struct worklist *wk; 1456 { 1457 struct ufsmount *ump; 1458 1459 ump = VFSTOUFS(wk->wk_mp); 1460 WORKLIST_REMOVE(wk); 1461 if (ump->softdep_worklist_tail == wk) 1462 ump->softdep_worklist_tail = 1463 (struct worklist *)wk->wk_list.le_prev; 1464 ump->softdep_on_worklist -= 1; 1465 } 1466 1467 static void 1468 wake_worklist(wk) 1469 struct worklist *wk; 1470 { 1471 if (wk->wk_state & IOWAITING) { 1472 wk->wk_state &= ~IOWAITING; 1473 wakeup(wk); 1474 } 1475 } 1476 1477 static void 1478 wait_worklist(wk, wmesg) 1479 struct worklist *wk; 1480 char *wmesg; 1481 { 1482 1483 wk->wk_state |= IOWAITING; 1484 msleep(wk, &lk, PVM, wmesg, 0); 1485 } 1486 1487 /* 1488 * Process that runs once per second to handle items in the background queue. 1489 * 1490 * Note that we ensure that everything is done in the order in which they 1491 * appear in the queue. The code below depends on this property to ensure 1492 * that blocks of a file are freed before the inode itself is freed. This 1493 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated 1494 * until all the old ones have been purged from the dependency lists. 1495 */ 1496 int 1497 softdep_process_worklist(mp, full) 1498 struct mount *mp; 1499 int full; 1500 { 1501 struct thread *td = curthread; 1502 int cnt, matchcnt; 1503 struct ufsmount *ump; 1504 long starttime; 1505 1506 KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); 1507 /* 1508 * Record the process identifier of our caller so that we can give 1509 * this process preferential treatment in request_cleanup below. 1510 */ 1511 matchcnt = 0; 1512 ump = VFSTOUFS(mp); 1513 ACQUIRE_LOCK(&lk); 1514 starttime = time_second; 1515 softdep_process_journal(mp, NULL, full?MNT_WAIT:0); 1516 while (ump->softdep_on_worklist > 0) { 1517 if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0) 1518 break; 1519 else 1520 matchcnt += cnt; 1521 /* 1522 * If requested, try removing inode or removal dependencies. 1523 */ 1524 if (req_clear_inodedeps) { 1525 clear_inodedeps(td); 1526 req_clear_inodedeps -= 1; 1527 wakeup_one(&proc_waiting); 1528 } 1529 if (req_clear_remove) { 1530 clear_remove(td); 1531 req_clear_remove -= 1; 1532 wakeup_one(&proc_waiting); 1533 } 1534 /* 1535 * We do not generally want to stop for buffer space, but if 1536 * we are really being a buffer hog, we will stop and wait. 1537 */ 1538 if (should_yield()) { 1539 FREE_LOCK(&lk); 1540 kern_yield(PRI_UNCHANGED); 1541 bwillwrite(); 1542 ACQUIRE_LOCK(&lk); 1543 } 1544 /* 1545 * Never allow processing to run for more than one 1546 * second. Otherwise the other mountpoints may get 1547 * excessively backlogged. 1548 */ 1549 if (!full && starttime != time_second) 1550 break; 1551 } 1552 if (full == 0) 1553 journal_unsuspend(ump); 1554 FREE_LOCK(&lk); 1555 return (matchcnt); 1556 } 1557 1558 /* 1559 * Process all removes associated with a vnode if we are running out of 1560 * journal space. Any other process which attempts to flush these will 1561 * be unable as we have the vnodes locked. 1562 */ 1563 static void 1564 process_removes(vp) 1565 struct vnode *vp; 1566 { 1567 struct inodedep *inodedep; 1568 struct dirrem *dirrem; 1569 struct mount *mp; 1570 ino_t inum; 1571 1572 mtx_assert(&lk, MA_OWNED); 1573 1574 mp = vp->v_mount; 1575 inum = VTOI(vp)->i_number; 1576 for (;;) { 1577 top: 1578 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1579 return; 1580 LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) { 1581 /* 1582 * If another thread is trying to lock this vnode 1583 * it will fail but we must wait for it to do so 1584 * before we can proceed. 1585 */ 1586 if (dirrem->dm_state & INPROGRESS) { 1587 wait_worklist(&dirrem->dm_list, "pwrwait"); 1588 goto top; 1589 } 1590 if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == 1591 (COMPLETE | ONWORKLIST)) 1592 break; 1593 } 1594 if (dirrem == NULL) 1595 return; 1596 remove_from_worklist(&dirrem->dm_list); 1597 FREE_LOCK(&lk); 1598 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1599 panic("process_removes: suspended filesystem"); 1600 handle_workitem_remove(dirrem, 0); 1601 vn_finished_secondary_write(mp); 1602 ACQUIRE_LOCK(&lk); 1603 } 1604 } 1605 1606 /* 1607 * Process all truncations associated with a vnode if we are running out 1608 * of journal space. This is called when the vnode lock is already held 1609 * and no other process can clear the truncation. This function returns 1610 * a value greater than zero if it did any work. 1611 */ 1612 static void 1613 process_truncates(vp) 1614 struct vnode *vp; 1615 { 1616 struct inodedep *inodedep; 1617 struct freeblks *freeblks; 1618 struct mount *mp; 1619 ino_t inum; 1620 int cgwait; 1621 1622 mtx_assert(&lk, MA_OWNED); 1623 1624 mp = vp->v_mount; 1625 inum = VTOI(vp)->i_number; 1626 for (;;) { 1627 if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) 1628 return; 1629 cgwait = 0; 1630 TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) { 1631 /* Journal entries not yet written. */ 1632 if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) { 1633 jwait(&LIST_FIRST( 1634 &freeblks->fb_jblkdephd)->jb_list, 1635 MNT_WAIT); 1636 break; 1637 } 1638 /* Another thread is executing this item. */ 1639 if (freeblks->fb_state & INPROGRESS) { 1640 wait_worklist(&freeblks->fb_list, "ptrwait"); 1641 break; 1642 } 1643 /* Freeblks is waiting on a inode write. */ 1644 if ((freeblks->fb_state & COMPLETE) == 0) { 1645 FREE_LOCK(&lk); 1646 ffs_update(vp, 1); 1647 ACQUIRE_LOCK(&lk); 1648 break; 1649 } 1650 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) == 1651 (ALLCOMPLETE | ONWORKLIST)) { 1652 remove_from_worklist(&freeblks->fb_list); 1653 freeblks->fb_state |= INPROGRESS; 1654 FREE_LOCK(&lk); 1655 if (vn_start_secondary_write(NULL, &mp, 1656 V_NOWAIT)) 1657 panic("process_truncates: " 1658 "suspended filesystem"); 1659 handle_workitem_freeblocks(freeblks, 0); 1660 vn_finished_secondary_write(mp); 1661 ACQUIRE_LOCK(&lk); 1662 break; 1663 } 1664 if (freeblks->fb_cgwait) 1665 cgwait++; 1666 } 1667 if (cgwait) { 1668 FREE_LOCK(&lk); 1669 sync_cgs(mp, MNT_WAIT); 1670 ffs_sync_snap(mp, MNT_WAIT); 1671 ACQUIRE_LOCK(&lk); 1672 continue; 1673 } 1674 if (freeblks == NULL) 1675 break; 1676 } 1677 return; 1678 } 1679 1680 /* 1681 * Process one item on the worklist. 1682 */ 1683 static int 1684 process_worklist_item(mp, target, flags) 1685 struct mount *mp; 1686 int target; 1687 int flags; 1688 { 1689 struct worklist sintenel; 1690 struct worklist *wk; 1691 struct ufsmount *ump; 1692 int matchcnt; 1693 int error; 1694 1695 mtx_assert(&lk, MA_OWNED); 1696 KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); 1697 /* 1698 * If we are being called because of a process doing a 1699 * copy-on-write, then it is not safe to write as we may 1700 * recurse into the copy-on-write routine. 1701 */ 1702 if (curthread->td_pflags & TDP_COWINPROGRESS) 1703 return (-1); 1704 PHOLD(curproc); /* Don't let the stack go away. */ 1705 ump = VFSTOUFS(mp); 1706 matchcnt = 0; 1707 sintenel.wk_mp = NULL; 1708 sintenel.wk_type = D_SENTINAL; 1709 LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sintenel, wk_list); 1710 for (wk = LIST_NEXT(&sintenel, wk_list); wk != NULL; 1711 wk = LIST_NEXT(&sintenel, wk_list)) { 1712 if (wk->wk_type == D_SENTINAL) { 1713 LIST_REMOVE(&sintenel, wk_list); 1714 LIST_INSERT_AFTER(wk, &sintenel, wk_list); 1715 continue; 1716 } 1717 if (wk->wk_state & INPROGRESS) 1718 panic("process_worklist_item: %p already in progress.", 1719 wk); 1720 wk->wk_state |= INPROGRESS; 1721 remove_from_worklist(wk); 1722 FREE_LOCK(&lk); 1723 if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) 1724 panic("process_worklist_item: suspended filesystem"); 1725 switch (wk->wk_type) { 1726 case D_DIRREM: 1727 /* removal of a directory entry */ 1728 error = handle_workitem_remove(WK_DIRREM(wk), flags); 1729 break; 1730 1731 case D_FREEBLKS: 1732 /* releasing blocks and/or fragments from a file */ 1733 error = handle_workitem_freeblocks(WK_FREEBLKS(wk), 1734 flags); 1735 break; 1736 1737 case D_FREEFRAG: 1738 /* releasing a fragment when replaced as a file grows */ 1739 handle_workitem_freefrag(WK_FREEFRAG(wk)); 1740 error = 0; 1741 break; 1742 1743 case D_FREEFILE: 1744 /* releasing an inode when its link count drops to 0 */ 1745 handle_workitem_freefile(WK_FREEFILE(wk)); 1746 error = 0; 1747 break; 1748 1749 default: 1750 panic("%s_process_worklist: Unknown type %s", 1751 "softdep", TYPENAME(wk->wk_type)); 1752 /* NOTREACHED */ 1753 } 1754 vn_finished_secondary_write(mp); 1755 ACQUIRE_LOCK(&lk); 1756 if (error == 0) { 1757 if (++matchcnt == target) 1758 break; 1759 continue; 1760 } 1761 /* 1762 * We have to retry the worklist item later. Wake up any 1763 * waiters who may be able to complete it immediately and 1764 * add the item back to the head so we don't try to execute 1765 * it again. 1766 */ 1767 wk->wk_state &= ~INPROGRESS; 1768 wake_worklist(wk); 1769 add_to_worklist(wk, WK_HEAD); 1770 } 1771 LIST_REMOVE(&sintenel, wk_list); 1772 /* Sentinal could've become the tail from remove_from_worklist. */ 1773 if (ump->softdep_worklist_tail == &sintenel) 1774 ump->softdep_worklist_tail = 1775 (struct worklist *)sintenel.wk_list.le_prev; 1776 PRELE(curproc); 1777 return (matchcnt); 1778 } 1779 1780 /* 1781 * Move dependencies from one buffer to another. 1782 */ 1783 int 1784 softdep_move_dependencies(oldbp, newbp) 1785 struct buf *oldbp; 1786 struct buf *newbp; 1787 { 1788 struct worklist *wk, *wktail; 1789 int dirty; 1790 1791 dirty = 0; 1792 wktail = NULL; 1793 ACQUIRE_LOCK(&lk); 1794 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { 1795 LIST_REMOVE(wk, wk_list); 1796 if (wk->wk_type == D_BMSAFEMAP && 1797 bmsafemap_rollbacks(WK_BMSAFEMAP(wk))) 1798 dirty = 1; 1799 if (wktail == 0) 1800 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); 1801 else 1802 LIST_INSERT_AFTER(wktail, wk, wk_list); 1803 wktail = wk; 1804 } 1805 FREE_LOCK(&lk); 1806 1807 return (dirty); 1808 } 1809 1810 /* 1811 * Purge the work list of all items associated with a particular mount point. 1812 */ 1813 int 1814 softdep_flushworklist(oldmnt, countp, td) 1815 struct mount *oldmnt; 1816 int *countp; 1817 struct thread *td; 1818 { 1819 struct vnode *devvp; 1820 int count, error = 0; 1821 struct ufsmount *ump; 1822 1823 /* 1824 * Alternately flush the block device associated with the mount 1825 * point and process any dependencies that the flushing 1826 * creates. We continue until no more worklist dependencies 1827 * are found. 1828 */ 1829 *countp = 0; 1830 ump = VFSTOUFS(oldmnt); 1831 devvp = ump->um_devvp; 1832 while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { 1833 *countp += count; 1834 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 1835 error = VOP_FSYNC(devvp, MNT_WAIT, td); 1836 VOP_UNLOCK(devvp, 0); 1837 if (error) 1838 break; 1839 } 1840 return (error); 1841 } 1842 1843 int 1844 softdep_waitidle(struct mount *mp) 1845 { 1846 struct ufsmount *ump; 1847 int error; 1848 int i; 1849 1850 ump = VFSTOUFS(mp); 1851 ACQUIRE_LOCK(&lk); 1852 for (i = 0; i < 10 && ump->softdep_deps; i++) { 1853 ump->softdep_req = 1; 1854 if (ump->softdep_on_worklist) 1855 panic("softdep_waitidle: work added after flush."); 1856 msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); 1857 } 1858 ump->softdep_req = 0; 1859 FREE_LOCK(&lk); 1860 error = 0; 1861 if (i == 10) { 1862 error = EBUSY; 1863 printf("softdep_waitidle: Failed to flush worklist for %p\n", 1864 mp); 1865 } 1866 1867 return (error); 1868 } 1869 1870 /* 1871 * Flush all vnodes and worklist items associated with a specified mount point. 1872 */ 1873 int 1874 softdep_flushfiles(oldmnt, flags, td) 1875 struct mount *oldmnt; 1876 int flags; 1877 struct thread *td; 1878 { 1879 int error, depcount, loopcnt, retry_flush_count, retry; 1880 1881 loopcnt = 10; 1882 retry_flush_count = 3; 1883 retry_flush: 1884 error = 0; 1885 1886 /* 1887 * Alternately flush the vnodes associated with the mount 1888 * point and process any dependencies that the flushing 1889 * creates. In theory, this loop can happen at most twice, 1890 * but we give it a few extra just to be sure. 1891 */ 1892 for (; loopcnt > 0; loopcnt--) { 1893 /* 1894 * Do another flush in case any vnodes were brought in 1895 * as part of the cleanup operations. 1896 */ 1897 if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) 1898 break; 1899 if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || 1900 depcount == 0) 1901 break; 1902 } 1903 /* 1904 * If we are unmounting then it is an error to fail. If we 1905 * are simply trying to downgrade to read-only, then filesystem 1906 * activity can keep us busy forever, so we just fail with EBUSY. 1907 */ 1908 if (loopcnt == 0) { 1909 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) 1910 panic("softdep_flushfiles: looping"); 1911 error = EBUSY; 1912 } 1913 if (!error) 1914 error = softdep_waitidle(oldmnt); 1915 if (!error) { 1916 if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { 1917 retry = 0; 1918 MNT_ILOCK(oldmnt); 1919 KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, 1920 ("softdep_flushfiles: !MNTK_NOINSMNTQ")); 1921 if (oldmnt->mnt_nvnodelistsize > 0) { 1922 if (--retry_flush_count > 0) { 1923 retry = 1; 1924 loopcnt = 3; 1925 } else 1926 error = EBUSY; 1927 } 1928 MNT_IUNLOCK(oldmnt); 1929 if (retry) 1930 goto retry_flush; 1931 } 1932 } 1933 return (error); 1934 } 1935 1936 /* 1937 * Structure hashing. 1938 * 1939 * There are three types of structures that can be looked up: 1940 * 1) pagedep structures identified by mount point, inode number, 1941 * and logical block. 1942 * 2) inodedep structures identified by mount point and inode number. 1943 * 3) newblk structures identified by mount point and 1944 * physical block number. 1945 * 1946 * The "pagedep" and "inodedep" dependency structures are hashed 1947 * separately from the file blocks and inodes to which they correspond. 1948 * This separation helps when the in-memory copy of an inode or 1949 * file block must be replaced. It also obviates the need to access 1950 * an inode or file page when simply updating (or de-allocating) 1951 * dependency structures. Lookup of newblk structures is needed to 1952 * find newly allocated blocks when trying to associate them with 1953 * their allocdirect or allocindir structure. 1954 * 1955 * The lookup routines optionally create and hash a new instance when 1956 * an existing entry is not found. 1957 */ 1958 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ 1959 #define NODELAY 0x0002 /* cannot do background work */ 1960 1961 /* 1962 * Structures and routines associated with pagedep caching. 1963 */ 1964 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; 1965 u_long pagedep_hash; /* size of hash table - 1 */ 1966 #define PAGEDEP_HASH(mp, inum, lbn) \ 1967 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ 1968 pagedep_hash]) 1969 1970 static int 1971 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) 1972 struct pagedep_hashhead *pagedephd; 1973 ino_t ino; 1974 ufs_lbn_t lbn; 1975 struct mount *mp; 1976 int flags; 1977 struct pagedep **pagedeppp; 1978 { 1979 struct pagedep *pagedep; 1980 1981 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 1982 if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn && 1983 mp == pagedep->pd_list.wk_mp) { 1984 *pagedeppp = pagedep; 1985 return (1); 1986 } 1987 } 1988 *pagedeppp = NULL; 1989 return (0); 1990 } 1991 /* 1992 * Look up a pagedep. Return 1 if found, 0 otherwise. 1993 * If not found, allocate if DEPALLOC flag is passed. 1994 * Found or allocated entry is returned in pagedeppp. 1995 * This routine must be called with splbio interrupts blocked. 1996 */ 1997 static int 1998 pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp) 1999 struct mount *mp; 2000 struct buf *bp; 2001 ino_t ino; 2002 ufs_lbn_t lbn; 2003 int flags; 2004 struct pagedep **pagedeppp; 2005 { 2006 struct pagedep *pagedep; 2007 struct pagedep_hashhead *pagedephd; 2008 struct worklist *wk; 2009 int ret; 2010 int i; 2011 2012 mtx_assert(&lk, MA_OWNED); 2013 if (bp) { 2014 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 2015 if (wk->wk_type == D_PAGEDEP) { 2016 *pagedeppp = WK_PAGEDEP(wk); 2017 return (1); 2018 } 2019 } 2020 } 2021 pagedephd = PAGEDEP_HASH(mp, ino, lbn); 2022 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 2023 if (ret) { 2024 if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp) 2025 WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list); 2026 return (1); 2027 } 2028 if ((flags & DEPALLOC) == 0) 2029 return (0); 2030 FREE_LOCK(&lk); 2031 pagedep = malloc(sizeof(struct pagedep), 2032 M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); 2033 workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); 2034 ACQUIRE_LOCK(&lk); 2035 ret = pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp); 2036 if (*pagedeppp) { 2037 /* 2038 * This should never happen since we only create pagedeps 2039 * with the vnode lock held. Could be an assert. 2040 */ 2041 WORKITEM_FREE(pagedep, D_PAGEDEP); 2042 return (ret); 2043 } 2044 pagedep->pd_ino = ino; 2045 pagedep->pd_lbn = lbn; 2046 LIST_INIT(&pagedep->pd_dirremhd); 2047 LIST_INIT(&pagedep->pd_pendinghd); 2048 for (i = 0; i < DAHASHSZ; i++) 2049 LIST_INIT(&pagedep->pd_diraddhd[i]); 2050 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); 2051 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); 2052 *pagedeppp = pagedep; 2053 return (0); 2054 } 2055 2056 /* 2057 * Structures and routines associated with inodedep caching. 2058 */ 2059 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; 2060 static u_long inodedep_hash; /* size of hash table - 1 */ 2061 #define INODEDEP_HASH(fs, inum) \ 2062 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) 2063 2064 static int 2065 inodedep_find(inodedephd, fs, inum, inodedeppp) 2066 struct inodedep_hashhead *inodedephd; 2067 struct fs *fs; 2068 ino_t inum; 2069 struct inodedep **inodedeppp; 2070 { 2071 struct inodedep *inodedep; 2072 2073 LIST_FOREACH(inodedep, inodedephd, id_hash) 2074 if (inum == inodedep->id_ino && fs == inodedep->id_fs) 2075 break; 2076 if (inodedep) { 2077 *inodedeppp = inodedep; 2078 return (1); 2079 } 2080 *inodedeppp = NULL; 2081 2082 return (0); 2083 } 2084 /* 2085 * Look up an inodedep. Return 1 if found, 0 if not found. 2086 * If not found, allocate if DEPALLOC flag is passed. 2087 * Found or allocated entry is returned in inodedeppp. 2088 * This routine must be called with splbio interrupts blocked. 2089 */ 2090 static int 2091 inodedep_lookup(mp, inum, flags, inodedeppp) 2092 struct mount *mp; 2093 ino_t inum; 2094 int flags; 2095 struct inodedep **inodedeppp; 2096 { 2097 struct inodedep *inodedep; 2098 struct inodedep_hashhead *inodedephd; 2099 struct fs *fs; 2100 2101 mtx_assert(&lk, MA_OWNED); 2102 fs = VFSTOUFS(mp)->um_fs; 2103 inodedephd = INODEDEP_HASH(fs, inum); 2104 2105 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) 2106 return (1); 2107 if ((flags & DEPALLOC) == 0) 2108 return (0); 2109 /* 2110 * If we are over our limit, try to improve the situation. 2111 */ 2112 if (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0) 2113 request_cleanup(mp, FLUSH_INODES); 2114 FREE_LOCK(&lk); 2115 inodedep = malloc(sizeof(struct inodedep), 2116 M_INODEDEP, M_SOFTDEP_FLAGS); 2117 workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); 2118 ACQUIRE_LOCK(&lk); 2119 if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { 2120 WORKITEM_FREE(inodedep, D_INODEDEP); 2121 return (1); 2122 } 2123 inodedep->id_fs = fs; 2124 inodedep->id_ino = inum; 2125 inodedep->id_state = ALLCOMPLETE; 2126 inodedep->id_nlinkdelta = 0; 2127 inodedep->id_savedino1 = NULL; 2128 inodedep->id_savedsize = -1; 2129 inodedep->id_savedextsize = -1; 2130 inodedep->id_savednlink = -1; 2131 inodedep->id_bmsafemap = NULL; 2132 inodedep->id_mkdiradd = NULL; 2133 LIST_INIT(&inodedep->id_dirremhd); 2134 LIST_INIT(&inodedep->id_pendinghd); 2135 LIST_INIT(&inodedep->id_inowait); 2136 LIST_INIT(&inodedep->id_bufwait); 2137 TAILQ_INIT(&inodedep->id_inoreflst); 2138 TAILQ_INIT(&inodedep->id_inoupdt); 2139 TAILQ_INIT(&inodedep->id_newinoupdt); 2140 TAILQ_INIT(&inodedep->id_extupdt); 2141 TAILQ_INIT(&inodedep->id_newextupdt); 2142 TAILQ_INIT(&inodedep->id_freeblklst); 2143 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); 2144 *inodedeppp = inodedep; 2145 return (0); 2146 } 2147 2148 /* 2149 * Structures and routines associated with newblk caching. 2150 */ 2151 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; 2152 u_long newblk_hash; /* size of hash table - 1 */ 2153 #define NEWBLK_HASH(fs, inum) \ 2154 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) 2155 2156 static int 2157 newblk_find(newblkhd, mp, newblkno, flags, newblkpp) 2158 struct newblk_hashhead *newblkhd; 2159 struct mount *mp; 2160 ufs2_daddr_t newblkno; 2161 int flags; 2162 struct newblk **newblkpp; 2163 { 2164 struct newblk *newblk; 2165 2166 LIST_FOREACH(newblk, newblkhd, nb_hash) { 2167 if (newblkno != newblk->nb_newblkno) 2168 continue; 2169 if (mp != newblk->nb_list.wk_mp) 2170 continue; 2171 /* 2172 * If we're creating a new dependency don't match those that 2173 * have already been converted to allocdirects. This is for 2174 * a frag extend. 2175 */ 2176 if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) 2177 continue; 2178 break; 2179 } 2180 if (newblk) { 2181 *newblkpp = newblk; 2182 return (1); 2183 } 2184 *newblkpp = NULL; 2185 return (0); 2186 } 2187 2188 /* 2189 * Look up a newblk. Return 1 if found, 0 if not found. 2190 * If not found, allocate if DEPALLOC flag is passed. 2191 * Found or allocated entry is returned in newblkpp. 2192 */ 2193 static int 2194 newblk_lookup(mp, newblkno, flags, newblkpp) 2195 struct mount *mp; 2196 ufs2_daddr_t newblkno; 2197 int flags; 2198 struct newblk **newblkpp; 2199 { 2200 struct newblk *newblk; 2201 struct newblk_hashhead *newblkhd; 2202 2203 newblkhd = NEWBLK_HASH(VFSTOUFS(mp)->um_fs, newblkno); 2204 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) 2205 return (1); 2206 if ((flags & DEPALLOC) == 0) 2207 return (0); 2208 FREE_LOCK(&lk); 2209 newblk = malloc(sizeof(union allblk), M_NEWBLK, 2210 M_SOFTDEP_FLAGS | M_ZERO); 2211 workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); 2212 ACQUIRE_LOCK(&lk); 2213 if (newblk_find(newblkhd, mp, newblkno, flags, newblkpp)) { 2214 WORKITEM_FREE(newblk, D_NEWBLK); 2215 return (1); 2216 } 2217 newblk->nb_freefrag = NULL; 2218 LIST_INIT(&newblk->nb_indirdeps); 2219 LIST_INIT(&newblk->nb_newdirblk); 2220 LIST_INIT(&newblk->nb_jwork); 2221 newblk->nb_state = ATTACHED; 2222 newblk->nb_newblkno = newblkno; 2223 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); 2224 *newblkpp = newblk; 2225 return (0); 2226 } 2227 2228 /* 2229 * Structures and routines associated with freed indirect block caching. 2230 */ 2231 struct freeworklst *indir_hashtbl; 2232 u_long indir_hash; /* size of hash table - 1 */ 2233 #define INDIR_HASH(mp, blkno) \ 2234 (&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash]) 2235 2236 /* 2237 * Lookup an indirect block in the indir hash table. The freework is 2238 * removed and potentially freed. The caller must do a blocking journal 2239 * write before writing to the blkno. 2240 */ 2241 static int 2242 indirblk_lookup(mp, blkno) 2243 struct mount *mp; 2244 ufs2_daddr_t blkno; 2245 { 2246 struct freework *freework; 2247 struct freeworklst *wkhd; 2248 2249 wkhd = INDIR_HASH(mp, blkno); 2250 TAILQ_FOREACH(freework, wkhd, fw_next) { 2251 if (freework->fw_blkno != blkno) 2252 continue; 2253 if (freework->fw_list.wk_mp != mp) 2254 continue; 2255 indirblk_remove(freework); 2256 return (1); 2257 } 2258 return (0); 2259 } 2260 2261 /* 2262 * Insert an indirect block represented by freework into the indirblk 2263 * hash table so that it may prevent the block from being re-used prior 2264 * to the journal being written. 2265 */ 2266 static void 2267 indirblk_insert(freework) 2268 struct freework *freework; 2269 { 2270 struct freeblks *freeblks; 2271 struct jsegdep *jsegdep; 2272 struct worklist *wk; 2273 2274 freeblks = freework->fw_freeblks; 2275 LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list) 2276 if (wk->wk_type == D_JSEGDEP) 2277 break; 2278 if (wk == NULL) 2279 return; 2280 2281 jsegdep = WK_JSEGDEP(wk); 2282 LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs, freework, fw_segs); 2283 TAILQ_INSERT_HEAD(INDIR_HASH(freework->fw_list.wk_mp, 2284 freework->fw_blkno), freework, fw_next); 2285 freework->fw_state &= ~DEPCOMPLETE; 2286 } 2287 2288 static void 2289 indirblk_remove(freework) 2290 struct freework *freework; 2291 { 2292 2293 LIST_REMOVE(freework, fw_segs); 2294 TAILQ_REMOVE(INDIR_HASH(freework->fw_list.wk_mp, 2295 freework->fw_blkno), freework, fw_next); 2296 freework->fw_state |= DEPCOMPLETE; 2297 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) 2298 WORKITEM_FREE(freework, D_FREEWORK); 2299 } 2300 2301 /* 2302 * Executed during filesystem system initialization before 2303 * mounting any filesystems. 2304 */ 2305 void 2306 softdep_initialize() 2307 { 2308 int i; 2309 2310 LIST_INIT(&mkdirlisthd); 2311 max_softdeps = desiredvnodes * 4; 2312 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); 2313 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); 2314 newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash); 2315 bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash); 2316 i = 1 << (ffs(desiredvnodes / 10) - 1); 2317 indir_hashtbl = malloc(i * sizeof(indir_hashtbl[0]), M_FREEWORK, 2318 M_WAITOK); 2319 indir_hash = i - 1; 2320 for (i = 0; i <= indir_hash; i++) 2321 TAILQ_INIT(&indir_hashtbl[i]); 2322 2323 /* initialise bioops hack */ 2324 bioops.io_start = softdep_disk_io_initiation; 2325 bioops.io_complete = softdep_disk_write_complete; 2326 bioops.io_deallocate = softdep_deallocate_dependencies; 2327 bioops.io_countdeps = softdep_count_dependencies; 2328 2329 /* Initialize the callout with an mtx. */ 2330 callout_init_mtx(&softdep_callout, &lk, 0); 2331 } 2332 2333 /* 2334 * Executed after all filesystems have been unmounted during 2335 * filesystem module unload. 2336 */ 2337 void 2338 softdep_uninitialize() 2339 { 2340 2341 callout_drain(&softdep_callout); 2342 hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); 2343 hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); 2344 hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); 2345 hashdestroy(bmsafemap_hashtbl, M_BMSAFEMAP, bmsafemap_hash); 2346 free(indir_hashtbl, M_FREEWORK); 2347 } 2348 2349 /* 2350 * Called at mount time to notify the dependency code that a 2351 * filesystem wishes to use it. 2352 */ 2353 int 2354 softdep_mount(devvp, mp, fs, cred) 2355 struct vnode *devvp; 2356 struct mount *mp; 2357 struct fs *fs; 2358 struct ucred *cred; 2359 { 2360 struct csum_total cstotal; 2361 struct ufsmount *ump; 2362 struct cg *cgp; 2363 struct buf *bp; 2364 int error, cyl; 2365 2366 MNT_ILOCK(mp); 2367 mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; 2368 if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { 2369 mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | 2370 MNTK_SOFTDEP; 2371 mp->mnt_noasync++; 2372 } 2373 MNT_IUNLOCK(mp); 2374 ump = VFSTOUFS(mp); 2375 LIST_INIT(&ump->softdep_workitem_pending); 2376 LIST_INIT(&ump->softdep_journal_pending); 2377 TAILQ_INIT(&ump->softdep_unlinked); 2378 LIST_INIT(&ump->softdep_dirtycg); 2379 ump->softdep_worklist_tail = NULL; 2380 ump->softdep_on_worklist = 0; 2381 ump->softdep_deps = 0; 2382 if ((fs->fs_flags & FS_SUJ) && 2383 (error = journal_mount(mp, fs, cred)) != 0) { 2384 printf("Failed to start journal: %d\n", error); 2385 return (error); 2386 } 2387 /* 2388 * When doing soft updates, the counters in the 2389 * superblock may have gotten out of sync. Recomputation 2390 * can take a long time and can be deferred for background 2391 * fsck. However, the old behavior of scanning the cylinder 2392 * groups and recalculating them at mount time is available 2393 * by setting vfs.ffs.compute_summary_at_mount to one. 2394 */ 2395 if (compute_summary_at_mount == 0 || fs->fs_clean != 0) 2396 return (0); 2397 bzero(&cstotal, sizeof cstotal); 2398 for (cyl = 0; cyl < fs->fs_ncg; cyl++) { 2399 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), 2400 fs->fs_cgsize, cred, &bp)) != 0) { 2401 brelse(bp); 2402 return (error); 2403 } 2404 cgp = (struct cg *)bp->b_data; 2405 cstotal.cs_nffree += cgp->cg_cs.cs_nffree; 2406 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; 2407 cstotal.cs_nifree += cgp->cg_cs.cs_nifree; 2408 cstotal.cs_ndir += cgp->cg_cs.cs_ndir; 2409 fs->fs_cs(fs, cyl) = cgp->cg_cs; 2410 brelse(bp); 2411 } 2412 #ifdef DEBUG 2413 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) 2414 printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); 2415 #endif 2416 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); 2417 return (0); 2418 } 2419 2420 void 2421 softdep_unmount(mp) 2422 struct mount *mp; 2423 { 2424 2425 MNT_ILOCK(mp); 2426 mp->mnt_flag &= ~MNT_SOFTDEP; 2427 if (MOUNTEDSUJ(mp) == 0) { 2428 MNT_IUNLOCK(mp); 2429 return; 2430 } 2431 mp->mnt_flag &= ~MNT_SUJ; 2432 MNT_IUNLOCK(mp); 2433 journal_unmount(mp); 2434 } 2435 2436 struct jblocks { 2437 struct jseglst jb_segs; /* TAILQ of current segments. */ 2438 struct jseg *jb_writeseg; /* Next write to complete. */ 2439 struct jseg *jb_oldestseg; /* Oldest segment with valid entries. */ 2440 struct jextent *jb_extent; /* Extent array. */ 2441 uint64_t jb_nextseq; /* Next sequence number. */ 2442 uint64_t jb_oldestwrseq; /* Oldest written sequence number. */ 2443 uint8_t jb_needseg; /* Need a forced segment. */ 2444 uint8_t jb_suspended; /* Did journal suspend writes? */ 2445 int jb_avail; /* Available extents. */ 2446 int jb_used; /* Last used extent. */ 2447 int jb_head; /* Allocator head. */ 2448 int jb_off; /* Allocator extent offset. */ 2449 int jb_blocks; /* Total disk blocks covered. */ 2450 int jb_free; /* Total disk blocks free. */ 2451 int jb_min; /* Minimum free space. */ 2452 int jb_low; /* Low on space. */ 2453 int jb_age; /* Insertion time of oldest rec. */ 2454 }; 2455 2456 struct jextent { 2457 ufs2_daddr_t je_daddr; /* Disk block address. */ 2458 int je_blocks; /* Disk block count. */ 2459 }; 2460 2461 static struct jblocks * 2462 jblocks_create(void) 2463 { 2464 struct jblocks *jblocks; 2465 2466 jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); 2467 TAILQ_INIT(&jblocks->jb_segs); 2468 jblocks->jb_avail = 10; 2469 jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2470 M_JBLOCKS, M_WAITOK | M_ZERO); 2471 2472 return (jblocks); 2473 } 2474 2475 static ufs2_daddr_t 2476 jblocks_alloc(jblocks, bytes, actual) 2477 struct jblocks *jblocks; 2478 int bytes; 2479 int *actual; 2480 { 2481 ufs2_daddr_t daddr; 2482 struct jextent *jext; 2483 int freecnt; 2484 int blocks; 2485 2486 blocks = bytes / DEV_BSIZE; 2487 jext = &jblocks->jb_extent[jblocks->jb_head]; 2488 freecnt = jext->je_blocks - jblocks->jb_off; 2489 if (freecnt == 0) { 2490 jblocks->jb_off = 0; 2491 if (++jblocks->jb_head > jblocks->jb_used) 2492 jblocks->jb_head = 0; 2493 jext = &jblocks->jb_extent[jblocks->jb_head]; 2494 freecnt = jext->je_blocks; 2495 } 2496 if (freecnt > blocks) 2497 freecnt = blocks; 2498 *actual = freecnt * DEV_BSIZE; 2499 daddr = jext->je_daddr + jblocks->jb_off; 2500 jblocks->jb_off += freecnt; 2501 jblocks->jb_free -= freecnt; 2502 2503 return (daddr); 2504 } 2505 2506 static void 2507 jblocks_free(jblocks, mp, bytes) 2508 struct jblocks *jblocks; 2509 struct mount *mp; 2510 int bytes; 2511 { 2512 2513 jblocks->jb_free += bytes / DEV_BSIZE; 2514 if (jblocks->jb_suspended) 2515 worklist_speedup(); 2516 wakeup(jblocks); 2517 } 2518 2519 static void 2520 jblocks_destroy(jblocks) 2521 struct jblocks *jblocks; 2522 { 2523 2524 if (jblocks->jb_extent) 2525 free(jblocks->jb_extent, M_JBLOCKS); 2526 free(jblocks, M_JBLOCKS); 2527 } 2528 2529 static void 2530 jblocks_add(jblocks, daddr, blocks) 2531 struct jblocks *jblocks; 2532 ufs2_daddr_t daddr; 2533 int blocks; 2534 { 2535 struct jextent *jext; 2536 2537 jblocks->jb_blocks += blocks; 2538 jblocks->jb_free += blocks; 2539 jext = &jblocks->jb_extent[jblocks->jb_used]; 2540 /* Adding the first block. */ 2541 if (jext->je_daddr == 0) { 2542 jext->je_daddr = daddr; 2543 jext->je_blocks = blocks; 2544 return; 2545 } 2546 /* Extending the last extent. */ 2547 if (jext->je_daddr + jext->je_blocks == daddr) { 2548 jext->je_blocks += blocks; 2549 return; 2550 } 2551 /* Adding a new extent. */ 2552 if (++jblocks->jb_used == jblocks->jb_avail) { 2553 jblocks->jb_avail *= 2; 2554 jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, 2555 M_JBLOCKS, M_WAITOK | M_ZERO); 2556 memcpy(jext, jblocks->jb_extent, 2557 sizeof(struct jextent) * jblocks->jb_used); 2558 free(jblocks->jb_extent, M_JBLOCKS); 2559 jblocks->jb_extent = jext; 2560 } 2561 jext = &jblocks->jb_extent[jblocks->jb_used]; 2562 jext->je_daddr = daddr; 2563 jext->je_blocks = blocks; 2564 return; 2565 } 2566 2567 int 2568 softdep_journal_lookup(mp, vpp) 2569 struct mount *mp; 2570 struct vnode **vpp; 2571 { 2572 struct componentname cnp; 2573 struct vnode *dvp; 2574 ino_t sujournal; 2575 int error; 2576 2577 error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp); 2578 if (error) 2579 return (error); 2580 bzero(&cnp, sizeof(cnp)); 2581 cnp.cn_nameiop = LOOKUP; 2582 cnp.cn_flags = ISLASTCN; 2583 cnp.cn_thread = curthread; 2584 cnp.cn_cred = curthread->td_ucred; 2585 cnp.cn_pnbuf = SUJ_FILE; 2586 cnp.cn_nameptr = SUJ_FILE; 2587 cnp.cn_namelen = strlen(SUJ_FILE); 2588 error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); 2589 vput(dvp); 2590 if (error != 0) 2591 return (error); 2592 error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); 2593 return (error); 2594 } 2595 2596 /* 2597 * Open and verify the journal file. 2598 */ 2599 static int 2600 journal_mount(mp, fs, cred) 2601 struct mount *mp; 2602 struct fs *fs; 2603 struct ucred *cred; 2604 { 2605 struct jblocks *jblocks; 2606 struct vnode *vp; 2607 struct inode *ip; 2608 ufs2_daddr_t blkno; 2609 int bcount; 2610 int error; 2611 int i; 2612 2613 error = softdep_journal_lookup(mp, &vp); 2614 if (error != 0) { 2615 printf("Failed to find journal. Use tunefs to create one\n"); 2616 return (error); 2617 } 2618 ip = VTOI(vp); 2619 if (ip->i_size < SUJ_MIN) { 2620 error = ENOSPC; 2621 goto out; 2622 } 2623 bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ 2624 jblocks = jblocks_create(); 2625 for (i = 0; i < bcount; i++) { 2626 error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); 2627 if (error) 2628 break; 2629 jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); 2630 } 2631 if (error) { 2632 jblocks_destroy(jblocks); 2633 goto out; 2634 } 2635 jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ 2636 jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ 2637 VFSTOUFS(mp)->softdep_jblocks = jblocks; 2638 out: 2639 if (error == 0) { 2640 MNT_ILOCK(mp); 2641 mp->mnt_flag |= MNT_SUJ; 2642 mp->mnt_flag &= ~MNT_SOFTDEP; 2643 MNT_IUNLOCK(mp); 2644 /* 2645 * Only validate the journal contents if the 2646 * filesystem is clean, otherwise we write the logs 2647 * but they'll never be used. If the filesystem was 2648 * still dirty when we mounted it the journal is 2649 * invalid and a new journal can only be valid if it 2650 * starts from a clean mount. 2651 */ 2652 if (fs->fs_clean) { 2653 DIP_SET(ip, i_modrev, fs->fs_mtime); 2654 ip->i_flags |= IN_MODIFIED; 2655 ffs_update(vp, 1); 2656 } 2657 } 2658 vput(vp); 2659 return (error); 2660 } 2661 2662 static void 2663 journal_unmount(mp) 2664 struct mount *mp; 2665 { 2666 struct ufsmount *ump; 2667 2668 ump = VFSTOUFS(mp); 2669 if (ump->softdep_jblocks) 2670 jblocks_destroy(ump->softdep_jblocks); 2671 ump->softdep_jblocks = NULL; 2672 } 2673 2674 /* 2675 * Called when a journal record is ready to be written. Space is allocated 2676 * and the journal entry is created when the journal is flushed to stable 2677 * store. 2678 */ 2679 static void 2680 add_to_journal(wk) 2681 struct worklist *wk; 2682 { 2683 struct ufsmount *ump; 2684 2685 mtx_assert(&lk, MA_OWNED); 2686 ump = VFSTOUFS(wk->wk_mp); 2687 if (wk->wk_state & ONWORKLIST) 2688 panic("add_to_journal: %s(0x%X) already on list", 2689 TYPENAME(wk->wk_type), wk->wk_state); 2690 wk->wk_state |= ONWORKLIST | DEPCOMPLETE; 2691 if (LIST_EMPTY(&ump->softdep_journal_pending)) { 2692 ump->softdep_jblocks->jb_age = ticks; 2693 LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); 2694 } else 2695 LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); 2696 ump->softdep_journal_tail = wk; 2697 ump->softdep_on_journal += 1; 2698 } 2699 2700 /* 2701 * Remove an arbitrary item for the journal worklist maintain the tail 2702 * pointer. This happens when a new operation obviates the need to 2703 * journal an old operation. 2704 */ 2705 static void 2706 remove_from_journal(wk) 2707 struct worklist *wk; 2708 { 2709 struct ufsmount *ump; 2710 2711 mtx_assert(&lk, MA_OWNED); 2712 ump = VFSTOUFS(wk->wk_mp); 2713 #ifdef SUJ_DEBUG 2714 { 2715 struct worklist *wkn; 2716 2717 LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) 2718 if (wkn == wk) 2719 break; 2720 if (wkn == NULL) 2721 panic("remove_from_journal: %p is not in journal", wk); 2722 } 2723 #endif 2724 /* 2725 * We emulate a TAILQ to save space in most structures which do not 2726 * require TAILQ semantics. Here we must update the tail position 2727 * when removing the tail which is not the final entry. This works 2728 * only if the worklist linkage are at the beginning of the structure. 2729 */ 2730 if (ump->softdep_journal_tail == wk) 2731 ump->softdep_journal_tail = 2732 (struct worklist *)wk->wk_list.le_prev; 2733 2734 WORKLIST_REMOVE(wk); 2735 ump->softdep_on_journal -= 1; 2736 } 2737 2738 /* 2739 * Check for journal space as well as dependency limits so the prelink 2740 * code can throttle both journaled and non-journaled filesystems. 2741 * Threshold is 0 for low and 1 for min. 2742 */ 2743 static int 2744 journal_space(ump, thresh) 2745 struct ufsmount *ump; 2746 int thresh; 2747 { 2748 struct jblocks *jblocks; 2749 int avail; 2750 2751 jblocks = ump->softdep_jblocks; 2752 if (jblocks == NULL) 2753 return (1); 2754 /* 2755 * We use a tighter restriction here to prevent request_cleanup() 2756 * running in threads from running into locks we currently hold. 2757 */ 2758 if (dep_current[D_INODEDEP] > (max_softdeps / 10) * 9) 2759 return (0); 2760 if (thresh) 2761 thresh = jblocks->jb_min; 2762 else 2763 thresh = jblocks->jb_low; 2764 avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; 2765 avail = jblocks->jb_free - avail; 2766 2767 return (avail > thresh); 2768 } 2769 2770 static void 2771 journal_suspend(ump) 2772 struct ufsmount *ump; 2773 { 2774 struct jblocks *jblocks; 2775 struct mount *mp; 2776 2777 mp = UFSTOVFS(ump); 2778 jblocks = ump->softdep_jblocks; 2779 MNT_ILOCK(mp); 2780 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 2781 stat_journal_min++; 2782 mp->mnt_kern_flag |= MNTK_SUSPEND; 2783 mp->mnt_susp_owner = FIRST_THREAD_IN_PROC(softdepproc); 2784 } 2785 jblocks->jb_suspended = 1; 2786 MNT_IUNLOCK(mp); 2787 } 2788 2789 static int 2790 journal_unsuspend(struct ufsmount *ump) 2791 { 2792 struct jblocks *jblocks; 2793 struct mount *mp; 2794 2795 mp = UFSTOVFS(ump); 2796 jblocks = ump->softdep_jblocks; 2797 2798 if (jblocks != NULL && jblocks->jb_suspended && 2799 journal_space(ump, jblocks->jb_min)) { 2800 jblocks->jb_suspended = 0; 2801 FREE_LOCK(&lk); 2802 mp->mnt_susp_owner = curthread; 2803 vfs_write_resume(mp); 2804 ACQUIRE_LOCK(&lk); 2805 return (1); 2806 } 2807 return (0); 2808 } 2809 2810 /* 2811 * Called before any allocation function to be certain that there is 2812 * sufficient space in the journal prior to creating any new records. 2813 * Since in the case of block allocation we may have multiple locked 2814 * buffers at the time of the actual allocation we can not block 2815 * when the journal records are created. Doing so would create a deadlock 2816 * if any of these buffers needed to be flushed to reclaim space. Instead 2817 * we require a sufficiently large amount of available space such that 2818 * each thread in the system could have passed this allocation check and 2819 * still have sufficient free space. With 20% of a minimum journal size 2820 * of 1MB we have 6553 records available. 2821 */ 2822 int 2823 softdep_prealloc(vp, waitok) 2824 struct vnode *vp; 2825 int waitok; 2826 { 2827 struct ufsmount *ump; 2828 2829 if (DOINGSUJ(vp) == 0) 2830 return (0); 2831 ump = VFSTOUFS(vp->v_mount); 2832 ACQUIRE_LOCK(&lk); 2833 if (journal_space(ump, 0)) { 2834 FREE_LOCK(&lk); 2835 return (0); 2836 } 2837 stat_journal_low++; 2838 FREE_LOCK(&lk); 2839 if (waitok == MNT_NOWAIT) 2840 return (ENOSPC); 2841 /* 2842 * Attempt to sync this vnode once to flush any journal 2843 * work attached to it. 2844 */ 2845 if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) 2846 ffs_syncvnode(vp, waitok); 2847 ACQUIRE_LOCK(&lk); 2848 process_removes(vp); 2849 process_truncates(vp); 2850 if (journal_space(ump, 0) == 0) { 2851 softdep_speedup(); 2852 if (journal_space(ump, 1) == 0) 2853 journal_suspend(ump); 2854 } 2855 FREE_LOCK(&lk); 2856 2857 return (0); 2858 } 2859 2860 /* 2861 * Before adjusting a link count on a vnode verify that we have sufficient 2862 * journal space. If not, process operations that depend on the currently 2863 * locked pair of vnodes to try to flush space as the syncer, buf daemon, 2864 * and softdep flush threads can not acquire these locks to reclaim space. 2865 */ 2866 static void 2867 softdep_prelink(dvp, vp) 2868 struct vnode *dvp; 2869 struct vnode *vp; 2870 { 2871 struct ufsmount *ump; 2872 2873 ump = VFSTOUFS(dvp->v_mount); 2874 mtx_assert(&lk, MA_OWNED); 2875 if (journal_space(ump, 0)) 2876 return; 2877 stat_journal_low++; 2878 FREE_LOCK(&lk); 2879 if (vp) 2880 ffs_syncvnode(vp, MNT_NOWAIT); 2881 ffs_syncvnode(dvp, MNT_WAIT); 2882 ACQUIRE_LOCK(&lk); 2883 /* Process vp before dvp as it may create .. removes. */ 2884 if (vp) { 2885 process_removes(vp); 2886 process_truncates(vp); 2887 } 2888 process_removes(dvp); 2889 process_truncates(dvp); 2890 softdep_speedup(); 2891 process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); 2892 if (journal_space(ump, 0) == 0) { 2893 softdep_speedup(); 2894 if (journal_space(ump, 1) == 0) 2895 journal_suspend(ump); 2896 } 2897 } 2898 2899 static void 2900 jseg_write(ump, jseg, data) 2901 struct ufsmount *ump; 2902 struct jseg *jseg; 2903 uint8_t *data; 2904 { 2905 struct jsegrec *rec; 2906 2907 rec = (struct jsegrec *)data; 2908 rec->jsr_seq = jseg->js_seq; 2909 rec->jsr_oldest = jseg->js_oldseq; 2910 rec->jsr_cnt = jseg->js_cnt; 2911 rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; 2912 rec->jsr_crc = 0; 2913 rec->jsr_time = ump->um_fs->fs_mtime; 2914 } 2915 2916 static inline void 2917 inoref_write(inoref, jseg, rec) 2918 struct inoref *inoref; 2919 struct jseg *jseg; 2920 struct jrefrec *rec; 2921 { 2922 2923 inoref->if_jsegdep->jd_seg = jseg; 2924 rec->jr_ino = inoref->if_ino; 2925 rec->jr_parent = inoref->if_parent; 2926 rec->jr_nlink = inoref->if_nlink; 2927 rec->jr_mode = inoref->if_mode; 2928 rec->jr_diroff = inoref->if_diroff; 2929 } 2930 2931 static void 2932 jaddref_write(jaddref, jseg, data) 2933 struct jaddref *jaddref; 2934 struct jseg *jseg; 2935 uint8_t *data; 2936 { 2937 struct jrefrec *rec; 2938 2939 rec = (struct jrefrec *)data; 2940 rec->jr_op = JOP_ADDREF; 2941 inoref_write(&jaddref->ja_ref, jseg, rec); 2942 } 2943 2944 static void 2945 jremref_write(jremref, jseg, data) 2946 struct jremref *jremref; 2947 struct jseg *jseg; 2948 uint8_t *data; 2949 { 2950 struct jrefrec *rec; 2951 2952 rec = (struct jrefrec *)data; 2953 rec->jr_op = JOP_REMREF; 2954 inoref_write(&jremref->jr_ref, jseg, rec); 2955 } 2956 2957 static void 2958 jmvref_write(jmvref, jseg, data) 2959 struct jmvref *jmvref; 2960 struct jseg *jseg; 2961 uint8_t *data; 2962 { 2963 struct jmvrec *rec; 2964 2965 rec = (struct jmvrec *)data; 2966 rec->jm_op = JOP_MVREF; 2967 rec->jm_ino = jmvref->jm_ino; 2968 rec->jm_parent = jmvref->jm_parent; 2969 rec->jm_oldoff = jmvref->jm_oldoff; 2970 rec->jm_newoff = jmvref->jm_newoff; 2971 } 2972 2973 static void 2974 jnewblk_write(jnewblk, jseg, data) 2975 struct jnewblk *jnewblk; 2976 struct jseg *jseg; 2977 uint8_t *data; 2978 { 2979 struct jblkrec *rec; 2980 2981 jnewblk->jn_jsegdep->jd_seg = jseg; 2982 rec = (struct jblkrec *)data; 2983 rec->jb_op = JOP_NEWBLK; 2984 rec->jb_ino = jnewblk->jn_ino; 2985 rec->jb_blkno = jnewblk->jn_blkno; 2986 rec->jb_lbn = jnewblk->jn_lbn; 2987 rec->jb_frags = jnewblk->jn_frags; 2988 rec->jb_oldfrags = jnewblk->jn_oldfrags; 2989 } 2990 2991 static void 2992 jfreeblk_write(jfreeblk, jseg, data) 2993 struct jfreeblk *jfreeblk; 2994 struct jseg *jseg; 2995 uint8_t *data; 2996 { 2997 struct jblkrec *rec; 2998 2999 jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg; 3000 rec = (struct jblkrec *)data; 3001 rec->jb_op = JOP_FREEBLK; 3002 rec->jb_ino = jfreeblk->jf_ino; 3003 rec->jb_blkno = jfreeblk->jf_blkno; 3004 rec->jb_lbn = jfreeblk->jf_lbn; 3005 rec->jb_frags = jfreeblk->jf_frags; 3006 rec->jb_oldfrags = 0; 3007 } 3008 3009 static void 3010 jfreefrag_write(jfreefrag, jseg, data) 3011 struct jfreefrag *jfreefrag; 3012 struct jseg *jseg; 3013 uint8_t *data; 3014 { 3015 struct jblkrec *rec; 3016 3017 jfreefrag->fr_jsegdep->jd_seg = jseg; 3018 rec = (struct jblkrec *)data; 3019 rec->jb_op = JOP_FREEBLK; 3020 rec->jb_ino = jfreefrag->fr_ino; 3021 rec->jb_blkno = jfreefrag->fr_blkno; 3022 rec->jb_lbn = jfreefrag->fr_lbn; 3023 rec->jb_frags = jfreefrag->fr_frags; 3024 rec->jb_oldfrags = 0; 3025 } 3026 3027 static void 3028 jtrunc_write(jtrunc, jseg, data) 3029 struct jtrunc *jtrunc; 3030 struct jseg *jseg; 3031 uint8_t *data; 3032 { 3033 struct jtrncrec *rec; 3034 3035 jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg; 3036 rec = (struct jtrncrec *)data; 3037 rec->jt_op = JOP_TRUNC; 3038 rec->jt_ino = jtrunc->jt_ino; 3039 rec->jt_size = jtrunc->jt_size; 3040 rec->jt_extsize = jtrunc->jt_extsize; 3041 } 3042 3043 static void 3044 jfsync_write(jfsync, jseg, data) 3045 struct jfsync *jfsync; 3046 struct jseg *jseg; 3047 uint8_t *data; 3048 { 3049 struct jtrncrec *rec; 3050 3051 rec = (struct jtrncrec *)data; 3052 rec->jt_op = JOP_SYNC; 3053 rec->jt_ino = jfsync->jfs_ino; 3054 rec->jt_size = jfsync->jfs_size; 3055 rec->jt_extsize = jfsync->jfs_extsize; 3056 } 3057 3058 static void 3059 softdep_flushjournal(mp) 3060 struct mount *mp; 3061 { 3062 struct jblocks *jblocks; 3063 struct ufsmount *ump; 3064 3065 if (MOUNTEDSUJ(mp) == 0) 3066 return; 3067 ump = VFSTOUFS(mp); 3068 jblocks = ump->softdep_jblocks; 3069 ACQUIRE_LOCK(&lk); 3070 while (ump->softdep_on_journal) { 3071 jblocks->jb_needseg = 1; 3072 softdep_process_journal(mp, NULL, MNT_WAIT); 3073 } 3074 FREE_LOCK(&lk); 3075 } 3076 3077 /* 3078 * Flush some journal records to disk. 3079 */ 3080 static void 3081 softdep_process_journal(mp, needwk, flags) 3082 struct mount *mp; 3083 struct worklist *needwk; 3084 int flags; 3085 { 3086 struct jblocks *jblocks; 3087 struct ufsmount *ump; 3088 struct worklist *wk; 3089 struct jseg *jseg; 3090 struct buf *bp; 3091 uint8_t *data; 3092 struct fs *fs; 3093 int segwritten; 3094 int jrecmin; /* Minimum records per block. */ 3095 int jrecmax; /* Maximum records per block. */ 3096 int size; 3097 int cnt; 3098 int off; 3099 int devbsize; 3100 3101 if (MOUNTEDSUJ(mp) == 0) 3102 return; 3103 ump = VFSTOUFS(mp); 3104 fs = ump->um_fs; 3105 jblocks = ump->softdep_jblocks; 3106 devbsize = ump->um_devvp->v_bufobj.bo_bsize; 3107 /* 3108 * We write anywhere between a disk block and fs block. The upper 3109 * bound is picked to prevent buffer cache fragmentation and limit 3110 * processing time per I/O. 3111 */ 3112 jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ 3113 jrecmax = (fs->fs_bsize / devbsize) * jrecmin; 3114 segwritten = 0; 3115 for (;;) { 3116 cnt = ump->softdep_on_journal; 3117 /* 3118 * Criteria for writing a segment: 3119 * 1) We have a full block. 3120 * 2) We're called from jwait() and haven't found the 3121 * journal item yet. 3122 * 3) Always write if needseg is set. 3123 * 4) If we are called from process_worklist and have 3124 * not yet written anything we write a partial block 3125 * to enforce a 1 second maximum latency on journal 3126 * entries. 3127 */ 3128 if (cnt < (jrecmax - 1) && needwk == NULL && 3129 jblocks->jb_needseg == 0 && (segwritten || cnt == 0)) 3130 break; 3131 cnt++; 3132 /* 3133 * Verify some free journal space. softdep_prealloc() should 3134 * guarantee that we don't run out so this is indicative of 3135 * a problem with the flow control. Try to recover 3136 * gracefully in any event. 3137 */ 3138 while (jblocks->jb_free == 0) { 3139 if (flags != MNT_WAIT) 3140 break; 3141 printf("softdep: Out of journal space!\n"); 3142 softdep_speedup(); 3143 msleep(jblocks, &lk, PRIBIO, "jblocks", hz); 3144 } 3145 FREE_LOCK(&lk); 3146 jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); 3147 workitem_alloc(&jseg->js_list, D_JSEG, mp); 3148 LIST_INIT(&jseg->js_entries); 3149 LIST_INIT(&jseg->js_indirs); 3150 jseg->js_state = ATTACHED; 3151 jseg->js_jblocks = jblocks; 3152 bp = geteblk(fs->fs_bsize, 0); 3153 ACQUIRE_LOCK(&lk); 3154 /* 3155 * If there was a race while we were allocating the block 3156 * and jseg the entry we care about was likely written. 3157 * We bail out in both the WAIT and NOWAIT case and assume 3158 * the caller will loop if the entry it cares about is 3159 * not written. 3160 */ 3161 cnt = ump->softdep_on_journal; 3162 if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) { 3163 bp->b_flags |= B_INVAL | B_NOCACHE; 3164 WORKITEM_FREE(jseg, D_JSEG); 3165 FREE_LOCK(&lk); 3166 brelse(bp); 3167 ACQUIRE_LOCK(&lk); 3168 break; 3169 } 3170 /* 3171 * Calculate the disk block size required for the available 3172 * records rounded to the min size. 3173 */ 3174 if (cnt == 0) 3175 size = devbsize; 3176 else if (cnt < jrecmax) 3177 size = howmany(cnt, jrecmin) * devbsize; 3178 else 3179 size = fs->fs_bsize; 3180 /* 3181 * Allocate a disk block for this journal data and account 3182 * for truncation of the requested size if enough contiguous 3183 * space was not available. 3184 */ 3185 bp->b_blkno = jblocks_alloc(jblocks, size, &size); 3186 bp->b_lblkno = bp->b_blkno; 3187 bp->b_offset = bp->b_blkno * DEV_BSIZE; 3188 bp->b_bcount = size; 3189 bp->b_bufobj = &ump->um_devvp->v_bufobj; 3190 bp->b_flags &= ~B_INVAL; 3191 bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; 3192 /* 3193 * Initialize our jseg with cnt records. Assign the next 3194 * sequence number to it and link it in-order. 3195 */ 3196 cnt = MIN(cnt, (size / devbsize) * jrecmin); 3197 jseg->js_buf = bp; 3198 jseg->js_cnt = cnt; 3199 jseg->js_refs = cnt + 1; /* Self ref. */ 3200 jseg->js_size = size; 3201 jseg->js_seq = jblocks->jb_nextseq++; 3202 if (jblocks->jb_oldestseg == NULL) 3203 jblocks->jb_oldestseg = jseg; 3204 jseg->js_oldseq = jblocks->jb_oldestseg->js_seq; 3205 TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); 3206 if (jblocks->jb_writeseg == NULL) 3207 jblocks->jb_writeseg = jseg; 3208 /* 3209 * Start filling in records from the pending list. 3210 */ 3211 data = bp->b_data; 3212 off = 0; 3213 while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) 3214 != NULL) { 3215 if (cnt == 0) 3216 break; 3217 /* Place a segment header on every device block. */ 3218 if ((off % devbsize) == 0) { 3219 jseg_write(ump, jseg, data); 3220 off += JREC_SIZE; 3221 data = bp->b_data + off; 3222 } 3223 if (wk == needwk) 3224 needwk = NULL; 3225 remove_from_journal(wk); 3226 wk->wk_state |= INPROGRESS; 3227 WORKLIST_INSERT(&jseg->js_entries, wk); 3228 switch (wk->wk_type) { 3229 case D_JADDREF: 3230 jaddref_write(WK_JADDREF(wk), jseg, data); 3231 break; 3232 case D_JREMREF: 3233 jremref_write(WK_JREMREF(wk), jseg, data); 3234 break; 3235 case D_JMVREF: 3236 jmvref_write(WK_JMVREF(wk), jseg, data); 3237 break; 3238 case D_JNEWBLK: 3239 jnewblk_write(WK_JNEWBLK(wk), jseg, data); 3240 break; 3241 case D_JFREEBLK: 3242 jfreeblk_write(WK_JFREEBLK(wk), jseg, data); 3243 break; 3244 case D_JFREEFRAG: 3245 jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); 3246 break; 3247 case D_JTRUNC: 3248 jtrunc_write(WK_JTRUNC(wk), jseg, data); 3249 break; 3250 case D_JFSYNC: 3251 jfsync_write(WK_JFSYNC(wk), jseg, data); 3252 break; 3253 default: 3254 panic("process_journal: Unknown type %s", 3255 TYPENAME(wk->wk_type)); 3256 /* NOTREACHED */ 3257 } 3258 off += JREC_SIZE; 3259 data = bp->b_data + off; 3260 cnt--; 3261 } 3262 /* 3263 * Write this one buffer and continue. 3264 */ 3265 segwritten = 1; 3266 jblocks->jb_needseg = 0; 3267 WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); 3268 FREE_LOCK(&lk); 3269 BO_LOCK(bp->b_bufobj); 3270 bgetvp(ump->um_devvp, bp); 3271 BO_UNLOCK(bp->b_bufobj); 3272 /* 3273 * We only do the blocking wait once we find the journal 3274 * entry we're looking for. 3275 */ 3276 if (needwk == NULL && flags == MNT_WAIT) 3277 bwrite(bp); 3278 else 3279 bawrite(bp); 3280 ACQUIRE_LOCK(&lk); 3281 } 3282 /* 3283 * If we've suspended the filesystem because we ran out of journal 3284 * space either try to sync it here to make some progress or 3285 * unsuspend it if we already have. 3286 */ 3287 if (flags == 0 && jblocks->jb_suspended) { 3288 if (journal_unsuspend(ump)) 3289 return; 3290 FREE_LOCK(&lk); 3291 VFS_SYNC(mp, MNT_NOWAIT); 3292 ffs_sbupdate(ump, MNT_WAIT, 0); 3293 ACQUIRE_LOCK(&lk); 3294 } 3295 } 3296 3297 /* 3298 * Complete a jseg, allowing all dependencies awaiting journal writes 3299 * to proceed. Each journal dependency also attaches a jsegdep to dependent 3300 * structures so that the journal segment can be freed to reclaim space. 3301 */ 3302 static void 3303 complete_jseg(jseg) 3304 struct jseg *jseg; 3305 { 3306 struct worklist *wk; 3307 struct jmvref *jmvref; 3308 int waiting; 3309 #ifdef INVARIANTS 3310 int i = 0; 3311 #endif 3312 3313 while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { 3314 WORKLIST_REMOVE(wk); 3315 waiting = wk->wk_state & IOWAITING; 3316 wk->wk_state &= ~(INPROGRESS | IOWAITING); 3317 wk->wk_state |= COMPLETE; 3318 KASSERT(i++ < jseg->js_cnt, 3319 ("handle_written_jseg: overflow %d >= %d", 3320 i - 1, jseg->js_cnt)); 3321 switch (wk->wk_type) { 3322 case D_JADDREF: 3323 handle_written_jaddref(WK_JADDREF(wk)); 3324 break; 3325 case D_JREMREF: 3326 handle_written_jremref(WK_JREMREF(wk)); 3327 break; 3328 case D_JMVREF: 3329 rele_jseg(jseg); /* No jsegdep. */ 3330 jmvref = WK_JMVREF(wk); 3331 LIST_REMOVE(jmvref, jm_deps); 3332 if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0) 3333 free_pagedep(jmvref->jm_pagedep); 3334 WORKITEM_FREE(jmvref, D_JMVREF); 3335 break; 3336 case D_JNEWBLK: 3337 handle_written_jnewblk(WK_JNEWBLK(wk)); 3338 break; 3339 case D_JFREEBLK: 3340 handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep); 3341 break; 3342 case D_JTRUNC: 3343 handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep); 3344 break; 3345 case D_JFSYNC: 3346 rele_jseg(jseg); /* No jsegdep. */ 3347 WORKITEM_FREE(wk, D_JFSYNC); 3348 break; 3349 case D_JFREEFRAG: 3350 handle_written_jfreefrag(WK_JFREEFRAG(wk)); 3351 break; 3352 default: 3353 panic("handle_written_jseg: Unknown type %s", 3354 TYPENAME(wk->wk_type)); 3355 /* NOTREACHED */ 3356 } 3357 if (waiting) 3358 wakeup(wk); 3359 } 3360 /* Release the self reference so the structure may be freed. */ 3361 rele_jseg(jseg); 3362 } 3363 3364 /* 3365 * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg 3366 * completions in order only. 3367 */ 3368 static void 3369 handle_written_jseg(jseg, bp) 3370 struct jseg *jseg; 3371 struct buf *bp; 3372 { 3373 struct jblocks *jblocks; 3374 struct jseg *jsegn; 3375 3376 if (jseg->js_refs == 0) 3377 panic("handle_written_jseg: No self-reference on %p", jseg); 3378 jseg->js_state |= DEPCOMPLETE; 3379 /* 3380 * We'll never need this buffer again, set flags so it will be 3381 * discarded. 3382 */ 3383 bp->b_flags |= B_INVAL | B_NOCACHE; 3384 jblocks = jseg->js_jblocks; 3385 /* 3386 * Don't allow out of order completions. If this isn't the first 3387 * block wait for it to write before we're done. 3388 */ 3389 if (jseg != jblocks->jb_writeseg) 3390 return; 3391 /* Iterate through available jsegs processing their entries. */ 3392 do { 3393 jblocks->jb_oldestwrseq = jseg->js_oldseq; 3394 jsegn = TAILQ_NEXT(jseg, js_next); 3395 complete_jseg(jseg); 3396 jseg = jsegn; 3397 } while (jseg && jseg->js_state & DEPCOMPLETE); 3398 jblocks->jb_writeseg = jseg; 3399 /* 3400 * Attempt to free jsegs now that oldestwrseq may have advanced. 3401 */ 3402 free_jsegs(jblocks); 3403 } 3404 3405 static inline struct jsegdep * 3406 inoref_jseg(inoref) 3407 struct inoref *inoref; 3408 { 3409 struct jsegdep *jsegdep; 3410 3411 jsegdep = inoref->if_jsegdep; 3412 inoref->if_jsegdep = NULL; 3413 3414 return (jsegdep); 3415 } 3416 3417 /* 3418 * Called once a jremref has made it to stable store. The jremref is marked 3419 * complete and we attempt to free it. Any pagedeps writes sleeping waiting 3420 * for the jremref to complete will be awoken by free_jremref. 3421 */ 3422 static void 3423 handle_written_jremref(jremref) 3424 struct jremref *jremref; 3425 { 3426 struct inodedep *inodedep; 3427 struct jsegdep *jsegdep; 3428 struct dirrem *dirrem; 3429 3430 /* Grab the jsegdep. */ 3431 jsegdep = inoref_jseg(&jremref->jr_ref); 3432 /* 3433 * Remove us from the inoref list. 3434 */ 3435 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 3436 0, &inodedep) == 0) 3437 panic("handle_written_jremref: Lost inodedep"); 3438 TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 3439 /* 3440 * Complete the dirrem. 3441 */ 3442 dirrem = jremref->jr_dirrem; 3443 jremref->jr_dirrem = NULL; 3444 LIST_REMOVE(jremref, jr_deps); 3445 jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; 3446 jwork_insert(&dirrem->dm_jwork, jsegdep); 3447 if (LIST_EMPTY(&dirrem->dm_jremrefhd) && 3448 (dirrem->dm_state & COMPLETE) != 0) 3449 add_to_worklist(&dirrem->dm_list, 0); 3450 free_jremref(jremref); 3451 } 3452 3453 /* 3454 * Called once a jaddref has made it to stable store. The dependency is 3455 * marked complete and any dependent structures are added to the inode 3456 * bufwait list to be completed as soon as it is written. If a bitmap write 3457 * depends on this entry we move the inode into the inodedephd of the 3458 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. 3459 */ 3460 static void 3461 handle_written_jaddref(jaddref) 3462 struct jaddref *jaddref; 3463 { 3464 struct jsegdep *jsegdep; 3465 struct inodedep *inodedep; 3466 struct diradd *diradd; 3467 struct mkdir *mkdir; 3468 3469 /* Grab the jsegdep. */ 3470 jsegdep = inoref_jseg(&jaddref->ja_ref); 3471 mkdir = NULL; 3472 diradd = NULL; 3473 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3474 0, &inodedep) == 0) 3475 panic("handle_written_jaddref: Lost inodedep."); 3476 if (jaddref->ja_diradd == NULL) 3477 panic("handle_written_jaddref: No dependency"); 3478 if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { 3479 diradd = jaddref->ja_diradd; 3480 WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); 3481 } else if (jaddref->ja_state & MKDIR_PARENT) { 3482 mkdir = jaddref->ja_mkdir; 3483 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); 3484 } else if (jaddref->ja_state & MKDIR_BODY) 3485 mkdir = jaddref->ja_mkdir; 3486 else 3487 panic("handle_written_jaddref: Unknown dependency %p", 3488 jaddref->ja_diradd); 3489 jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ 3490 /* 3491 * Remove us from the inode list. 3492 */ 3493 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); 3494 /* 3495 * The mkdir may be waiting on the jaddref to clear before freeing. 3496 */ 3497 if (mkdir) { 3498 KASSERT(mkdir->md_list.wk_type == D_MKDIR, 3499 ("handle_written_jaddref: Incorrect type for mkdir %s", 3500 TYPENAME(mkdir->md_list.wk_type))); 3501 mkdir->md_jaddref = NULL; 3502 diradd = mkdir->md_diradd; 3503 mkdir->md_state |= DEPCOMPLETE; 3504 complete_mkdir(mkdir); 3505 } 3506 jwork_insert(&diradd->da_jwork, jsegdep); 3507 if (jaddref->ja_state & NEWBLOCK) { 3508 inodedep->id_state |= ONDEPLIST; 3509 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, 3510 inodedep, id_deps); 3511 } 3512 free_jaddref(jaddref); 3513 } 3514 3515 /* 3516 * Called once a jnewblk journal is written. The allocdirect or allocindir 3517 * is placed in the bmsafemap to await notification of a written bitmap. If 3518 * the operation was canceled we add the segdep to the appropriate 3519 * dependency to free the journal space once the canceling operation 3520 * completes. 3521 */ 3522 static void 3523 handle_written_jnewblk(jnewblk) 3524 struct jnewblk *jnewblk; 3525 { 3526 struct bmsafemap *bmsafemap; 3527 struct freefrag *freefrag; 3528 struct freework *freework; 3529 struct jsegdep *jsegdep; 3530 struct newblk *newblk; 3531 3532 /* Grab the jsegdep. */ 3533 jsegdep = jnewblk->jn_jsegdep; 3534 jnewblk->jn_jsegdep = NULL; 3535 if (jnewblk->jn_dep == NULL) 3536 panic("handle_written_jnewblk: No dependency for the segdep."); 3537 switch (jnewblk->jn_dep->wk_type) { 3538 case D_NEWBLK: 3539 case D_ALLOCDIRECT: 3540 case D_ALLOCINDIR: 3541 /* 3542 * Add the written block to the bmsafemap so it can 3543 * be notified when the bitmap is on disk. 3544 */ 3545 newblk = WK_NEWBLK(jnewblk->jn_dep); 3546 newblk->nb_jnewblk = NULL; 3547 if ((newblk->nb_state & GOINGAWAY) == 0) { 3548 bmsafemap = newblk->nb_bmsafemap; 3549 newblk->nb_state |= ONDEPLIST; 3550 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, 3551 nb_deps); 3552 } 3553 jwork_insert(&newblk->nb_jwork, jsegdep); 3554 break; 3555 case D_FREEFRAG: 3556 /* 3557 * A newblock being removed by a freefrag when replaced by 3558 * frag extension. 3559 */ 3560 freefrag = WK_FREEFRAG(jnewblk->jn_dep); 3561 freefrag->ff_jdep = NULL; 3562 WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list); 3563 break; 3564 case D_FREEWORK: 3565 /* 3566 * A direct block was removed by truncate. 3567 */ 3568 freework = WK_FREEWORK(jnewblk->jn_dep); 3569 freework->fw_jnewblk = NULL; 3570 WORKLIST_INSERT(&freework->fw_freeblks->fb_jwork, 3571 &jsegdep->jd_list); 3572 break; 3573 default: 3574 panic("handle_written_jnewblk: Unknown type %d.", 3575 jnewblk->jn_dep->wk_type); 3576 } 3577 jnewblk->jn_dep = NULL; 3578 free_jnewblk(jnewblk); 3579 } 3580 3581 /* 3582 * Cancel a jfreefrag that won't be needed, probably due to colliding with 3583 * an in-flight allocation that has not yet been committed. Divorce us 3584 * from the freefrag and mark it DEPCOMPLETE so that it may be added 3585 * to the worklist. 3586 */ 3587 static void 3588 cancel_jfreefrag(jfreefrag) 3589 struct jfreefrag *jfreefrag; 3590 { 3591 struct freefrag *freefrag; 3592 3593 if (jfreefrag->fr_jsegdep) { 3594 free_jsegdep(jfreefrag->fr_jsegdep); 3595 jfreefrag->fr_jsegdep = NULL; 3596 } 3597 freefrag = jfreefrag->fr_freefrag; 3598 jfreefrag->fr_freefrag = NULL; 3599 free_jfreefrag(jfreefrag); 3600 freefrag->ff_state |= DEPCOMPLETE; 3601 } 3602 3603 /* 3604 * Free a jfreefrag when the parent freefrag is rendered obsolete. 3605 */ 3606 static void 3607 free_jfreefrag(jfreefrag) 3608 struct jfreefrag *jfreefrag; 3609 { 3610 3611 if (jfreefrag->fr_state & INPROGRESS) 3612 WORKLIST_REMOVE(&jfreefrag->fr_list); 3613 else if (jfreefrag->fr_state & ONWORKLIST) 3614 remove_from_journal(&jfreefrag->fr_list); 3615 if (jfreefrag->fr_freefrag != NULL) 3616 panic("free_jfreefrag: Still attached to a freefrag."); 3617 WORKITEM_FREE(jfreefrag, D_JFREEFRAG); 3618 } 3619 3620 /* 3621 * Called when the journal write for a jfreefrag completes. The parent 3622 * freefrag is added to the worklist if this completes its dependencies. 3623 */ 3624 static void 3625 handle_written_jfreefrag(jfreefrag) 3626 struct jfreefrag *jfreefrag; 3627 { 3628 struct jsegdep *jsegdep; 3629 struct freefrag *freefrag; 3630 3631 /* Grab the jsegdep. */ 3632 jsegdep = jfreefrag->fr_jsegdep; 3633 jfreefrag->fr_jsegdep = NULL; 3634 freefrag = jfreefrag->fr_freefrag; 3635 if (freefrag == NULL) 3636 panic("handle_written_jfreefrag: No freefrag."); 3637 freefrag->ff_state |= DEPCOMPLETE; 3638 freefrag->ff_jdep = NULL; 3639 jwork_insert(&freefrag->ff_jwork, jsegdep); 3640 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 3641 add_to_worklist(&freefrag->ff_list, 0); 3642 jfreefrag->fr_freefrag = NULL; 3643 free_jfreefrag(jfreefrag); 3644 } 3645 3646 /* 3647 * Called when the journal write for a jfreeblk completes. The jfreeblk 3648 * is removed from the freeblks list of pending journal writes and the 3649 * jsegdep is moved to the freeblks jwork to be completed when all blocks 3650 * have been reclaimed. 3651 */ 3652 static void 3653 handle_written_jblkdep(jblkdep) 3654 struct jblkdep *jblkdep; 3655 { 3656 struct freeblks *freeblks; 3657 struct jsegdep *jsegdep; 3658 3659 /* Grab the jsegdep. */ 3660 jsegdep = jblkdep->jb_jsegdep; 3661 jblkdep->jb_jsegdep = NULL; 3662 freeblks = jblkdep->jb_freeblks; 3663 LIST_REMOVE(jblkdep, jb_deps); 3664 WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list); 3665 /* 3666 * If the freeblks is all journaled, we can add it to the worklist. 3667 */ 3668 if (LIST_EMPTY(&freeblks->fb_jblkdephd) && 3669 (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 3670 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 3671 3672 free_jblkdep(jblkdep); 3673 } 3674 3675 static struct jsegdep * 3676 newjsegdep(struct worklist *wk) 3677 { 3678 struct jsegdep *jsegdep; 3679 3680 jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); 3681 workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); 3682 jsegdep->jd_seg = NULL; 3683 3684 return (jsegdep); 3685 } 3686 3687 static struct jmvref * 3688 newjmvref(dp, ino, oldoff, newoff) 3689 struct inode *dp; 3690 ino_t ino; 3691 off_t oldoff; 3692 off_t newoff; 3693 { 3694 struct jmvref *jmvref; 3695 3696 jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); 3697 workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); 3698 jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; 3699 jmvref->jm_parent = dp->i_number; 3700 jmvref->jm_ino = ino; 3701 jmvref->jm_oldoff = oldoff; 3702 jmvref->jm_newoff = newoff; 3703 3704 return (jmvref); 3705 } 3706 3707 /* 3708 * Allocate a new jremref that tracks the removal of ip from dp with the 3709 * directory entry offset of diroff. Mark the entry as ATTACHED and 3710 * DEPCOMPLETE as we have all the information required for the journal write 3711 * and the directory has already been removed from the buffer. The caller 3712 * is responsible for linking the jremref into the pagedep and adding it 3713 * to the journal to write. The MKDIR_PARENT flag is set if we're doing 3714 * a DOTDOT addition so handle_workitem_remove() can properly assign 3715 * the jsegdep when we're done. 3716 */ 3717 static struct jremref * 3718 newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, 3719 off_t diroff, nlink_t nlink) 3720 { 3721 struct jremref *jremref; 3722 3723 jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); 3724 workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); 3725 jremref->jr_state = ATTACHED; 3726 newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, 3727 nlink, ip->i_mode); 3728 jremref->jr_dirrem = dirrem; 3729 3730 return (jremref); 3731 } 3732 3733 static inline void 3734 newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, 3735 nlink_t nlink, uint16_t mode) 3736 { 3737 3738 inoref->if_jsegdep = newjsegdep(&inoref->if_list); 3739 inoref->if_diroff = diroff; 3740 inoref->if_ino = ino; 3741 inoref->if_parent = parent; 3742 inoref->if_nlink = nlink; 3743 inoref->if_mode = mode; 3744 } 3745 3746 /* 3747 * Allocate a new jaddref to track the addition of ino to dp at diroff. The 3748 * directory offset may not be known until later. The caller is responsible 3749 * adding the entry to the journal when this information is available. nlink 3750 * should be the link count prior to the addition and mode is only required 3751 * to have the correct FMT. 3752 */ 3753 static struct jaddref * 3754 newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, 3755 uint16_t mode) 3756 { 3757 struct jaddref *jaddref; 3758 3759 jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); 3760 workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); 3761 jaddref->ja_state = ATTACHED; 3762 jaddref->ja_mkdir = NULL; 3763 newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); 3764 3765 return (jaddref); 3766 } 3767 3768 /* 3769 * Create a new free dependency for a freework. The caller is responsible 3770 * for adjusting the reference count when it has the lock held. The freedep 3771 * will track an outstanding bitmap write that will ultimately clear the 3772 * freework to continue. 3773 */ 3774 static struct freedep * 3775 newfreedep(struct freework *freework) 3776 { 3777 struct freedep *freedep; 3778 3779 freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); 3780 workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); 3781 freedep->fd_freework = freework; 3782 3783 return (freedep); 3784 } 3785 3786 /* 3787 * Free a freedep structure once the buffer it is linked to is written. If 3788 * this is the last reference to the freework schedule it for completion. 3789 */ 3790 static void 3791 free_freedep(freedep) 3792 struct freedep *freedep; 3793 { 3794 struct freework *freework; 3795 3796 freework = freedep->fd_freework; 3797 freework->fw_freeblks->fb_cgwait--; 3798 if (--freework->fw_ref == 0) 3799 freework_enqueue(freework); 3800 WORKITEM_FREE(freedep, D_FREEDEP); 3801 } 3802 3803 /* 3804 * Allocate a new freework structure that may be a level in an indirect 3805 * when parent is not NULL or a top level block when it is. The top level 3806 * freework structures are allocated without lk held and before the freeblks 3807 * is visible outside of softdep_setup_freeblocks(). 3808 */ 3809 static struct freework * 3810 newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal) 3811 struct ufsmount *ump; 3812 struct freeblks *freeblks; 3813 struct freework *parent; 3814 ufs_lbn_t lbn; 3815 ufs2_daddr_t nb; 3816 int frags; 3817 int off; 3818 int journal; 3819 { 3820 struct freework *freework; 3821 3822 freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); 3823 workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); 3824 freework->fw_state = ATTACHED; 3825 freework->fw_jnewblk = NULL; 3826 freework->fw_freeblks = freeblks; 3827 freework->fw_parent = parent; 3828 freework->fw_lbn = lbn; 3829 freework->fw_blkno = nb; 3830 freework->fw_frags = frags; 3831 freework->fw_indir = NULL; 3832 freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR) 3833 ? 0 : NINDIR(ump->um_fs) + 1; 3834 freework->fw_start = freework->fw_off = off; 3835 if (journal) 3836 newjfreeblk(freeblks, lbn, nb, frags); 3837 if (parent == NULL) { 3838 ACQUIRE_LOCK(&lk); 3839 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); 3840 freeblks->fb_ref++; 3841 FREE_LOCK(&lk); 3842 } 3843 3844 return (freework); 3845 } 3846 3847 /* 3848 * Eliminate a jfreeblk for a block that does not need journaling. 3849 */ 3850 static void 3851 cancel_jfreeblk(freeblks, blkno) 3852 struct freeblks *freeblks; 3853 ufs2_daddr_t blkno; 3854 { 3855 struct jfreeblk *jfreeblk; 3856 struct jblkdep *jblkdep; 3857 3858 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) { 3859 if (jblkdep->jb_list.wk_type != D_JFREEBLK) 3860 continue; 3861 jfreeblk = WK_JFREEBLK(&jblkdep->jb_list); 3862 if (jfreeblk->jf_blkno == blkno) 3863 break; 3864 } 3865 if (jblkdep == NULL) 3866 return; 3867 free_jsegdep(jblkdep->jb_jsegdep); 3868 LIST_REMOVE(jblkdep, jb_deps); 3869 WORKITEM_FREE(jfreeblk, D_JFREEBLK); 3870 } 3871 3872 /* 3873 * Allocate a new jfreeblk to journal top level block pointer when truncating 3874 * a file. The caller must add this to the worklist when lk is held. 3875 */ 3876 static struct jfreeblk * 3877 newjfreeblk(freeblks, lbn, blkno, frags) 3878 struct freeblks *freeblks; 3879 ufs_lbn_t lbn; 3880 ufs2_daddr_t blkno; 3881 int frags; 3882 { 3883 struct jfreeblk *jfreeblk; 3884 3885 jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); 3886 workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK, 3887 freeblks->fb_list.wk_mp); 3888 jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list); 3889 jfreeblk->jf_dep.jb_freeblks = freeblks; 3890 jfreeblk->jf_ino = freeblks->fb_inum; 3891 jfreeblk->jf_lbn = lbn; 3892 jfreeblk->jf_blkno = blkno; 3893 jfreeblk->jf_frags = frags; 3894 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps); 3895 3896 return (jfreeblk); 3897 } 3898 3899 /* 3900 * Allocate a new jtrunc to track a partial truncation. 3901 */ 3902 static struct jtrunc * 3903 newjtrunc(freeblks, size, extsize) 3904 struct freeblks *freeblks; 3905 off_t size; 3906 int extsize; 3907 { 3908 struct jtrunc *jtrunc; 3909 3910 jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); 3911 workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC, 3912 freeblks->fb_list.wk_mp); 3913 jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list); 3914 jtrunc->jt_dep.jb_freeblks = freeblks; 3915 jtrunc->jt_ino = freeblks->fb_inum; 3916 jtrunc->jt_size = size; 3917 jtrunc->jt_extsize = extsize; 3918 LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps); 3919 3920 return (jtrunc); 3921 } 3922 3923 /* 3924 * If we're canceling a new bitmap we have to search for another ref 3925 * to move into the bmsafemap dep. This might be better expressed 3926 * with another structure. 3927 */ 3928 static void 3929 move_newblock_dep(jaddref, inodedep) 3930 struct jaddref *jaddref; 3931 struct inodedep *inodedep; 3932 { 3933 struct inoref *inoref; 3934 struct jaddref *jaddrefn; 3935 3936 jaddrefn = NULL; 3937 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3938 inoref = TAILQ_NEXT(inoref, if_deps)) { 3939 if ((jaddref->ja_state & NEWBLOCK) && 3940 inoref->if_list.wk_type == D_JADDREF) { 3941 jaddrefn = (struct jaddref *)inoref; 3942 break; 3943 } 3944 } 3945 if (jaddrefn == NULL) 3946 return; 3947 jaddrefn->ja_state &= ~(ATTACHED | UNDONE); 3948 jaddrefn->ja_state |= jaddref->ja_state & 3949 (ATTACHED | UNDONE | NEWBLOCK); 3950 jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); 3951 jaddref->ja_state |= ATTACHED; 3952 LIST_REMOVE(jaddref, ja_bmdeps); 3953 LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, 3954 ja_bmdeps); 3955 } 3956 3957 /* 3958 * Cancel a jaddref either before it has been written or while it is being 3959 * written. This happens when a link is removed before the add reaches 3960 * the disk. The jaddref dependency is kept linked into the bmsafemap 3961 * and inode to prevent the link count or bitmap from reaching the disk 3962 * until handle_workitem_remove() re-adjusts the counts and bitmaps as 3963 * required. 3964 * 3965 * Returns 1 if the canceled addref requires journaling of the remove and 3966 * 0 otherwise. 3967 */ 3968 static int 3969 cancel_jaddref(jaddref, inodedep, wkhd) 3970 struct jaddref *jaddref; 3971 struct inodedep *inodedep; 3972 struct workhead *wkhd; 3973 { 3974 struct inoref *inoref; 3975 struct jsegdep *jsegdep; 3976 int needsj; 3977 3978 KASSERT((jaddref->ja_state & COMPLETE) == 0, 3979 ("cancel_jaddref: Canceling complete jaddref")); 3980 if (jaddref->ja_state & (INPROGRESS | COMPLETE)) 3981 needsj = 1; 3982 else 3983 needsj = 0; 3984 if (inodedep == NULL) 3985 if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 3986 0, &inodedep) == 0) 3987 panic("cancel_jaddref: Lost inodedep"); 3988 /* 3989 * We must adjust the nlink of any reference operation that follows 3990 * us so that it is consistent with the in-memory reference. This 3991 * ensures that inode nlink rollbacks always have the correct link. 3992 */ 3993 if (needsj == 0) { 3994 for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; 3995 inoref = TAILQ_NEXT(inoref, if_deps)) { 3996 if (inoref->if_state & GOINGAWAY) 3997 break; 3998 inoref->if_nlink--; 3999 } 4000 } 4001 jsegdep = inoref_jseg(&jaddref->ja_ref); 4002 if (jaddref->ja_state & NEWBLOCK) 4003 move_newblock_dep(jaddref, inodedep); 4004 wake_worklist(&jaddref->ja_list); 4005 jaddref->ja_mkdir = NULL; 4006 if (jaddref->ja_state & INPROGRESS) { 4007 jaddref->ja_state &= ~INPROGRESS; 4008 WORKLIST_REMOVE(&jaddref->ja_list); 4009 jwork_insert(wkhd, jsegdep); 4010 } else { 4011 free_jsegdep(jsegdep); 4012 if (jaddref->ja_state & DEPCOMPLETE) 4013 remove_from_journal(&jaddref->ja_list); 4014 } 4015 jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE); 4016 /* 4017 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove 4018 * can arrange for them to be freed with the bitmap. Otherwise we 4019 * no longer need this addref attached to the inoreflst and it 4020 * will incorrectly adjust nlink if we leave it. 4021 */ 4022 if ((jaddref->ja_state & NEWBLOCK) == 0) { 4023 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 4024 if_deps); 4025 jaddref->ja_state |= COMPLETE; 4026 free_jaddref(jaddref); 4027 return (needsj); 4028 } 4029 /* 4030 * Leave the head of the list for jsegdeps for fast merging. 4031 */ 4032 if (LIST_FIRST(wkhd) != NULL) { 4033 jaddref->ja_state |= ONWORKLIST; 4034 LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); 4035 } else 4036 WORKLIST_INSERT(wkhd, &jaddref->ja_list); 4037 4038 return (needsj); 4039 } 4040 4041 /* 4042 * Attempt to free a jaddref structure when some work completes. This 4043 * should only succeed once the entry is written and all dependencies have 4044 * been notified. 4045 */ 4046 static void 4047 free_jaddref(jaddref) 4048 struct jaddref *jaddref; 4049 { 4050 4051 if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) 4052 return; 4053 if (jaddref->ja_ref.if_jsegdep) 4054 panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", 4055 jaddref, jaddref->ja_state); 4056 if (jaddref->ja_state & NEWBLOCK) 4057 LIST_REMOVE(jaddref, ja_bmdeps); 4058 if (jaddref->ja_state & (INPROGRESS | ONWORKLIST)) 4059 panic("free_jaddref: Bad state %p(0x%X)", 4060 jaddref, jaddref->ja_state); 4061 if (jaddref->ja_mkdir != NULL) 4062 panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); 4063 WORKITEM_FREE(jaddref, D_JADDREF); 4064 } 4065 4066 /* 4067 * Free a jremref structure once it has been written or discarded. 4068 */ 4069 static void 4070 free_jremref(jremref) 4071 struct jremref *jremref; 4072 { 4073 4074 if (jremref->jr_ref.if_jsegdep) 4075 free_jsegdep(jremref->jr_ref.if_jsegdep); 4076 if (jremref->jr_state & INPROGRESS) 4077 panic("free_jremref: IO still pending"); 4078 WORKITEM_FREE(jremref, D_JREMREF); 4079 } 4080 4081 /* 4082 * Free a jnewblk structure. 4083 */ 4084 static void 4085 free_jnewblk(jnewblk) 4086 struct jnewblk *jnewblk; 4087 { 4088 4089 if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) 4090 return; 4091 LIST_REMOVE(jnewblk, jn_deps); 4092 if (jnewblk->jn_dep != NULL) 4093 panic("free_jnewblk: Dependency still attached."); 4094 WORKITEM_FREE(jnewblk, D_JNEWBLK); 4095 } 4096 4097 /* 4098 * Cancel a jnewblk which has been been made redundant by frag extension. 4099 */ 4100 static void 4101 cancel_jnewblk(jnewblk, wkhd) 4102 struct jnewblk *jnewblk; 4103 struct workhead *wkhd; 4104 { 4105 struct jsegdep *jsegdep; 4106 4107 jsegdep = jnewblk->jn_jsegdep; 4108 if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL) 4109 panic("cancel_jnewblk: Invalid state"); 4110 jnewblk->jn_jsegdep = NULL; 4111 jnewblk->jn_dep = NULL; 4112 jnewblk->jn_state |= GOINGAWAY; 4113 if (jnewblk->jn_state & INPROGRESS) { 4114 jnewblk->jn_state &= ~INPROGRESS; 4115 WORKLIST_REMOVE(&jnewblk->jn_list); 4116 jwork_insert(wkhd, jsegdep); 4117 } else { 4118 free_jsegdep(jsegdep); 4119 remove_from_journal(&jnewblk->jn_list); 4120 } 4121 wake_worklist(&jnewblk->jn_list); 4122 WORKLIST_INSERT(wkhd, &jnewblk->jn_list); 4123 } 4124 4125 static void 4126 free_jblkdep(jblkdep) 4127 struct jblkdep *jblkdep; 4128 { 4129 4130 if (jblkdep->jb_list.wk_type == D_JFREEBLK) 4131 WORKITEM_FREE(jblkdep, D_JFREEBLK); 4132 else if (jblkdep->jb_list.wk_type == D_JTRUNC) 4133 WORKITEM_FREE(jblkdep, D_JTRUNC); 4134 else 4135 panic("free_jblkdep: Unexpected type %s", 4136 TYPENAME(jblkdep->jb_list.wk_type)); 4137 } 4138 4139 /* 4140 * Free a single jseg once it is no longer referenced in memory or on 4141 * disk. Reclaim journal blocks and dependencies waiting for the segment 4142 * to disappear. 4143 */ 4144 static void 4145 free_jseg(jseg, jblocks) 4146 struct jseg *jseg; 4147 struct jblocks *jblocks; 4148 { 4149 struct freework *freework; 4150 4151 /* 4152 * Free freework structures that were lingering to indicate freed 4153 * indirect blocks that forced journal write ordering on reallocate. 4154 */ 4155 while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) 4156 indirblk_remove(freework); 4157 if (jblocks->jb_oldestseg == jseg) 4158 jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next); 4159 TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); 4160 jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); 4161 KASSERT(LIST_EMPTY(&jseg->js_entries), 4162 ("free_jseg: Freed jseg has valid entries.")); 4163 WORKITEM_FREE(jseg, D_JSEG); 4164 } 4165 4166 /* 4167 * Free all jsegs that meet the criteria for being reclaimed and update 4168 * oldestseg. 4169 */ 4170 static void 4171 free_jsegs(jblocks) 4172 struct jblocks *jblocks; 4173 { 4174 struct jseg *jseg; 4175 4176 /* 4177 * Free only those jsegs which have none allocated before them to 4178 * preserve the journal space ordering. 4179 */ 4180 while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { 4181 /* 4182 * Only reclaim space when nothing depends on this journal 4183 * set and another set has written that it is no longer 4184 * valid. 4185 */ 4186 if (jseg->js_refs != 0) { 4187 jblocks->jb_oldestseg = jseg; 4188 return; 4189 } 4190 if (!LIST_EMPTY(&jseg->js_indirs) && 4191 jseg->js_seq >= jblocks->jb_oldestwrseq) 4192 break; 4193 free_jseg(jseg, jblocks); 4194 } 4195 /* 4196 * If we exited the loop above we still must discover the 4197 * oldest valid segment. 4198 */ 4199 if (jseg) 4200 for (jseg = jblocks->jb_oldestseg; jseg != NULL; 4201 jseg = TAILQ_NEXT(jseg, js_next)) 4202 if (jseg->js_refs != 0) 4203 break; 4204 jblocks->jb_oldestseg = jseg; 4205 /* 4206 * The journal has no valid records but some jsegs may still be 4207 * waiting on oldestwrseq to advance. We force a small record 4208 * out to permit these lingering records to be reclaimed. 4209 */ 4210 if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs)) 4211 jblocks->jb_needseg = 1; 4212 } 4213 4214 /* 4215 * Release one reference to a jseg and free it if the count reaches 0. This 4216 * should eventually reclaim journal space as well. 4217 */ 4218 static void 4219 rele_jseg(jseg) 4220 struct jseg *jseg; 4221 { 4222 4223 KASSERT(jseg->js_refs > 0, 4224 ("free_jseg: Invalid refcnt %d", jseg->js_refs)); 4225 if (--jseg->js_refs != 0) 4226 return; 4227 free_jsegs(jseg->js_jblocks); 4228 } 4229 4230 /* 4231 * Release a jsegdep and decrement the jseg count. 4232 */ 4233 static void 4234 free_jsegdep(jsegdep) 4235 struct jsegdep *jsegdep; 4236 { 4237 4238 if (jsegdep->jd_seg) 4239 rele_jseg(jsegdep->jd_seg); 4240 WORKITEM_FREE(jsegdep, D_JSEGDEP); 4241 } 4242 4243 /* 4244 * Wait for a journal item to make it to disk. Initiate journal processing 4245 * if required. 4246 */ 4247 static int 4248 jwait(wk, waitfor) 4249 struct worklist *wk; 4250 int waitfor; 4251 { 4252 4253 /* 4254 * Blocking journal waits cause slow synchronous behavior. Record 4255 * stats on the frequency of these blocking operations. 4256 */ 4257 if (waitfor == MNT_WAIT) { 4258 stat_journal_wait++; 4259 switch (wk->wk_type) { 4260 case D_JREMREF: 4261 case D_JMVREF: 4262 stat_jwait_filepage++; 4263 break; 4264 case D_JTRUNC: 4265 case D_JFREEBLK: 4266 stat_jwait_freeblks++; 4267 break; 4268 case D_JNEWBLK: 4269 stat_jwait_newblk++; 4270 break; 4271 case D_JADDREF: 4272 stat_jwait_inode++; 4273 break; 4274 default: 4275 break; 4276 } 4277 } 4278 /* 4279 * If IO has not started we process the journal. We can't mark the 4280 * worklist item as IOWAITING because we drop the lock while 4281 * processing the journal and the worklist entry may be freed after 4282 * this point. The caller may call back in and re-issue the request. 4283 */ 4284 if ((wk->wk_state & INPROGRESS) == 0) { 4285 softdep_process_journal(wk->wk_mp, wk, waitfor); 4286 if (waitfor != MNT_WAIT) 4287 return (EBUSY); 4288 return (0); 4289 } 4290 if (waitfor != MNT_WAIT) 4291 return (EBUSY); 4292 wait_worklist(wk, "jwait"); 4293 return (0); 4294 } 4295 4296 /* 4297 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as 4298 * appropriate. This is a convenience function to reduce duplicate code 4299 * for the setup and revert functions below. 4300 */ 4301 static struct inodedep * 4302 inodedep_lookup_ip(ip) 4303 struct inode *ip; 4304 { 4305 struct inodedep *inodedep; 4306 4307 KASSERT(ip->i_nlink >= ip->i_effnlink, 4308 ("inodedep_lookup_ip: bad delta")); 4309 (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 4310 DEPALLOC, &inodedep); 4311 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 4312 4313 return (inodedep); 4314 } 4315 4316 /* 4317 * Called prior to creating a new inode and linking it to a directory. The 4318 * jaddref structure must already be allocated by softdep_setup_inomapdep 4319 * and it is discovered here so we can initialize the mode and update 4320 * nlinkdelta. 4321 */ 4322 void 4323 softdep_setup_create(dp, ip) 4324 struct inode *dp; 4325 struct inode *ip; 4326 { 4327 struct inodedep *inodedep; 4328 struct jaddref *jaddref; 4329 struct vnode *dvp; 4330 4331 KASSERT(ip->i_nlink == 1, 4332 ("softdep_setup_create: Invalid link count.")); 4333 dvp = ITOV(dp); 4334 ACQUIRE_LOCK(&lk); 4335 inodedep = inodedep_lookup_ip(ip); 4336 if (DOINGSUJ(dvp)) { 4337 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4338 inoreflst); 4339 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 4340 ("softdep_setup_create: No addref structure present.")); 4341 } 4342 softdep_prelink(dvp, NULL); 4343 FREE_LOCK(&lk); 4344 } 4345 4346 /* 4347 * Create a jaddref structure to track the addition of a DOTDOT link when 4348 * we are reparenting an inode as part of a rename. This jaddref will be 4349 * found by softdep_setup_directory_change. Adjusts nlinkdelta for 4350 * non-journaling softdep. 4351 */ 4352 void 4353 softdep_setup_dotdot_link(dp, ip) 4354 struct inode *dp; 4355 struct inode *ip; 4356 { 4357 struct inodedep *inodedep; 4358 struct jaddref *jaddref; 4359 struct vnode *dvp; 4360 struct vnode *vp; 4361 4362 dvp = ITOV(dp); 4363 vp = ITOV(ip); 4364 jaddref = NULL; 4365 /* 4366 * We don't set MKDIR_PARENT as this is not tied to a mkdir and 4367 * is used as a normal link would be. 4368 */ 4369 if (DOINGSUJ(dvp)) 4370 jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 4371 dp->i_effnlink - 1, dp->i_mode); 4372 ACQUIRE_LOCK(&lk); 4373 inodedep = inodedep_lookup_ip(dp); 4374 if (jaddref) 4375 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4376 if_deps); 4377 softdep_prelink(dvp, ITOV(ip)); 4378 FREE_LOCK(&lk); 4379 } 4380 4381 /* 4382 * Create a jaddref structure to track a new link to an inode. The directory 4383 * offset is not known until softdep_setup_directory_add or 4384 * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling 4385 * softdep. 4386 */ 4387 void 4388 softdep_setup_link(dp, ip) 4389 struct inode *dp; 4390 struct inode *ip; 4391 { 4392 struct inodedep *inodedep; 4393 struct jaddref *jaddref; 4394 struct vnode *dvp; 4395 4396 dvp = ITOV(dp); 4397 jaddref = NULL; 4398 if (DOINGSUJ(dvp)) 4399 jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, 4400 ip->i_mode); 4401 ACQUIRE_LOCK(&lk); 4402 inodedep = inodedep_lookup_ip(ip); 4403 if (jaddref) 4404 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4405 if_deps); 4406 softdep_prelink(dvp, ITOV(ip)); 4407 FREE_LOCK(&lk); 4408 } 4409 4410 /* 4411 * Called to create the jaddref structures to track . and .. references as 4412 * well as lookup and further initialize the incomplete jaddref created 4413 * by softdep_setup_inomapdep when the inode was allocated. Adjusts 4414 * nlinkdelta for non-journaling softdep. 4415 */ 4416 void 4417 softdep_setup_mkdir(dp, ip) 4418 struct inode *dp; 4419 struct inode *ip; 4420 { 4421 struct inodedep *inodedep; 4422 struct jaddref *dotdotaddref; 4423 struct jaddref *dotaddref; 4424 struct jaddref *jaddref; 4425 struct vnode *dvp; 4426 4427 dvp = ITOV(dp); 4428 dotaddref = dotdotaddref = NULL; 4429 if (DOINGSUJ(dvp)) { 4430 dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, 4431 ip->i_mode); 4432 dotaddref->ja_state |= MKDIR_BODY; 4433 dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, 4434 dp->i_effnlink - 1, dp->i_mode); 4435 dotdotaddref->ja_state |= MKDIR_PARENT; 4436 } 4437 ACQUIRE_LOCK(&lk); 4438 inodedep = inodedep_lookup_ip(ip); 4439 if (DOINGSUJ(dvp)) { 4440 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4441 inoreflst); 4442 KASSERT(jaddref != NULL, 4443 ("softdep_setup_mkdir: No addref structure present.")); 4444 KASSERT(jaddref->ja_parent == dp->i_number, 4445 ("softdep_setup_mkdir: bad parent %d", 4446 jaddref->ja_parent)); 4447 TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, 4448 if_deps); 4449 } 4450 inodedep = inodedep_lookup_ip(dp); 4451 if (DOINGSUJ(dvp)) 4452 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, 4453 &dotdotaddref->ja_ref, if_deps); 4454 softdep_prelink(ITOV(dp), NULL); 4455 FREE_LOCK(&lk); 4456 } 4457 4458 /* 4459 * Called to track nlinkdelta of the inode and parent directories prior to 4460 * unlinking a directory. 4461 */ 4462 void 4463 softdep_setup_rmdir(dp, ip) 4464 struct inode *dp; 4465 struct inode *ip; 4466 { 4467 struct vnode *dvp; 4468 4469 dvp = ITOV(dp); 4470 ACQUIRE_LOCK(&lk); 4471 (void) inodedep_lookup_ip(ip); 4472 (void) inodedep_lookup_ip(dp); 4473 softdep_prelink(dvp, ITOV(ip)); 4474 FREE_LOCK(&lk); 4475 } 4476 4477 /* 4478 * Called to track nlinkdelta of the inode and parent directories prior to 4479 * unlink. 4480 */ 4481 void 4482 softdep_setup_unlink(dp, ip) 4483 struct inode *dp; 4484 struct inode *ip; 4485 { 4486 struct vnode *dvp; 4487 4488 dvp = ITOV(dp); 4489 ACQUIRE_LOCK(&lk); 4490 (void) inodedep_lookup_ip(ip); 4491 (void) inodedep_lookup_ip(dp); 4492 softdep_prelink(dvp, ITOV(ip)); 4493 FREE_LOCK(&lk); 4494 } 4495 4496 /* 4497 * Called to release the journal structures created by a failed non-directory 4498 * creation. Adjusts nlinkdelta for non-journaling softdep. 4499 */ 4500 void 4501 softdep_revert_create(dp, ip) 4502 struct inode *dp; 4503 struct inode *ip; 4504 { 4505 struct inodedep *inodedep; 4506 struct jaddref *jaddref; 4507 struct vnode *dvp; 4508 4509 dvp = ITOV(dp); 4510 ACQUIRE_LOCK(&lk); 4511 inodedep = inodedep_lookup_ip(ip); 4512 if (DOINGSUJ(dvp)) { 4513 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4514 inoreflst); 4515 KASSERT(jaddref->ja_parent == dp->i_number, 4516 ("softdep_revert_create: addref parent mismatch")); 4517 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4518 } 4519 FREE_LOCK(&lk); 4520 } 4521 4522 /* 4523 * Called to release the journal structures created by a failed dotdot link 4524 * creation. Adjusts nlinkdelta for non-journaling softdep. 4525 */ 4526 void 4527 softdep_revert_dotdot_link(dp, ip) 4528 struct inode *dp; 4529 struct inode *ip; 4530 { 4531 struct inodedep *inodedep; 4532 struct jaddref *jaddref; 4533 struct vnode *dvp; 4534 4535 dvp = ITOV(dp); 4536 ACQUIRE_LOCK(&lk); 4537 inodedep = inodedep_lookup_ip(dp); 4538 if (DOINGSUJ(dvp)) { 4539 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4540 inoreflst); 4541 KASSERT(jaddref->ja_parent == ip->i_number, 4542 ("softdep_revert_dotdot_link: addref parent mismatch")); 4543 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4544 } 4545 FREE_LOCK(&lk); 4546 } 4547 4548 /* 4549 * Called to release the journal structures created by a failed link 4550 * addition. Adjusts nlinkdelta for non-journaling softdep. 4551 */ 4552 void 4553 softdep_revert_link(dp, ip) 4554 struct inode *dp; 4555 struct inode *ip; 4556 { 4557 struct inodedep *inodedep; 4558 struct jaddref *jaddref; 4559 struct vnode *dvp; 4560 4561 dvp = ITOV(dp); 4562 ACQUIRE_LOCK(&lk); 4563 inodedep = inodedep_lookup_ip(ip); 4564 if (DOINGSUJ(dvp)) { 4565 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4566 inoreflst); 4567 KASSERT(jaddref->ja_parent == dp->i_number, 4568 ("softdep_revert_link: addref parent mismatch")); 4569 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4570 } 4571 FREE_LOCK(&lk); 4572 } 4573 4574 /* 4575 * Called to release the journal structures created by a failed mkdir 4576 * attempt. Adjusts nlinkdelta for non-journaling softdep. 4577 */ 4578 void 4579 softdep_revert_mkdir(dp, ip) 4580 struct inode *dp; 4581 struct inode *ip; 4582 { 4583 struct inodedep *inodedep; 4584 struct jaddref *jaddref; 4585 struct jaddref *dotaddref; 4586 struct vnode *dvp; 4587 4588 dvp = ITOV(dp); 4589 4590 ACQUIRE_LOCK(&lk); 4591 inodedep = inodedep_lookup_ip(dp); 4592 if (DOINGSUJ(dvp)) { 4593 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4594 inoreflst); 4595 KASSERT(jaddref->ja_parent == ip->i_number, 4596 ("softdep_revert_mkdir: dotdot addref parent mismatch")); 4597 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4598 } 4599 inodedep = inodedep_lookup_ip(ip); 4600 if (DOINGSUJ(dvp)) { 4601 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 4602 inoreflst); 4603 KASSERT(jaddref->ja_parent == dp->i_number, 4604 ("softdep_revert_mkdir: addref parent mismatch")); 4605 dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 4606 inoreflst, if_deps); 4607 cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); 4608 KASSERT(dotaddref->ja_parent == ip->i_number, 4609 ("softdep_revert_mkdir: dot addref parent mismatch")); 4610 cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); 4611 } 4612 FREE_LOCK(&lk); 4613 } 4614 4615 /* 4616 * Called to correct nlinkdelta after a failed rmdir. 4617 */ 4618 void 4619 softdep_revert_rmdir(dp, ip) 4620 struct inode *dp; 4621 struct inode *ip; 4622 { 4623 4624 ACQUIRE_LOCK(&lk); 4625 (void) inodedep_lookup_ip(ip); 4626 (void) inodedep_lookup_ip(dp); 4627 FREE_LOCK(&lk); 4628 } 4629 4630 /* 4631 * Protecting the freemaps (or bitmaps). 4632 * 4633 * To eliminate the need to execute fsck before mounting a filesystem 4634 * after a power failure, one must (conservatively) guarantee that the 4635 * on-disk copy of the bitmaps never indicate that a live inode or block is 4636 * free. So, when a block or inode is allocated, the bitmap should be 4637 * updated (on disk) before any new pointers. When a block or inode is 4638 * freed, the bitmap should not be updated until all pointers have been 4639 * reset. The latter dependency is handled by the delayed de-allocation 4640 * approach described below for block and inode de-allocation. The former 4641 * dependency is handled by calling the following procedure when a block or 4642 * inode is allocated. When an inode is allocated an "inodedep" is created 4643 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. 4644 * Each "inodedep" is also inserted into the hash indexing structure so 4645 * that any additional link additions can be made dependent on the inode 4646 * allocation. 4647 * 4648 * The ufs filesystem maintains a number of free block counts (e.g., per 4649 * cylinder group, per cylinder and per <cylinder, rotational position> pair) 4650 * in addition to the bitmaps. These counts are used to improve efficiency 4651 * during allocation and therefore must be consistent with the bitmaps. 4652 * There is no convenient way to guarantee post-crash consistency of these 4653 * counts with simple update ordering, for two main reasons: (1) The counts 4654 * and bitmaps for a single cylinder group block are not in the same disk 4655 * sector. If a disk write is interrupted (e.g., by power failure), one may 4656 * be written and the other not. (2) Some of the counts are located in the 4657 * superblock rather than the cylinder group block. So, we focus our soft 4658 * updates implementation on protecting the bitmaps. When mounting a 4659 * filesystem, we recompute the auxiliary counts from the bitmaps. 4660 */ 4661 4662 /* 4663 * Called just after updating the cylinder group block to allocate an inode. 4664 */ 4665 void 4666 softdep_setup_inomapdep(bp, ip, newinum, mode) 4667 struct buf *bp; /* buffer for cylgroup block with inode map */ 4668 struct inode *ip; /* inode related to allocation */ 4669 ino_t newinum; /* new inode number being allocated */ 4670 int mode; 4671 { 4672 struct inodedep *inodedep; 4673 struct bmsafemap *bmsafemap; 4674 struct jaddref *jaddref; 4675 struct mount *mp; 4676 struct fs *fs; 4677 4678 mp = UFSTOVFS(ip->i_ump); 4679 fs = ip->i_ump->um_fs; 4680 jaddref = NULL; 4681 4682 /* 4683 * Allocate the journal reference add structure so that the bitmap 4684 * can be dependent on it. 4685 */ 4686 if (MOUNTEDSUJ(mp)) { 4687 jaddref = newjaddref(ip, newinum, 0, 0, mode); 4688 jaddref->ja_state |= NEWBLOCK; 4689 } 4690 4691 /* 4692 * Create a dependency for the newly allocated inode. 4693 * Panic if it already exists as something is seriously wrong. 4694 * Otherwise add it to the dependency list for the buffer holding 4695 * the cylinder group map from which it was allocated. 4696 */ 4697 ACQUIRE_LOCK(&lk); 4698 if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep))) 4699 panic("softdep_setup_inomapdep: dependency %p for new" 4700 "inode already exists", inodedep); 4701 bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum)); 4702 if (jaddref) { 4703 LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); 4704 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, 4705 if_deps); 4706 } else { 4707 inodedep->id_state |= ONDEPLIST; 4708 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); 4709 } 4710 inodedep->id_bmsafemap = bmsafemap; 4711 inodedep->id_state &= ~DEPCOMPLETE; 4712 FREE_LOCK(&lk); 4713 } 4714 4715 /* 4716 * Called just after updating the cylinder group block to 4717 * allocate block or fragment. 4718 */ 4719 void 4720 softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) 4721 struct buf *bp; /* buffer for cylgroup block with block map */ 4722 struct mount *mp; /* filesystem doing allocation */ 4723 ufs2_daddr_t newblkno; /* number of newly allocated block */ 4724 int frags; /* Number of fragments. */ 4725 int oldfrags; /* Previous number of fragments for extend. */ 4726 { 4727 struct newblk *newblk; 4728 struct bmsafemap *bmsafemap; 4729 struct jnewblk *jnewblk; 4730 struct fs *fs; 4731 4732 fs = VFSTOUFS(mp)->um_fs; 4733 jnewblk = NULL; 4734 /* 4735 * Create a dependency for the newly allocated block. 4736 * Add it to the dependency list for the buffer holding 4737 * the cylinder group map from which it was allocated. 4738 */ 4739 if (MOUNTEDSUJ(mp)) { 4740 jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); 4741 workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); 4742 jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); 4743 jnewblk->jn_state = ATTACHED; 4744 jnewblk->jn_blkno = newblkno; 4745 jnewblk->jn_frags = frags; 4746 jnewblk->jn_oldfrags = oldfrags; 4747 #ifdef SUJ_DEBUG 4748 { 4749 struct cg *cgp; 4750 uint8_t *blksfree; 4751 long bno; 4752 int i; 4753 4754 cgp = (struct cg *)bp->b_data; 4755 blksfree = cg_blksfree(cgp); 4756 bno = dtogd(fs, jnewblk->jn_blkno); 4757 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; 4758 i++) { 4759 if (isset(blksfree, bno + i)) 4760 panic("softdep_setup_blkmapdep: " 4761 "free fragment %d from %d-%d " 4762 "state 0x%X dep %p", i, 4763 jnewblk->jn_oldfrags, 4764 jnewblk->jn_frags, 4765 jnewblk->jn_state, 4766 jnewblk->jn_dep); 4767 } 4768 } 4769 #endif 4770 } 4771 ACQUIRE_LOCK(&lk); 4772 if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) 4773 panic("softdep_setup_blkmapdep: found block"); 4774 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, 4775 dtog(fs, newblkno)); 4776 if (jnewblk) { 4777 jnewblk->jn_dep = (struct worklist *)newblk; 4778 LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); 4779 } else { 4780 newblk->nb_state |= ONDEPLIST; 4781 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); 4782 } 4783 newblk->nb_bmsafemap = bmsafemap; 4784 newblk->nb_jnewblk = jnewblk; 4785 FREE_LOCK(&lk); 4786 } 4787 4788 #define BMSAFEMAP_HASH(fs, cg) \ 4789 (&bmsafemap_hashtbl[((((register_t)(fs)) >> 13) + (cg)) & bmsafemap_hash]) 4790 4791 static int 4792 bmsafemap_find(bmsafemaphd, mp, cg, bmsafemapp) 4793 struct bmsafemap_hashhead *bmsafemaphd; 4794 struct mount *mp; 4795 int cg; 4796 struct bmsafemap **bmsafemapp; 4797 { 4798 struct bmsafemap *bmsafemap; 4799 4800 LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) 4801 if (bmsafemap->sm_list.wk_mp == mp && bmsafemap->sm_cg == cg) 4802 break; 4803 if (bmsafemap) { 4804 *bmsafemapp = bmsafemap; 4805 return (1); 4806 } 4807 *bmsafemapp = NULL; 4808 4809 return (0); 4810 } 4811 4812 /* 4813 * Find the bmsafemap associated with a cylinder group buffer. 4814 * If none exists, create one. The buffer must be locked when 4815 * this routine is called and this routine must be called with 4816 * splbio interrupts blocked. 4817 */ 4818 static struct bmsafemap * 4819 bmsafemap_lookup(mp, bp, cg) 4820 struct mount *mp; 4821 struct buf *bp; 4822 int cg; 4823 { 4824 struct bmsafemap_hashhead *bmsafemaphd; 4825 struct bmsafemap *bmsafemap, *collision; 4826 struct worklist *wk; 4827 struct fs *fs; 4828 4829 mtx_assert(&lk, MA_OWNED); 4830 if (bp) 4831 LIST_FOREACH(wk, &bp->b_dep, wk_list) 4832 if (wk->wk_type == D_BMSAFEMAP) 4833 return (WK_BMSAFEMAP(wk)); 4834 fs = VFSTOUFS(mp)->um_fs; 4835 bmsafemaphd = BMSAFEMAP_HASH(fs, cg); 4836 if (bmsafemap_find(bmsafemaphd, mp, cg, &bmsafemap) == 1) 4837 return (bmsafemap); 4838 FREE_LOCK(&lk); 4839 bmsafemap = malloc(sizeof(struct bmsafemap), 4840 M_BMSAFEMAP, M_SOFTDEP_FLAGS); 4841 workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); 4842 bmsafemap->sm_buf = bp; 4843 LIST_INIT(&bmsafemap->sm_inodedephd); 4844 LIST_INIT(&bmsafemap->sm_inodedepwr); 4845 LIST_INIT(&bmsafemap->sm_newblkhd); 4846 LIST_INIT(&bmsafemap->sm_newblkwr); 4847 LIST_INIT(&bmsafemap->sm_jaddrefhd); 4848 LIST_INIT(&bmsafemap->sm_jnewblkhd); 4849 LIST_INIT(&bmsafemap->sm_freehd); 4850 LIST_INIT(&bmsafemap->sm_freewr); 4851 ACQUIRE_LOCK(&lk); 4852 if (bmsafemap_find(bmsafemaphd, mp, cg, &collision) == 1) { 4853 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 4854 return (collision); 4855 } 4856 bmsafemap->sm_cg = cg; 4857 LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); 4858 LIST_INSERT_HEAD(&VFSTOUFS(mp)->softdep_dirtycg, bmsafemap, sm_next); 4859 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); 4860 return (bmsafemap); 4861 } 4862 4863 /* 4864 * Direct block allocation dependencies. 4865 * 4866 * When a new block is allocated, the corresponding disk locations must be 4867 * initialized (with zeros or new data) before the on-disk inode points to 4868 * them. Also, the freemap from which the block was allocated must be 4869 * updated (on disk) before the inode's pointer. These two dependencies are 4870 * independent of each other and are needed for all file blocks and indirect 4871 * blocks that are pointed to directly by the inode. Just before the 4872 * "in-core" version of the inode is updated with a newly allocated block 4873 * number, a procedure (below) is called to setup allocation dependency 4874 * structures. These structures are removed when the corresponding 4875 * dependencies are satisfied or when the block allocation becomes obsolete 4876 * (i.e., the file is deleted, the block is de-allocated, or the block is a 4877 * fragment that gets upgraded). All of these cases are handled in 4878 * procedures described later. 4879 * 4880 * When a file extension causes a fragment to be upgraded, either to a larger 4881 * fragment or to a full block, the on-disk location may change (if the 4882 * previous fragment could not simply be extended). In this case, the old 4883 * fragment must be de-allocated, but not until after the inode's pointer has 4884 * been updated. In most cases, this is handled by later procedures, which 4885 * will construct a "freefrag" structure to be added to the workitem queue 4886 * when the inode update is complete (or obsolete). The main exception to 4887 * this is when an allocation occurs while a pending allocation dependency 4888 * (for the same block pointer) remains. This case is handled in the main 4889 * allocation dependency setup procedure by immediately freeing the 4890 * unreferenced fragments. 4891 */ 4892 void 4893 softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 4894 struct inode *ip; /* inode to which block is being added */ 4895 ufs_lbn_t off; /* block pointer within inode */ 4896 ufs2_daddr_t newblkno; /* disk block number being added */ 4897 ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ 4898 long newsize; /* size of new block */ 4899 long oldsize; /* size of new block */ 4900 struct buf *bp; /* bp for allocated block */ 4901 { 4902 struct allocdirect *adp, *oldadp; 4903 struct allocdirectlst *adphead; 4904 struct freefrag *freefrag; 4905 struct inodedep *inodedep; 4906 struct pagedep *pagedep; 4907 struct jnewblk *jnewblk; 4908 struct newblk *newblk; 4909 struct mount *mp; 4910 ufs_lbn_t lbn; 4911 4912 lbn = bp->b_lblkno; 4913 mp = UFSTOVFS(ip->i_ump); 4914 if (oldblkno && oldblkno != newblkno) 4915 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 4916 else 4917 freefrag = NULL; 4918 4919 ACQUIRE_LOCK(&lk); 4920 if (off >= NDADDR) { 4921 if (lbn > 0) 4922 panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", 4923 lbn, off); 4924 /* allocating an indirect block */ 4925 if (oldblkno != 0) 4926 panic("softdep_setup_allocdirect: non-zero indir"); 4927 } else { 4928 if (off != lbn) 4929 panic("softdep_setup_allocdirect: lbn %jd != off %jd", 4930 lbn, off); 4931 /* 4932 * Allocating a direct block. 4933 * 4934 * If we are allocating a directory block, then we must 4935 * allocate an associated pagedep to track additions and 4936 * deletions. 4937 */ 4938 if ((ip->i_mode & IFMT) == IFDIR) 4939 pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC, 4940 &pagedep); 4941 } 4942 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 4943 panic("softdep_setup_allocdirect: lost block"); 4944 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 4945 ("softdep_setup_allocdirect: newblk already initialized")); 4946 /* 4947 * Convert the newblk to an allocdirect. 4948 */ 4949 newblk->nb_list.wk_type = D_ALLOCDIRECT; 4950 adp = (struct allocdirect *)newblk; 4951 newblk->nb_freefrag = freefrag; 4952 adp->ad_offset = off; 4953 adp->ad_oldblkno = oldblkno; 4954 adp->ad_newsize = newsize; 4955 adp->ad_oldsize = oldsize; 4956 4957 /* 4958 * Finish initializing the journal. 4959 */ 4960 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 4961 jnewblk->jn_ino = ip->i_number; 4962 jnewblk->jn_lbn = lbn; 4963 add_to_journal(&jnewblk->jn_list); 4964 } 4965 if (freefrag && freefrag->ff_jdep != NULL && 4966 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 4967 add_to_journal(freefrag->ff_jdep); 4968 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 4969 adp->ad_inodedep = inodedep; 4970 4971 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 4972 /* 4973 * The list of allocdirects must be kept in sorted and ascending 4974 * order so that the rollback routines can quickly determine the 4975 * first uncommitted block (the size of the file stored on disk 4976 * ends at the end of the lowest committed fragment, or if there 4977 * are no fragments, at the end of the highest committed block). 4978 * Since files generally grow, the typical case is that the new 4979 * block is to be added at the end of the list. We speed this 4980 * special case by checking against the last allocdirect in the 4981 * list before laboriously traversing the list looking for the 4982 * insertion point. 4983 */ 4984 adphead = &inodedep->id_newinoupdt; 4985 oldadp = TAILQ_LAST(adphead, allocdirectlst); 4986 if (oldadp == NULL || oldadp->ad_offset <= off) { 4987 /* insert at end of list */ 4988 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 4989 if (oldadp != NULL && oldadp->ad_offset == off) 4990 allocdirect_merge(adphead, adp, oldadp); 4991 FREE_LOCK(&lk); 4992 return; 4993 } 4994 TAILQ_FOREACH(oldadp, adphead, ad_next) { 4995 if (oldadp->ad_offset >= off) 4996 break; 4997 } 4998 if (oldadp == NULL) 4999 panic("softdep_setup_allocdirect: lost entry"); 5000 /* insert in middle of list */ 5001 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 5002 if (oldadp->ad_offset == off) 5003 allocdirect_merge(adphead, adp, oldadp); 5004 5005 FREE_LOCK(&lk); 5006 } 5007 5008 /* 5009 * Merge a newer and older journal record to be stored either in a 5010 * newblock or freefrag. This handles aggregating journal records for 5011 * fragment allocation into a second record as well as replacing a 5012 * journal free with an aborted journal allocation. A segment for the 5013 * oldest record will be placed on wkhd if it has been written. If not 5014 * the segment for the newer record will suffice. 5015 */ 5016 static struct worklist * 5017 jnewblk_merge(new, old, wkhd) 5018 struct worklist *new; 5019 struct worklist *old; 5020 struct workhead *wkhd; 5021 { 5022 struct jnewblk *njnewblk; 5023 struct jnewblk *jnewblk; 5024 5025 /* Handle NULLs to simplify callers. */ 5026 if (new == NULL) 5027 return (old); 5028 if (old == NULL) 5029 return (new); 5030 /* Replace a jfreefrag with a jnewblk. */ 5031 if (new->wk_type == D_JFREEFRAG) { 5032 cancel_jfreefrag(WK_JFREEFRAG(new)); 5033 return (old); 5034 } 5035 /* 5036 * Handle merging of two jnewblk records that describe 5037 * different sets of fragments in the same block. 5038 */ 5039 jnewblk = WK_JNEWBLK(old); 5040 njnewblk = WK_JNEWBLK(new); 5041 if (jnewblk->jn_blkno != njnewblk->jn_blkno) 5042 panic("jnewblk_merge: Merging disparate blocks."); 5043 /* 5044 * The record may be rolled back in the cg. 5045 */ 5046 if (jnewblk->jn_state & UNDONE) { 5047 jnewblk->jn_state &= ~UNDONE; 5048 njnewblk->jn_state |= UNDONE; 5049 njnewblk->jn_state &= ~ATTACHED; 5050 } 5051 /* 5052 * We modify the newer addref and free the older so that if neither 5053 * has been written the most up-to-date copy will be on disk. If 5054 * both have been written but rolled back we only temporarily need 5055 * one of them to fix the bits when the cg write completes. 5056 */ 5057 jnewblk->jn_state |= ATTACHED | COMPLETE; 5058 njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; 5059 cancel_jnewblk(jnewblk, wkhd); 5060 WORKLIST_REMOVE(&jnewblk->jn_list); 5061 free_jnewblk(jnewblk); 5062 return (new); 5063 } 5064 5065 /* 5066 * Replace an old allocdirect dependency with a newer one. 5067 * This routine must be called with splbio interrupts blocked. 5068 */ 5069 static void 5070 allocdirect_merge(adphead, newadp, oldadp) 5071 struct allocdirectlst *adphead; /* head of list holding allocdirects */ 5072 struct allocdirect *newadp; /* allocdirect being added */ 5073 struct allocdirect *oldadp; /* existing allocdirect being checked */ 5074 { 5075 struct worklist *wk; 5076 struct freefrag *freefrag; 5077 5078 freefrag = NULL; 5079 mtx_assert(&lk, MA_OWNED); 5080 if (newadp->ad_oldblkno != oldadp->ad_newblkno || 5081 newadp->ad_oldsize != oldadp->ad_newsize || 5082 newadp->ad_offset >= NDADDR) 5083 panic("%s %jd != new %jd || old size %ld != new %ld", 5084 "allocdirect_merge: old blkno", 5085 (intmax_t)newadp->ad_oldblkno, 5086 (intmax_t)oldadp->ad_newblkno, 5087 newadp->ad_oldsize, oldadp->ad_newsize); 5088 newadp->ad_oldblkno = oldadp->ad_oldblkno; 5089 newadp->ad_oldsize = oldadp->ad_oldsize; 5090 /* 5091 * If the old dependency had a fragment to free or had never 5092 * previously had a block allocated, then the new dependency 5093 * can immediately post its freefrag and adopt the old freefrag. 5094 * This action is done by swapping the freefrag dependencies. 5095 * The new dependency gains the old one's freefrag, and the 5096 * old one gets the new one and then immediately puts it on 5097 * the worklist when it is freed by free_newblk. It is 5098 * not possible to do this swap when the old dependency had a 5099 * non-zero size but no previous fragment to free. This condition 5100 * arises when the new block is an extension of the old block. 5101 * Here, the first part of the fragment allocated to the new 5102 * dependency is part of the block currently claimed on disk by 5103 * the old dependency, so cannot legitimately be freed until the 5104 * conditions for the new dependency are fulfilled. 5105 */ 5106 freefrag = newadp->ad_freefrag; 5107 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { 5108 newadp->ad_freefrag = oldadp->ad_freefrag; 5109 oldadp->ad_freefrag = freefrag; 5110 } 5111 /* 5112 * If we are tracking a new directory-block allocation, 5113 * move it from the old allocdirect to the new allocdirect. 5114 */ 5115 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { 5116 WORKLIST_REMOVE(wk); 5117 if (!LIST_EMPTY(&oldadp->ad_newdirblk)) 5118 panic("allocdirect_merge: extra newdirblk"); 5119 WORKLIST_INSERT(&newadp->ad_newdirblk, wk); 5120 } 5121 TAILQ_REMOVE(adphead, oldadp, ad_next); 5122 /* 5123 * We need to move any journal dependencies over to the freefrag 5124 * that releases this block if it exists. Otherwise we are 5125 * extending an existing block and we'll wait until that is 5126 * complete to release the journal space and extend the 5127 * new journal to cover this old space as well. 5128 */ 5129 if (freefrag == NULL) { 5130 if (oldadp->ad_newblkno != newadp->ad_newblkno) 5131 panic("allocdirect_merge: %jd != %jd", 5132 oldadp->ad_newblkno, newadp->ad_newblkno); 5133 newadp->ad_block.nb_jnewblk = (struct jnewblk *) 5134 jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, 5135 &oldadp->ad_block.nb_jnewblk->jn_list, 5136 &newadp->ad_block.nb_jwork); 5137 oldadp->ad_block.nb_jnewblk = NULL; 5138 cancel_newblk(&oldadp->ad_block, NULL, 5139 &newadp->ad_block.nb_jwork); 5140 } else { 5141 wk = (struct worklist *) cancel_newblk(&oldadp->ad_block, 5142 &freefrag->ff_list, &freefrag->ff_jwork); 5143 freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk, 5144 &freefrag->ff_jwork); 5145 } 5146 free_newblk(&oldadp->ad_block); 5147 } 5148 5149 /* 5150 * Allocate a jfreefrag structure to journal a single block free. 5151 */ 5152 static struct jfreefrag * 5153 newjfreefrag(freefrag, ip, blkno, size, lbn) 5154 struct freefrag *freefrag; 5155 struct inode *ip; 5156 ufs2_daddr_t blkno; 5157 long size; 5158 ufs_lbn_t lbn; 5159 { 5160 struct jfreefrag *jfreefrag; 5161 struct fs *fs; 5162 5163 fs = ip->i_fs; 5164 jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, 5165 M_SOFTDEP_FLAGS); 5166 workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); 5167 jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); 5168 jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; 5169 jfreefrag->fr_ino = ip->i_number; 5170 jfreefrag->fr_lbn = lbn; 5171 jfreefrag->fr_blkno = blkno; 5172 jfreefrag->fr_frags = numfrags(fs, size); 5173 jfreefrag->fr_freefrag = freefrag; 5174 5175 return (jfreefrag); 5176 } 5177 5178 /* 5179 * Allocate a new freefrag structure. 5180 */ 5181 static struct freefrag * 5182 newfreefrag(ip, blkno, size, lbn) 5183 struct inode *ip; 5184 ufs2_daddr_t blkno; 5185 long size; 5186 ufs_lbn_t lbn; 5187 { 5188 struct freefrag *freefrag; 5189 struct fs *fs; 5190 5191 fs = ip->i_fs; 5192 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) 5193 panic("newfreefrag: frag size"); 5194 freefrag = malloc(sizeof(struct freefrag), 5195 M_FREEFRAG, M_SOFTDEP_FLAGS); 5196 workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); 5197 freefrag->ff_state = ATTACHED; 5198 LIST_INIT(&freefrag->ff_jwork); 5199 freefrag->ff_inum = ip->i_number; 5200 freefrag->ff_vtype = ITOV(ip)->v_type; 5201 freefrag->ff_blkno = blkno; 5202 freefrag->ff_fragsize = size; 5203 5204 if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) { 5205 freefrag->ff_jdep = (struct worklist *) 5206 newjfreefrag(freefrag, ip, blkno, size, lbn); 5207 } else { 5208 freefrag->ff_state |= DEPCOMPLETE; 5209 freefrag->ff_jdep = NULL; 5210 } 5211 5212 return (freefrag); 5213 } 5214 5215 /* 5216 * This workitem de-allocates fragments that were replaced during 5217 * file block allocation. 5218 */ 5219 static void 5220 handle_workitem_freefrag(freefrag) 5221 struct freefrag *freefrag; 5222 { 5223 struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); 5224 struct workhead wkhd; 5225 5226 /* 5227 * It would be illegal to add new completion items to the 5228 * freefrag after it was schedule to be done so it must be 5229 * safe to modify the list head here. 5230 */ 5231 LIST_INIT(&wkhd); 5232 ACQUIRE_LOCK(&lk); 5233 LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); 5234 /* 5235 * If the journal has not been written we must cancel it here. 5236 */ 5237 if (freefrag->ff_jdep) { 5238 if (freefrag->ff_jdep->wk_type != D_JNEWBLK) 5239 panic("handle_workitem_freefrag: Unexpected type %d\n", 5240 freefrag->ff_jdep->wk_type); 5241 cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd); 5242 } 5243 FREE_LOCK(&lk); 5244 ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, 5245 freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd); 5246 ACQUIRE_LOCK(&lk); 5247 WORKITEM_FREE(freefrag, D_FREEFRAG); 5248 FREE_LOCK(&lk); 5249 } 5250 5251 /* 5252 * Set up a dependency structure for an external attributes data block. 5253 * This routine follows much of the structure of softdep_setup_allocdirect. 5254 * See the description of softdep_setup_allocdirect above for details. 5255 */ 5256 void 5257 softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) 5258 struct inode *ip; 5259 ufs_lbn_t off; 5260 ufs2_daddr_t newblkno; 5261 ufs2_daddr_t oldblkno; 5262 long newsize; 5263 long oldsize; 5264 struct buf *bp; 5265 { 5266 struct allocdirect *adp, *oldadp; 5267 struct allocdirectlst *adphead; 5268 struct freefrag *freefrag; 5269 struct inodedep *inodedep; 5270 struct jnewblk *jnewblk; 5271 struct newblk *newblk; 5272 struct mount *mp; 5273 ufs_lbn_t lbn; 5274 5275 if (off >= NXADDR) 5276 panic("softdep_setup_allocext: lbn %lld > NXADDR", 5277 (long long)off); 5278 5279 lbn = bp->b_lblkno; 5280 mp = UFSTOVFS(ip->i_ump); 5281 if (oldblkno && oldblkno != newblkno) 5282 freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); 5283 else 5284 freefrag = NULL; 5285 5286 ACQUIRE_LOCK(&lk); 5287 if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) 5288 panic("softdep_setup_allocext: lost block"); 5289 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 5290 ("softdep_setup_allocext: newblk already initialized")); 5291 /* 5292 * Convert the newblk to an allocdirect. 5293 */ 5294 newblk->nb_list.wk_type = D_ALLOCDIRECT; 5295 adp = (struct allocdirect *)newblk; 5296 newblk->nb_freefrag = freefrag; 5297 adp->ad_offset = off; 5298 adp->ad_oldblkno = oldblkno; 5299 adp->ad_newsize = newsize; 5300 adp->ad_oldsize = oldsize; 5301 adp->ad_state |= EXTDATA; 5302 5303 /* 5304 * Finish initializing the journal. 5305 */ 5306 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 5307 jnewblk->jn_ino = ip->i_number; 5308 jnewblk->jn_lbn = lbn; 5309 add_to_journal(&jnewblk->jn_list); 5310 } 5311 if (freefrag && freefrag->ff_jdep != NULL && 5312 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 5313 add_to_journal(freefrag->ff_jdep); 5314 inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); 5315 adp->ad_inodedep = inodedep; 5316 5317 WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); 5318 /* 5319 * The list of allocdirects must be kept in sorted and ascending 5320 * order so that the rollback routines can quickly determine the 5321 * first uncommitted block (the size of the file stored on disk 5322 * ends at the end of the lowest committed fragment, or if there 5323 * are no fragments, at the end of the highest committed block). 5324 * Since files generally grow, the typical case is that the new 5325 * block is to be added at the end of the list. We speed this 5326 * special case by checking against the last allocdirect in the 5327 * list before laboriously traversing the list looking for the 5328 * insertion point. 5329 */ 5330 adphead = &inodedep->id_newextupdt; 5331 oldadp = TAILQ_LAST(adphead, allocdirectlst); 5332 if (oldadp == NULL || oldadp->ad_offset <= off) { 5333 /* insert at end of list */ 5334 TAILQ_INSERT_TAIL(adphead, adp, ad_next); 5335 if (oldadp != NULL && oldadp->ad_offset == off) 5336 allocdirect_merge(adphead, adp, oldadp); 5337 FREE_LOCK(&lk); 5338 return; 5339 } 5340 TAILQ_FOREACH(oldadp, adphead, ad_next) { 5341 if (oldadp->ad_offset >= off) 5342 break; 5343 } 5344 if (oldadp == NULL) 5345 panic("softdep_setup_allocext: lost entry"); 5346 /* insert in middle of list */ 5347 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); 5348 if (oldadp->ad_offset == off) 5349 allocdirect_merge(adphead, adp, oldadp); 5350 FREE_LOCK(&lk); 5351 } 5352 5353 /* 5354 * Indirect block allocation dependencies. 5355 * 5356 * The same dependencies that exist for a direct block also exist when 5357 * a new block is allocated and pointed to by an entry in a block of 5358 * indirect pointers. The undo/redo states described above are also 5359 * used here. Because an indirect block contains many pointers that 5360 * may have dependencies, a second copy of the entire in-memory indirect 5361 * block is kept. The buffer cache copy is always completely up-to-date. 5362 * The second copy, which is used only as a source for disk writes, 5363 * contains only the safe pointers (i.e., those that have no remaining 5364 * update dependencies). The second copy is freed when all pointers 5365 * are safe. The cache is not allowed to replace indirect blocks with 5366 * pending update dependencies. If a buffer containing an indirect 5367 * block with dependencies is written, these routines will mark it 5368 * dirty again. It can only be successfully written once all the 5369 * dependencies are removed. The ffs_fsync routine in conjunction with 5370 * softdep_sync_metadata work together to get all the dependencies 5371 * removed so that a file can be successfully written to disk. Three 5372 * procedures are used when setting up indirect block pointer 5373 * dependencies. The division is necessary because of the organization 5374 * of the "balloc" routine and because of the distinction between file 5375 * pages and file metadata blocks. 5376 */ 5377 5378 /* 5379 * Allocate a new allocindir structure. 5380 */ 5381 static struct allocindir * 5382 newallocindir(ip, ptrno, newblkno, oldblkno, lbn) 5383 struct inode *ip; /* inode for file being extended */ 5384 int ptrno; /* offset of pointer in indirect block */ 5385 ufs2_daddr_t newblkno; /* disk block number being added */ 5386 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 5387 ufs_lbn_t lbn; 5388 { 5389 struct newblk *newblk; 5390 struct allocindir *aip; 5391 struct freefrag *freefrag; 5392 struct jnewblk *jnewblk; 5393 5394 if (oldblkno) 5395 freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); 5396 else 5397 freefrag = NULL; 5398 ACQUIRE_LOCK(&lk); 5399 if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) 5400 panic("new_allocindir: lost block"); 5401 KASSERT(newblk->nb_list.wk_type == D_NEWBLK, 5402 ("newallocindir: newblk already initialized")); 5403 newblk->nb_list.wk_type = D_ALLOCINDIR; 5404 newblk->nb_freefrag = freefrag; 5405 aip = (struct allocindir *)newblk; 5406 aip->ai_offset = ptrno; 5407 aip->ai_oldblkno = oldblkno; 5408 aip->ai_lbn = lbn; 5409 if ((jnewblk = newblk->nb_jnewblk) != NULL) { 5410 jnewblk->jn_ino = ip->i_number; 5411 jnewblk->jn_lbn = lbn; 5412 add_to_journal(&jnewblk->jn_list); 5413 } 5414 if (freefrag && freefrag->ff_jdep != NULL && 5415 freefrag->ff_jdep->wk_type == D_JFREEFRAG) 5416 add_to_journal(freefrag->ff_jdep); 5417 return (aip); 5418 } 5419 5420 /* 5421 * Called just before setting an indirect block pointer 5422 * to a newly allocated file page. 5423 */ 5424 void 5425 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) 5426 struct inode *ip; /* inode for file being extended */ 5427 ufs_lbn_t lbn; /* allocated block number within file */ 5428 struct buf *bp; /* buffer with indirect blk referencing page */ 5429 int ptrno; /* offset of pointer in indirect block */ 5430 ufs2_daddr_t newblkno; /* disk block number being added */ 5431 ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ 5432 struct buf *nbp; /* buffer holding allocated page */ 5433 { 5434 struct inodedep *inodedep; 5435 struct freefrag *freefrag; 5436 struct allocindir *aip; 5437 struct pagedep *pagedep; 5438 struct mount *mp; 5439 5440 if (lbn != nbp->b_lblkno) 5441 panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", 5442 lbn, bp->b_lblkno); 5443 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); 5444 mp = UFSTOVFS(ip->i_ump); 5445 aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); 5446 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 5447 /* 5448 * If we are allocating a directory page, then we must 5449 * allocate an associated pagedep to track additions and 5450 * deletions. 5451 */ 5452 if ((ip->i_mode & IFMT) == IFDIR) 5453 pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep); 5454 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 5455 freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); 5456 FREE_LOCK(&lk); 5457 if (freefrag) 5458 handle_workitem_freefrag(freefrag); 5459 } 5460 5461 /* 5462 * Called just before setting an indirect block pointer to a 5463 * newly allocated indirect block. 5464 */ 5465 void 5466 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) 5467 struct buf *nbp; /* newly allocated indirect block */ 5468 struct inode *ip; /* inode for file being extended */ 5469 struct buf *bp; /* indirect block referencing allocated block */ 5470 int ptrno; /* offset of pointer in indirect block */ 5471 ufs2_daddr_t newblkno; /* disk block number being added */ 5472 { 5473 struct inodedep *inodedep; 5474 struct allocindir *aip; 5475 ufs_lbn_t lbn; 5476 5477 lbn = nbp->b_lblkno; 5478 ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); 5479 aip = newallocindir(ip, ptrno, newblkno, 0, lbn); 5480 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 5481 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); 5482 if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)) 5483 panic("softdep_setup_allocindir_meta: Block already existed"); 5484 FREE_LOCK(&lk); 5485 } 5486 5487 static void 5488 indirdep_complete(indirdep) 5489 struct indirdep *indirdep; 5490 { 5491 struct allocindir *aip; 5492 5493 LIST_REMOVE(indirdep, ir_next); 5494 indirdep->ir_state |= DEPCOMPLETE; 5495 5496 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { 5497 LIST_REMOVE(aip, ai_next); 5498 free_newblk(&aip->ai_block); 5499 } 5500 /* 5501 * If this indirdep is not attached to a buf it was simply waiting 5502 * on completion to clear completehd. free_indirdep() asserts 5503 * that nothing is dangling. 5504 */ 5505 if ((indirdep->ir_state & ONWORKLIST) == 0) 5506 free_indirdep(indirdep); 5507 } 5508 5509 static struct indirdep * 5510 indirdep_lookup(mp, ip, bp) 5511 struct mount *mp; 5512 struct inode *ip; 5513 struct buf *bp; 5514 { 5515 struct indirdep *indirdep, *newindirdep; 5516 struct newblk *newblk; 5517 struct worklist *wk; 5518 struct fs *fs; 5519 ufs2_daddr_t blkno; 5520 5521 mtx_assert(&lk, MA_OWNED); 5522 indirdep = NULL; 5523 newindirdep = NULL; 5524 fs = ip->i_fs; 5525 for (;;) { 5526 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 5527 if (wk->wk_type != D_INDIRDEP) 5528 continue; 5529 indirdep = WK_INDIRDEP(wk); 5530 break; 5531 } 5532 /* Found on the buffer worklist, no new structure to free. */ 5533 if (indirdep != NULL && newindirdep == NULL) 5534 return (indirdep); 5535 if (indirdep != NULL && newindirdep != NULL) 5536 panic("indirdep_lookup: simultaneous create"); 5537 /* None found on the buffer and a new structure is ready. */ 5538 if (indirdep == NULL && newindirdep != NULL) 5539 break; 5540 /* None found and no new structure available. */ 5541 FREE_LOCK(&lk); 5542 newindirdep = malloc(sizeof(struct indirdep), 5543 M_INDIRDEP, M_SOFTDEP_FLAGS); 5544 workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); 5545 newindirdep->ir_state = ATTACHED; 5546 if (ip->i_ump->um_fstype == UFS1) 5547 newindirdep->ir_state |= UFS1FMT; 5548 TAILQ_INIT(&newindirdep->ir_trunc); 5549 newindirdep->ir_saveddata = NULL; 5550 LIST_INIT(&newindirdep->ir_deplisthd); 5551 LIST_INIT(&newindirdep->ir_donehd); 5552 LIST_INIT(&newindirdep->ir_writehd); 5553 LIST_INIT(&newindirdep->ir_completehd); 5554 if (bp->b_blkno == bp->b_lblkno) { 5555 ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, 5556 NULL, NULL); 5557 bp->b_blkno = blkno; 5558 } 5559 newindirdep->ir_freeblks = NULL; 5560 newindirdep->ir_savebp = 5561 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); 5562 newindirdep->ir_bp = bp; 5563 BUF_KERNPROC(newindirdep->ir_savebp); 5564 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); 5565 ACQUIRE_LOCK(&lk); 5566 } 5567 indirdep = newindirdep; 5568 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); 5569 /* 5570 * If the block is not yet allocated we don't set DEPCOMPLETE so 5571 * that we don't free dependencies until the pointers are valid. 5572 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather 5573 * than using the hash. 5574 */ 5575 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)) 5576 LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next); 5577 else 5578 indirdep->ir_state |= DEPCOMPLETE; 5579 return (indirdep); 5580 } 5581 5582 /* 5583 * Called to finish the allocation of the "aip" allocated 5584 * by one of the two routines above. 5585 */ 5586 static struct freefrag * 5587 setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) 5588 struct buf *bp; /* in-memory copy of the indirect block */ 5589 struct inode *ip; /* inode for file being extended */ 5590 struct inodedep *inodedep; /* Inodedep for ip */ 5591 struct allocindir *aip; /* allocindir allocated by the above routines */ 5592 ufs_lbn_t lbn; /* Logical block number for this block. */ 5593 { 5594 struct fs *fs; 5595 struct indirdep *indirdep; 5596 struct allocindir *oldaip; 5597 struct freefrag *freefrag; 5598 struct mount *mp; 5599 5600 mtx_assert(&lk, MA_OWNED); 5601 mp = UFSTOVFS(ip->i_ump); 5602 fs = ip->i_fs; 5603 if (bp->b_lblkno >= 0) 5604 panic("setup_allocindir_phase2: not indir blk"); 5605 KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs), 5606 ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset)); 5607 indirdep = indirdep_lookup(mp, ip, bp); 5608 KASSERT(indirdep->ir_savebp != NULL, 5609 ("setup_allocindir_phase2 NULL ir_savebp")); 5610 aip->ai_indirdep = indirdep; 5611 /* 5612 * Check for an unwritten dependency for this indirect offset. If 5613 * there is, merge the old dependency into the new one. This happens 5614 * as a result of reallocblk only. 5615 */ 5616 freefrag = NULL; 5617 if (aip->ai_oldblkno != 0) { 5618 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) { 5619 if (oldaip->ai_offset == aip->ai_offset) { 5620 freefrag = allocindir_merge(aip, oldaip); 5621 goto done; 5622 } 5623 } 5624 LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) { 5625 if (oldaip->ai_offset == aip->ai_offset) { 5626 freefrag = allocindir_merge(aip, oldaip); 5627 goto done; 5628 } 5629 } 5630 } 5631 done: 5632 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); 5633 return (freefrag); 5634 } 5635 5636 /* 5637 * Merge two allocindirs which refer to the same block. Move newblock 5638 * dependencies and setup the freefrags appropriately. 5639 */ 5640 static struct freefrag * 5641 allocindir_merge(aip, oldaip) 5642 struct allocindir *aip; 5643 struct allocindir *oldaip; 5644 { 5645 struct freefrag *freefrag; 5646 struct worklist *wk; 5647 5648 if (oldaip->ai_newblkno != aip->ai_oldblkno) 5649 panic("allocindir_merge: blkno"); 5650 aip->ai_oldblkno = oldaip->ai_oldblkno; 5651 freefrag = aip->ai_freefrag; 5652 aip->ai_freefrag = oldaip->ai_freefrag; 5653 oldaip->ai_freefrag = NULL; 5654 KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); 5655 /* 5656 * If we are tracking a new directory-block allocation, 5657 * move it from the old allocindir to the new allocindir. 5658 */ 5659 if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { 5660 WORKLIST_REMOVE(wk); 5661 if (!LIST_EMPTY(&oldaip->ai_newdirblk)) 5662 panic("allocindir_merge: extra newdirblk"); 5663 WORKLIST_INSERT(&aip->ai_newdirblk, wk); 5664 } 5665 /* 5666 * We can skip journaling for this freefrag and just complete 5667 * any pending journal work for the allocindir that is being 5668 * removed after the freefrag completes. 5669 */ 5670 if (freefrag->ff_jdep) 5671 cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep)); 5672 LIST_REMOVE(oldaip, ai_next); 5673 freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block, 5674 &freefrag->ff_list, &freefrag->ff_jwork); 5675 free_newblk(&oldaip->ai_block); 5676 5677 return (freefrag); 5678 } 5679 5680 static inline void 5681 setup_freedirect(freeblks, ip, i, needj) 5682 struct freeblks *freeblks; 5683 struct inode *ip; 5684 int i; 5685 int needj; 5686 { 5687 ufs2_daddr_t blkno; 5688 int frags; 5689 5690 blkno = DIP(ip, i_db[i]); 5691 if (blkno == 0) 5692 return; 5693 DIP_SET(ip, i_db[i], 0); 5694 frags = sblksize(ip->i_fs, ip->i_size, i); 5695 frags = numfrags(ip->i_fs, frags); 5696 newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj); 5697 } 5698 5699 static inline void 5700 setup_freeext(freeblks, ip, i, needj) 5701 struct freeblks *freeblks; 5702 struct inode *ip; 5703 int i; 5704 int needj; 5705 { 5706 ufs2_daddr_t blkno; 5707 int frags; 5708 5709 blkno = ip->i_din2->di_extb[i]; 5710 if (blkno == 0) 5711 return; 5712 ip->i_din2->di_extb[i] = 0; 5713 frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i); 5714 frags = numfrags(ip->i_fs, frags); 5715 newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); 5716 } 5717 5718 static inline void 5719 setup_freeindir(freeblks, ip, i, lbn, needj) 5720 struct freeblks *freeblks; 5721 struct inode *ip; 5722 int i; 5723 ufs_lbn_t lbn; 5724 int needj; 5725 { 5726 ufs2_daddr_t blkno; 5727 5728 blkno = DIP(ip, i_ib[i]); 5729 if (blkno == 0) 5730 return; 5731 DIP_SET(ip, i_ib[i], 0); 5732 newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag, 5733 0, needj); 5734 } 5735 5736 static inline struct freeblks * 5737 newfreeblks(mp, ip) 5738 struct mount *mp; 5739 struct inode *ip; 5740 { 5741 struct freeblks *freeblks; 5742 5743 freeblks = malloc(sizeof(struct freeblks), 5744 M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); 5745 workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); 5746 LIST_INIT(&freeblks->fb_jblkdephd); 5747 LIST_INIT(&freeblks->fb_jwork); 5748 freeblks->fb_ref = 0; 5749 freeblks->fb_cgwait = 0; 5750 freeblks->fb_state = ATTACHED; 5751 freeblks->fb_uid = ip->i_uid; 5752 freeblks->fb_inum = ip->i_number; 5753 freeblks->fb_vtype = ITOV(ip)->v_type; 5754 freeblks->fb_modrev = DIP(ip, i_modrev); 5755 freeblks->fb_devvp = ip->i_devvp; 5756 freeblks->fb_chkcnt = 0; 5757 freeblks->fb_len = 0; 5758 5759 return (freeblks); 5760 } 5761 5762 static void 5763 trunc_indirdep(indirdep, freeblks, bp, off) 5764 struct indirdep *indirdep; 5765 struct freeblks *freeblks; 5766 struct buf *bp; 5767 int off; 5768 { 5769 struct allocindir *aip, *aipn; 5770 5771 /* 5772 * The first set of allocindirs won't be in savedbp. 5773 */ 5774 LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn) 5775 if (aip->ai_offset > off) 5776 cancel_allocindir(aip, bp, freeblks, 1); 5777 LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn) 5778 if (aip->ai_offset > off) 5779 cancel_allocindir(aip, bp, freeblks, 1); 5780 /* 5781 * These will exist in savedbp. 5782 */ 5783 LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn) 5784 if (aip->ai_offset > off) 5785 cancel_allocindir(aip, NULL, freeblks, 0); 5786 LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn) 5787 if (aip->ai_offset > off) 5788 cancel_allocindir(aip, NULL, freeblks, 0); 5789 } 5790 5791 /* 5792 * Follow the chain of indirects down to lastlbn creating a freework 5793 * structure for each. This will be used to start indir_trunc() at 5794 * the right offset and create the journal records for the parrtial 5795 * truncation. A second step will handle the truncated dependencies. 5796 */ 5797 static int 5798 setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno) 5799 struct freeblks *freeblks; 5800 struct inode *ip; 5801 ufs_lbn_t lbn; 5802 ufs_lbn_t lastlbn; 5803 ufs2_daddr_t blkno; 5804 { 5805 struct indirdep *indirdep; 5806 struct indirdep *indirn; 5807 struct freework *freework; 5808 struct newblk *newblk; 5809 struct mount *mp; 5810 struct buf *bp; 5811 uint8_t *start; 5812 uint8_t *end; 5813 ufs_lbn_t lbnadd; 5814 int level; 5815 int error; 5816 int off; 5817 5818 5819 freework = NULL; 5820 if (blkno == 0) 5821 return (0); 5822 mp = freeblks->fb_list.wk_mp; 5823 bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0); 5824 if ((bp->b_flags & B_CACHE) == 0) { 5825 bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno); 5826 bp->b_iocmd = BIO_READ; 5827 bp->b_flags &= ~B_INVAL; 5828 bp->b_ioflags &= ~BIO_ERROR; 5829 vfs_busy_pages(bp, 0); 5830 bp->b_iooffset = dbtob(bp->b_blkno); 5831 bstrategy(bp); 5832 curthread->td_ru.ru_inblock++; 5833 error = bufwait(bp); 5834 if (error) { 5835 brelse(bp); 5836 return (error); 5837 } 5838 } 5839 level = lbn_level(lbn); 5840 lbnadd = lbn_offset(ip->i_fs, level); 5841 /* 5842 * Compute the offset of the last block we want to keep. Store 5843 * in the freework the first block we want to completely free. 5844 */ 5845 off = (lastlbn - -(lbn + level)) / lbnadd; 5846 if (off + 1 == NINDIR(ip->i_fs)) 5847 goto nowork; 5848 freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1, 5849 0); 5850 /* 5851 * Link the freework into the indirdep. This will prevent any new 5852 * allocations from proceeding until we are finished with the 5853 * truncate and the block is written. 5854 */ 5855 ACQUIRE_LOCK(&lk); 5856 indirdep = indirdep_lookup(mp, ip, bp); 5857 if (indirdep->ir_freeblks) 5858 panic("setup_trunc_indir: indirdep already truncated."); 5859 TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); 5860 freework->fw_indir = indirdep; 5861 /* 5862 * Cancel any allocindirs that will not make it to disk. 5863 * We have to do this for all copies of the indirdep that 5864 * live on this newblk. 5865 */ 5866 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 5867 newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk); 5868 LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next) 5869 trunc_indirdep(indirn, freeblks, bp, off); 5870 } else 5871 trunc_indirdep(indirdep, freeblks, bp, off); 5872 FREE_LOCK(&lk); 5873 /* 5874 * Creation is protected by the buf lock. The saveddata is only 5875 * needed if a full truncation follows a partial truncation but it 5876 * is difficult to allocate in that case so we fetch it anyway. 5877 */ 5878 if (indirdep->ir_saveddata == NULL) 5879 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 5880 M_SOFTDEP_FLAGS); 5881 nowork: 5882 /* Fetch the blkno of the child and the zero start offset. */ 5883 if (ip->i_ump->um_fstype == UFS1) { 5884 blkno = ((ufs1_daddr_t *)bp->b_data)[off]; 5885 start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1]; 5886 } else { 5887 blkno = ((ufs2_daddr_t *)bp->b_data)[off]; 5888 start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1]; 5889 } 5890 if (freework) { 5891 /* Zero the truncated pointers. */ 5892 end = bp->b_data + bp->b_bcount; 5893 bzero(start, end - start); 5894 bdwrite(bp); 5895 } else 5896 bqrelse(bp); 5897 if (level == 0) 5898 return (0); 5899 lbn++; /* adjust level */ 5900 lbn -= (off * lbnadd); 5901 return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno); 5902 } 5903 5904 /* 5905 * Complete the partial truncation of an indirect block setup by 5906 * setup_trunc_indir(). This zeros the truncated pointers in the saved 5907 * copy and writes them to disk before the freeblks is allowed to complete. 5908 */ 5909 static void 5910 complete_trunc_indir(freework) 5911 struct freework *freework; 5912 { 5913 struct freework *fwn; 5914 struct indirdep *indirdep; 5915 struct buf *bp; 5916 uintptr_t start; 5917 int count; 5918 5919 indirdep = freework->fw_indir; 5920 for (;;) { 5921 bp = indirdep->ir_bp; 5922 /* See if the block was discarded. */ 5923 if (bp == NULL) 5924 break; 5925 /* Inline part of getdirtybuf(). We dont want bremfree. */ 5926 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) 5927 break; 5928 if (BUF_LOCK(bp, 5929 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, &lk) == 0) 5930 BUF_UNLOCK(bp); 5931 ACQUIRE_LOCK(&lk); 5932 } 5933 mtx_assert(&lk, MA_OWNED); 5934 freework->fw_state |= DEPCOMPLETE; 5935 TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next); 5936 /* 5937 * Zero the pointers in the saved copy. 5938 */ 5939 if (indirdep->ir_state & UFS1FMT) 5940 start = sizeof(ufs1_daddr_t); 5941 else 5942 start = sizeof(ufs2_daddr_t); 5943 start *= freework->fw_start; 5944 count = indirdep->ir_savebp->b_bcount - start; 5945 start += (uintptr_t)indirdep->ir_savebp->b_data; 5946 bzero((char *)start, count); 5947 /* 5948 * We need to start the next truncation in the list if it has not 5949 * been started yet. 5950 */ 5951 fwn = TAILQ_FIRST(&indirdep->ir_trunc); 5952 if (fwn != NULL) { 5953 if (fwn->fw_freeblks == indirdep->ir_freeblks) 5954 TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next); 5955 if ((fwn->fw_state & ONWORKLIST) == 0) 5956 freework_enqueue(fwn); 5957 } 5958 /* 5959 * If bp is NULL the block was fully truncated, restore 5960 * the saved block list otherwise free it if it is no 5961 * longer needed. 5962 */ 5963 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 5964 if (bp == NULL) 5965 bcopy(indirdep->ir_saveddata, 5966 indirdep->ir_savebp->b_data, 5967 indirdep->ir_savebp->b_bcount); 5968 free(indirdep->ir_saveddata, M_INDIRDEP); 5969 indirdep->ir_saveddata = NULL; 5970 } 5971 /* 5972 * When bp is NULL there is a full truncation pending. We 5973 * must wait for this full truncation to be journaled before 5974 * we can release this freework because the disk pointers will 5975 * never be written as zero. 5976 */ 5977 if (bp == NULL) { 5978 if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd)) 5979 handle_written_freework(freework); 5980 else 5981 WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd, 5982 &freework->fw_list); 5983 } else { 5984 /* Complete when the real copy is written. */ 5985 WORKLIST_INSERT(&bp->b_dep, &freework->fw_list); 5986 BUF_UNLOCK(bp); 5987 } 5988 } 5989 5990 /* 5991 * Calculate the number of blocks we are going to release where datablocks 5992 * is the current total and length is the new file size. 5993 */ 5994 ufs2_daddr_t 5995 blkcount(fs, datablocks, length) 5996 struct fs *fs; 5997 ufs2_daddr_t datablocks; 5998 off_t length; 5999 { 6000 off_t totblks, numblks; 6001 6002 totblks = 0; 6003 numblks = howmany(length, fs->fs_bsize); 6004 if (numblks <= NDADDR) { 6005 totblks = howmany(length, fs->fs_fsize); 6006 goto out; 6007 } 6008 totblks = blkstofrags(fs, numblks); 6009 numblks -= NDADDR; 6010 /* 6011 * Count all single, then double, then triple indirects required. 6012 * Subtracting one indirects worth of blocks for each pass 6013 * acknowledges one of each pointed to by the inode. 6014 */ 6015 for (;;) { 6016 totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs))); 6017 numblks -= NINDIR(fs); 6018 if (numblks <= 0) 6019 break; 6020 numblks = howmany(numblks, NINDIR(fs)); 6021 } 6022 out: 6023 totblks = fsbtodb(fs, totblks); 6024 /* 6025 * Handle sparse files. We can't reclaim more blocks than the inode 6026 * references. We will correct it later in handle_complete_freeblks() 6027 * when we know the real count. 6028 */ 6029 if (totblks > datablocks) 6030 return (0); 6031 return (datablocks - totblks); 6032 } 6033 6034 /* 6035 * Handle freeblocks for journaled softupdate filesystems. 6036 * 6037 * Contrary to normal softupdates, we must preserve the block pointers in 6038 * indirects until their subordinates are free. This is to avoid journaling 6039 * every block that is freed which may consume more space than the journal 6040 * itself. The recovery program will see the free block journals at the 6041 * base of the truncated area and traverse them to reclaim space. The 6042 * pointers in the inode may be cleared immediately after the journal 6043 * records are written because each direct and indirect pointer in the 6044 * inode is recorded in a journal. This permits full truncation to proceed 6045 * asynchronously. The write order is journal -> inode -> cgs -> indirects. 6046 * 6047 * The algorithm is as follows: 6048 * 1) Traverse the in-memory state and create journal entries to release 6049 * the relevant blocks and full indirect trees. 6050 * 2) Traverse the indirect block chain adding partial truncation freework 6051 * records to indirects in the path to lastlbn. The freework will 6052 * prevent new allocation dependencies from being satisfied in this 6053 * indirect until the truncation completes. 6054 * 3) Read and lock the inode block, performing an update with the new size 6055 * and pointers. This prevents truncated data from becoming valid on 6056 * disk through step 4. 6057 * 4) Reap unsatisfied dependencies that are beyond the truncated area, 6058 * eliminate journal work for those records that do not require it. 6059 * 5) Schedule the journal records to be written followed by the inode block. 6060 * 6) Allocate any necessary frags for the end of file. 6061 * 7) Zero any partially truncated blocks. 6062 * 6063 * From this truncation proceeds asynchronously using the freework and 6064 * indir_trunc machinery. The file will not be extended again into a 6065 * partially truncated indirect block until all work is completed but 6066 * the normal dependency mechanism ensures that it is rolled back/forward 6067 * as appropriate. Further truncation may occur without delay and is 6068 * serialized in indir_trunc(). 6069 */ 6070 void 6071 softdep_journal_freeblocks(ip, cred, length, flags) 6072 struct inode *ip; /* The inode whose length is to be reduced */ 6073 struct ucred *cred; 6074 off_t length; /* The new length for the file */ 6075 int flags; /* IO_EXT and/or IO_NORMAL */ 6076 { 6077 struct freeblks *freeblks, *fbn; 6078 struct inodedep *inodedep; 6079 struct jblkdep *jblkdep; 6080 struct allocdirect *adp, *adpn; 6081 struct fs *fs; 6082 struct buf *bp; 6083 struct vnode *vp; 6084 struct mount *mp; 6085 ufs2_daddr_t extblocks, datablocks; 6086 ufs_lbn_t tmpval, lbn, lastlbn; 6087 int frags; 6088 int lastoff, iboff; 6089 int allocblock; 6090 int error, i; 6091 int needj; 6092 6093 fs = ip->i_fs; 6094 mp = UFSTOVFS(ip->i_ump); 6095 vp = ITOV(ip); 6096 needj = 1; 6097 iboff = -1; 6098 allocblock = 0; 6099 extblocks = 0; 6100 datablocks = 0; 6101 frags = 0; 6102 freeblks = newfreeblks(mp, ip); 6103 ACQUIRE_LOCK(&lk); 6104 /* 6105 * If we're truncating a removed file that will never be written 6106 * we don't need to journal the block frees. The canceled journals 6107 * for the allocations will suffice. 6108 */ 6109 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6110 if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED && 6111 length == 0) 6112 needj = 0; 6113 FREE_LOCK(&lk); 6114 /* 6115 * Calculate the lbn that we are truncating to. This results in -1 6116 * if we're truncating the 0 bytes. So it is the last lbn we want 6117 * to keep, not the first lbn we want to truncate. 6118 */ 6119 lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1; 6120 lastoff = blkoff(fs, length); 6121 /* 6122 * Compute frags we are keeping in lastlbn. 0 means all. 6123 */ 6124 if (lastlbn >= 0 && lastlbn < NDADDR) { 6125 frags = fragroundup(fs, lastoff); 6126 /* adp offset of last valid allocdirect. */ 6127 iboff = lastlbn; 6128 } else if (lastlbn > 0) 6129 iboff = NDADDR; 6130 if (fs->fs_magic == FS_UFS2_MAGIC) 6131 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 6132 /* 6133 * Handle normal data blocks and indirects. This section saves 6134 * values used after the inode update to complete frag and indirect 6135 * truncation. 6136 */ 6137 if ((flags & IO_NORMAL) != 0) { 6138 /* 6139 * Handle truncation of whole direct and indirect blocks. 6140 */ 6141 for (i = iboff + 1; i < NDADDR; i++) 6142 setup_freedirect(freeblks, ip, i, needj); 6143 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 6144 i++, lbn += tmpval, tmpval *= NINDIR(fs)) { 6145 /* Release a whole indirect tree. */ 6146 if (lbn > lastlbn) { 6147 setup_freeindir(freeblks, ip, i, -lbn -i, 6148 needj); 6149 continue; 6150 } 6151 iboff = i + NDADDR; 6152 /* 6153 * Traverse partially truncated indirect tree. 6154 */ 6155 if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn) 6156 setup_trunc_indir(freeblks, ip, -lbn - i, 6157 lastlbn, DIP(ip, i_ib[i])); 6158 } 6159 /* 6160 * Handle partial truncation to a frag boundary. 6161 */ 6162 if (frags) { 6163 ufs2_daddr_t blkno; 6164 long oldfrags; 6165 6166 oldfrags = blksize(fs, ip, lastlbn); 6167 blkno = DIP(ip, i_db[lastlbn]); 6168 if (blkno && oldfrags != frags) { 6169 oldfrags -= frags; 6170 oldfrags = numfrags(ip->i_fs, oldfrags); 6171 blkno += numfrags(ip->i_fs, frags); 6172 newfreework(ip->i_ump, freeblks, NULL, lastlbn, 6173 blkno, oldfrags, 0, needj); 6174 } else if (blkno == 0) 6175 allocblock = 1; 6176 } 6177 /* 6178 * Add a journal record for partial truncate if we are 6179 * handling indirect blocks. Non-indirects need no extra 6180 * journaling. 6181 */ 6182 if (length != 0 && lastlbn >= NDADDR) { 6183 ip->i_flag |= IN_TRUNCATED; 6184 newjtrunc(freeblks, length, 0); 6185 } 6186 ip->i_size = length; 6187 DIP_SET(ip, i_size, ip->i_size); 6188 datablocks = DIP(ip, i_blocks) - extblocks; 6189 if (length != 0) 6190 datablocks = blkcount(ip->i_fs, datablocks, length); 6191 freeblks->fb_len = length; 6192 } 6193 if ((flags & IO_EXT) != 0) { 6194 for (i = 0; i < NXADDR; i++) 6195 setup_freeext(freeblks, ip, i, needj); 6196 ip->i_din2->di_extsize = 0; 6197 datablocks += extblocks; 6198 } 6199 #ifdef QUOTA 6200 /* Reference the quotas in case the block count is wrong in the end. */ 6201 quotaref(vp, freeblks->fb_quota); 6202 (void) chkdq(ip, -datablocks, NOCRED, 0); 6203 #endif 6204 freeblks->fb_chkcnt = -datablocks; 6205 UFS_LOCK(ip->i_ump); 6206 fs->fs_pendingblocks += datablocks; 6207 UFS_UNLOCK(ip->i_ump); 6208 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); 6209 /* 6210 * Handle truncation of incomplete alloc direct dependencies. We 6211 * hold the inode block locked to prevent incomplete dependencies 6212 * from reaching the disk while we are eliminating those that 6213 * have been truncated. This is a partially inlined ffs_update(). 6214 */ 6215 ufs_itimes(vp); 6216 ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); 6217 error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 6218 (int)fs->fs_bsize, cred, &bp); 6219 if (error) { 6220 brelse(bp); 6221 softdep_error("softdep_journal_freeblocks", error); 6222 return; 6223 } 6224 if (bp->b_bufsize == fs->fs_bsize) 6225 bp->b_flags |= B_CLUSTEROK; 6226 softdep_update_inodeblock(ip, bp, 0); 6227 if (ip->i_ump->um_fstype == UFS1) 6228 *((struct ufs1_dinode *)bp->b_data + 6229 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; 6230 else 6231 *((struct ufs2_dinode *)bp->b_data + 6232 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; 6233 ACQUIRE_LOCK(&lk); 6234 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6235 if ((inodedep->id_state & IOSTARTED) != 0) 6236 panic("softdep_setup_freeblocks: inode busy"); 6237 /* 6238 * Add the freeblks structure to the list of operations that 6239 * must await the zero'ed inode being written to disk. If we 6240 * still have a bitmap dependency (needj), then the inode 6241 * has never been written to disk, so we can process the 6242 * freeblks below once we have deleted the dependencies. 6243 */ 6244 if (needj) 6245 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 6246 else 6247 freeblks->fb_state |= COMPLETE; 6248 if ((flags & IO_NORMAL) != 0) { 6249 TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) { 6250 if (adp->ad_offset > iboff) 6251 cancel_allocdirect(&inodedep->id_inoupdt, adp, 6252 freeblks); 6253 /* 6254 * Truncate the allocdirect. We could eliminate 6255 * or modify journal records as well. 6256 */ 6257 else if (adp->ad_offset == iboff && frags) 6258 adp->ad_newsize = frags; 6259 } 6260 } 6261 if ((flags & IO_EXT) != 0) 6262 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 6263 cancel_allocdirect(&inodedep->id_extupdt, adp, 6264 freeblks); 6265 /* 6266 * Add journal work. 6267 */ 6268 LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) 6269 add_to_journal(&jblkdep->jb_list); 6270 FREE_LOCK(&lk); 6271 bdwrite(bp); 6272 /* 6273 * Truncate dependency structures beyond length. 6274 */ 6275 trunc_dependencies(ip, freeblks, lastlbn, frags, flags); 6276 /* 6277 * This is only set when we need to allocate a fragment because 6278 * none existed at the end of a frag-sized file. It handles only 6279 * allocating a new, zero filled block. 6280 */ 6281 if (allocblock) { 6282 ip->i_size = length - lastoff; 6283 DIP_SET(ip, i_size, ip->i_size); 6284 error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp); 6285 if (error != 0) { 6286 softdep_error("softdep_journal_freeblks", error); 6287 return; 6288 } 6289 ip->i_size = length; 6290 DIP_SET(ip, i_size, length); 6291 ip->i_flag |= IN_CHANGE | IN_UPDATE; 6292 allocbuf(bp, frags); 6293 ffs_update(vp, MNT_NOWAIT); 6294 bawrite(bp); 6295 } else if (lastoff != 0 && vp->v_type != VDIR) { 6296 int size; 6297 6298 /* 6299 * Zero the end of a truncated frag or block. 6300 */ 6301 size = sblksize(fs, length, lastlbn); 6302 error = bread(vp, lastlbn, size, cred, &bp); 6303 if (error) { 6304 softdep_error("softdep_journal_freeblks", error); 6305 return; 6306 } 6307 bzero((char *)bp->b_data + lastoff, size - lastoff); 6308 bawrite(bp); 6309 6310 } 6311 ACQUIRE_LOCK(&lk); 6312 inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6313 TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next); 6314 freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST; 6315 /* 6316 * We zero earlier truncations so they don't erroneously 6317 * update i_blocks. 6318 */ 6319 if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0) 6320 TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next) 6321 fbn->fb_len = 0; 6322 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE && 6323 LIST_EMPTY(&freeblks->fb_jblkdephd)) 6324 freeblks->fb_state |= INPROGRESS; 6325 else 6326 freeblks = NULL; 6327 FREE_LOCK(&lk); 6328 if (freeblks) 6329 handle_workitem_freeblocks(freeblks, 0); 6330 trunc_pages(ip, length, extblocks, flags); 6331 6332 } 6333 6334 /* 6335 * Flush a JOP_SYNC to the journal. 6336 */ 6337 void 6338 softdep_journal_fsync(ip) 6339 struct inode *ip; 6340 { 6341 struct jfsync *jfsync; 6342 6343 if ((ip->i_flag & IN_TRUNCATED) == 0) 6344 return; 6345 ip->i_flag &= ~IN_TRUNCATED; 6346 jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO); 6347 workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump)); 6348 jfsync->jfs_size = ip->i_size; 6349 jfsync->jfs_ino = ip->i_number; 6350 ACQUIRE_LOCK(&lk); 6351 add_to_journal(&jfsync->jfs_list); 6352 jwait(&jfsync->jfs_list, MNT_WAIT); 6353 FREE_LOCK(&lk); 6354 } 6355 6356 /* 6357 * Block de-allocation dependencies. 6358 * 6359 * When blocks are de-allocated, the on-disk pointers must be nullified before 6360 * the blocks are made available for use by other files. (The true 6361 * requirement is that old pointers must be nullified before new on-disk 6362 * pointers are set. We chose this slightly more stringent requirement to 6363 * reduce complexity.) Our implementation handles this dependency by updating 6364 * the inode (or indirect block) appropriately but delaying the actual block 6365 * de-allocation (i.e., freemap and free space count manipulation) until 6366 * after the updated versions reach stable storage. After the disk is 6367 * updated, the blocks can be safely de-allocated whenever it is convenient. 6368 * This implementation handles only the common case of reducing a file's 6369 * length to zero. Other cases are handled by the conventional synchronous 6370 * write approach. 6371 * 6372 * The ffs implementation with which we worked double-checks 6373 * the state of the block pointers and file size as it reduces 6374 * a file's length. Some of this code is replicated here in our 6375 * soft updates implementation. The freeblks->fb_chkcnt field is 6376 * used to transfer a part of this information to the procedure 6377 * that eventually de-allocates the blocks. 6378 * 6379 * This routine should be called from the routine that shortens 6380 * a file's length, before the inode's size or block pointers 6381 * are modified. It will save the block pointer information for 6382 * later release and zero the inode so that the calling routine 6383 * can release it. 6384 */ 6385 void 6386 softdep_setup_freeblocks(ip, length, flags) 6387 struct inode *ip; /* The inode whose length is to be reduced */ 6388 off_t length; /* The new length for the file */ 6389 int flags; /* IO_EXT and/or IO_NORMAL */ 6390 { 6391 struct ufs1_dinode *dp1; 6392 struct ufs2_dinode *dp2; 6393 struct freeblks *freeblks; 6394 struct inodedep *inodedep; 6395 struct allocdirect *adp; 6396 struct buf *bp; 6397 struct fs *fs; 6398 ufs2_daddr_t extblocks, datablocks; 6399 struct mount *mp; 6400 int i, delay, error; 6401 ufs_lbn_t tmpval; 6402 ufs_lbn_t lbn; 6403 6404 fs = ip->i_fs; 6405 mp = UFSTOVFS(ip->i_ump); 6406 if (length != 0) 6407 panic("softdep_setup_freeblocks: non-zero length"); 6408 freeblks = newfreeblks(mp, ip); 6409 extblocks = 0; 6410 datablocks = 0; 6411 if (fs->fs_magic == FS_UFS2_MAGIC) 6412 extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); 6413 if ((flags & IO_NORMAL) != 0) { 6414 for (i = 0; i < NDADDR; i++) 6415 setup_freedirect(freeblks, ip, i, 0); 6416 for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; 6417 i++, lbn += tmpval, tmpval *= NINDIR(fs)) 6418 setup_freeindir(freeblks, ip, i, -lbn -i, 0); 6419 ip->i_size = 0; 6420 DIP_SET(ip, i_size, 0); 6421 datablocks = DIP(ip, i_blocks) - extblocks; 6422 } 6423 if ((flags & IO_EXT) != 0) { 6424 for (i = 0; i < NXADDR; i++) 6425 setup_freeext(freeblks, ip, i, 0); 6426 ip->i_din2->di_extsize = 0; 6427 datablocks += extblocks; 6428 } 6429 #ifdef QUOTA 6430 /* Reference the quotas in case the block count is wrong in the end. */ 6431 quotaref(vp, freeblks->fb_quota); 6432 (void) chkdq(ip, -datablocks, NOCRED, 0); 6433 #endif 6434 freeblks->fb_chkcnt = -datablocks; 6435 UFS_LOCK(ip->i_ump); 6436 fs->fs_pendingblocks += datablocks; 6437 UFS_UNLOCK(ip->i_ump); 6438 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); 6439 /* 6440 * Push the zero'ed inode to to its disk buffer so that we are free 6441 * to delete its dependencies below. Once the dependencies are gone 6442 * the buffer can be safely released. 6443 */ 6444 if ((error = bread(ip->i_devvp, 6445 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 6446 (int)fs->fs_bsize, NOCRED, &bp)) != 0) { 6447 brelse(bp); 6448 softdep_error("softdep_setup_freeblocks", error); 6449 } 6450 if (ip->i_ump->um_fstype == UFS1) { 6451 dp1 = ((struct ufs1_dinode *)bp->b_data + 6452 ino_to_fsbo(fs, ip->i_number)); 6453 ip->i_din1->di_freelink = dp1->di_freelink; 6454 *dp1 = *ip->i_din1; 6455 } else { 6456 dp2 = ((struct ufs2_dinode *)bp->b_data + 6457 ino_to_fsbo(fs, ip->i_number)); 6458 ip->i_din2->di_freelink = dp2->di_freelink; 6459 *dp2 = *ip->i_din2; 6460 } 6461 /* 6462 * Find and eliminate any inode dependencies. 6463 */ 6464 ACQUIRE_LOCK(&lk); 6465 (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); 6466 if ((inodedep->id_state & IOSTARTED) != 0) 6467 panic("softdep_setup_freeblocks: inode busy"); 6468 /* 6469 * Add the freeblks structure to the list of operations that 6470 * must await the zero'ed inode being written to disk. If we 6471 * still have a bitmap dependency (delay == 0), then the inode 6472 * has never been written to disk, so we can process the 6473 * freeblks below once we have deleted the dependencies. 6474 */ 6475 delay = (inodedep->id_state & DEPCOMPLETE); 6476 if (delay) 6477 WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); 6478 else 6479 freeblks->fb_state |= COMPLETE; 6480 /* 6481 * Because the file length has been truncated to zero, any 6482 * pending block allocation dependency structures associated 6483 * with this inode are obsolete and can simply be de-allocated. 6484 * We must first merge the two dependency lists to get rid of 6485 * any duplicate freefrag structures, then purge the merged list. 6486 * If we still have a bitmap dependency, then the inode has never 6487 * been written to disk, so we can free any fragments without delay. 6488 */ 6489 if (flags & IO_NORMAL) { 6490 merge_inode_lists(&inodedep->id_newinoupdt, 6491 &inodedep->id_inoupdt); 6492 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) 6493 cancel_allocdirect(&inodedep->id_inoupdt, adp, 6494 freeblks); 6495 } 6496 if (flags & IO_EXT) { 6497 merge_inode_lists(&inodedep->id_newextupdt, 6498 &inodedep->id_extupdt); 6499 while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) 6500 cancel_allocdirect(&inodedep->id_extupdt, adp, 6501 freeblks); 6502 } 6503 FREE_LOCK(&lk); 6504 bdwrite(bp); 6505 trunc_dependencies(ip, freeblks, -1, 0, flags); 6506 ACQUIRE_LOCK(&lk); 6507 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 6508 (void) free_inodedep(inodedep); 6509 freeblks->fb_state |= DEPCOMPLETE; 6510 /* 6511 * If the inode with zeroed block pointers is now on disk 6512 * we can start freeing blocks. 6513 */ 6514 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) 6515 freeblks->fb_state |= INPROGRESS; 6516 else 6517 freeblks = NULL; 6518 FREE_LOCK(&lk); 6519 if (freeblks) 6520 handle_workitem_freeblocks(freeblks, 0); 6521 trunc_pages(ip, length, extblocks, flags); 6522 } 6523 6524 /* 6525 * Eliminate pages from the page cache that back parts of this inode and 6526 * adjust the vnode pager's idea of our size. This prevents stale data 6527 * from hanging around in the page cache. 6528 */ 6529 static void 6530 trunc_pages(ip, length, extblocks, flags) 6531 struct inode *ip; 6532 off_t length; 6533 ufs2_daddr_t extblocks; 6534 int flags; 6535 { 6536 struct vnode *vp; 6537 struct fs *fs; 6538 ufs_lbn_t lbn; 6539 off_t end, extend; 6540 6541 vp = ITOV(ip); 6542 fs = ip->i_fs; 6543 extend = OFF_TO_IDX(lblktosize(fs, -extblocks)); 6544 if ((flags & IO_EXT) != 0) 6545 vn_pages_remove(vp, extend, 0); 6546 if ((flags & IO_NORMAL) == 0) 6547 return; 6548 BO_LOCK(&vp->v_bufobj); 6549 drain_output(vp); 6550 BO_UNLOCK(&vp->v_bufobj); 6551 /* 6552 * The vnode pager eliminates file pages we eliminate indirects 6553 * below. 6554 */ 6555 vnode_pager_setsize(vp, length); 6556 /* 6557 * Calculate the end based on the last indirect we want to keep. If 6558 * the block extends into indirects we can just use the negative of 6559 * its lbn. Doubles and triples exist at lower numbers so we must 6560 * be careful not to remove those, if they exist. double and triple 6561 * indirect lbns do not overlap with others so it is not important 6562 * to verify how many levels are required. 6563 */ 6564 lbn = lblkno(fs, length); 6565 if (lbn >= NDADDR) { 6566 /* Calculate the virtual lbn of the triple indirect. */ 6567 lbn = -lbn - (NIADDR - 1); 6568 end = OFF_TO_IDX(lblktosize(fs, lbn)); 6569 } else 6570 end = extend; 6571 vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end); 6572 } 6573 6574 /* 6575 * See if the buf bp is in the range eliminated by truncation. 6576 */ 6577 static int 6578 trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags) 6579 struct buf *bp; 6580 int *blkoffp; 6581 ufs_lbn_t lastlbn; 6582 int lastoff; 6583 int flags; 6584 { 6585 ufs_lbn_t lbn; 6586 6587 *blkoffp = 0; 6588 /* Only match ext/normal blocks as appropriate. */ 6589 if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || 6590 ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) 6591 return (0); 6592 /* ALTDATA is always a full truncation. */ 6593 if ((bp->b_xflags & BX_ALTDATA) != 0) 6594 return (1); 6595 /* -1 is full truncation. */ 6596 if (lastlbn == -1) 6597 return (1); 6598 /* 6599 * If this is a partial truncate we only want those 6600 * blocks and indirect blocks that cover the range 6601 * we're after. 6602 */ 6603 lbn = bp->b_lblkno; 6604 if (lbn < 0) 6605 lbn = -(lbn + lbn_level(lbn)); 6606 if (lbn < lastlbn) 6607 return (0); 6608 /* Here we only truncate lblkno if it's partial. */ 6609 if (lbn == lastlbn) { 6610 if (lastoff == 0) 6611 return (0); 6612 *blkoffp = lastoff; 6613 } 6614 return (1); 6615 } 6616 6617 /* 6618 * Eliminate any dependencies that exist in memory beyond lblkno:off 6619 */ 6620 static void 6621 trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags) 6622 struct inode *ip; 6623 struct freeblks *freeblks; 6624 ufs_lbn_t lastlbn; 6625 int lastoff; 6626 int flags; 6627 { 6628 struct bufobj *bo; 6629 struct vnode *vp; 6630 struct buf *bp; 6631 struct fs *fs; 6632 int blkoff; 6633 6634 /* 6635 * We must wait for any I/O in progress to finish so that 6636 * all potential buffers on the dirty list will be visible. 6637 * Once they are all there, walk the list and get rid of 6638 * any dependencies. 6639 */ 6640 fs = ip->i_fs; 6641 vp = ITOV(ip); 6642 bo = &vp->v_bufobj; 6643 BO_LOCK(bo); 6644 drain_output(vp); 6645 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) 6646 bp->b_vflags &= ~BV_SCANNED; 6647 restart: 6648 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { 6649 if (bp->b_vflags & BV_SCANNED) 6650 continue; 6651 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { 6652 bp->b_vflags |= BV_SCANNED; 6653 continue; 6654 } 6655 if ((bp = getdirtybuf(bp, BO_MTX(bo), MNT_WAIT)) == NULL) 6656 goto restart; 6657 BO_UNLOCK(bo); 6658 if (deallocate_dependencies(bp, freeblks, blkoff)) 6659 bqrelse(bp); 6660 else 6661 brelse(bp); 6662 BO_LOCK(bo); 6663 goto restart; 6664 } 6665 /* 6666 * Now do the work of vtruncbuf while also matching indirect blocks. 6667 */ 6668 TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) 6669 bp->b_vflags &= ~BV_SCANNED; 6670 cleanrestart: 6671 TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) { 6672 if (bp->b_vflags & BV_SCANNED) 6673 continue; 6674 if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { 6675 bp->b_vflags |= BV_SCANNED; 6676 continue; 6677 } 6678 if (BUF_LOCK(bp, 6679 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 6680 BO_MTX(bo)) == ENOLCK) { 6681 BO_LOCK(bo); 6682 goto cleanrestart; 6683 } 6684 bp->b_vflags |= BV_SCANNED; 6685 BO_LOCK(bo); 6686 bremfree(bp); 6687 BO_UNLOCK(bo); 6688 if (blkoff != 0) { 6689 allocbuf(bp, blkoff); 6690 bqrelse(bp); 6691 } else { 6692 bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF; 6693 brelse(bp); 6694 } 6695 BO_LOCK(bo); 6696 goto cleanrestart; 6697 } 6698 drain_output(vp); 6699 BO_UNLOCK(bo); 6700 } 6701 6702 static int 6703 cancel_pagedep(pagedep, freeblks, blkoff) 6704 struct pagedep *pagedep; 6705 struct freeblks *freeblks; 6706 int blkoff; 6707 { 6708 struct jremref *jremref; 6709 struct jmvref *jmvref; 6710 struct dirrem *dirrem, *tmp; 6711 int i; 6712 6713 /* 6714 * Copy any directory remove dependencies to the list 6715 * to be processed after the freeblks proceeds. If 6716 * directory entry never made it to disk they 6717 * can be dumped directly onto the work list. 6718 */ 6719 LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) { 6720 /* Skip this directory removal if it is intended to remain. */ 6721 if (dirrem->dm_offset < blkoff) 6722 continue; 6723 /* 6724 * If there are any dirrems we wait for the journal write 6725 * to complete and then restart the buf scan as the lock 6726 * has been dropped. 6727 */ 6728 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { 6729 jwait(&jremref->jr_list, MNT_WAIT); 6730 return (ERESTART); 6731 } 6732 LIST_REMOVE(dirrem, dm_next); 6733 dirrem->dm_dirinum = pagedep->pd_ino; 6734 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list); 6735 } 6736 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { 6737 jwait(&jmvref->jm_list, MNT_WAIT); 6738 return (ERESTART); 6739 } 6740 /* 6741 * When we're partially truncating a pagedep we just want to flush 6742 * journal entries and return. There can not be any adds in the 6743 * truncated portion of the directory and newblk must remain if 6744 * part of the block remains. 6745 */ 6746 if (blkoff != 0) { 6747 struct diradd *dap; 6748 6749 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 6750 if (dap->da_offset > blkoff) 6751 panic("cancel_pagedep: diradd %p off %d > %d", 6752 dap, dap->da_offset, blkoff); 6753 for (i = 0; i < DAHASHSZ; i++) 6754 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) 6755 if (dap->da_offset > blkoff) 6756 panic("cancel_pagedep: diradd %p off %d > %d", 6757 dap, dap->da_offset, blkoff); 6758 return (0); 6759 } 6760 /* 6761 * There should be no directory add dependencies present 6762 * as the directory could not be truncated until all 6763 * children were removed. 6764 */ 6765 KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, 6766 ("deallocate_dependencies: pendinghd != NULL")); 6767 for (i = 0; i < DAHASHSZ; i++) 6768 KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, 6769 ("deallocate_dependencies: diraddhd != NULL")); 6770 if ((pagedep->pd_state & NEWBLOCK) != 0) 6771 free_newdirblk(pagedep->pd_newdirblk); 6772 if (free_pagedep(pagedep) == 0) 6773 panic("Failed to free pagedep %p", pagedep); 6774 return (0); 6775 } 6776 6777 /* 6778 * Reclaim any dependency structures from a buffer that is about to 6779 * be reallocated to a new vnode. The buffer must be locked, thus, 6780 * no I/O completion operations can occur while we are manipulating 6781 * its associated dependencies. The mutex is held so that other I/O's 6782 * associated with related dependencies do not occur. 6783 */ 6784 static int 6785 deallocate_dependencies(bp, freeblks, off) 6786 struct buf *bp; 6787 struct freeblks *freeblks; 6788 int off; 6789 { 6790 struct indirdep *indirdep; 6791 struct pagedep *pagedep; 6792 struct allocdirect *adp; 6793 struct worklist *wk, *wkn; 6794 6795 ACQUIRE_LOCK(&lk); 6796 LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) { 6797 switch (wk->wk_type) { 6798 case D_INDIRDEP: 6799 indirdep = WK_INDIRDEP(wk); 6800 if (bp->b_lblkno >= 0 || 6801 bp->b_blkno != indirdep->ir_savebp->b_lblkno) 6802 panic("deallocate_dependencies: not indir"); 6803 cancel_indirdep(indirdep, bp, freeblks); 6804 continue; 6805 6806 case D_PAGEDEP: 6807 pagedep = WK_PAGEDEP(wk); 6808 if (cancel_pagedep(pagedep, freeblks, off)) { 6809 FREE_LOCK(&lk); 6810 return (ERESTART); 6811 } 6812 continue; 6813 6814 case D_ALLOCINDIR: 6815 /* 6816 * Simply remove the allocindir, we'll find it via 6817 * the indirdep where we can clear pointers if 6818 * needed. 6819 */ 6820 WORKLIST_REMOVE(wk); 6821 continue; 6822 6823 case D_FREEWORK: 6824 /* 6825 * A truncation is waiting for the zero'd pointers 6826 * to be written. It can be freed when the freeblks 6827 * is journaled. 6828 */ 6829 WORKLIST_REMOVE(wk); 6830 wk->wk_state |= ONDEPLIST; 6831 WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); 6832 break; 6833 6834 case D_ALLOCDIRECT: 6835 adp = WK_ALLOCDIRECT(wk); 6836 if (off != 0) 6837 continue; 6838 /* FALLTHROUGH */ 6839 default: 6840 panic("deallocate_dependencies: Unexpected type %s", 6841 TYPENAME(wk->wk_type)); 6842 /* NOTREACHED */ 6843 } 6844 } 6845 FREE_LOCK(&lk); 6846 /* 6847 * Don't throw away this buf, we were partially truncating and 6848 * some deps may always remain. 6849 */ 6850 if (off) { 6851 allocbuf(bp, off); 6852 bp->b_vflags |= BV_SCANNED; 6853 return (EBUSY); 6854 } 6855 bp->b_flags |= B_INVAL | B_NOCACHE; 6856 6857 return (0); 6858 } 6859 6860 /* 6861 * An allocdirect is being canceled due to a truncate. We must make sure 6862 * the journal entry is released in concert with the blkfree that releases 6863 * the storage. Completed journal entries must not be released until the 6864 * space is no longer pointed to by the inode or in the bitmap. 6865 */ 6866 static void 6867 cancel_allocdirect(adphead, adp, freeblks) 6868 struct allocdirectlst *adphead; 6869 struct allocdirect *adp; 6870 struct freeblks *freeblks; 6871 { 6872 struct freework *freework; 6873 struct newblk *newblk; 6874 struct worklist *wk; 6875 6876 TAILQ_REMOVE(adphead, adp, ad_next); 6877 newblk = (struct newblk *)adp; 6878 freework = NULL; 6879 /* 6880 * Find the correct freework structure. 6881 */ 6882 LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { 6883 if (wk->wk_type != D_FREEWORK) 6884 continue; 6885 freework = WK_FREEWORK(wk); 6886 if (freework->fw_blkno == newblk->nb_newblkno) 6887 break; 6888 } 6889 if (freework == NULL) 6890 panic("cancel_allocdirect: Freework not found"); 6891 /* 6892 * If a newblk exists at all we still have the journal entry that 6893 * initiated the allocation so we do not need to journal the free. 6894 */ 6895 cancel_jfreeblk(freeblks, freework->fw_blkno); 6896 /* 6897 * If the journal hasn't been written the jnewblk must be passed 6898 * to the call to ffs_blkfree that reclaims the space. We accomplish 6899 * this by linking the journal dependency into the freework to be 6900 * freed when freework_freeblock() is called. If the journal has 6901 * been written we can simply reclaim the journal space when the 6902 * freeblks work is complete. 6903 */ 6904 freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list, 6905 &freeblks->fb_jwork); 6906 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); 6907 } 6908 6909 6910 /* 6911 * Cancel a new block allocation. May be an indirect or direct block. We 6912 * remove it from various lists and return any journal record that needs to 6913 * be resolved by the caller. 6914 * 6915 * A special consideration is made for indirects which were never pointed 6916 * at on disk and will never be found once this block is released. 6917 */ 6918 static struct jnewblk * 6919 cancel_newblk(newblk, wk, wkhd) 6920 struct newblk *newblk; 6921 struct worklist *wk; 6922 struct workhead *wkhd; 6923 { 6924 struct jnewblk *jnewblk; 6925 6926 newblk->nb_state |= GOINGAWAY; 6927 /* 6928 * Previously we traversed the completedhd on each indirdep 6929 * attached to this newblk to cancel them and gather journal 6930 * work. Since we need only the oldest journal segment and 6931 * the lowest point on the tree will always have the oldest 6932 * journal segment we are free to release the segments 6933 * of any subordinates and may leave the indirdep list to 6934 * indirdep_complete() when this newblk is freed. 6935 */ 6936 if (newblk->nb_state & ONDEPLIST) { 6937 newblk->nb_state &= ~ONDEPLIST; 6938 LIST_REMOVE(newblk, nb_deps); 6939 } 6940 if (newblk->nb_state & ONWORKLIST) 6941 WORKLIST_REMOVE(&newblk->nb_list); 6942 /* 6943 * If the journal entry hasn't been written we save a pointer to 6944 * the dependency that frees it until it is written or the 6945 * superseding operation completes. 6946 */ 6947 jnewblk = newblk->nb_jnewblk; 6948 if (jnewblk != NULL && wk != NULL) { 6949 newblk->nb_jnewblk = NULL; 6950 jnewblk->jn_dep = wk; 6951 } 6952 if (!LIST_EMPTY(&newblk->nb_jwork)) 6953 jwork_move(wkhd, &newblk->nb_jwork); 6954 /* 6955 * When truncating we must free the newdirblk early to remove 6956 * the pagedep from the hash before returning. 6957 */ 6958 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) 6959 free_newdirblk(WK_NEWDIRBLK(wk)); 6960 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 6961 panic("cancel_newblk: extra newdirblk"); 6962 6963 return (jnewblk); 6964 } 6965 6966 /* 6967 * Schedule the freefrag associated with a newblk to be released once 6968 * the pointers are written and the previous block is no longer needed. 6969 */ 6970 static void 6971 newblk_freefrag(newblk) 6972 struct newblk *newblk; 6973 { 6974 struct freefrag *freefrag; 6975 6976 if (newblk->nb_freefrag == NULL) 6977 return; 6978 freefrag = newblk->nb_freefrag; 6979 newblk->nb_freefrag = NULL; 6980 freefrag->ff_state |= COMPLETE; 6981 if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) 6982 add_to_worklist(&freefrag->ff_list, 0); 6983 } 6984 6985 /* 6986 * Free a newblk. Generate a new freefrag work request if appropriate. 6987 * This must be called after the inode pointer and any direct block pointers 6988 * are valid or fully removed via truncate or frag extension. 6989 */ 6990 static void 6991 free_newblk(newblk) 6992 struct newblk *newblk; 6993 { 6994 struct indirdep *indirdep; 6995 struct worklist *wk; 6996 6997 KASSERT(newblk->nb_jnewblk == NULL, 6998 ("free_newblk; jnewblk %p still attached", newblk->nb_jnewblk)); 6999 mtx_assert(&lk, MA_OWNED); 7000 newblk_freefrag(newblk); 7001 if (newblk->nb_state & ONDEPLIST) 7002 LIST_REMOVE(newblk, nb_deps); 7003 if (newblk->nb_state & ONWORKLIST) 7004 WORKLIST_REMOVE(&newblk->nb_list); 7005 LIST_REMOVE(newblk, nb_hash); 7006 if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) 7007 free_newdirblk(WK_NEWDIRBLK(wk)); 7008 if (!LIST_EMPTY(&newblk->nb_newdirblk)) 7009 panic("free_newblk: extra newdirblk"); 7010 while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) 7011 indirdep_complete(indirdep); 7012 handle_jwork(&newblk->nb_jwork); 7013 newblk->nb_list.wk_type = D_NEWBLK; 7014 WORKITEM_FREE(newblk, D_NEWBLK); 7015 } 7016 7017 /* 7018 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. 7019 * This routine must be called with splbio interrupts blocked. 7020 */ 7021 static void 7022 free_newdirblk(newdirblk) 7023 struct newdirblk *newdirblk; 7024 { 7025 struct pagedep *pagedep; 7026 struct diradd *dap; 7027 struct worklist *wk; 7028 7029 mtx_assert(&lk, MA_OWNED); 7030 WORKLIST_REMOVE(&newdirblk->db_list); 7031 /* 7032 * If the pagedep is still linked onto the directory buffer 7033 * dependency chain, then some of the entries on the 7034 * pd_pendinghd list may not be committed to disk yet. In 7035 * this case, we will simply clear the NEWBLOCK flag and 7036 * let the pd_pendinghd list be processed when the pagedep 7037 * is next written. If the pagedep is no longer on the buffer 7038 * dependency chain, then all the entries on the pd_pending 7039 * list are committed to disk and we can free them here. 7040 */ 7041 pagedep = newdirblk->db_pagedep; 7042 pagedep->pd_state &= ~NEWBLOCK; 7043 if ((pagedep->pd_state & ONWORKLIST) == 0) { 7044 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 7045 free_diradd(dap, NULL); 7046 /* 7047 * If no dependencies remain, the pagedep will be freed. 7048 */ 7049 free_pagedep(pagedep); 7050 } 7051 /* Should only ever be one item in the list. */ 7052 while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { 7053 WORKLIST_REMOVE(wk); 7054 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 7055 } 7056 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 7057 } 7058 7059 /* 7060 * Prepare an inode to be freed. The actual free operation is not 7061 * done until the zero'ed inode has been written to disk. 7062 */ 7063 void 7064 softdep_freefile(pvp, ino, mode) 7065 struct vnode *pvp; 7066 ino_t ino; 7067 int mode; 7068 { 7069 struct inode *ip = VTOI(pvp); 7070 struct inodedep *inodedep; 7071 struct freefile *freefile; 7072 struct freeblks *freeblks; 7073 7074 /* 7075 * This sets up the inode de-allocation dependency. 7076 */ 7077 freefile = malloc(sizeof(struct freefile), 7078 M_FREEFILE, M_SOFTDEP_FLAGS); 7079 workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); 7080 freefile->fx_mode = mode; 7081 freefile->fx_oldinum = ino; 7082 freefile->fx_devvp = ip->i_devvp; 7083 LIST_INIT(&freefile->fx_jwork); 7084 UFS_LOCK(ip->i_ump); 7085 ip->i_fs->fs_pendinginodes += 1; 7086 UFS_UNLOCK(ip->i_ump); 7087 7088 /* 7089 * If the inodedep does not exist, then the zero'ed inode has 7090 * been written to disk. If the allocated inode has never been 7091 * written to disk, then the on-disk inode is zero'ed. In either 7092 * case we can free the file immediately. If the journal was 7093 * canceled before being written the inode will never make it to 7094 * disk and we must send the canceled journal entrys to 7095 * ffs_freefile() to be cleared in conjunction with the bitmap. 7096 * Any blocks waiting on the inode to write can be safely freed 7097 * here as it will never been written. 7098 */ 7099 ACQUIRE_LOCK(&lk); 7100 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 7101 if (inodedep) { 7102 /* 7103 * Clear out freeblks that no longer need to reference 7104 * this inode. 7105 */ 7106 while ((freeblks = 7107 TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) { 7108 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, 7109 fb_next); 7110 freeblks->fb_state &= ~ONDEPLIST; 7111 } 7112 /* 7113 * Remove this inode from the unlinked list. 7114 */ 7115 if (inodedep->id_state & UNLINKED) { 7116 /* 7117 * Save the journal work to be freed with the bitmap 7118 * before we clear UNLINKED. Otherwise it can be lost 7119 * if the inode block is written. 7120 */ 7121 handle_bufwait(inodedep, &freefile->fx_jwork); 7122 clear_unlinked_inodedep(inodedep); 7123 /* Re-acquire inodedep as we've dropped lk. */ 7124 inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); 7125 } 7126 } 7127 if (inodedep == NULL || check_inode_unwritten(inodedep)) { 7128 FREE_LOCK(&lk); 7129 handle_workitem_freefile(freefile); 7130 return; 7131 } 7132 if ((inodedep->id_state & DEPCOMPLETE) == 0) 7133 inodedep->id_state |= GOINGAWAY; 7134 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); 7135 FREE_LOCK(&lk); 7136 if (ip->i_number == ino) 7137 ip->i_flag |= IN_MODIFIED; 7138 } 7139 7140 /* 7141 * Check to see if an inode has never been written to disk. If 7142 * so free the inodedep and return success, otherwise return failure. 7143 * This routine must be called with splbio interrupts blocked. 7144 * 7145 * If we still have a bitmap dependency, then the inode has never 7146 * been written to disk. Drop the dependency as it is no longer 7147 * necessary since the inode is being deallocated. We set the 7148 * ALLCOMPLETE flags since the bitmap now properly shows that the 7149 * inode is not allocated. Even if the inode is actively being 7150 * written, it has been rolled back to its zero'ed state, so we 7151 * are ensured that a zero inode is what is on the disk. For short 7152 * lived files, this change will usually result in removing all the 7153 * dependencies from the inode so that it can be freed immediately. 7154 */ 7155 static int 7156 check_inode_unwritten(inodedep) 7157 struct inodedep *inodedep; 7158 { 7159 7160 mtx_assert(&lk, MA_OWNED); 7161 7162 if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || 7163 !LIST_EMPTY(&inodedep->id_pendinghd) || 7164 !LIST_EMPTY(&inodedep->id_bufwait) || 7165 !LIST_EMPTY(&inodedep->id_inowait) || 7166 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 7167 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 7168 !TAILQ_EMPTY(&inodedep->id_extupdt) || 7169 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 7170 inodedep->id_mkdiradd != NULL || 7171 inodedep->id_nlinkdelta != 0) 7172 return (0); 7173 /* 7174 * Another process might be in initiate_write_inodeblock_ufs[12] 7175 * trying to allocate memory without holding "Softdep Lock". 7176 */ 7177 if ((inodedep->id_state & IOSTARTED) != 0 && 7178 inodedep->id_savedino1 == NULL) 7179 return (0); 7180 7181 if (inodedep->id_state & ONDEPLIST) 7182 LIST_REMOVE(inodedep, id_deps); 7183 inodedep->id_state &= ~ONDEPLIST; 7184 inodedep->id_state |= ALLCOMPLETE; 7185 inodedep->id_bmsafemap = NULL; 7186 if (inodedep->id_state & ONWORKLIST) 7187 WORKLIST_REMOVE(&inodedep->id_list); 7188 if (inodedep->id_savedino1 != NULL) { 7189 free(inodedep->id_savedino1, M_SAVEDINO); 7190 inodedep->id_savedino1 = NULL; 7191 } 7192 if (free_inodedep(inodedep) == 0) 7193 panic("check_inode_unwritten: busy inode"); 7194 return (1); 7195 } 7196 7197 /* 7198 * Try to free an inodedep structure. Return 1 if it could be freed. 7199 */ 7200 static int 7201 free_inodedep(inodedep) 7202 struct inodedep *inodedep; 7203 { 7204 7205 mtx_assert(&lk, MA_OWNED); 7206 if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || 7207 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || 7208 !LIST_EMPTY(&inodedep->id_dirremhd) || 7209 !LIST_EMPTY(&inodedep->id_pendinghd) || 7210 !LIST_EMPTY(&inodedep->id_bufwait) || 7211 !LIST_EMPTY(&inodedep->id_inowait) || 7212 !TAILQ_EMPTY(&inodedep->id_inoreflst) || 7213 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 7214 !TAILQ_EMPTY(&inodedep->id_newinoupdt) || 7215 !TAILQ_EMPTY(&inodedep->id_extupdt) || 7216 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 7217 !TAILQ_EMPTY(&inodedep->id_freeblklst) || 7218 inodedep->id_mkdiradd != NULL || 7219 inodedep->id_nlinkdelta != 0 || 7220 inodedep->id_savedino1 != NULL) 7221 return (0); 7222 if (inodedep->id_state & ONDEPLIST) 7223 LIST_REMOVE(inodedep, id_deps); 7224 LIST_REMOVE(inodedep, id_hash); 7225 WORKITEM_FREE(inodedep, D_INODEDEP); 7226 return (1); 7227 } 7228 7229 /* 7230 * Free the block referenced by a freework structure. The parent freeblks 7231 * structure is released and completed when the final cg bitmap reaches 7232 * the disk. This routine may be freeing a jnewblk which never made it to 7233 * disk in which case we do not have to wait as the operation is undone 7234 * in memory immediately. 7235 */ 7236 static void 7237 freework_freeblock(freework) 7238 struct freework *freework; 7239 { 7240 struct freeblks *freeblks; 7241 struct jnewblk *jnewblk; 7242 struct ufsmount *ump; 7243 struct workhead wkhd; 7244 struct fs *fs; 7245 int bsize; 7246 int needj; 7247 7248 mtx_assert(&lk, MA_OWNED); 7249 /* 7250 * Handle partial truncate separately. 7251 */ 7252 if (freework->fw_indir) { 7253 complete_trunc_indir(freework); 7254 return; 7255 } 7256 freeblks = freework->fw_freeblks; 7257 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7258 fs = ump->um_fs; 7259 needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0; 7260 bsize = lfragtosize(fs, freework->fw_frags); 7261 LIST_INIT(&wkhd); 7262 /* 7263 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives 7264 * on the indirblk hashtable and prevents premature freeing. 7265 */ 7266 freework->fw_state |= DEPCOMPLETE; 7267 /* 7268 * SUJ needs to wait for the segment referencing freed indirect 7269 * blocks to expire so that we know the checker will not confuse 7270 * a re-allocated indirect block with its old contents. 7271 */ 7272 if (needj && freework->fw_lbn <= -NDADDR) 7273 indirblk_insert(freework); 7274 /* 7275 * If we are canceling an existing jnewblk pass it to the free 7276 * routine, otherwise pass the freeblk which will ultimately 7277 * release the freeblks. If we're not journaling, we can just 7278 * free the freeblks immediately. 7279 */ 7280 jnewblk = freework->fw_jnewblk; 7281 if (jnewblk != NULL) { 7282 cancel_jnewblk(jnewblk, &wkhd); 7283 needj = 0; 7284 } else if (needj) { 7285 freework->fw_state |= DELAYEDFREE; 7286 freeblks->fb_cgwait++; 7287 WORKLIST_INSERT(&wkhd, &freework->fw_list); 7288 } 7289 FREE_LOCK(&lk); 7290 freeblks_free(ump, freeblks, btodb(bsize)); 7291 ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, 7292 freeblks->fb_inum, freeblks->fb_vtype, &wkhd); 7293 ACQUIRE_LOCK(&lk); 7294 /* 7295 * The jnewblk will be discarded and the bits in the map never 7296 * made it to disk. We can immediately free the freeblk. 7297 */ 7298 if (needj == 0) 7299 handle_written_freework(freework); 7300 } 7301 7302 /* 7303 * We enqueue freework items that need processing back on the freeblks and 7304 * add the freeblks to the worklist. This makes it easier to find all work 7305 * required to flush a truncation in process_truncates(). 7306 */ 7307 static void 7308 freework_enqueue(freework) 7309 struct freework *freework; 7310 { 7311 struct freeblks *freeblks; 7312 7313 freeblks = freework->fw_freeblks; 7314 if ((freework->fw_state & INPROGRESS) == 0) 7315 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); 7316 if ((freeblks->fb_state & 7317 (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE && 7318 LIST_EMPTY(&freeblks->fb_jblkdephd)) 7319 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 7320 } 7321 7322 /* 7323 * Start, continue, or finish the process of freeing an indirect block tree. 7324 * The free operation may be paused at any point with fw_off containing the 7325 * offset to restart from. This enables us to implement some flow control 7326 * for large truncates which may fan out and generate a huge number of 7327 * dependencies. 7328 */ 7329 static void 7330 handle_workitem_indirblk(freework) 7331 struct freework *freework; 7332 { 7333 struct freeblks *freeblks; 7334 struct ufsmount *ump; 7335 struct fs *fs; 7336 7337 freeblks = freework->fw_freeblks; 7338 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7339 fs = ump->um_fs; 7340 if (freework->fw_state & DEPCOMPLETE) { 7341 handle_written_freework(freework); 7342 return; 7343 } 7344 if (freework->fw_off == NINDIR(fs)) { 7345 freework_freeblock(freework); 7346 return; 7347 } 7348 freework->fw_state |= INPROGRESS; 7349 FREE_LOCK(&lk); 7350 indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), 7351 freework->fw_lbn); 7352 ACQUIRE_LOCK(&lk); 7353 } 7354 7355 /* 7356 * Called when a freework structure attached to a cg buf is written. The 7357 * ref on either the parent or the freeblks structure is released and 7358 * the freeblks is added back to the worklist if there is more work to do. 7359 */ 7360 static void 7361 handle_written_freework(freework) 7362 struct freework *freework; 7363 { 7364 struct freeblks *freeblks; 7365 struct freework *parent; 7366 7367 freeblks = freework->fw_freeblks; 7368 parent = freework->fw_parent; 7369 if (freework->fw_state & DELAYEDFREE) 7370 freeblks->fb_cgwait--; 7371 freework->fw_state |= COMPLETE; 7372 if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) 7373 WORKITEM_FREE(freework, D_FREEWORK); 7374 if (parent) { 7375 if (--parent->fw_ref == 0) 7376 freework_enqueue(parent); 7377 return; 7378 } 7379 if (--freeblks->fb_ref != 0) 7380 return; 7381 if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) == 7382 ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) 7383 add_to_worklist(&freeblks->fb_list, WK_NODELAY); 7384 } 7385 7386 /* 7387 * This workitem routine performs the block de-allocation. 7388 * The workitem is added to the pending list after the updated 7389 * inode block has been written to disk. As mentioned above, 7390 * checks regarding the number of blocks de-allocated (compared 7391 * to the number of blocks allocated for the file) are also 7392 * performed in this function. 7393 */ 7394 static int 7395 handle_workitem_freeblocks(freeblks, flags) 7396 struct freeblks *freeblks; 7397 int flags; 7398 { 7399 struct freework *freework; 7400 struct newblk *newblk; 7401 struct allocindir *aip; 7402 struct ufsmount *ump; 7403 struct worklist *wk; 7404 7405 KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), 7406 ("handle_workitem_freeblocks: Journal entries not written.")); 7407 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7408 ACQUIRE_LOCK(&lk); 7409 while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { 7410 WORKLIST_REMOVE(wk); 7411 switch (wk->wk_type) { 7412 case D_DIRREM: 7413 wk->wk_state |= COMPLETE; 7414 add_to_worklist(wk, 0); 7415 continue; 7416 7417 case D_ALLOCDIRECT: 7418 free_newblk(WK_NEWBLK(wk)); 7419 continue; 7420 7421 case D_ALLOCINDIR: 7422 aip = WK_ALLOCINDIR(wk); 7423 freework = NULL; 7424 if (aip->ai_state & DELAYEDFREE) { 7425 FREE_LOCK(&lk); 7426 freework = newfreework(ump, freeblks, NULL, 7427 aip->ai_lbn, aip->ai_newblkno, 7428 ump->um_fs->fs_frag, 0, 0); 7429 ACQUIRE_LOCK(&lk); 7430 } 7431 newblk = WK_NEWBLK(wk); 7432 if (newblk->nb_jnewblk) { 7433 freework->fw_jnewblk = newblk->nb_jnewblk; 7434 newblk->nb_jnewblk->jn_dep = &freework->fw_list; 7435 newblk->nb_jnewblk = NULL; 7436 } 7437 free_newblk(newblk); 7438 continue; 7439 7440 case D_FREEWORK: 7441 freework = WK_FREEWORK(wk); 7442 if (freework->fw_lbn <= -NDADDR) 7443 handle_workitem_indirblk(freework); 7444 else 7445 freework_freeblock(freework); 7446 continue; 7447 default: 7448 panic("handle_workitem_freeblocks: Unknown type %s", 7449 TYPENAME(wk->wk_type)); 7450 } 7451 } 7452 if (freeblks->fb_ref != 0) { 7453 freeblks->fb_state &= ~INPROGRESS; 7454 wake_worklist(&freeblks->fb_list); 7455 freeblks = NULL; 7456 } 7457 FREE_LOCK(&lk); 7458 if (freeblks) 7459 return handle_complete_freeblocks(freeblks, flags); 7460 return (0); 7461 } 7462 7463 /* 7464 * Handle completion of block free via truncate. This allows fs_pending 7465 * to track the actual free block count more closely than if we only updated 7466 * it at the end. We must be careful to handle cases where the block count 7467 * on free was incorrect. 7468 */ 7469 static void 7470 freeblks_free(ump, freeblks, blocks) 7471 struct ufsmount *ump; 7472 struct freeblks *freeblks; 7473 int blocks; 7474 { 7475 struct fs *fs; 7476 ufs2_daddr_t remain; 7477 7478 UFS_LOCK(ump); 7479 remain = -freeblks->fb_chkcnt; 7480 freeblks->fb_chkcnt += blocks; 7481 if (remain > 0) { 7482 if (remain < blocks) 7483 blocks = remain; 7484 fs = ump->um_fs; 7485 fs->fs_pendingblocks -= blocks; 7486 } 7487 UFS_UNLOCK(ump); 7488 } 7489 7490 /* 7491 * Once all of the freework workitems are complete we can retire the 7492 * freeblocks dependency and any journal work awaiting completion. This 7493 * can not be called until all other dependencies are stable on disk. 7494 */ 7495 static int 7496 handle_complete_freeblocks(freeblks, flags) 7497 struct freeblks *freeblks; 7498 int flags; 7499 { 7500 struct inodedep *inodedep; 7501 struct inode *ip; 7502 struct vnode *vp; 7503 struct fs *fs; 7504 struct ufsmount *ump; 7505 ufs2_daddr_t spare; 7506 7507 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7508 fs = ump->um_fs; 7509 flags = LK_EXCLUSIVE | flags; 7510 spare = freeblks->fb_chkcnt; 7511 7512 /* 7513 * If we did not release the expected number of blocks we may have 7514 * to adjust the inode block count here. Only do so if it wasn't 7515 * a truncation to zero and the modrev still matches. 7516 */ 7517 if (spare && freeblks->fb_len != 0) { 7518 if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum, 7519 flags, &vp, FFSV_FORCEINSMQ) != 0) 7520 return (EBUSY); 7521 ip = VTOI(vp); 7522 if (DIP(ip, i_modrev) == freeblks->fb_modrev) { 7523 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare); 7524 ip->i_flag |= IN_CHANGE; 7525 /* 7526 * We must wait so this happens before the 7527 * journal is reclaimed. 7528 */ 7529 ffs_update(vp, 1); 7530 } 7531 vput(vp); 7532 } 7533 if (spare < 0) { 7534 UFS_LOCK(ump); 7535 fs->fs_pendingblocks += spare; 7536 UFS_UNLOCK(ump); 7537 } 7538 #ifdef QUOTA 7539 /* Handle spare. */ 7540 if (spare) 7541 quotaadj(freeblks->fb_quota, ump, -spare); 7542 quotarele(freeblks->fb_quota); 7543 #endif 7544 ACQUIRE_LOCK(&lk); 7545 if (freeblks->fb_state & ONDEPLIST) { 7546 inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum, 7547 0, &inodedep); 7548 TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); 7549 freeblks->fb_state &= ~ONDEPLIST; 7550 if (TAILQ_EMPTY(&inodedep->id_freeblklst)) 7551 free_inodedep(inodedep); 7552 } 7553 /* 7554 * All of the freeblock deps must be complete prior to this call 7555 * so it's now safe to complete earlier outstanding journal entries. 7556 */ 7557 handle_jwork(&freeblks->fb_jwork); 7558 WORKITEM_FREE(freeblks, D_FREEBLKS); 7559 FREE_LOCK(&lk); 7560 return (0); 7561 } 7562 7563 /* 7564 * Release blocks associated with the freeblks and stored in the indirect 7565 * block dbn. If level is greater than SINGLE, the block is an indirect block 7566 * and recursive calls to indirtrunc must be used to cleanse other indirect 7567 * blocks. 7568 * 7569 * This handles partial and complete truncation of blocks. Partial is noted 7570 * with goingaway == 0. In this case the freework is completed after the 7571 * zero'd indirects are written to disk. For full truncation the freework 7572 * is completed after the block is freed. 7573 */ 7574 static void 7575 indir_trunc(freework, dbn, lbn) 7576 struct freework *freework; 7577 ufs2_daddr_t dbn; 7578 ufs_lbn_t lbn; 7579 { 7580 struct freework *nfreework; 7581 struct workhead wkhd; 7582 struct freeblks *freeblks; 7583 struct buf *bp; 7584 struct fs *fs; 7585 struct indirdep *indirdep; 7586 struct ufsmount *ump; 7587 ufs1_daddr_t *bap1 = 0; 7588 ufs2_daddr_t nb, nnb, *bap2 = 0; 7589 ufs_lbn_t lbnadd, nlbn; 7590 int i, nblocks, ufs1fmt; 7591 int freedblocks; 7592 int goingaway; 7593 int freedeps; 7594 int needj; 7595 int level; 7596 int cnt; 7597 7598 freeblks = freework->fw_freeblks; 7599 ump = VFSTOUFS(freeblks->fb_list.wk_mp); 7600 fs = ump->um_fs; 7601 /* 7602 * Get buffer of block pointers to be freed. There are three cases: 7603 * 7604 * 1) Partial truncate caches the indirdep pointer in the freework 7605 * which provides us a back copy to the save bp which holds the 7606 * pointers we want to clear. When this completes the zero 7607 * pointers are written to the real copy. 7608 * 2) The indirect is being completely truncated, cancel_indirdep() 7609 * eliminated the real copy and placed the indirdep on the saved 7610 * copy. The indirdep and buf are discarded when this completes. 7611 * 3) The indirect was not in memory, we read a copy off of the disk 7612 * using the devvp and drop and invalidate the buffer when we're 7613 * done. 7614 */ 7615 goingaway = 1; 7616 indirdep = NULL; 7617 if (freework->fw_indir != NULL) { 7618 goingaway = 0; 7619 indirdep = freework->fw_indir; 7620 bp = indirdep->ir_savebp; 7621 if (bp == NULL || bp->b_blkno != dbn) 7622 panic("indir_trunc: Bad saved buf %p blkno %jd", 7623 bp, (intmax_t)dbn); 7624 } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) { 7625 /* 7626 * The lock prevents the buf dep list from changing and 7627 * indirects on devvp should only ever have one dependency. 7628 */ 7629 indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep)); 7630 if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0) 7631 panic("indir_trunc: Bad indirdep %p from buf %p", 7632 indirdep, bp); 7633 } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 7634 NOCRED, &bp) != 0) { 7635 brelse(bp); 7636 return; 7637 } 7638 ACQUIRE_LOCK(&lk); 7639 /* Protects against a race with complete_trunc_indir(). */ 7640 freework->fw_state &= ~INPROGRESS; 7641 /* 7642 * If we have an indirdep we need to enforce the truncation order 7643 * and discard it when it is complete. 7644 */ 7645 if (indirdep) { 7646 if (freework != TAILQ_FIRST(&indirdep->ir_trunc) && 7647 !TAILQ_EMPTY(&indirdep->ir_trunc)) { 7648 /* 7649 * Add the complete truncate to the list on the 7650 * indirdep to enforce in-order processing. 7651 */ 7652 if (freework->fw_indir == NULL) 7653 TAILQ_INSERT_TAIL(&indirdep->ir_trunc, 7654 freework, fw_next); 7655 FREE_LOCK(&lk); 7656 return; 7657 } 7658 /* 7659 * If we're goingaway, free the indirdep. Otherwise it will 7660 * linger until the write completes. 7661 */ 7662 if (goingaway) { 7663 free_indirdep(indirdep); 7664 ump->um_numindirdeps -= 1; 7665 } 7666 } 7667 FREE_LOCK(&lk); 7668 /* Initialize pointers depending on block size. */ 7669 if (ump->um_fstype == UFS1) { 7670 bap1 = (ufs1_daddr_t *)bp->b_data; 7671 nb = bap1[freework->fw_off]; 7672 ufs1fmt = 1; 7673 } else { 7674 bap2 = (ufs2_daddr_t *)bp->b_data; 7675 nb = bap2[freework->fw_off]; 7676 ufs1fmt = 0; 7677 } 7678 level = lbn_level(lbn); 7679 needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0; 7680 lbnadd = lbn_offset(fs, level); 7681 nblocks = btodb(fs->fs_bsize); 7682 nfreework = freework; 7683 freedeps = 0; 7684 cnt = 0; 7685 /* 7686 * Reclaim blocks. Traverses into nested indirect levels and 7687 * arranges for the current level to be freed when subordinates 7688 * are free when journaling. 7689 */ 7690 for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { 7691 if (i != NINDIR(fs) - 1) { 7692 if (ufs1fmt) 7693 nnb = bap1[i+1]; 7694 else 7695 nnb = bap2[i+1]; 7696 } else 7697 nnb = 0; 7698 if (nb == 0) 7699 continue; 7700 cnt++; 7701 if (level != 0) { 7702 nlbn = (lbn + 1) - (i * lbnadd); 7703 if (needj != 0) { 7704 nfreework = newfreework(ump, freeblks, freework, 7705 nlbn, nb, fs->fs_frag, 0, 0); 7706 freedeps++; 7707 } 7708 indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); 7709 } else { 7710 struct freedep *freedep; 7711 7712 /* 7713 * Attempt to aggregate freedep dependencies for 7714 * all blocks being released to the same CG. 7715 */ 7716 LIST_INIT(&wkhd); 7717 if (needj != 0 && 7718 (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { 7719 freedep = newfreedep(freework); 7720 WORKLIST_INSERT_UNLOCKED(&wkhd, 7721 &freedep->fd_list); 7722 freedeps++; 7723 } 7724 ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, 7725 fs->fs_bsize, freeblks->fb_inum, 7726 freeblks->fb_vtype, &wkhd); 7727 } 7728 } 7729 if (goingaway) { 7730 bp->b_flags |= B_INVAL | B_NOCACHE; 7731 brelse(bp); 7732 } 7733 freedblocks = 0; 7734 if (level == 0) 7735 freedblocks = (nblocks * cnt); 7736 if (needj == 0) 7737 freedblocks += nblocks; 7738 freeblks_free(ump, freeblks, freedblocks); 7739 /* 7740 * If we are journaling set up the ref counts and offset so this 7741 * indirect can be completed when its children are free. 7742 */ 7743 if (needj) { 7744 ACQUIRE_LOCK(&lk); 7745 freework->fw_off = i; 7746 freework->fw_ref += freedeps; 7747 freework->fw_ref -= NINDIR(fs) + 1; 7748 if (level == 0) 7749 freeblks->fb_cgwait += freedeps; 7750 if (freework->fw_ref == 0) 7751 freework_freeblock(freework); 7752 FREE_LOCK(&lk); 7753 return; 7754 } 7755 /* 7756 * If we're not journaling we can free the indirect now. 7757 */ 7758 dbn = dbtofsb(fs, dbn); 7759 ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, 7760 freeblks->fb_inum, freeblks->fb_vtype, NULL); 7761 /* Non SUJ softdep does single-threaded truncations. */ 7762 if (freework->fw_blkno == dbn) { 7763 freework->fw_state |= ALLCOMPLETE; 7764 ACQUIRE_LOCK(&lk); 7765 handle_written_freework(freework); 7766 FREE_LOCK(&lk); 7767 } 7768 return; 7769 } 7770 7771 /* 7772 * Cancel an allocindir when it is removed via truncation. When bp is not 7773 * NULL the indirect never appeared on disk and is scheduled to be freed 7774 * independently of the indir so we can more easily track journal work. 7775 */ 7776 static void 7777 cancel_allocindir(aip, bp, freeblks, trunc) 7778 struct allocindir *aip; 7779 struct buf *bp; 7780 struct freeblks *freeblks; 7781 int trunc; 7782 { 7783 struct indirdep *indirdep; 7784 struct freefrag *freefrag; 7785 struct newblk *newblk; 7786 7787 newblk = (struct newblk *)aip; 7788 LIST_REMOVE(aip, ai_next); 7789 /* 7790 * We must eliminate the pointer in bp if it must be freed on its 7791 * own due to partial truncate or pending journal work. 7792 */ 7793 if (bp && (trunc || newblk->nb_jnewblk)) { 7794 /* 7795 * Clear the pointer and mark the aip to be freed 7796 * directly if it never existed on disk. 7797 */ 7798 aip->ai_state |= DELAYEDFREE; 7799 indirdep = aip->ai_indirdep; 7800 if (indirdep->ir_state & UFS1FMT) 7801 ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0; 7802 else 7803 ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0; 7804 } 7805 /* 7806 * When truncating the previous pointer will be freed via 7807 * savedbp. Eliminate the freefrag which would dup free. 7808 */ 7809 if (trunc && (freefrag = newblk->nb_freefrag) != NULL) { 7810 newblk->nb_freefrag = NULL; 7811 if (freefrag->ff_jdep) 7812 cancel_jfreefrag( 7813 WK_JFREEFRAG(freefrag->ff_jdep)); 7814 jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork); 7815 WORKITEM_FREE(freefrag, D_FREEFRAG); 7816 } 7817 /* 7818 * If the journal hasn't been written the jnewblk must be passed 7819 * to the call to ffs_blkfree that reclaims the space. We accomplish 7820 * this by leaving the journal dependency on the newblk to be freed 7821 * when a freework is created in handle_workitem_freeblocks(). 7822 */ 7823 cancel_newblk(newblk, NULL, &freeblks->fb_jwork); 7824 WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); 7825 } 7826 7827 /* 7828 * Create the mkdir dependencies for . and .. in a new directory. Link them 7829 * in to a newdirblk so any subsequent additions are tracked properly. The 7830 * caller is responsible for adding the mkdir1 dependency to the journal 7831 * and updating id_mkdiradd. This function returns with lk held. 7832 */ 7833 static struct mkdir * 7834 setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) 7835 struct diradd *dap; 7836 ino_t newinum; 7837 ino_t dinum; 7838 struct buf *newdirbp; 7839 struct mkdir **mkdirp; 7840 { 7841 struct newblk *newblk; 7842 struct pagedep *pagedep; 7843 struct inodedep *inodedep; 7844 struct newdirblk *newdirblk = 0; 7845 struct mkdir *mkdir1, *mkdir2; 7846 struct worklist *wk; 7847 struct jaddref *jaddref; 7848 struct mount *mp; 7849 7850 mp = dap->da_list.wk_mp; 7851 newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, 7852 M_SOFTDEP_FLAGS); 7853 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 7854 LIST_INIT(&newdirblk->db_mkdir); 7855 mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 7856 workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); 7857 mkdir1->md_state = ATTACHED | MKDIR_BODY; 7858 mkdir1->md_diradd = dap; 7859 mkdir1->md_jaddref = NULL; 7860 mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); 7861 workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); 7862 mkdir2->md_state = ATTACHED | MKDIR_PARENT; 7863 mkdir2->md_diradd = dap; 7864 mkdir2->md_jaddref = NULL; 7865 if (MOUNTEDSUJ(mp) == 0) { 7866 mkdir1->md_state |= DEPCOMPLETE; 7867 mkdir2->md_state |= DEPCOMPLETE; 7868 } 7869 /* 7870 * Dependency on "." and ".." being written to disk. 7871 */ 7872 mkdir1->md_buf = newdirbp; 7873 ACQUIRE_LOCK(&lk); 7874 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); 7875 /* 7876 * We must link the pagedep, allocdirect, and newdirblk for 7877 * the initial file page so the pointer to the new directory 7878 * is not written until the directory contents are live and 7879 * any subsequent additions are not marked live until the 7880 * block is reachable via the inode. 7881 */ 7882 if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0) 7883 panic("setup_newdir: lost pagedep"); 7884 LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) 7885 if (wk->wk_type == D_ALLOCDIRECT) 7886 break; 7887 if (wk == NULL) 7888 panic("setup_newdir: lost allocdirect"); 7889 if (pagedep->pd_state & NEWBLOCK) 7890 panic("setup_newdir: NEWBLOCK already set"); 7891 newblk = WK_NEWBLK(wk); 7892 pagedep->pd_state |= NEWBLOCK; 7893 pagedep->pd_newdirblk = newdirblk; 7894 newdirblk->db_pagedep = pagedep; 7895 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 7896 WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); 7897 /* 7898 * Look up the inodedep for the parent directory so that we 7899 * can link mkdir2 into the pending dotdot jaddref or 7900 * the inode write if there is none. If the inode is 7901 * ALLCOMPLETE and no jaddref is present all dependencies have 7902 * been satisfied and mkdir2 can be freed. 7903 */ 7904 inodedep_lookup(mp, dinum, 0, &inodedep); 7905 if (MOUNTEDSUJ(mp)) { 7906 if (inodedep == NULL) 7907 panic("setup_newdir: Lost parent."); 7908 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 7909 inoreflst); 7910 KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && 7911 (jaddref->ja_state & MKDIR_PARENT), 7912 ("setup_newdir: bad dotdot jaddref %p", jaddref)); 7913 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 7914 mkdir2->md_jaddref = jaddref; 7915 jaddref->ja_mkdir = mkdir2; 7916 } else if (inodedep == NULL || 7917 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 7918 dap->da_state &= ~MKDIR_PARENT; 7919 WORKITEM_FREE(mkdir2, D_MKDIR); 7920 } else { 7921 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); 7922 WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list); 7923 } 7924 *mkdirp = mkdir2; 7925 7926 return (mkdir1); 7927 } 7928 7929 /* 7930 * Directory entry addition dependencies. 7931 * 7932 * When adding a new directory entry, the inode (with its incremented link 7933 * count) must be written to disk before the directory entry's pointer to it. 7934 * Also, if the inode is newly allocated, the corresponding freemap must be 7935 * updated (on disk) before the directory entry's pointer. These requirements 7936 * are met via undo/redo on the directory entry's pointer, which consists 7937 * simply of the inode number. 7938 * 7939 * As directory entries are added and deleted, the free space within a 7940 * directory block can become fragmented. The ufs filesystem will compact 7941 * a fragmented directory block to make space for a new entry. When this 7942 * occurs, the offsets of previously added entries change. Any "diradd" 7943 * dependency structures corresponding to these entries must be updated with 7944 * the new offsets. 7945 */ 7946 7947 /* 7948 * This routine is called after the in-memory inode's link 7949 * count has been incremented, but before the directory entry's 7950 * pointer to the inode has been set. 7951 */ 7952 int 7953 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) 7954 struct buf *bp; /* buffer containing directory block */ 7955 struct inode *dp; /* inode for directory */ 7956 off_t diroffset; /* offset of new entry in directory */ 7957 ino_t newinum; /* inode referenced by new directory entry */ 7958 struct buf *newdirbp; /* non-NULL => contents of new mkdir */ 7959 int isnewblk; /* entry is in a newly allocated block */ 7960 { 7961 int offset; /* offset of new entry within directory block */ 7962 ufs_lbn_t lbn; /* block in directory containing new entry */ 7963 struct fs *fs; 7964 struct diradd *dap; 7965 struct newblk *newblk; 7966 struct pagedep *pagedep; 7967 struct inodedep *inodedep; 7968 struct newdirblk *newdirblk = 0; 7969 struct mkdir *mkdir1, *mkdir2; 7970 struct jaddref *jaddref; 7971 struct mount *mp; 7972 int isindir; 7973 7974 /* 7975 * Whiteouts have no dependencies. 7976 */ 7977 if (newinum == WINO) { 7978 if (newdirbp != NULL) 7979 bdwrite(newdirbp); 7980 return (0); 7981 } 7982 jaddref = NULL; 7983 mkdir1 = mkdir2 = NULL; 7984 mp = UFSTOVFS(dp->i_ump); 7985 fs = dp->i_fs; 7986 lbn = lblkno(fs, diroffset); 7987 offset = blkoff(fs, diroffset); 7988 dap = malloc(sizeof(struct diradd), M_DIRADD, 7989 M_SOFTDEP_FLAGS|M_ZERO); 7990 workitem_alloc(&dap->da_list, D_DIRADD, mp); 7991 dap->da_offset = offset; 7992 dap->da_newinum = newinum; 7993 dap->da_state = ATTACHED; 7994 LIST_INIT(&dap->da_jwork); 7995 isindir = bp->b_lblkno >= NDADDR; 7996 if (isnewblk && 7997 (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { 7998 newdirblk = malloc(sizeof(struct newdirblk), 7999 M_NEWDIRBLK, M_SOFTDEP_FLAGS); 8000 workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); 8001 LIST_INIT(&newdirblk->db_mkdir); 8002 } 8003 /* 8004 * If we're creating a new directory setup the dependencies and set 8005 * the dap state to wait for them. Otherwise it's COMPLETE and 8006 * we can move on. 8007 */ 8008 if (newdirbp == NULL) { 8009 dap->da_state |= DEPCOMPLETE; 8010 ACQUIRE_LOCK(&lk); 8011 } else { 8012 dap->da_state |= MKDIR_BODY | MKDIR_PARENT; 8013 mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, 8014 &mkdir2); 8015 } 8016 /* 8017 * Link into parent directory pagedep to await its being written. 8018 */ 8019 pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep); 8020 #ifdef DEBUG 8021 if (diradd_lookup(pagedep, offset) != NULL) 8022 panic("softdep_setup_directory_add: %p already at off %d\n", 8023 diradd_lookup(pagedep, offset), offset); 8024 #endif 8025 dap->da_pagedep = pagedep; 8026 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, 8027 da_pdlist); 8028 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 8029 /* 8030 * If we're journaling, link the diradd into the jaddref so it 8031 * may be completed after the journal entry is written. Otherwise, 8032 * link the diradd into its inodedep. If the inode is not yet 8033 * written place it on the bufwait list, otherwise do the post-inode 8034 * write processing to put it on the id_pendinghd list. 8035 */ 8036 if (MOUNTEDSUJ(mp)) { 8037 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 8038 inoreflst); 8039 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 8040 ("softdep_setup_directory_add: bad jaddref %p", jaddref)); 8041 jaddref->ja_diroff = diroffset; 8042 jaddref->ja_diradd = dap; 8043 add_to_journal(&jaddref->ja_list); 8044 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) 8045 diradd_inode_written(dap, inodedep); 8046 else 8047 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 8048 /* 8049 * Add the journal entries for . and .. links now that the primary 8050 * link is written. 8051 */ 8052 if (mkdir1 != NULL && MOUNTEDSUJ(mp)) { 8053 jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, 8054 inoreflst, if_deps); 8055 KASSERT(jaddref != NULL && 8056 jaddref->ja_ino == jaddref->ja_parent && 8057 (jaddref->ja_state & MKDIR_BODY), 8058 ("softdep_setup_directory_add: bad dot jaddref %p", 8059 jaddref)); 8060 mkdir1->md_jaddref = jaddref; 8061 jaddref->ja_mkdir = mkdir1; 8062 /* 8063 * It is important that the dotdot journal entry 8064 * is added prior to the dot entry since dot writes 8065 * both the dot and dotdot links. These both must 8066 * be added after the primary link for the journal 8067 * to remain consistent. 8068 */ 8069 add_to_journal(&mkdir2->md_jaddref->ja_list); 8070 add_to_journal(&jaddref->ja_list); 8071 } 8072 /* 8073 * If we are adding a new directory remember this diradd so that if 8074 * we rename it we can keep the dot and dotdot dependencies. If 8075 * we are adding a new name for an inode that has a mkdiradd we 8076 * must be in rename and we have to move the dot and dotdot 8077 * dependencies to this new name. The old name is being orphaned 8078 * soon. 8079 */ 8080 if (mkdir1 != NULL) { 8081 if (inodedep->id_mkdiradd != NULL) 8082 panic("softdep_setup_directory_add: Existing mkdir"); 8083 inodedep->id_mkdiradd = dap; 8084 } else if (inodedep->id_mkdiradd) 8085 merge_diradd(inodedep, dap); 8086 if (newdirblk) { 8087 /* 8088 * There is nothing to do if we are already tracking 8089 * this block. 8090 */ 8091 if ((pagedep->pd_state & NEWBLOCK) != 0) { 8092 WORKITEM_FREE(newdirblk, D_NEWDIRBLK); 8093 FREE_LOCK(&lk); 8094 return (0); 8095 } 8096 if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) 8097 == 0) 8098 panic("softdep_setup_directory_add: lost entry"); 8099 WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); 8100 pagedep->pd_state |= NEWBLOCK; 8101 pagedep->pd_newdirblk = newdirblk; 8102 newdirblk->db_pagedep = pagedep; 8103 FREE_LOCK(&lk); 8104 /* 8105 * If we extended into an indirect signal direnter to sync. 8106 */ 8107 if (isindir) 8108 return (1); 8109 return (0); 8110 } 8111 FREE_LOCK(&lk); 8112 return (0); 8113 } 8114 8115 /* 8116 * This procedure is called to change the offset of a directory 8117 * entry when compacting a directory block which must be owned 8118 * exclusively by the caller. Note that the actual entry movement 8119 * must be done in this procedure to ensure that no I/O completions 8120 * occur while the move is in progress. 8121 */ 8122 void 8123 softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) 8124 struct buf *bp; /* Buffer holding directory block. */ 8125 struct inode *dp; /* inode for directory */ 8126 caddr_t base; /* address of dp->i_offset */ 8127 caddr_t oldloc; /* address of old directory location */ 8128 caddr_t newloc; /* address of new directory location */ 8129 int entrysize; /* size of directory entry */ 8130 { 8131 int offset, oldoffset, newoffset; 8132 struct pagedep *pagedep; 8133 struct jmvref *jmvref; 8134 struct diradd *dap; 8135 struct direct *de; 8136 struct mount *mp; 8137 ufs_lbn_t lbn; 8138 int flags; 8139 8140 mp = UFSTOVFS(dp->i_ump); 8141 de = (struct direct *)oldloc; 8142 jmvref = NULL; 8143 flags = 0; 8144 /* 8145 * Moves are always journaled as it would be too complex to 8146 * determine if any affected adds or removes are present in the 8147 * journal. 8148 */ 8149 if (MOUNTEDSUJ(mp)) { 8150 flags = DEPALLOC; 8151 jmvref = newjmvref(dp, de->d_ino, 8152 dp->i_offset + (oldloc - base), 8153 dp->i_offset + (newloc - base)); 8154 } 8155 lbn = lblkno(dp->i_fs, dp->i_offset); 8156 offset = blkoff(dp->i_fs, dp->i_offset); 8157 oldoffset = offset + (oldloc - base); 8158 newoffset = offset + (newloc - base); 8159 ACQUIRE_LOCK(&lk); 8160 if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0) 8161 goto done; 8162 dap = diradd_lookup(pagedep, oldoffset); 8163 if (dap) { 8164 dap->da_offset = newoffset; 8165 newoffset = DIRADDHASH(newoffset); 8166 oldoffset = DIRADDHASH(oldoffset); 8167 if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && 8168 newoffset != oldoffset) { 8169 LIST_REMOVE(dap, da_pdlist); 8170 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], 8171 dap, da_pdlist); 8172 } 8173 } 8174 done: 8175 if (jmvref) { 8176 jmvref->jm_pagedep = pagedep; 8177 LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); 8178 add_to_journal(&jmvref->jm_list); 8179 } 8180 bcopy(oldloc, newloc, entrysize); 8181 FREE_LOCK(&lk); 8182 } 8183 8184 /* 8185 * Move the mkdir dependencies and journal work from one diradd to another 8186 * when renaming a directory. The new name must depend on the mkdir deps 8187 * completing as the old name did. Directories can only have one valid link 8188 * at a time so one must be canonical. 8189 */ 8190 static void 8191 merge_diradd(inodedep, newdap) 8192 struct inodedep *inodedep; 8193 struct diradd *newdap; 8194 { 8195 struct diradd *olddap; 8196 struct mkdir *mkdir, *nextmd; 8197 short state; 8198 8199 olddap = inodedep->id_mkdiradd; 8200 inodedep->id_mkdiradd = newdap; 8201 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8202 newdap->da_state &= ~DEPCOMPLETE; 8203 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 8204 nextmd = LIST_NEXT(mkdir, md_mkdirs); 8205 if (mkdir->md_diradd != olddap) 8206 continue; 8207 mkdir->md_diradd = newdap; 8208 state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); 8209 newdap->da_state |= state; 8210 olddap->da_state &= ~state; 8211 if ((olddap->da_state & 8212 (MKDIR_PARENT | MKDIR_BODY)) == 0) 8213 break; 8214 } 8215 if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 8216 panic("merge_diradd: unfound ref"); 8217 } 8218 /* 8219 * Any mkdir related journal items are not safe to be freed until 8220 * the new name is stable. 8221 */ 8222 jwork_move(&newdap->da_jwork, &olddap->da_jwork); 8223 olddap->da_state |= DEPCOMPLETE; 8224 complete_diradd(olddap); 8225 } 8226 8227 /* 8228 * Move the diradd to the pending list when all diradd dependencies are 8229 * complete. 8230 */ 8231 static void 8232 complete_diradd(dap) 8233 struct diradd *dap; 8234 { 8235 struct pagedep *pagedep; 8236 8237 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 8238 if (dap->da_state & DIRCHG) 8239 pagedep = dap->da_previous->dm_pagedep; 8240 else 8241 pagedep = dap->da_pagedep; 8242 LIST_REMOVE(dap, da_pdlist); 8243 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 8244 } 8245 } 8246 8247 /* 8248 * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal 8249 * add entries and conditonally journal the remove. 8250 */ 8251 static void 8252 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) 8253 struct diradd *dap; 8254 struct dirrem *dirrem; 8255 struct jremref *jremref; 8256 struct jremref *dotremref; 8257 struct jremref *dotdotremref; 8258 { 8259 struct inodedep *inodedep; 8260 struct jaddref *jaddref; 8261 struct inoref *inoref; 8262 struct mkdir *mkdir; 8263 8264 /* 8265 * If no remove references were allocated we're on a non-journaled 8266 * filesystem and can skip the cancel step. 8267 */ 8268 if (jremref == NULL) { 8269 free_diradd(dap, NULL); 8270 return; 8271 } 8272 /* 8273 * Cancel the primary name an free it if it does not require 8274 * journaling. 8275 */ 8276 if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 8277 0, &inodedep) != 0) { 8278 /* Abort the addref that reference this diradd. */ 8279 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 8280 if (inoref->if_list.wk_type != D_JADDREF) 8281 continue; 8282 jaddref = (struct jaddref *)inoref; 8283 if (jaddref->ja_diradd != dap) 8284 continue; 8285 if (cancel_jaddref(jaddref, inodedep, 8286 &dirrem->dm_jwork) == 0) { 8287 free_jremref(jremref); 8288 jremref = NULL; 8289 } 8290 break; 8291 } 8292 } 8293 /* 8294 * Cancel subordinate names and free them if they do not require 8295 * journaling. 8296 */ 8297 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8298 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 8299 if (mkdir->md_diradd != dap) 8300 continue; 8301 if ((jaddref = mkdir->md_jaddref) == NULL) 8302 continue; 8303 mkdir->md_jaddref = NULL; 8304 if (mkdir->md_state & MKDIR_PARENT) { 8305 if (cancel_jaddref(jaddref, NULL, 8306 &dirrem->dm_jwork) == 0) { 8307 free_jremref(dotdotremref); 8308 dotdotremref = NULL; 8309 } 8310 } else { 8311 if (cancel_jaddref(jaddref, inodedep, 8312 &dirrem->dm_jwork) == 0) { 8313 free_jremref(dotremref); 8314 dotremref = NULL; 8315 } 8316 } 8317 } 8318 } 8319 8320 if (jremref) 8321 journal_jremref(dirrem, jremref, inodedep); 8322 if (dotremref) 8323 journal_jremref(dirrem, dotremref, inodedep); 8324 if (dotdotremref) 8325 journal_jremref(dirrem, dotdotremref, NULL); 8326 jwork_move(&dirrem->dm_jwork, &dap->da_jwork); 8327 free_diradd(dap, &dirrem->dm_jwork); 8328 } 8329 8330 /* 8331 * Free a diradd dependency structure. This routine must be called 8332 * with splbio interrupts blocked. 8333 */ 8334 static void 8335 free_diradd(dap, wkhd) 8336 struct diradd *dap; 8337 struct workhead *wkhd; 8338 { 8339 struct dirrem *dirrem; 8340 struct pagedep *pagedep; 8341 struct inodedep *inodedep; 8342 struct mkdir *mkdir, *nextmd; 8343 8344 mtx_assert(&lk, MA_OWNED); 8345 LIST_REMOVE(dap, da_pdlist); 8346 if (dap->da_state & ONWORKLIST) 8347 WORKLIST_REMOVE(&dap->da_list); 8348 if ((dap->da_state & DIRCHG) == 0) { 8349 pagedep = dap->da_pagedep; 8350 } else { 8351 dirrem = dap->da_previous; 8352 pagedep = dirrem->dm_pagedep; 8353 dirrem->dm_dirinum = pagedep->pd_ino; 8354 dirrem->dm_state |= COMPLETE; 8355 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8356 add_to_worklist(&dirrem->dm_list, 0); 8357 } 8358 if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 8359 0, &inodedep) != 0) 8360 if (inodedep->id_mkdiradd == dap) 8361 inodedep->id_mkdiradd = NULL; 8362 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { 8363 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { 8364 nextmd = LIST_NEXT(mkdir, md_mkdirs); 8365 if (mkdir->md_diradd != dap) 8366 continue; 8367 dap->da_state &= 8368 ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 8369 LIST_REMOVE(mkdir, md_mkdirs); 8370 if (mkdir->md_state & ONWORKLIST) 8371 WORKLIST_REMOVE(&mkdir->md_list); 8372 if (mkdir->md_jaddref != NULL) 8373 panic("free_diradd: Unexpected jaddref"); 8374 WORKITEM_FREE(mkdir, D_MKDIR); 8375 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) 8376 break; 8377 } 8378 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) 8379 panic("free_diradd: unfound ref"); 8380 } 8381 if (inodedep) 8382 free_inodedep(inodedep); 8383 /* 8384 * Free any journal segments waiting for the directory write. 8385 */ 8386 handle_jwork(&dap->da_jwork); 8387 WORKITEM_FREE(dap, D_DIRADD); 8388 } 8389 8390 /* 8391 * Directory entry removal dependencies. 8392 * 8393 * When removing a directory entry, the entry's inode pointer must be 8394 * zero'ed on disk before the corresponding inode's link count is decremented 8395 * (possibly freeing the inode for re-use). This dependency is handled by 8396 * updating the directory entry but delaying the inode count reduction until 8397 * after the directory block has been written to disk. After this point, the 8398 * inode count can be decremented whenever it is convenient. 8399 */ 8400 8401 /* 8402 * This routine should be called immediately after removing 8403 * a directory entry. The inode's link count should not be 8404 * decremented by the calling procedure -- the soft updates 8405 * code will do this task when it is safe. 8406 */ 8407 void 8408 softdep_setup_remove(bp, dp, ip, isrmdir) 8409 struct buf *bp; /* buffer containing directory block */ 8410 struct inode *dp; /* inode for the directory being modified */ 8411 struct inode *ip; /* inode for directory entry being removed */ 8412 int isrmdir; /* indicates if doing RMDIR */ 8413 { 8414 struct dirrem *dirrem, *prevdirrem; 8415 struct inodedep *inodedep; 8416 int direct; 8417 8418 /* 8419 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want 8420 * newdirrem() to setup the full directory remove which requires 8421 * isrmdir > 1. 8422 */ 8423 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 8424 /* 8425 * Add the dirrem to the inodedep's pending remove list for quick 8426 * discovery later. 8427 */ 8428 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 8429 &inodedep) == 0) 8430 panic("softdep_setup_remove: Lost inodedep."); 8431 dirrem->dm_state |= ONDEPLIST; 8432 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 8433 8434 /* 8435 * If the COMPLETE flag is clear, then there were no active 8436 * entries and we want to roll back to a zeroed entry until 8437 * the new inode is committed to disk. If the COMPLETE flag is 8438 * set then we have deleted an entry that never made it to 8439 * disk. If the entry we deleted resulted from a name change, 8440 * then the old name still resides on disk. We cannot delete 8441 * its inode (returned to us in prevdirrem) until the zeroed 8442 * directory entry gets to disk. The new inode has never been 8443 * referenced on the disk, so can be deleted immediately. 8444 */ 8445 if ((dirrem->dm_state & COMPLETE) == 0) { 8446 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, 8447 dm_next); 8448 FREE_LOCK(&lk); 8449 } else { 8450 if (prevdirrem != NULL) 8451 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, 8452 prevdirrem, dm_next); 8453 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; 8454 direct = LIST_EMPTY(&dirrem->dm_jremrefhd); 8455 FREE_LOCK(&lk); 8456 if (direct) 8457 handle_workitem_remove(dirrem, 0); 8458 } 8459 } 8460 8461 /* 8462 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the 8463 * pd_pendinghd list of a pagedep. 8464 */ 8465 static struct diradd * 8466 diradd_lookup(pagedep, offset) 8467 struct pagedep *pagedep; 8468 int offset; 8469 { 8470 struct diradd *dap; 8471 8472 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) 8473 if (dap->da_offset == offset) 8474 return (dap); 8475 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) 8476 if (dap->da_offset == offset) 8477 return (dap); 8478 return (NULL); 8479 } 8480 8481 /* 8482 * Search for a .. diradd dependency in a directory that is being removed. 8483 * If the directory was renamed to a new parent we have a diradd rather 8484 * than a mkdir for the .. entry. We need to cancel it now before 8485 * it is found in truncate(). 8486 */ 8487 static struct jremref * 8488 cancel_diradd_dotdot(ip, dirrem, jremref) 8489 struct inode *ip; 8490 struct dirrem *dirrem; 8491 struct jremref *jremref; 8492 { 8493 struct pagedep *pagedep; 8494 struct diradd *dap; 8495 struct worklist *wk; 8496 8497 if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0, 8498 &pagedep) == 0) 8499 return (jremref); 8500 dap = diradd_lookup(pagedep, DOTDOT_OFFSET); 8501 if (dap == NULL) 8502 return (jremref); 8503 cancel_diradd(dap, dirrem, jremref, NULL, NULL); 8504 /* 8505 * Mark any journal work as belonging to the parent so it is freed 8506 * with the .. reference. 8507 */ 8508 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 8509 wk->wk_state |= MKDIR_PARENT; 8510 return (NULL); 8511 } 8512 8513 /* 8514 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to 8515 * replace it with a dirrem/diradd pair as a result of re-parenting a 8516 * directory. This ensures that we don't simultaneously have a mkdir and 8517 * a diradd for the same .. entry. 8518 */ 8519 static struct jremref * 8520 cancel_mkdir_dotdot(ip, dirrem, jremref) 8521 struct inode *ip; 8522 struct dirrem *dirrem; 8523 struct jremref *jremref; 8524 { 8525 struct inodedep *inodedep; 8526 struct jaddref *jaddref; 8527 struct mkdir *mkdir; 8528 struct diradd *dap; 8529 8530 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 8531 &inodedep) == 0) 8532 panic("cancel_mkdir_dotdot: Lost inodedep"); 8533 dap = inodedep->id_mkdiradd; 8534 if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) 8535 return (jremref); 8536 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; 8537 mkdir = LIST_NEXT(mkdir, md_mkdirs)) 8538 if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) 8539 break; 8540 if (mkdir == NULL) 8541 panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); 8542 if ((jaddref = mkdir->md_jaddref) != NULL) { 8543 mkdir->md_jaddref = NULL; 8544 jaddref->ja_state &= ~MKDIR_PARENT; 8545 if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, 8546 &inodedep) == 0) 8547 panic("cancel_mkdir_dotdot: Lost parent inodedep"); 8548 if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { 8549 journal_jremref(dirrem, jremref, inodedep); 8550 jremref = NULL; 8551 } 8552 } 8553 if (mkdir->md_state & ONWORKLIST) 8554 WORKLIST_REMOVE(&mkdir->md_list); 8555 mkdir->md_state |= ALLCOMPLETE; 8556 complete_mkdir(mkdir); 8557 return (jremref); 8558 } 8559 8560 static void 8561 journal_jremref(dirrem, jremref, inodedep) 8562 struct dirrem *dirrem; 8563 struct jremref *jremref; 8564 struct inodedep *inodedep; 8565 { 8566 8567 if (inodedep == NULL) 8568 if (inodedep_lookup(jremref->jr_list.wk_mp, 8569 jremref->jr_ref.if_ino, 0, &inodedep) == 0) 8570 panic("journal_jremref: Lost inodedep"); 8571 LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); 8572 TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); 8573 add_to_journal(&jremref->jr_list); 8574 } 8575 8576 static void 8577 dirrem_journal(dirrem, jremref, dotremref, dotdotremref) 8578 struct dirrem *dirrem; 8579 struct jremref *jremref; 8580 struct jremref *dotremref; 8581 struct jremref *dotdotremref; 8582 { 8583 struct inodedep *inodedep; 8584 8585 8586 if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, 8587 &inodedep) == 0) 8588 panic("dirrem_journal: Lost inodedep"); 8589 journal_jremref(dirrem, jremref, inodedep); 8590 if (dotremref) 8591 journal_jremref(dirrem, dotremref, inodedep); 8592 if (dotdotremref) 8593 journal_jremref(dirrem, dotdotremref, NULL); 8594 } 8595 8596 /* 8597 * Allocate a new dirrem if appropriate and return it along with 8598 * its associated pagedep. Called without a lock, returns with lock. 8599 */ 8600 static struct dirrem * 8601 newdirrem(bp, dp, ip, isrmdir, prevdirremp) 8602 struct buf *bp; /* buffer containing directory block */ 8603 struct inode *dp; /* inode for the directory being modified */ 8604 struct inode *ip; /* inode for directory entry being removed */ 8605 int isrmdir; /* indicates if doing RMDIR */ 8606 struct dirrem **prevdirremp; /* previously referenced inode, if any */ 8607 { 8608 int offset; 8609 ufs_lbn_t lbn; 8610 struct diradd *dap; 8611 struct dirrem *dirrem; 8612 struct pagedep *pagedep; 8613 struct jremref *jremref; 8614 struct jremref *dotremref; 8615 struct jremref *dotdotremref; 8616 struct vnode *dvp; 8617 8618 /* 8619 * Whiteouts have no deletion dependencies. 8620 */ 8621 if (ip == NULL) 8622 panic("newdirrem: whiteout"); 8623 dvp = ITOV(dp); 8624 /* 8625 * If we are over our limit, try to improve the situation. 8626 * Limiting the number of dirrem structures will also limit 8627 * the number of freefile and freeblks structures. 8628 */ 8629 ACQUIRE_LOCK(&lk); 8630 if (!(ip->i_flags & SF_SNAPSHOT) && 8631 dep_current[D_DIRREM] > max_softdeps / 2) 8632 (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS); 8633 FREE_LOCK(&lk); 8634 dirrem = malloc(sizeof(struct dirrem), 8635 M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); 8636 workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); 8637 LIST_INIT(&dirrem->dm_jremrefhd); 8638 LIST_INIT(&dirrem->dm_jwork); 8639 dirrem->dm_state = isrmdir ? RMDIR : 0; 8640 dirrem->dm_oldinum = ip->i_number; 8641 *prevdirremp = NULL; 8642 /* 8643 * Allocate remove reference structures to track journal write 8644 * dependencies. We will always have one for the link and 8645 * when doing directories we will always have one more for dot. 8646 * When renaming a directory we skip the dotdot link change so 8647 * this is not needed. 8648 */ 8649 jremref = dotremref = dotdotremref = NULL; 8650 if (DOINGSUJ(dvp)) { 8651 if (isrmdir) { 8652 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 8653 ip->i_effnlink + 2); 8654 dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, 8655 ip->i_effnlink + 1); 8656 dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, 8657 dp->i_effnlink + 1); 8658 dotdotremref->jr_state |= MKDIR_PARENT; 8659 } else 8660 jremref = newjremref(dirrem, dp, ip, dp->i_offset, 8661 ip->i_effnlink + 1); 8662 } 8663 ACQUIRE_LOCK(&lk); 8664 lbn = lblkno(dp->i_fs, dp->i_offset); 8665 offset = blkoff(dp->i_fs, dp->i_offset); 8666 pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC, 8667 &pagedep); 8668 dirrem->dm_pagedep = pagedep; 8669 dirrem->dm_offset = offset; 8670 /* 8671 * If we're renaming a .. link to a new directory, cancel any 8672 * existing MKDIR_PARENT mkdir. If it has already been canceled 8673 * the jremref is preserved for any potential diradd in this 8674 * location. This can not coincide with a rmdir. 8675 */ 8676 if (dp->i_offset == DOTDOT_OFFSET) { 8677 if (isrmdir) 8678 panic("newdirrem: .. directory change during remove?"); 8679 jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); 8680 } 8681 /* 8682 * If we're removing a directory search for the .. dependency now and 8683 * cancel it. Any pending journal work will be added to the dirrem 8684 * to be completed when the workitem remove completes. 8685 */ 8686 if (isrmdir) 8687 dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); 8688 /* 8689 * Check for a diradd dependency for the same directory entry. 8690 * If present, then both dependencies become obsolete and can 8691 * be de-allocated. 8692 */ 8693 dap = diradd_lookup(pagedep, offset); 8694 if (dap == NULL) { 8695 /* 8696 * Link the jremref structures into the dirrem so they are 8697 * written prior to the pagedep. 8698 */ 8699 if (jremref) 8700 dirrem_journal(dirrem, jremref, dotremref, 8701 dotdotremref); 8702 return (dirrem); 8703 } 8704 /* 8705 * Must be ATTACHED at this point. 8706 */ 8707 if ((dap->da_state & ATTACHED) == 0) 8708 panic("newdirrem: not ATTACHED"); 8709 if (dap->da_newinum != ip->i_number) 8710 panic("newdirrem: inum %d should be %d", 8711 ip->i_number, dap->da_newinum); 8712 /* 8713 * If we are deleting a changed name that never made it to disk, 8714 * then return the dirrem describing the previous inode (which 8715 * represents the inode currently referenced from this entry on disk). 8716 */ 8717 if ((dap->da_state & DIRCHG) != 0) { 8718 *prevdirremp = dap->da_previous; 8719 dap->da_state &= ~DIRCHG; 8720 dap->da_pagedep = pagedep; 8721 } 8722 /* 8723 * We are deleting an entry that never made it to disk. 8724 * Mark it COMPLETE so we can delete its inode immediately. 8725 */ 8726 dirrem->dm_state |= COMPLETE; 8727 cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); 8728 #ifdef SUJ_DEBUG 8729 if (isrmdir == 0) { 8730 struct worklist *wk; 8731 8732 LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) 8733 if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) 8734 panic("bad wk %p (0x%X)\n", wk, wk->wk_state); 8735 } 8736 #endif 8737 8738 return (dirrem); 8739 } 8740 8741 /* 8742 * Directory entry change dependencies. 8743 * 8744 * Changing an existing directory entry requires that an add operation 8745 * be completed first followed by a deletion. The semantics for the addition 8746 * are identical to the description of adding a new entry above except 8747 * that the rollback is to the old inode number rather than zero. Once 8748 * the addition dependency is completed, the removal is done as described 8749 * in the removal routine above. 8750 */ 8751 8752 /* 8753 * This routine should be called immediately after changing 8754 * a directory entry. The inode's link count should not be 8755 * decremented by the calling procedure -- the soft updates 8756 * code will perform this task when it is safe. 8757 */ 8758 void 8759 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) 8760 struct buf *bp; /* buffer containing directory block */ 8761 struct inode *dp; /* inode for the directory being modified */ 8762 struct inode *ip; /* inode for directory entry being removed */ 8763 ino_t newinum; /* new inode number for changed entry */ 8764 int isrmdir; /* indicates if doing RMDIR */ 8765 { 8766 int offset; 8767 struct diradd *dap = NULL; 8768 struct dirrem *dirrem, *prevdirrem; 8769 struct pagedep *pagedep; 8770 struct inodedep *inodedep; 8771 struct jaddref *jaddref; 8772 struct mount *mp; 8773 8774 offset = blkoff(dp->i_fs, dp->i_offset); 8775 mp = UFSTOVFS(dp->i_ump); 8776 8777 /* 8778 * Whiteouts do not need diradd dependencies. 8779 */ 8780 if (newinum != WINO) { 8781 dap = malloc(sizeof(struct diradd), 8782 M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); 8783 workitem_alloc(&dap->da_list, D_DIRADD, mp); 8784 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; 8785 dap->da_offset = offset; 8786 dap->da_newinum = newinum; 8787 LIST_INIT(&dap->da_jwork); 8788 } 8789 8790 /* 8791 * Allocate a new dirrem and ACQUIRE_LOCK. 8792 */ 8793 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); 8794 pagedep = dirrem->dm_pagedep; 8795 /* 8796 * The possible values for isrmdir: 8797 * 0 - non-directory file rename 8798 * 1 - directory rename within same directory 8799 * inum - directory rename to new directory of given inode number 8800 * When renaming to a new directory, we are both deleting and 8801 * creating a new directory entry, so the link count on the new 8802 * directory should not change. Thus we do not need the followup 8803 * dirrem which is usually done in handle_workitem_remove. We set 8804 * the DIRCHG flag to tell handle_workitem_remove to skip the 8805 * followup dirrem. 8806 */ 8807 if (isrmdir > 1) 8808 dirrem->dm_state |= DIRCHG; 8809 8810 /* 8811 * Whiteouts have no additional dependencies, 8812 * so just put the dirrem on the correct list. 8813 */ 8814 if (newinum == WINO) { 8815 if ((dirrem->dm_state & COMPLETE) == 0) { 8816 LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, 8817 dm_next); 8818 } else { 8819 dirrem->dm_dirinum = pagedep->pd_ino; 8820 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8821 add_to_worklist(&dirrem->dm_list, 0); 8822 } 8823 FREE_LOCK(&lk); 8824 return; 8825 } 8826 /* 8827 * Add the dirrem to the inodedep's pending remove list for quick 8828 * discovery later. A valid nlinkdelta ensures that this lookup 8829 * will not fail. 8830 */ 8831 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 8832 panic("softdep_setup_directory_change: Lost inodedep."); 8833 dirrem->dm_state |= ONDEPLIST; 8834 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 8835 8836 /* 8837 * If the COMPLETE flag is clear, then there were no active 8838 * entries and we want to roll back to the previous inode until 8839 * the new inode is committed to disk. If the COMPLETE flag is 8840 * set, then we have deleted an entry that never made it to disk. 8841 * If the entry we deleted resulted from a name change, then the old 8842 * inode reference still resides on disk. Any rollback that we do 8843 * needs to be to that old inode (returned to us in prevdirrem). If 8844 * the entry we deleted resulted from a create, then there is 8845 * no entry on the disk, so we want to roll back to zero rather 8846 * than the uncommitted inode. In either of the COMPLETE cases we 8847 * want to immediately free the unwritten and unreferenced inode. 8848 */ 8849 if ((dirrem->dm_state & COMPLETE) == 0) { 8850 dap->da_previous = dirrem; 8851 } else { 8852 if (prevdirrem != NULL) { 8853 dap->da_previous = prevdirrem; 8854 } else { 8855 dap->da_state &= ~DIRCHG; 8856 dap->da_pagedep = pagedep; 8857 } 8858 dirrem->dm_dirinum = pagedep->pd_ino; 8859 if (LIST_EMPTY(&dirrem->dm_jremrefhd)) 8860 add_to_worklist(&dirrem->dm_list, 0); 8861 } 8862 /* 8863 * Lookup the jaddref for this journal entry. We must finish 8864 * initializing it and make the diradd write dependent on it. 8865 * If we're not journaling Put it on the id_bufwait list if the inode 8866 * is not yet written. If it is written, do the post-inode write 8867 * processing to put it on the id_pendinghd list. 8868 */ 8869 inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); 8870 if (MOUNTEDSUJ(mp)) { 8871 jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, 8872 inoreflst); 8873 KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, 8874 ("softdep_setup_directory_change: bad jaddref %p", 8875 jaddref)); 8876 jaddref->ja_diroff = dp->i_offset; 8877 jaddref->ja_diradd = dap; 8878 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 8879 dap, da_pdlist); 8880 add_to_journal(&jaddref->ja_list); 8881 } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { 8882 dap->da_state |= COMPLETE; 8883 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); 8884 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 8885 } else { 8886 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], 8887 dap, da_pdlist); 8888 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); 8889 } 8890 /* 8891 * If we're making a new name for a directory that has not been 8892 * committed when need to move the dot and dotdot references to 8893 * this new name. 8894 */ 8895 if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) 8896 merge_diradd(inodedep, dap); 8897 FREE_LOCK(&lk); 8898 } 8899 8900 /* 8901 * Called whenever the link count on an inode is changed. 8902 * It creates an inode dependency so that the new reference(s) 8903 * to the inode cannot be committed to disk until the updated 8904 * inode has been written. 8905 */ 8906 void 8907 softdep_change_linkcnt(ip) 8908 struct inode *ip; /* the inode with the increased link count */ 8909 { 8910 struct inodedep *inodedep; 8911 8912 ACQUIRE_LOCK(&lk); 8913 inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); 8914 if (ip->i_nlink < ip->i_effnlink) 8915 panic("softdep_change_linkcnt: bad delta"); 8916 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 8917 FREE_LOCK(&lk); 8918 } 8919 8920 /* 8921 * Attach a sbdep dependency to the superblock buf so that we can keep 8922 * track of the head of the linked list of referenced but unlinked inodes. 8923 */ 8924 void 8925 softdep_setup_sbupdate(ump, fs, bp) 8926 struct ufsmount *ump; 8927 struct fs *fs; 8928 struct buf *bp; 8929 { 8930 struct sbdep *sbdep; 8931 struct worklist *wk; 8932 8933 if (MOUNTEDSUJ(UFSTOVFS(ump)) == 0) 8934 return; 8935 LIST_FOREACH(wk, &bp->b_dep, wk_list) 8936 if (wk->wk_type == D_SBDEP) 8937 break; 8938 if (wk != NULL) 8939 return; 8940 sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); 8941 workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); 8942 sbdep->sb_fs = fs; 8943 sbdep->sb_ump = ump; 8944 ACQUIRE_LOCK(&lk); 8945 WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); 8946 FREE_LOCK(&lk); 8947 } 8948 8949 /* 8950 * Return the first unlinked inodedep which is ready to be the head of the 8951 * list. The inodedep and all those after it must have valid next pointers. 8952 */ 8953 static struct inodedep * 8954 first_unlinked_inodedep(ump) 8955 struct ufsmount *ump; 8956 { 8957 struct inodedep *inodedep; 8958 struct inodedep *idp; 8959 8960 for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); 8961 inodedep; inodedep = idp) { 8962 if ((inodedep->id_state & UNLINKNEXT) == 0) 8963 return (NULL); 8964 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 8965 if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) 8966 break; 8967 if ((inodedep->id_state & UNLINKPREV) == 0) 8968 panic("first_unlinked_inodedep: prev != next"); 8969 } 8970 if (inodedep == NULL) 8971 return (NULL); 8972 8973 return (inodedep); 8974 } 8975 8976 /* 8977 * Set the sujfree unlinked head pointer prior to writing a superblock. 8978 */ 8979 static void 8980 initiate_write_sbdep(sbdep) 8981 struct sbdep *sbdep; 8982 { 8983 struct inodedep *inodedep; 8984 struct fs *bpfs; 8985 struct fs *fs; 8986 8987 bpfs = sbdep->sb_fs; 8988 fs = sbdep->sb_ump->um_fs; 8989 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 8990 if (inodedep) { 8991 fs->fs_sujfree = inodedep->id_ino; 8992 inodedep->id_state |= UNLINKPREV; 8993 } else 8994 fs->fs_sujfree = 0; 8995 bpfs->fs_sujfree = fs->fs_sujfree; 8996 } 8997 8998 /* 8999 * After a superblock is written determine whether it must be written again 9000 * due to a changing unlinked list head. 9001 */ 9002 static int 9003 handle_written_sbdep(sbdep, bp) 9004 struct sbdep *sbdep; 9005 struct buf *bp; 9006 { 9007 struct inodedep *inodedep; 9008 struct mount *mp; 9009 struct fs *fs; 9010 9011 fs = sbdep->sb_fs; 9012 mp = UFSTOVFS(sbdep->sb_ump); 9013 inodedep = first_unlinked_inodedep(sbdep->sb_ump); 9014 if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || 9015 (inodedep == NULL && fs->fs_sujfree != 0)) { 9016 bdirty(bp); 9017 return (1); 9018 } 9019 WORKITEM_FREE(sbdep, D_SBDEP); 9020 if (fs->fs_sujfree == 0) 9021 return (0); 9022 if (inodedep_lookup(mp, fs->fs_sujfree, 0, &inodedep) == 0) 9023 panic("handle_written_sbdep: lost inodedep"); 9024 /* 9025 * Now that we have a record of this inode in stable store allow it 9026 * to be written to free up pending work. Inodes may see a lot of 9027 * write activity after they are unlinked which we must not hold up. 9028 */ 9029 for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { 9030 if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) 9031 panic("handle_written_sbdep: Bad inodedep %p (0x%X)", 9032 inodedep, inodedep->id_state); 9033 if (inodedep->id_state & UNLINKONLIST) 9034 break; 9035 inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; 9036 } 9037 9038 return (0); 9039 } 9040 9041 /* 9042 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. 9043 */ 9044 static void 9045 unlinked_inodedep(mp, inodedep) 9046 struct mount *mp; 9047 struct inodedep *inodedep; 9048 { 9049 struct ufsmount *ump; 9050 9051 if (MOUNTEDSUJ(mp) == 0) 9052 return; 9053 ump = VFSTOUFS(mp); 9054 ump->um_fs->fs_fmod = 1; 9055 inodedep->id_state |= UNLINKED; 9056 TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); 9057 } 9058 9059 /* 9060 * Remove an inodedep from the unlinked inodedep list. This may require 9061 * disk writes if the inode has made it that far. 9062 */ 9063 static void 9064 clear_unlinked_inodedep(inodedep) 9065 struct inodedep *inodedep; 9066 { 9067 struct ufsmount *ump; 9068 struct inodedep *idp; 9069 struct inodedep *idn; 9070 struct fs *fs; 9071 struct buf *bp; 9072 ino_t ino; 9073 ino_t nino; 9074 ino_t pino; 9075 int error; 9076 9077 ump = VFSTOUFS(inodedep->id_list.wk_mp); 9078 fs = ump->um_fs; 9079 ino = inodedep->id_ino; 9080 error = 0; 9081 for (;;) { 9082 /* 9083 * If nothing has yet been written simply remove us from 9084 * the in memory list and return. This is the most common 9085 * case where handle_workitem_remove() loses the final 9086 * reference. 9087 */ 9088 if ((inodedep->id_state & UNLINKLINKS) == 0) 9089 break; 9090 /* 9091 * If we have a NEXT pointer and no PREV pointer we can simply 9092 * clear NEXT's PREV and remove ourselves from the list. Be 9093 * careful not to clear PREV if the superblock points at 9094 * next as well. 9095 */ 9096 idn = TAILQ_NEXT(inodedep, id_unlinked); 9097 if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { 9098 if (idn && fs->fs_sujfree != idn->id_ino) 9099 idn->id_state &= ~UNLINKPREV; 9100 break; 9101 } 9102 /* 9103 * Here we have an inodedep which is actually linked into 9104 * the list. We must remove it by forcing a write to the 9105 * link before us, whether it be the superblock or an inode. 9106 * Unfortunately the list may change while we're waiting 9107 * on the buf lock for either resource so we must loop until 9108 * we lock the right one. If both the superblock and an 9109 * inode point to this inode we must clear the inode first 9110 * followed by the superblock. 9111 */ 9112 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 9113 pino = 0; 9114 if (idp && (idp->id_state & UNLINKNEXT)) 9115 pino = idp->id_ino; 9116 FREE_LOCK(&lk); 9117 if (pino == 0) 9118 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 9119 (int)fs->fs_sbsize, 0, 0, 0); 9120 else 9121 error = bread(ump->um_devvp, 9122 fsbtodb(fs, ino_to_fsba(fs, pino)), 9123 (int)fs->fs_bsize, NOCRED, &bp); 9124 ACQUIRE_LOCK(&lk); 9125 if (error) 9126 break; 9127 /* If the list has changed restart the loop. */ 9128 idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); 9129 nino = 0; 9130 if (idp && (idp->id_state & UNLINKNEXT)) 9131 nino = idp->id_ino; 9132 if (nino != pino || 9133 (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { 9134 FREE_LOCK(&lk); 9135 brelse(bp); 9136 ACQUIRE_LOCK(&lk); 9137 continue; 9138 } 9139 /* 9140 * Remove us from the in memory list. After this we cannot 9141 * access the inodedep. 9142 */ 9143 idn = TAILQ_NEXT(inodedep, id_unlinked); 9144 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 9145 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 9146 /* 9147 * Determine the next inode number. 9148 */ 9149 nino = 0; 9150 if (idn) { 9151 /* 9152 * If next isn't on the list we can just clear prev's 9153 * state and schedule it to be fixed later. No need 9154 * to synchronously write if we're not in the real 9155 * list. 9156 */ 9157 if ((idn->id_state & UNLINKPREV) == 0 && pino != 0) { 9158 idp->id_state &= ~UNLINKNEXT; 9159 if ((idp->id_state & ONWORKLIST) == 0) 9160 WORKLIST_INSERT(&bp->b_dep, 9161 &idp->id_list); 9162 FREE_LOCK(&lk); 9163 bawrite(bp); 9164 ACQUIRE_LOCK(&lk); 9165 return; 9166 } 9167 nino = idn->id_ino; 9168 } 9169 FREE_LOCK(&lk); 9170 /* 9171 * The predecessor's next pointer is manually updated here 9172 * so that the NEXT flag is never cleared for an element 9173 * that is in the list. 9174 */ 9175 if (pino == 0) { 9176 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 9177 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 9178 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 9179 bp); 9180 } else if (fs->fs_magic == FS_UFS1_MAGIC) 9181 ((struct ufs1_dinode *)bp->b_data + 9182 ino_to_fsbo(fs, pino))->di_freelink = nino; 9183 else 9184 ((struct ufs2_dinode *)bp->b_data + 9185 ino_to_fsbo(fs, pino))->di_freelink = nino; 9186 /* 9187 * If the bwrite fails we have no recourse to recover. The 9188 * filesystem is corrupted already. 9189 */ 9190 bwrite(bp); 9191 ACQUIRE_LOCK(&lk); 9192 /* 9193 * If the superblock pointer still needs to be cleared force 9194 * a write here. 9195 */ 9196 if (fs->fs_sujfree == ino) { 9197 FREE_LOCK(&lk); 9198 bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), 9199 (int)fs->fs_sbsize, 0, 0, 0); 9200 bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); 9201 ffs_oldfscompat_write((struct fs *)bp->b_data, ump); 9202 softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, 9203 bp); 9204 bwrite(bp); 9205 ACQUIRE_LOCK(&lk); 9206 } 9207 if (fs->fs_sujfree != ino) 9208 return; 9209 panic("clear_unlinked_inodedep: Failed to clear free head"); 9210 } 9211 if (inodedep->id_ino == fs->fs_sujfree) 9212 panic("clear_unlinked_inodedep: Freeing head of free list"); 9213 inodedep->id_state &= ~(UNLINKED | UNLINKLINKS); 9214 TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); 9215 return; 9216 } 9217 9218 /* 9219 * This workitem decrements the inode's link count. 9220 * If the link count reaches zero, the file is removed. 9221 */ 9222 static int 9223 handle_workitem_remove(dirrem, flags) 9224 struct dirrem *dirrem; 9225 int flags; 9226 { 9227 struct inodedep *inodedep; 9228 struct workhead dotdotwk; 9229 struct worklist *wk; 9230 struct ufsmount *ump; 9231 struct mount *mp; 9232 struct vnode *vp; 9233 struct inode *ip; 9234 ino_t oldinum; 9235 9236 if (dirrem->dm_state & ONWORKLIST) 9237 panic("handle_workitem_remove: dirrem %p still on worklist", 9238 dirrem); 9239 oldinum = dirrem->dm_oldinum; 9240 mp = dirrem->dm_list.wk_mp; 9241 ump = VFSTOUFS(mp); 9242 flags |= LK_EXCLUSIVE; 9243 if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0) 9244 return (EBUSY); 9245 ip = VTOI(vp); 9246 ACQUIRE_LOCK(&lk); 9247 if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) 9248 panic("handle_workitem_remove: lost inodedep"); 9249 if (dirrem->dm_state & ONDEPLIST) 9250 LIST_REMOVE(dirrem, dm_inonext); 9251 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 9252 ("handle_workitem_remove: Journal entries not written.")); 9253 9254 /* 9255 * Move all dependencies waiting on the remove to complete 9256 * from the dirrem to the inode inowait list to be completed 9257 * after the inode has been updated and written to disk. Any 9258 * marked MKDIR_PARENT are saved to be completed when the .. ref 9259 * is removed. 9260 */ 9261 LIST_INIT(&dotdotwk); 9262 while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { 9263 WORKLIST_REMOVE(wk); 9264 if (wk->wk_state & MKDIR_PARENT) { 9265 wk->wk_state &= ~MKDIR_PARENT; 9266 WORKLIST_INSERT(&dotdotwk, wk); 9267 continue; 9268 } 9269 WORKLIST_INSERT(&inodedep->id_inowait, wk); 9270 } 9271 LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); 9272 /* 9273 * Normal file deletion. 9274 */ 9275 if ((dirrem->dm_state & RMDIR) == 0) { 9276 ip->i_nlink--; 9277 DIP_SET(ip, i_nlink, ip->i_nlink); 9278 ip->i_flag |= IN_CHANGE; 9279 if (ip->i_nlink < ip->i_effnlink) 9280 panic("handle_workitem_remove: bad file delta"); 9281 if (ip->i_nlink == 0) 9282 unlinked_inodedep(mp, inodedep); 9283 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 9284 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 9285 ("handle_workitem_remove: worklist not empty. %s", 9286 TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); 9287 WORKITEM_FREE(dirrem, D_DIRREM); 9288 FREE_LOCK(&lk); 9289 goto out; 9290 } 9291 /* 9292 * Directory deletion. Decrement reference count for both the 9293 * just deleted parent directory entry and the reference for ".". 9294 * Arrange to have the reference count on the parent decremented 9295 * to account for the loss of "..". 9296 */ 9297 ip->i_nlink -= 2; 9298 DIP_SET(ip, i_nlink, ip->i_nlink); 9299 ip->i_flag |= IN_CHANGE; 9300 if (ip->i_nlink < ip->i_effnlink) 9301 panic("handle_workitem_remove: bad dir delta"); 9302 if (ip->i_nlink == 0) 9303 unlinked_inodedep(mp, inodedep); 9304 inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; 9305 /* 9306 * Rename a directory to a new parent. Since, we are both deleting 9307 * and creating a new directory entry, the link count on the new 9308 * directory should not change. Thus we skip the followup dirrem. 9309 */ 9310 if (dirrem->dm_state & DIRCHG) { 9311 KASSERT(LIST_EMPTY(&dirrem->dm_jwork), 9312 ("handle_workitem_remove: DIRCHG and worklist not empty.")); 9313 WORKITEM_FREE(dirrem, D_DIRREM); 9314 FREE_LOCK(&lk); 9315 goto out; 9316 } 9317 dirrem->dm_state = ONDEPLIST; 9318 dirrem->dm_oldinum = dirrem->dm_dirinum; 9319 /* 9320 * Place the dirrem on the parent's diremhd list. 9321 */ 9322 if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) 9323 panic("handle_workitem_remove: lost dir inodedep"); 9324 LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); 9325 /* 9326 * If the allocated inode has never been written to disk, then 9327 * the on-disk inode is zero'ed and we can remove the file 9328 * immediately. When journaling if the inode has been marked 9329 * unlinked and not DEPCOMPLETE we know it can never be written. 9330 */ 9331 inodedep_lookup(mp, oldinum, 0, &inodedep); 9332 if (inodedep == NULL || 9333 (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || 9334 check_inode_unwritten(inodedep)) { 9335 FREE_LOCK(&lk); 9336 vput(vp); 9337 return handle_workitem_remove(dirrem, flags); 9338 } 9339 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); 9340 FREE_LOCK(&lk); 9341 ip->i_flag |= IN_CHANGE; 9342 out: 9343 ffs_update(vp, 0); 9344 vput(vp); 9345 return (0); 9346 } 9347 9348 /* 9349 * Inode de-allocation dependencies. 9350 * 9351 * When an inode's link count is reduced to zero, it can be de-allocated. We 9352 * found it convenient to postpone de-allocation until after the inode is 9353 * written to disk with its new link count (zero). At this point, all of the 9354 * on-disk inode's block pointers are nullified and, with careful dependency 9355 * list ordering, all dependencies related to the inode will be satisfied and 9356 * the corresponding dependency structures de-allocated. So, if/when the 9357 * inode is reused, there will be no mixing of old dependencies with new 9358 * ones. This artificial dependency is set up by the block de-allocation 9359 * procedure above (softdep_setup_freeblocks) and completed by the 9360 * following procedure. 9361 */ 9362 static void 9363 handle_workitem_freefile(freefile) 9364 struct freefile *freefile; 9365 { 9366 struct workhead wkhd; 9367 struct fs *fs; 9368 struct inodedep *idp; 9369 struct ufsmount *ump; 9370 int error; 9371 9372 ump = VFSTOUFS(freefile->fx_list.wk_mp); 9373 fs = ump->um_fs; 9374 #ifdef DEBUG 9375 ACQUIRE_LOCK(&lk); 9376 error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); 9377 FREE_LOCK(&lk); 9378 if (error) 9379 panic("handle_workitem_freefile: inodedep %p survived", idp); 9380 #endif 9381 UFS_LOCK(ump); 9382 fs->fs_pendinginodes -= 1; 9383 UFS_UNLOCK(ump); 9384 LIST_INIT(&wkhd); 9385 LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); 9386 if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, 9387 freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) 9388 softdep_error("handle_workitem_freefile", error); 9389 ACQUIRE_LOCK(&lk); 9390 WORKITEM_FREE(freefile, D_FREEFILE); 9391 FREE_LOCK(&lk); 9392 } 9393 9394 9395 /* 9396 * Helper function which unlinks marker element from work list and returns 9397 * the next element on the list. 9398 */ 9399 static __inline struct worklist * 9400 markernext(struct worklist *marker) 9401 { 9402 struct worklist *next; 9403 9404 next = LIST_NEXT(marker, wk_list); 9405 LIST_REMOVE(marker, wk_list); 9406 return next; 9407 } 9408 9409 /* 9410 * Disk writes. 9411 * 9412 * The dependency structures constructed above are most actively used when file 9413 * system blocks are written to disk. No constraints are placed on when a 9414 * block can be written, but unsatisfied update dependencies are made safe by 9415 * modifying (or replacing) the source memory for the duration of the disk 9416 * write. When the disk write completes, the memory block is again brought 9417 * up-to-date. 9418 * 9419 * In-core inode structure reclamation. 9420 * 9421 * Because there are a finite number of "in-core" inode structures, they are 9422 * reused regularly. By transferring all inode-related dependencies to the 9423 * in-memory inode block and indexing them separately (via "inodedep"s), we 9424 * can allow "in-core" inode structures to be reused at any time and avoid 9425 * any increase in contention. 9426 * 9427 * Called just before entering the device driver to initiate a new disk I/O. 9428 * The buffer must be locked, thus, no I/O completion operations can occur 9429 * while we are manipulating its associated dependencies. 9430 */ 9431 static void 9432 softdep_disk_io_initiation(bp) 9433 struct buf *bp; /* structure describing disk write to occur */ 9434 { 9435 struct worklist *wk; 9436 struct worklist marker; 9437 struct inodedep *inodedep; 9438 struct freeblks *freeblks; 9439 struct jblkdep *jblkdep; 9440 struct newblk *newblk; 9441 9442 /* 9443 * We only care about write operations. There should never 9444 * be dependencies for reads. 9445 */ 9446 if (bp->b_iocmd != BIO_WRITE) 9447 panic("softdep_disk_io_initiation: not write"); 9448 9449 if (bp->b_vflags & BV_BKGRDINPROG) 9450 panic("softdep_disk_io_initiation: Writing buffer with " 9451 "background write in progress: %p", bp); 9452 9453 marker.wk_type = D_LAST + 1; /* Not a normal workitem */ 9454 PHOLD(curproc); /* Don't swap out kernel stack */ 9455 9456 ACQUIRE_LOCK(&lk); 9457 /* 9458 * Do any necessary pre-I/O processing. 9459 */ 9460 for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; 9461 wk = markernext(&marker)) { 9462 LIST_INSERT_AFTER(wk, &marker, wk_list); 9463 switch (wk->wk_type) { 9464 9465 case D_PAGEDEP: 9466 initiate_write_filepage(WK_PAGEDEP(wk), bp); 9467 continue; 9468 9469 case D_INODEDEP: 9470 inodedep = WK_INODEDEP(wk); 9471 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) 9472 initiate_write_inodeblock_ufs1(inodedep, bp); 9473 else 9474 initiate_write_inodeblock_ufs2(inodedep, bp); 9475 continue; 9476 9477 case D_INDIRDEP: 9478 initiate_write_indirdep(WK_INDIRDEP(wk), bp); 9479 continue; 9480 9481 case D_BMSAFEMAP: 9482 initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); 9483 continue; 9484 9485 case D_JSEG: 9486 WK_JSEG(wk)->js_buf = NULL; 9487 continue; 9488 9489 case D_FREEBLKS: 9490 freeblks = WK_FREEBLKS(wk); 9491 jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd); 9492 /* 9493 * We have to wait for the freeblks to be journaled 9494 * before we can write an inodeblock with updated 9495 * pointers. Be careful to arrange the marker so 9496 * we revisit the freeblks if it's not removed by 9497 * the first jwait(). 9498 */ 9499 if (jblkdep != NULL) { 9500 LIST_REMOVE(&marker, wk_list); 9501 LIST_INSERT_BEFORE(wk, &marker, wk_list); 9502 jwait(&jblkdep->jb_list, MNT_WAIT); 9503 } 9504 continue; 9505 case D_ALLOCDIRECT: 9506 case D_ALLOCINDIR: 9507 /* 9508 * We have to wait for the jnewblk to be journaled 9509 * before we can write to a block if the contents 9510 * may be confused with an earlier file's indirect 9511 * at recovery time. Handle the marker as described 9512 * above. 9513 */ 9514 newblk = WK_NEWBLK(wk); 9515 if (newblk->nb_jnewblk != NULL && 9516 indirblk_lookup(newblk->nb_list.wk_mp, 9517 newblk->nb_newblkno)) { 9518 LIST_REMOVE(&marker, wk_list); 9519 LIST_INSERT_BEFORE(wk, &marker, wk_list); 9520 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 9521 } 9522 continue; 9523 9524 case D_SBDEP: 9525 initiate_write_sbdep(WK_SBDEP(wk)); 9526 continue; 9527 9528 case D_MKDIR: 9529 case D_FREEWORK: 9530 case D_FREEDEP: 9531 case D_JSEGDEP: 9532 continue; 9533 9534 default: 9535 panic("handle_disk_io_initiation: Unexpected type %s", 9536 TYPENAME(wk->wk_type)); 9537 /* NOTREACHED */ 9538 } 9539 } 9540 FREE_LOCK(&lk); 9541 PRELE(curproc); /* Allow swapout of kernel stack */ 9542 } 9543 9544 /* 9545 * Called from within the procedure above to deal with unsatisfied 9546 * allocation dependencies in a directory. The buffer must be locked, 9547 * thus, no I/O completion operations can occur while we are 9548 * manipulating its associated dependencies. 9549 */ 9550 static void 9551 initiate_write_filepage(pagedep, bp) 9552 struct pagedep *pagedep; 9553 struct buf *bp; 9554 { 9555 struct jremref *jremref; 9556 struct jmvref *jmvref; 9557 struct dirrem *dirrem; 9558 struct diradd *dap; 9559 struct direct *ep; 9560 int i; 9561 9562 if (pagedep->pd_state & IOSTARTED) { 9563 /* 9564 * This can only happen if there is a driver that does not 9565 * understand chaining. Here biodone will reissue the call 9566 * to strategy for the incomplete buffers. 9567 */ 9568 printf("initiate_write_filepage: already started\n"); 9569 return; 9570 } 9571 pagedep->pd_state |= IOSTARTED; 9572 /* 9573 * Wait for all journal remove dependencies to hit the disk. 9574 * We can not allow any potentially conflicting directory adds 9575 * to be visible before removes and rollback is too difficult. 9576 * lk may be dropped and re-acquired, however we hold the buf 9577 * locked so the dependency can not go away. 9578 */ 9579 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) 9580 while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) 9581 jwait(&jremref->jr_list, MNT_WAIT); 9582 while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) 9583 jwait(&jmvref->jm_list, MNT_WAIT); 9584 for (i = 0; i < DAHASHSZ; i++) { 9585 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 9586 ep = (struct direct *) 9587 ((char *)bp->b_data + dap->da_offset); 9588 if (ep->d_ino != dap->da_newinum) 9589 panic("%s: dir inum %d != new %d", 9590 "initiate_write_filepage", 9591 ep->d_ino, dap->da_newinum); 9592 if (dap->da_state & DIRCHG) 9593 ep->d_ino = dap->da_previous->dm_oldinum; 9594 else 9595 ep->d_ino = 0; 9596 dap->da_state &= ~ATTACHED; 9597 dap->da_state |= UNDONE; 9598 } 9599 } 9600 } 9601 9602 /* 9603 * Version of initiate_write_inodeblock that handles UFS1 dinodes. 9604 * Note that any bug fixes made to this routine must be done in the 9605 * version found below. 9606 * 9607 * Called from within the procedure above to deal with unsatisfied 9608 * allocation dependencies in an inodeblock. The buffer must be 9609 * locked, thus, no I/O completion operations can occur while we 9610 * are manipulating its associated dependencies. 9611 */ 9612 static void 9613 initiate_write_inodeblock_ufs1(inodedep, bp) 9614 struct inodedep *inodedep; 9615 struct buf *bp; /* The inode block */ 9616 { 9617 struct allocdirect *adp, *lastadp; 9618 struct ufs1_dinode *dp; 9619 struct ufs1_dinode *sip; 9620 struct inoref *inoref; 9621 struct fs *fs; 9622 ufs_lbn_t i; 9623 #ifdef INVARIANTS 9624 ufs_lbn_t prevlbn = 0; 9625 #endif 9626 int deplist; 9627 9628 if (inodedep->id_state & IOSTARTED) 9629 panic("initiate_write_inodeblock_ufs1: already started"); 9630 inodedep->id_state |= IOSTARTED; 9631 fs = inodedep->id_fs; 9632 dp = (struct ufs1_dinode *)bp->b_data + 9633 ino_to_fsbo(fs, inodedep->id_ino); 9634 9635 /* 9636 * If we're on the unlinked list but have not yet written our 9637 * next pointer initialize it here. 9638 */ 9639 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9640 struct inodedep *inon; 9641 9642 inon = TAILQ_NEXT(inodedep, id_unlinked); 9643 dp->di_freelink = inon ? inon->id_ino : 0; 9644 } 9645 /* 9646 * If the bitmap is not yet written, then the allocated 9647 * inode cannot be written to disk. 9648 */ 9649 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 9650 if (inodedep->id_savedino1 != NULL) 9651 panic("initiate_write_inodeblock_ufs1: I/O underway"); 9652 FREE_LOCK(&lk); 9653 sip = malloc(sizeof(struct ufs1_dinode), 9654 M_SAVEDINO, M_SOFTDEP_FLAGS); 9655 ACQUIRE_LOCK(&lk); 9656 inodedep->id_savedino1 = sip; 9657 *inodedep->id_savedino1 = *dp; 9658 bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); 9659 dp->di_gen = inodedep->id_savedino1->di_gen; 9660 dp->di_freelink = inodedep->id_savedino1->di_freelink; 9661 return; 9662 } 9663 /* 9664 * If no dependencies, then there is nothing to roll back. 9665 */ 9666 inodedep->id_savedsize = dp->di_size; 9667 inodedep->id_savedextsize = 0; 9668 inodedep->id_savednlink = dp->di_nlink; 9669 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 9670 TAILQ_EMPTY(&inodedep->id_inoreflst)) 9671 return; 9672 /* 9673 * Revert the link count to that of the first unwritten journal entry. 9674 */ 9675 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 9676 if (inoref) 9677 dp->di_nlink = inoref->if_nlink; 9678 /* 9679 * Set the dependencies to busy. 9680 */ 9681 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9682 adp = TAILQ_NEXT(adp, ad_next)) { 9683 #ifdef INVARIANTS 9684 if (deplist != 0 && prevlbn >= adp->ad_offset) 9685 panic("softdep_write_inodeblock: lbn order"); 9686 prevlbn = adp->ad_offset; 9687 if (adp->ad_offset < NDADDR && 9688 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 9689 panic("%s: direct pointer #%jd mismatch %d != %jd", 9690 "softdep_write_inodeblock", 9691 (intmax_t)adp->ad_offset, 9692 dp->di_db[adp->ad_offset], 9693 (intmax_t)adp->ad_newblkno); 9694 if (adp->ad_offset >= NDADDR && 9695 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 9696 panic("%s: indirect pointer #%jd mismatch %d != %jd", 9697 "softdep_write_inodeblock", 9698 (intmax_t)adp->ad_offset - NDADDR, 9699 dp->di_ib[adp->ad_offset - NDADDR], 9700 (intmax_t)adp->ad_newblkno); 9701 deplist |= 1 << adp->ad_offset; 9702 if ((adp->ad_state & ATTACHED) == 0) 9703 panic("softdep_write_inodeblock: Unknown state 0x%x", 9704 adp->ad_state); 9705 #endif /* INVARIANTS */ 9706 adp->ad_state &= ~ATTACHED; 9707 adp->ad_state |= UNDONE; 9708 } 9709 /* 9710 * The on-disk inode cannot claim to be any larger than the last 9711 * fragment that has been written. Otherwise, the on-disk inode 9712 * might have fragments that were not the last block in the file 9713 * which would corrupt the filesystem. 9714 */ 9715 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9716 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9717 if (adp->ad_offset >= NDADDR) 9718 break; 9719 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 9720 /* keep going until hitting a rollback to a frag */ 9721 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9722 continue; 9723 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9724 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 9725 #ifdef INVARIANTS 9726 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 9727 panic("softdep_write_inodeblock: lost dep1"); 9728 #endif /* INVARIANTS */ 9729 dp->di_db[i] = 0; 9730 } 9731 for (i = 0; i < NIADDR; i++) { 9732 #ifdef INVARIANTS 9733 if (dp->di_ib[i] != 0 && 9734 (deplist & ((1 << NDADDR) << i)) == 0) 9735 panic("softdep_write_inodeblock: lost dep2"); 9736 #endif /* INVARIANTS */ 9737 dp->di_ib[i] = 0; 9738 } 9739 return; 9740 } 9741 /* 9742 * If we have zero'ed out the last allocated block of the file, 9743 * roll back the size to the last currently allocated block. 9744 * We know that this last allocated block is a full-sized as 9745 * we already checked for fragments in the loop above. 9746 */ 9747 if (lastadp != NULL && 9748 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9749 for (i = lastadp->ad_offset; i >= 0; i--) 9750 if (dp->di_db[i] != 0) 9751 break; 9752 dp->di_size = (i + 1) * fs->fs_bsize; 9753 } 9754 /* 9755 * The only dependencies are for indirect blocks. 9756 * 9757 * The file size for indirect block additions is not guaranteed. 9758 * Such a guarantee would be non-trivial to achieve. The conventional 9759 * synchronous write implementation also does not make this guarantee. 9760 * Fsck should catch and fix discrepancies. Arguably, the file size 9761 * can be over-estimated without destroying integrity when the file 9762 * moves into the indirect blocks (i.e., is large). If we want to 9763 * postpone fsck, we are stuck with this argument. 9764 */ 9765 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 9766 dp->di_ib[adp->ad_offset - NDADDR] = 0; 9767 } 9768 9769 /* 9770 * Version of initiate_write_inodeblock that handles UFS2 dinodes. 9771 * Note that any bug fixes made to this routine must be done in the 9772 * version found above. 9773 * 9774 * Called from within the procedure above to deal with unsatisfied 9775 * allocation dependencies in an inodeblock. The buffer must be 9776 * locked, thus, no I/O completion operations can occur while we 9777 * are manipulating its associated dependencies. 9778 */ 9779 static void 9780 initiate_write_inodeblock_ufs2(inodedep, bp) 9781 struct inodedep *inodedep; 9782 struct buf *bp; /* The inode block */ 9783 { 9784 struct allocdirect *adp, *lastadp; 9785 struct ufs2_dinode *dp; 9786 struct ufs2_dinode *sip; 9787 struct inoref *inoref; 9788 struct fs *fs; 9789 ufs_lbn_t i; 9790 #ifdef INVARIANTS 9791 ufs_lbn_t prevlbn = 0; 9792 #endif 9793 int deplist; 9794 9795 if (inodedep->id_state & IOSTARTED) 9796 panic("initiate_write_inodeblock_ufs2: already started"); 9797 inodedep->id_state |= IOSTARTED; 9798 fs = inodedep->id_fs; 9799 dp = (struct ufs2_dinode *)bp->b_data + 9800 ino_to_fsbo(fs, inodedep->id_ino); 9801 9802 /* 9803 * If we're on the unlinked list but have not yet written our 9804 * next pointer initialize it here. 9805 */ 9806 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 9807 struct inodedep *inon; 9808 9809 inon = TAILQ_NEXT(inodedep, id_unlinked); 9810 dp->di_freelink = inon ? inon->id_ino : 0; 9811 } 9812 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == 9813 (UNLINKED | UNLINKNEXT)) { 9814 struct inodedep *inon; 9815 ino_t freelink; 9816 9817 inon = TAILQ_NEXT(inodedep, id_unlinked); 9818 freelink = inon ? inon->id_ino : 0; 9819 if (freelink != dp->di_freelink) 9820 panic("ino %p(0x%X) %d, %d != %d", 9821 inodedep, inodedep->id_state, inodedep->id_ino, 9822 freelink, dp->di_freelink); 9823 } 9824 /* 9825 * If the bitmap is not yet written, then the allocated 9826 * inode cannot be written to disk. 9827 */ 9828 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 9829 if (inodedep->id_savedino2 != NULL) 9830 panic("initiate_write_inodeblock_ufs2: I/O underway"); 9831 FREE_LOCK(&lk); 9832 sip = malloc(sizeof(struct ufs2_dinode), 9833 M_SAVEDINO, M_SOFTDEP_FLAGS); 9834 ACQUIRE_LOCK(&lk); 9835 inodedep->id_savedino2 = sip; 9836 *inodedep->id_savedino2 = *dp; 9837 bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); 9838 dp->di_gen = inodedep->id_savedino2->di_gen; 9839 dp->di_freelink = inodedep->id_savedino2->di_freelink; 9840 return; 9841 } 9842 /* 9843 * If no dependencies, then there is nothing to roll back. 9844 */ 9845 inodedep->id_savedsize = dp->di_size; 9846 inodedep->id_savedextsize = dp->di_extsize; 9847 inodedep->id_savednlink = dp->di_nlink; 9848 if (TAILQ_EMPTY(&inodedep->id_inoupdt) && 9849 TAILQ_EMPTY(&inodedep->id_extupdt) && 9850 TAILQ_EMPTY(&inodedep->id_inoreflst)) 9851 return; 9852 /* 9853 * Revert the link count to that of the first unwritten journal entry. 9854 */ 9855 inoref = TAILQ_FIRST(&inodedep->id_inoreflst); 9856 if (inoref) 9857 dp->di_nlink = inoref->if_nlink; 9858 9859 /* 9860 * Set the ext data dependencies to busy. 9861 */ 9862 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 9863 adp = TAILQ_NEXT(adp, ad_next)) { 9864 #ifdef INVARIANTS 9865 if (deplist != 0 && prevlbn >= adp->ad_offset) 9866 panic("softdep_write_inodeblock: lbn order"); 9867 prevlbn = adp->ad_offset; 9868 if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) 9869 panic("%s: direct pointer #%jd mismatch %jd != %jd", 9870 "softdep_write_inodeblock", 9871 (intmax_t)adp->ad_offset, 9872 (intmax_t)dp->di_extb[adp->ad_offset], 9873 (intmax_t)adp->ad_newblkno); 9874 deplist |= 1 << adp->ad_offset; 9875 if ((adp->ad_state & ATTACHED) == 0) 9876 panic("softdep_write_inodeblock: Unknown state 0x%x", 9877 adp->ad_state); 9878 #endif /* INVARIANTS */ 9879 adp->ad_state &= ~ATTACHED; 9880 adp->ad_state |= UNDONE; 9881 } 9882 /* 9883 * The on-disk inode cannot claim to be any larger than the last 9884 * fragment that has been written. Otherwise, the on-disk inode 9885 * might have fragments that were not the last block in the ext 9886 * data which would corrupt the filesystem. 9887 */ 9888 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; 9889 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9890 dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; 9891 /* keep going until hitting a rollback to a frag */ 9892 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9893 continue; 9894 dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9895 for (i = adp->ad_offset + 1; i < NXADDR; i++) { 9896 #ifdef INVARIANTS 9897 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) 9898 panic("softdep_write_inodeblock: lost dep1"); 9899 #endif /* INVARIANTS */ 9900 dp->di_extb[i] = 0; 9901 } 9902 lastadp = NULL; 9903 break; 9904 } 9905 /* 9906 * If we have zero'ed out the last allocated block of the ext 9907 * data, roll back the size to the last currently allocated block. 9908 * We know that this last allocated block is a full-sized as 9909 * we already checked for fragments in the loop above. 9910 */ 9911 if (lastadp != NULL && 9912 dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9913 for (i = lastadp->ad_offset; i >= 0; i--) 9914 if (dp->di_extb[i] != 0) 9915 break; 9916 dp->di_extsize = (i + 1) * fs->fs_bsize; 9917 } 9918 /* 9919 * Set the file data dependencies to busy. 9920 */ 9921 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9922 adp = TAILQ_NEXT(adp, ad_next)) { 9923 #ifdef INVARIANTS 9924 if (deplist != 0 && prevlbn >= adp->ad_offset) 9925 panic("softdep_write_inodeblock: lbn order"); 9926 if ((adp->ad_state & ATTACHED) == 0) 9927 panic("inodedep %p and adp %p not attached", inodedep, adp); 9928 prevlbn = adp->ad_offset; 9929 if (adp->ad_offset < NDADDR && 9930 dp->di_db[adp->ad_offset] != adp->ad_newblkno) 9931 panic("%s: direct pointer #%jd mismatch %jd != %jd", 9932 "softdep_write_inodeblock", 9933 (intmax_t)adp->ad_offset, 9934 (intmax_t)dp->di_db[adp->ad_offset], 9935 (intmax_t)adp->ad_newblkno); 9936 if (adp->ad_offset >= NDADDR && 9937 dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) 9938 panic("%s indirect pointer #%jd mismatch %jd != %jd", 9939 "softdep_write_inodeblock:", 9940 (intmax_t)adp->ad_offset - NDADDR, 9941 (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], 9942 (intmax_t)adp->ad_newblkno); 9943 deplist |= 1 << adp->ad_offset; 9944 if ((adp->ad_state & ATTACHED) == 0) 9945 panic("softdep_write_inodeblock: Unknown state 0x%x", 9946 adp->ad_state); 9947 #endif /* INVARIANTS */ 9948 adp->ad_state &= ~ATTACHED; 9949 adp->ad_state |= UNDONE; 9950 } 9951 /* 9952 * The on-disk inode cannot claim to be any larger than the last 9953 * fragment that has been written. Otherwise, the on-disk inode 9954 * might have fragments that were not the last block in the file 9955 * which would corrupt the filesystem. 9956 */ 9957 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; 9958 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { 9959 if (adp->ad_offset >= NDADDR) 9960 break; 9961 dp->di_db[adp->ad_offset] = adp->ad_oldblkno; 9962 /* keep going until hitting a rollback to a frag */ 9963 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) 9964 continue; 9965 dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; 9966 for (i = adp->ad_offset + 1; i < NDADDR; i++) { 9967 #ifdef INVARIANTS 9968 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) 9969 panic("softdep_write_inodeblock: lost dep2"); 9970 #endif /* INVARIANTS */ 9971 dp->di_db[i] = 0; 9972 } 9973 for (i = 0; i < NIADDR; i++) { 9974 #ifdef INVARIANTS 9975 if (dp->di_ib[i] != 0 && 9976 (deplist & ((1 << NDADDR) << i)) == 0) 9977 panic("softdep_write_inodeblock: lost dep3"); 9978 #endif /* INVARIANTS */ 9979 dp->di_ib[i] = 0; 9980 } 9981 return; 9982 } 9983 /* 9984 * If we have zero'ed out the last allocated block of the file, 9985 * roll back the size to the last currently allocated block. 9986 * We know that this last allocated block is a full-sized as 9987 * we already checked for fragments in the loop above. 9988 */ 9989 if (lastadp != NULL && 9990 dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { 9991 for (i = lastadp->ad_offset; i >= 0; i--) 9992 if (dp->di_db[i] != 0) 9993 break; 9994 dp->di_size = (i + 1) * fs->fs_bsize; 9995 } 9996 /* 9997 * The only dependencies are for indirect blocks. 9998 * 9999 * The file size for indirect block additions is not guaranteed. 10000 * Such a guarantee would be non-trivial to achieve. The conventional 10001 * synchronous write implementation also does not make this guarantee. 10002 * Fsck should catch and fix discrepancies. Arguably, the file size 10003 * can be over-estimated without destroying integrity when the file 10004 * moves into the indirect blocks (i.e., is large). If we want to 10005 * postpone fsck, we are stuck with this argument. 10006 */ 10007 for (; adp; adp = TAILQ_NEXT(adp, ad_next)) 10008 dp->di_ib[adp->ad_offset - NDADDR] = 0; 10009 } 10010 10011 /* 10012 * Cancel an indirdep as a result of truncation. Release all of the 10013 * children allocindirs and place their journal work on the appropriate 10014 * list. 10015 */ 10016 static void 10017 cancel_indirdep(indirdep, bp, freeblks) 10018 struct indirdep *indirdep; 10019 struct buf *bp; 10020 struct freeblks *freeblks; 10021 { 10022 struct allocindir *aip; 10023 10024 /* 10025 * None of the indirect pointers will ever be visible, 10026 * so they can simply be tossed. GOINGAWAY ensures 10027 * that allocated pointers will be saved in the buffer 10028 * cache until they are freed. Note that they will 10029 * only be able to be found by their physical address 10030 * since the inode mapping the logical address will 10031 * be gone. The save buffer used for the safe copy 10032 * was allocated in setup_allocindir_phase2 using 10033 * the physical address so it could be used for this 10034 * purpose. Hence we swap the safe copy with the real 10035 * copy, allowing the safe copy to be freed and holding 10036 * on to the real copy for later use in indir_trunc. 10037 */ 10038 if (indirdep->ir_state & GOINGAWAY) 10039 panic("cancel_indirdep: already gone"); 10040 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 10041 indirdep->ir_state |= DEPCOMPLETE; 10042 LIST_REMOVE(indirdep, ir_next); 10043 } 10044 indirdep->ir_state |= GOINGAWAY; 10045 VFSTOUFS(indirdep->ir_list.wk_mp)->um_numindirdeps += 1; 10046 /* 10047 * Pass in bp for blocks still have journal writes 10048 * pending so we can cancel them on their own. 10049 */ 10050 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) 10051 cancel_allocindir(aip, bp, freeblks, 0); 10052 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) 10053 cancel_allocindir(aip, NULL, freeblks, 0); 10054 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) 10055 cancel_allocindir(aip, NULL, freeblks, 0); 10056 while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0) 10057 cancel_allocindir(aip, NULL, freeblks, 0); 10058 /* 10059 * If there are pending partial truncations we need to keep the 10060 * old block copy around until they complete. This is because 10061 * the current b_data is not a perfect superset of the available 10062 * blocks. 10063 */ 10064 if (TAILQ_EMPTY(&indirdep->ir_trunc)) 10065 bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); 10066 else 10067 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 10068 WORKLIST_REMOVE(&indirdep->ir_list); 10069 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); 10070 indirdep->ir_bp = NULL; 10071 indirdep->ir_freeblks = freeblks; 10072 } 10073 10074 /* 10075 * Free an indirdep once it no longer has new pointers to track. 10076 */ 10077 static void 10078 free_indirdep(indirdep) 10079 struct indirdep *indirdep; 10080 { 10081 10082 KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc), 10083 ("free_indirdep: Indir trunc list not empty.")); 10084 KASSERT(LIST_EMPTY(&indirdep->ir_completehd), 10085 ("free_indirdep: Complete head not empty.")); 10086 KASSERT(LIST_EMPTY(&indirdep->ir_writehd), 10087 ("free_indirdep: write head not empty.")); 10088 KASSERT(LIST_EMPTY(&indirdep->ir_donehd), 10089 ("free_indirdep: done head not empty.")); 10090 KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), 10091 ("free_indirdep: deplist head not empty.")); 10092 KASSERT((indirdep->ir_state & DEPCOMPLETE), 10093 ("free_indirdep: %p still on newblk list.", indirdep)); 10094 KASSERT(indirdep->ir_saveddata == NULL, 10095 ("free_indirdep: %p still has saved data.", indirdep)); 10096 if (indirdep->ir_state & ONWORKLIST) 10097 WORKLIST_REMOVE(&indirdep->ir_list); 10098 WORKITEM_FREE(indirdep, D_INDIRDEP); 10099 } 10100 10101 /* 10102 * Called before a write to an indirdep. This routine is responsible for 10103 * rolling back pointers to a safe state which includes only those 10104 * allocindirs which have been completed. 10105 */ 10106 static void 10107 initiate_write_indirdep(indirdep, bp) 10108 struct indirdep *indirdep; 10109 struct buf *bp; 10110 { 10111 10112 indirdep->ir_state |= IOSTARTED; 10113 if (indirdep->ir_state & GOINGAWAY) 10114 panic("disk_io_initiation: indirdep gone"); 10115 /* 10116 * If there are no remaining dependencies, this will be writing 10117 * the real pointers. 10118 */ 10119 if (LIST_EMPTY(&indirdep->ir_deplisthd) && 10120 TAILQ_EMPTY(&indirdep->ir_trunc)) 10121 return; 10122 /* 10123 * Replace up-to-date version with safe version. 10124 */ 10125 if (indirdep->ir_saveddata == NULL) { 10126 FREE_LOCK(&lk); 10127 indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, 10128 M_SOFTDEP_FLAGS); 10129 ACQUIRE_LOCK(&lk); 10130 } 10131 indirdep->ir_state &= ~ATTACHED; 10132 indirdep->ir_state |= UNDONE; 10133 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); 10134 bcopy(indirdep->ir_savebp->b_data, bp->b_data, 10135 bp->b_bcount); 10136 } 10137 10138 /* 10139 * Called when an inode has been cleared in a cg bitmap. This finally 10140 * eliminates any canceled jaddrefs 10141 */ 10142 void 10143 softdep_setup_inofree(mp, bp, ino, wkhd) 10144 struct mount *mp; 10145 struct buf *bp; 10146 ino_t ino; 10147 struct workhead *wkhd; 10148 { 10149 struct worklist *wk, *wkn; 10150 struct inodedep *inodedep; 10151 uint8_t *inosused; 10152 struct cg *cgp; 10153 struct fs *fs; 10154 10155 ACQUIRE_LOCK(&lk); 10156 fs = VFSTOUFS(mp)->um_fs; 10157 cgp = (struct cg *)bp->b_data; 10158 inosused = cg_inosused(cgp); 10159 if (isset(inosused, ino % fs->fs_ipg)) 10160 panic("softdep_setup_inofree: inode %d not freed.", ino); 10161 if (inodedep_lookup(mp, ino, 0, &inodedep)) 10162 panic("softdep_setup_inofree: ino %d has existing inodedep %p", 10163 ino, inodedep); 10164 if (wkhd) { 10165 LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { 10166 if (wk->wk_type != D_JADDREF) 10167 continue; 10168 WORKLIST_REMOVE(wk); 10169 /* 10170 * We can free immediately even if the jaddref 10171 * isn't attached in a background write as now 10172 * the bitmaps are reconciled. 10173 */ 10174 wk->wk_state |= COMPLETE | ATTACHED; 10175 free_jaddref(WK_JADDREF(wk)); 10176 } 10177 jwork_move(&bp->b_dep, wkhd); 10178 } 10179 FREE_LOCK(&lk); 10180 } 10181 10182 10183 /* 10184 * Called via ffs_blkfree() after a set of frags has been cleared from a cg 10185 * map. Any dependencies waiting for the write to clear are added to the 10186 * buf's list and any jnewblks that are being canceled are discarded 10187 * immediately. 10188 */ 10189 void 10190 softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) 10191 struct mount *mp; 10192 struct buf *bp; 10193 ufs2_daddr_t blkno; 10194 int frags; 10195 struct workhead *wkhd; 10196 { 10197 struct bmsafemap *bmsafemap; 10198 struct jnewblk *jnewblk; 10199 struct worklist *wk; 10200 struct fs *fs; 10201 #ifdef SUJ_DEBUG 10202 uint8_t *blksfree; 10203 struct cg *cgp; 10204 ufs2_daddr_t jstart; 10205 ufs2_daddr_t jend; 10206 ufs2_daddr_t end; 10207 long bno; 10208 int i; 10209 #endif 10210 10211 ACQUIRE_LOCK(&lk); 10212 /* Lookup the bmsafemap so we track when it is dirty. */ 10213 fs = VFSTOUFS(mp)->um_fs; 10214 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 10215 /* 10216 * Detach any jnewblks which have been canceled. They must linger 10217 * until the bitmap is cleared again by ffs_blkfree() to prevent 10218 * an unjournaled allocation from hitting the disk. 10219 */ 10220 if (wkhd) { 10221 while ((wk = LIST_FIRST(wkhd)) != NULL) { 10222 WORKLIST_REMOVE(wk); 10223 if (wk->wk_type != D_JNEWBLK) { 10224 WORKLIST_INSERT(&bmsafemap->sm_freehd, wk); 10225 continue; 10226 } 10227 jnewblk = WK_JNEWBLK(wk); 10228 KASSERT(jnewblk->jn_state & GOINGAWAY, 10229 ("softdep_setup_blkfree: jnewblk not canceled.")); 10230 #ifdef SUJ_DEBUG 10231 /* 10232 * Assert that this block is free in the bitmap 10233 * before we discard the jnewblk. 10234 */ 10235 cgp = (struct cg *)bp->b_data; 10236 blksfree = cg_blksfree(cgp); 10237 bno = dtogd(fs, jnewblk->jn_blkno); 10238 for (i = jnewblk->jn_oldfrags; 10239 i < jnewblk->jn_frags; i++) { 10240 if (isset(blksfree, bno + i)) 10241 continue; 10242 panic("softdep_setup_blkfree: not free"); 10243 } 10244 #endif 10245 /* 10246 * Even if it's not attached we can free immediately 10247 * as the new bitmap is correct. 10248 */ 10249 wk->wk_state |= COMPLETE | ATTACHED; 10250 free_jnewblk(jnewblk); 10251 } 10252 } 10253 10254 #ifdef SUJ_DEBUG 10255 /* 10256 * Assert that we are not freeing a block which has an outstanding 10257 * allocation dependency. 10258 */ 10259 fs = VFSTOUFS(mp)->um_fs; 10260 bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno)); 10261 end = blkno + frags; 10262 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 10263 /* 10264 * Don't match against blocks that will be freed when the 10265 * background write is done. 10266 */ 10267 if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == 10268 (COMPLETE | DEPCOMPLETE)) 10269 continue; 10270 jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; 10271 jend = jnewblk->jn_blkno + jnewblk->jn_frags; 10272 if ((blkno >= jstart && blkno < jend) || 10273 (end > jstart && end <= jend)) { 10274 printf("state 0x%X %jd - %d %d dep %p\n", 10275 jnewblk->jn_state, jnewblk->jn_blkno, 10276 jnewblk->jn_oldfrags, jnewblk->jn_frags, 10277 jnewblk->jn_dep); 10278 panic("softdep_setup_blkfree: " 10279 "%jd-%jd(%d) overlaps with %jd-%jd", 10280 blkno, end, frags, jstart, jend); 10281 } 10282 } 10283 #endif 10284 FREE_LOCK(&lk); 10285 } 10286 10287 /* 10288 * Revert a block allocation when the journal record that describes it 10289 * is not yet written. 10290 */ 10291 int 10292 jnewblk_rollback(jnewblk, fs, cgp, blksfree) 10293 struct jnewblk *jnewblk; 10294 struct fs *fs; 10295 struct cg *cgp; 10296 uint8_t *blksfree; 10297 { 10298 ufs1_daddr_t fragno; 10299 long cgbno, bbase; 10300 int frags, blk; 10301 int i; 10302 10303 frags = 0; 10304 cgbno = dtogd(fs, jnewblk->jn_blkno); 10305 /* 10306 * We have to test which frags need to be rolled back. We may 10307 * be operating on a stale copy when doing background writes. 10308 */ 10309 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) 10310 if (isclr(blksfree, cgbno + i)) 10311 frags++; 10312 if (frags == 0) 10313 return (0); 10314 /* 10315 * This is mostly ffs_blkfree() sans some validation and 10316 * superblock updates. 10317 */ 10318 if (frags == fs->fs_frag) { 10319 fragno = fragstoblks(fs, cgbno); 10320 ffs_setblock(fs, blksfree, fragno); 10321 ffs_clusteracct(fs, cgp, fragno, 1); 10322 cgp->cg_cs.cs_nbfree++; 10323 } else { 10324 cgbno += jnewblk->jn_oldfrags; 10325 bbase = cgbno - fragnum(fs, cgbno); 10326 /* Decrement the old frags. */ 10327 blk = blkmap(fs, blksfree, bbase); 10328 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 10329 /* Deallocate the fragment */ 10330 for (i = 0; i < frags; i++) 10331 setbit(blksfree, cgbno + i); 10332 cgp->cg_cs.cs_nffree += frags; 10333 /* Add back in counts associated with the new frags */ 10334 blk = blkmap(fs, blksfree, bbase); 10335 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 10336 /* If a complete block has been reassembled, account for it. */ 10337 fragno = fragstoblks(fs, bbase); 10338 if (ffs_isblock(fs, blksfree, fragno)) { 10339 cgp->cg_cs.cs_nffree -= fs->fs_frag; 10340 ffs_clusteracct(fs, cgp, fragno, 1); 10341 cgp->cg_cs.cs_nbfree++; 10342 } 10343 } 10344 stat_jnewblk++; 10345 jnewblk->jn_state &= ~ATTACHED; 10346 jnewblk->jn_state |= UNDONE; 10347 10348 return (frags); 10349 } 10350 10351 static void 10352 initiate_write_bmsafemap(bmsafemap, bp) 10353 struct bmsafemap *bmsafemap; 10354 struct buf *bp; /* The cg block. */ 10355 { 10356 struct jaddref *jaddref; 10357 struct jnewblk *jnewblk; 10358 uint8_t *inosused; 10359 uint8_t *blksfree; 10360 struct cg *cgp; 10361 struct fs *fs; 10362 ino_t ino; 10363 10364 if (bmsafemap->sm_state & IOSTARTED) 10365 panic("initiate_write_bmsafemap: Already started\n"); 10366 bmsafemap->sm_state |= IOSTARTED; 10367 /* 10368 * Clear any inode allocations which are pending journal writes. 10369 */ 10370 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { 10371 cgp = (struct cg *)bp->b_data; 10372 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 10373 inosused = cg_inosused(cgp); 10374 LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { 10375 ino = jaddref->ja_ino % fs->fs_ipg; 10376 /* 10377 * If this is a background copy the inode may not 10378 * be marked used yet. 10379 */ 10380 if (isset(inosused, ino)) { 10381 if ((jaddref->ja_mode & IFMT) == IFDIR) 10382 cgp->cg_cs.cs_ndir--; 10383 cgp->cg_cs.cs_nifree++; 10384 clrbit(inosused, ino); 10385 jaddref->ja_state &= ~ATTACHED; 10386 jaddref->ja_state |= UNDONE; 10387 stat_jaddref++; 10388 } else if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 10389 panic("initiate_write_bmsafemap: inode %d " 10390 "marked free", jaddref->ja_ino); 10391 } 10392 } 10393 /* 10394 * Clear any block allocations which are pending journal writes. 10395 */ 10396 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 10397 cgp = (struct cg *)bp->b_data; 10398 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 10399 blksfree = cg_blksfree(cgp); 10400 LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { 10401 if (jnewblk_rollback(jnewblk, fs, cgp, blksfree)) 10402 continue; 10403 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) 10404 panic("initiate_write_bmsafemap: block %jd " 10405 "marked free", jnewblk->jn_blkno); 10406 } 10407 } 10408 /* 10409 * Move allocation lists to the written lists so they can be 10410 * cleared once the block write is complete. 10411 */ 10412 LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, 10413 inodedep, id_deps); 10414 LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, 10415 newblk, nb_deps); 10416 LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, 10417 wk_list); 10418 } 10419 10420 /* 10421 * This routine is called during the completion interrupt 10422 * service routine for a disk write (from the procedure called 10423 * by the device driver to inform the filesystem caches of 10424 * a request completion). It should be called early in this 10425 * procedure, before the block is made available to other 10426 * processes or other routines are called. 10427 * 10428 */ 10429 static void 10430 softdep_disk_write_complete(bp) 10431 struct buf *bp; /* describes the completed disk write */ 10432 { 10433 struct worklist *wk; 10434 struct worklist *owk; 10435 struct workhead reattach; 10436 struct freeblks *freeblks; 10437 struct buf *sbp; 10438 10439 /* 10440 * If an error occurred while doing the write, then the data 10441 * has not hit the disk and the dependencies cannot be unrolled. 10442 */ 10443 if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) 10444 return; 10445 LIST_INIT(&reattach); 10446 /* 10447 * This lock must not be released anywhere in this code segment. 10448 */ 10449 sbp = NULL; 10450 owk = NULL; 10451 ACQUIRE_LOCK(&lk); 10452 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { 10453 WORKLIST_REMOVE(wk); 10454 dep_write[wk->wk_type]++; 10455 if (wk == owk) 10456 panic("duplicate worklist: %p\n", wk); 10457 owk = wk; 10458 switch (wk->wk_type) { 10459 10460 case D_PAGEDEP: 10461 if (handle_written_filepage(WK_PAGEDEP(wk), bp)) 10462 WORKLIST_INSERT(&reattach, wk); 10463 continue; 10464 10465 case D_INODEDEP: 10466 if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) 10467 WORKLIST_INSERT(&reattach, wk); 10468 continue; 10469 10470 case D_BMSAFEMAP: 10471 if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) 10472 WORKLIST_INSERT(&reattach, wk); 10473 continue; 10474 10475 case D_MKDIR: 10476 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); 10477 continue; 10478 10479 case D_ALLOCDIRECT: 10480 wk->wk_state |= COMPLETE; 10481 handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); 10482 continue; 10483 10484 case D_ALLOCINDIR: 10485 wk->wk_state |= COMPLETE; 10486 handle_allocindir_partdone(WK_ALLOCINDIR(wk)); 10487 continue; 10488 10489 case D_INDIRDEP: 10490 if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) 10491 WORKLIST_INSERT(&reattach, wk); 10492 continue; 10493 10494 case D_FREEBLKS: 10495 wk->wk_state |= COMPLETE; 10496 freeblks = WK_FREEBLKS(wk); 10497 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE && 10498 LIST_EMPTY(&freeblks->fb_jblkdephd)) 10499 add_to_worklist(wk, WK_NODELAY); 10500 continue; 10501 10502 case D_FREEWORK: 10503 handle_written_freework(WK_FREEWORK(wk)); 10504 break; 10505 10506 case D_JSEGDEP: 10507 free_jsegdep(WK_JSEGDEP(wk)); 10508 continue; 10509 10510 case D_JSEG: 10511 handle_written_jseg(WK_JSEG(wk), bp); 10512 continue; 10513 10514 case D_SBDEP: 10515 if (handle_written_sbdep(WK_SBDEP(wk), bp)) 10516 WORKLIST_INSERT(&reattach, wk); 10517 continue; 10518 10519 case D_FREEDEP: 10520 free_freedep(WK_FREEDEP(wk)); 10521 continue; 10522 10523 default: 10524 panic("handle_disk_write_complete: Unknown type %s", 10525 TYPENAME(wk->wk_type)); 10526 /* NOTREACHED */ 10527 } 10528 } 10529 /* 10530 * Reattach any requests that must be redone. 10531 */ 10532 while ((wk = LIST_FIRST(&reattach)) != NULL) { 10533 WORKLIST_REMOVE(wk); 10534 WORKLIST_INSERT(&bp->b_dep, wk); 10535 } 10536 FREE_LOCK(&lk); 10537 if (sbp) 10538 brelse(sbp); 10539 } 10540 10541 /* 10542 * Called from within softdep_disk_write_complete above. Note that 10543 * this routine is always called from interrupt level with further 10544 * splbio interrupts blocked. 10545 */ 10546 static void 10547 handle_allocdirect_partdone(adp, wkhd) 10548 struct allocdirect *adp; /* the completed allocdirect */ 10549 struct workhead *wkhd; /* Work to do when inode is writtne. */ 10550 { 10551 struct allocdirectlst *listhead; 10552 struct allocdirect *listadp; 10553 struct inodedep *inodedep; 10554 long bsize; 10555 10556 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 10557 return; 10558 /* 10559 * The on-disk inode cannot claim to be any larger than the last 10560 * fragment that has been written. Otherwise, the on-disk inode 10561 * might have fragments that were not the last block in the file 10562 * which would corrupt the filesystem. Thus, we cannot free any 10563 * allocdirects after one whose ad_oldblkno claims a fragment as 10564 * these blocks must be rolled back to zero before writing the inode. 10565 * We check the currently active set of allocdirects in id_inoupdt 10566 * or id_extupdt as appropriate. 10567 */ 10568 inodedep = adp->ad_inodedep; 10569 bsize = inodedep->id_fs->fs_bsize; 10570 if (adp->ad_state & EXTDATA) 10571 listhead = &inodedep->id_extupdt; 10572 else 10573 listhead = &inodedep->id_inoupdt; 10574 TAILQ_FOREACH(listadp, listhead, ad_next) { 10575 /* found our block */ 10576 if (listadp == adp) 10577 break; 10578 /* continue if ad_oldlbn is not a fragment */ 10579 if (listadp->ad_oldsize == 0 || 10580 listadp->ad_oldsize == bsize) 10581 continue; 10582 /* hit a fragment */ 10583 return; 10584 } 10585 /* 10586 * If we have reached the end of the current list without 10587 * finding the just finished dependency, then it must be 10588 * on the future dependency list. Future dependencies cannot 10589 * be freed until they are moved to the current list. 10590 */ 10591 if (listadp == NULL) { 10592 #ifdef DEBUG 10593 if (adp->ad_state & EXTDATA) 10594 listhead = &inodedep->id_newextupdt; 10595 else 10596 listhead = &inodedep->id_newinoupdt; 10597 TAILQ_FOREACH(listadp, listhead, ad_next) 10598 /* found our block */ 10599 if (listadp == adp) 10600 break; 10601 if (listadp == NULL) 10602 panic("handle_allocdirect_partdone: lost dep"); 10603 #endif /* DEBUG */ 10604 return; 10605 } 10606 /* 10607 * If we have found the just finished dependency, then queue 10608 * it along with anything that follows it that is complete. 10609 * Since the pointer has not yet been written in the inode 10610 * as the dependency prevents it, place the allocdirect on the 10611 * bufwait list where it will be freed once the pointer is 10612 * valid. 10613 */ 10614 if (wkhd == NULL) 10615 wkhd = &inodedep->id_bufwait; 10616 for (; adp; adp = listadp) { 10617 listadp = TAILQ_NEXT(adp, ad_next); 10618 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) 10619 return; 10620 TAILQ_REMOVE(listhead, adp, ad_next); 10621 WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); 10622 } 10623 } 10624 10625 /* 10626 * Called from within softdep_disk_write_complete above. This routine 10627 * completes successfully written allocindirs. 10628 */ 10629 static void 10630 handle_allocindir_partdone(aip) 10631 struct allocindir *aip; /* the completed allocindir */ 10632 { 10633 struct indirdep *indirdep; 10634 10635 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) 10636 return; 10637 indirdep = aip->ai_indirdep; 10638 LIST_REMOVE(aip, ai_next); 10639 /* 10640 * Don't set a pointer while the buffer is undergoing IO or while 10641 * we have active truncations. 10642 */ 10643 if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) { 10644 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); 10645 return; 10646 } 10647 if (indirdep->ir_state & UFS1FMT) 10648 ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 10649 aip->ai_newblkno; 10650 else 10651 ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = 10652 aip->ai_newblkno; 10653 /* 10654 * Await the pointer write before freeing the allocindir. 10655 */ 10656 LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); 10657 } 10658 10659 /* 10660 * Release segments held on a jwork list. 10661 */ 10662 static void 10663 handle_jwork(wkhd) 10664 struct workhead *wkhd; 10665 { 10666 struct worklist *wk; 10667 10668 while ((wk = LIST_FIRST(wkhd)) != NULL) { 10669 WORKLIST_REMOVE(wk); 10670 switch (wk->wk_type) { 10671 case D_JSEGDEP: 10672 free_jsegdep(WK_JSEGDEP(wk)); 10673 continue; 10674 case D_FREEDEP: 10675 free_freedep(WK_FREEDEP(wk)); 10676 continue; 10677 case D_FREEFRAG: 10678 rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep)); 10679 WORKITEM_FREE(wk, D_FREEFRAG); 10680 case D_FREEWORK: 10681 handle_written_freework(WK_FREEWORK(wk)); 10682 continue; 10683 default: 10684 panic("handle_jwork: Unknown type %s\n", 10685 TYPENAME(wk->wk_type)); 10686 } 10687 } 10688 } 10689 10690 /* 10691 * Handle the bufwait list on an inode when it is safe to release items 10692 * held there. This normally happens after an inode block is written but 10693 * may be delayed and handled later if there are pending journal items that 10694 * are not yet safe to be released. 10695 */ 10696 static struct freefile * 10697 handle_bufwait(inodedep, refhd) 10698 struct inodedep *inodedep; 10699 struct workhead *refhd; 10700 { 10701 struct jaddref *jaddref; 10702 struct freefile *freefile; 10703 struct worklist *wk; 10704 10705 freefile = NULL; 10706 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { 10707 WORKLIST_REMOVE(wk); 10708 switch (wk->wk_type) { 10709 case D_FREEFILE: 10710 /* 10711 * We defer adding freefile to the worklist 10712 * until all other additions have been made to 10713 * ensure that it will be done after all the 10714 * old blocks have been freed. 10715 */ 10716 if (freefile != NULL) 10717 panic("handle_bufwait: freefile"); 10718 freefile = WK_FREEFILE(wk); 10719 continue; 10720 10721 case D_MKDIR: 10722 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); 10723 continue; 10724 10725 case D_DIRADD: 10726 diradd_inode_written(WK_DIRADD(wk), inodedep); 10727 continue; 10728 10729 case D_FREEFRAG: 10730 wk->wk_state |= COMPLETE; 10731 if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) 10732 add_to_worklist(wk, 0); 10733 continue; 10734 10735 case D_DIRREM: 10736 wk->wk_state |= COMPLETE; 10737 add_to_worklist(wk, 0); 10738 continue; 10739 10740 case D_ALLOCDIRECT: 10741 case D_ALLOCINDIR: 10742 free_newblk(WK_NEWBLK(wk)); 10743 continue; 10744 10745 case D_JNEWBLK: 10746 wk->wk_state |= COMPLETE; 10747 free_jnewblk(WK_JNEWBLK(wk)); 10748 continue; 10749 10750 /* 10751 * Save freed journal segments and add references on 10752 * the supplied list which will delay their release 10753 * until the cg bitmap is cleared on disk. 10754 */ 10755 case D_JSEGDEP: 10756 if (refhd == NULL) 10757 free_jsegdep(WK_JSEGDEP(wk)); 10758 else 10759 WORKLIST_INSERT(refhd, wk); 10760 continue; 10761 10762 case D_JADDREF: 10763 jaddref = WK_JADDREF(wk); 10764 TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, 10765 if_deps); 10766 /* 10767 * Transfer any jaddrefs to the list to be freed with 10768 * the bitmap if we're handling a removed file. 10769 */ 10770 if (refhd == NULL) { 10771 wk->wk_state |= COMPLETE; 10772 free_jaddref(jaddref); 10773 } else 10774 WORKLIST_INSERT(refhd, wk); 10775 continue; 10776 10777 default: 10778 panic("handle_bufwait: Unknown type %p(%s)", 10779 wk, TYPENAME(wk->wk_type)); 10780 /* NOTREACHED */ 10781 } 10782 } 10783 return (freefile); 10784 } 10785 /* 10786 * Called from within softdep_disk_write_complete above to restore 10787 * in-memory inode block contents to their most up-to-date state. Note 10788 * that this routine is always called from interrupt level with further 10789 * splbio interrupts blocked. 10790 */ 10791 static int 10792 handle_written_inodeblock(inodedep, bp) 10793 struct inodedep *inodedep; 10794 struct buf *bp; /* buffer containing the inode block */ 10795 { 10796 struct freefile *freefile; 10797 struct allocdirect *adp, *nextadp; 10798 struct ufs1_dinode *dp1 = NULL; 10799 struct ufs2_dinode *dp2 = NULL; 10800 struct workhead wkhd; 10801 int hadchanges, fstype; 10802 ino_t freelink; 10803 10804 LIST_INIT(&wkhd); 10805 hadchanges = 0; 10806 freefile = NULL; 10807 if ((inodedep->id_state & IOSTARTED) == 0) 10808 panic("handle_written_inodeblock: not started"); 10809 inodedep->id_state &= ~IOSTARTED; 10810 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { 10811 fstype = UFS1; 10812 dp1 = (struct ufs1_dinode *)bp->b_data + 10813 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 10814 freelink = dp1->di_freelink; 10815 } else { 10816 fstype = UFS2; 10817 dp2 = (struct ufs2_dinode *)bp->b_data + 10818 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); 10819 freelink = dp2->di_freelink; 10820 } 10821 /* 10822 * If we wrote a valid freelink pointer during the last write 10823 * record it here. 10824 */ 10825 if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { 10826 struct inodedep *inon; 10827 10828 inon = TAILQ_NEXT(inodedep, id_unlinked); 10829 if ((inon == NULL && freelink == 0) || 10830 (inon && inon->id_ino == freelink)) { 10831 if (inon) 10832 inon->id_state |= UNLINKPREV; 10833 inodedep->id_state |= UNLINKNEXT; 10834 } else 10835 hadchanges = 1; 10836 } 10837 /* Leave this inodeblock dirty until it's in the list. */ 10838 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) 10839 hadchanges = 1; 10840 /* 10841 * If we had to rollback the inode allocation because of 10842 * bitmaps being incomplete, then simply restore it. 10843 * Keep the block dirty so that it will not be reclaimed until 10844 * all associated dependencies have been cleared and the 10845 * corresponding updates written to disk. 10846 */ 10847 if (inodedep->id_savedino1 != NULL) { 10848 hadchanges = 1; 10849 if (fstype == UFS1) 10850 *dp1 = *inodedep->id_savedino1; 10851 else 10852 *dp2 = *inodedep->id_savedino2; 10853 free(inodedep->id_savedino1, M_SAVEDINO); 10854 inodedep->id_savedino1 = NULL; 10855 if ((bp->b_flags & B_DELWRI) == 0) 10856 stat_inode_bitmap++; 10857 bdirty(bp); 10858 /* 10859 * If the inode is clear here and GOINGAWAY it will never 10860 * be written. Process the bufwait and clear any pending 10861 * work which may include the freefile. 10862 */ 10863 if (inodedep->id_state & GOINGAWAY) 10864 goto bufwait; 10865 return (1); 10866 } 10867 inodedep->id_state |= COMPLETE; 10868 /* 10869 * Roll forward anything that had to be rolled back before 10870 * the inode could be updated. 10871 */ 10872 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { 10873 nextadp = TAILQ_NEXT(adp, ad_next); 10874 if (adp->ad_state & ATTACHED) 10875 panic("handle_written_inodeblock: new entry"); 10876 if (fstype == UFS1) { 10877 if (adp->ad_offset < NDADDR) { 10878 if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) 10879 panic("%s %s #%jd mismatch %d != %jd", 10880 "handle_written_inodeblock:", 10881 "direct pointer", 10882 (intmax_t)adp->ad_offset, 10883 dp1->di_db[adp->ad_offset], 10884 (intmax_t)adp->ad_oldblkno); 10885 dp1->di_db[adp->ad_offset] = adp->ad_newblkno; 10886 } else { 10887 if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) 10888 panic("%s: %s #%jd allocated as %d", 10889 "handle_written_inodeblock", 10890 "indirect pointer", 10891 (intmax_t)adp->ad_offset - NDADDR, 10892 dp1->di_ib[adp->ad_offset - NDADDR]); 10893 dp1->di_ib[adp->ad_offset - NDADDR] = 10894 adp->ad_newblkno; 10895 } 10896 } else { 10897 if (adp->ad_offset < NDADDR) { 10898 if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) 10899 panic("%s: %s #%jd %s %jd != %jd", 10900 "handle_written_inodeblock", 10901 "direct pointer", 10902 (intmax_t)adp->ad_offset, "mismatch", 10903 (intmax_t)dp2->di_db[adp->ad_offset], 10904 (intmax_t)adp->ad_oldblkno); 10905 dp2->di_db[adp->ad_offset] = adp->ad_newblkno; 10906 } else { 10907 if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) 10908 panic("%s: %s #%jd allocated as %jd", 10909 "handle_written_inodeblock", 10910 "indirect pointer", 10911 (intmax_t)adp->ad_offset - NDADDR, 10912 (intmax_t) 10913 dp2->di_ib[adp->ad_offset - NDADDR]); 10914 dp2->di_ib[adp->ad_offset - NDADDR] = 10915 adp->ad_newblkno; 10916 } 10917 } 10918 adp->ad_state &= ~UNDONE; 10919 adp->ad_state |= ATTACHED; 10920 hadchanges = 1; 10921 } 10922 for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { 10923 nextadp = TAILQ_NEXT(adp, ad_next); 10924 if (adp->ad_state & ATTACHED) 10925 panic("handle_written_inodeblock: new entry"); 10926 if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) 10927 panic("%s: direct pointers #%jd %s %jd != %jd", 10928 "handle_written_inodeblock", 10929 (intmax_t)adp->ad_offset, "mismatch", 10930 (intmax_t)dp2->di_extb[adp->ad_offset], 10931 (intmax_t)adp->ad_oldblkno); 10932 dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; 10933 adp->ad_state &= ~UNDONE; 10934 adp->ad_state |= ATTACHED; 10935 hadchanges = 1; 10936 } 10937 if (hadchanges && (bp->b_flags & B_DELWRI) == 0) 10938 stat_direct_blk_ptrs++; 10939 /* 10940 * Reset the file size to its most up-to-date value. 10941 */ 10942 if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) 10943 panic("handle_written_inodeblock: bad size"); 10944 if (inodedep->id_savednlink > LINK_MAX) 10945 panic("handle_written_inodeblock: Invalid link count " 10946 "%d for inodedep %p", inodedep->id_savednlink, inodedep); 10947 if (fstype == UFS1) { 10948 if (dp1->di_nlink != inodedep->id_savednlink) { 10949 dp1->di_nlink = inodedep->id_savednlink; 10950 hadchanges = 1; 10951 } 10952 if (dp1->di_size != inodedep->id_savedsize) { 10953 dp1->di_size = inodedep->id_savedsize; 10954 hadchanges = 1; 10955 } 10956 } else { 10957 if (dp2->di_nlink != inodedep->id_savednlink) { 10958 dp2->di_nlink = inodedep->id_savednlink; 10959 hadchanges = 1; 10960 } 10961 if (dp2->di_size != inodedep->id_savedsize) { 10962 dp2->di_size = inodedep->id_savedsize; 10963 hadchanges = 1; 10964 } 10965 if (dp2->di_extsize != inodedep->id_savedextsize) { 10966 dp2->di_extsize = inodedep->id_savedextsize; 10967 hadchanges = 1; 10968 } 10969 } 10970 inodedep->id_savedsize = -1; 10971 inodedep->id_savedextsize = -1; 10972 inodedep->id_savednlink = -1; 10973 /* 10974 * If there were any rollbacks in the inode block, then it must be 10975 * marked dirty so that its will eventually get written back in 10976 * its correct form. 10977 */ 10978 if (hadchanges) 10979 bdirty(bp); 10980 bufwait: 10981 /* 10982 * Process any allocdirects that completed during the update. 10983 */ 10984 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) 10985 handle_allocdirect_partdone(adp, &wkhd); 10986 if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) 10987 handle_allocdirect_partdone(adp, &wkhd); 10988 /* 10989 * Process deallocations that were held pending until the 10990 * inode had been written to disk. Freeing of the inode 10991 * is delayed until after all blocks have been freed to 10992 * avoid creation of new <vfsid, inum, lbn> triples 10993 * before the old ones have been deleted. Completely 10994 * unlinked inodes are not processed until the unlinked 10995 * inode list is written or the last reference is removed. 10996 */ 10997 if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { 10998 freefile = handle_bufwait(inodedep, NULL); 10999 if (freefile && !LIST_EMPTY(&wkhd)) { 11000 WORKLIST_INSERT(&wkhd, &freefile->fx_list); 11001 freefile = NULL; 11002 } 11003 } 11004 /* 11005 * Move rolled forward dependency completions to the bufwait list 11006 * now that those that were already written have been processed. 11007 */ 11008 if (!LIST_EMPTY(&wkhd) && hadchanges == 0) 11009 panic("handle_written_inodeblock: bufwait but no changes"); 11010 jwork_move(&inodedep->id_bufwait, &wkhd); 11011 11012 if (freefile != NULL) { 11013 /* 11014 * If the inode is goingaway it was never written. Fake up 11015 * the state here so free_inodedep() can succeed. 11016 */ 11017 if (inodedep->id_state & GOINGAWAY) 11018 inodedep->id_state |= COMPLETE | DEPCOMPLETE; 11019 if (free_inodedep(inodedep) == 0) 11020 panic("handle_written_inodeblock: live inodedep %p", 11021 inodedep); 11022 add_to_worklist(&freefile->fx_list, 0); 11023 return (0); 11024 } 11025 11026 /* 11027 * If no outstanding dependencies, free it. 11028 */ 11029 if (free_inodedep(inodedep) || 11030 (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && 11031 TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && 11032 TAILQ_FIRST(&inodedep->id_extupdt) == 0 && 11033 LIST_FIRST(&inodedep->id_bufwait) == 0)) 11034 return (0); 11035 return (hadchanges); 11036 } 11037 11038 static int 11039 handle_written_indirdep(indirdep, bp, bpp) 11040 struct indirdep *indirdep; 11041 struct buf *bp; 11042 struct buf **bpp; 11043 { 11044 struct allocindir *aip; 11045 struct buf *sbp; 11046 int chgs; 11047 11048 if (indirdep->ir_state & GOINGAWAY) 11049 panic("handle_written_indirdep: indirdep gone"); 11050 if ((indirdep->ir_state & IOSTARTED) == 0) 11051 panic("handle_written_indirdep: IO not started"); 11052 chgs = 0; 11053 /* 11054 * If there were rollbacks revert them here. 11055 */ 11056 if (indirdep->ir_saveddata) { 11057 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); 11058 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 11059 free(indirdep->ir_saveddata, M_INDIRDEP); 11060 indirdep->ir_saveddata = NULL; 11061 } 11062 chgs = 1; 11063 } 11064 indirdep->ir_state &= ~(UNDONE | IOSTARTED); 11065 indirdep->ir_state |= ATTACHED; 11066 /* 11067 * Move allocindirs with written pointers to the completehd if 11068 * the indirdep's pointer is not yet written. Otherwise 11069 * free them here. 11070 */ 11071 while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { 11072 LIST_REMOVE(aip, ai_next); 11073 if ((indirdep->ir_state & DEPCOMPLETE) == 0) { 11074 LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, 11075 ai_next); 11076 newblk_freefrag(&aip->ai_block); 11077 continue; 11078 } 11079 free_newblk(&aip->ai_block); 11080 } 11081 /* 11082 * Move allocindirs that have finished dependency processing from 11083 * the done list to the write list after updating the pointers. 11084 */ 11085 if (TAILQ_EMPTY(&indirdep->ir_trunc)) { 11086 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { 11087 handle_allocindir_partdone(aip); 11088 if (aip == LIST_FIRST(&indirdep->ir_donehd)) 11089 panic("disk_write_complete: not gone"); 11090 chgs = 1; 11091 } 11092 } 11093 /* 11094 * Preserve the indirdep if there were any changes or if it is not 11095 * yet valid on disk. 11096 */ 11097 if (chgs) { 11098 stat_indir_blk_ptrs++; 11099 bdirty(bp); 11100 return (1); 11101 } 11102 /* 11103 * If there were no changes we can discard the savedbp and detach 11104 * ourselves from the buf. We are only carrying completed pointers 11105 * in this case. 11106 */ 11107 sbp = indirdep->ir_savebp; 11108 sbp->b_flags |= B_INVAL | B_NOCACHE; 11109 indirdep->ir_savebp = NULL; 11110 indirdep->ir_bp = NULL; 11111 if (*bpp != NULL) 11112 panic("handle_written_indirdep: bp already exists."); 11113 *bpp = sbp; 11114 /* 11115 * The indirdep may not be freed until its parent points at it. 11116 */ 11117 if (indirdep->ir_state & DEPCOMPLETE) 11118 free_indirdep(indirdep); 11119 11120 return (0); 11121 } 11122 11123 /* 11124 * Process a diradd entry after its dependent inode has been written. 11125 * This routine must be called with splbio interrupts blocked. 11126 */ 11127 static void 11128 diradd_inode_written(dap, inodedep) 11129 struct diradd *dap; 11130 struct inodedep *inodedep; 11131 { 11132 11133 dap->da_state |= COMPLETE; 11134 complete_diradd(dap); 11135 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); 11136 } 11137 11138 /* 11139 * Returns true if the bmsafemap will have rollbacks when written. Must 11140 * only be called with lk and the buf lock on the cg held. 11141 */ 11142 static int 11143 bmsafemap_rollbacks(bmsafemap) 11144 struct bmsafemap *bmsafemap; 11145 { 11146 11147 return (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | 11148 !LIST_EMPTY(&bmsafemap->sm_jnewblkhd)); 11149 } 11150 11151 /* 11152 * Re-apply an allocation when a cg write is complete. 11153 */ 11154 static int 11155 jnewblk_rollforward(jnewblk, fs, cgp, blksfree) 11156 struct jnewblk *jnewblk; 11157 struct fs *fs; 11158 struct cg *cgp; 11159 uint8_t *blksfree; 11160 { 11161 ufs1_daddr_t fragno; 11162 ufs2_daddr_t blkno; 11163 long cgbno, bbase; 11164 int frags, blk; 11165 int i; 11166 11167 frags = 0; 11168 cgbno = dtogd(fs, jnewblk->jn_blkno); 11169 for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { 11170 if (isclr(blksfree, cgbno + i)) 11171 panic("jnewblk_rollforward: re-allocated fragment"); 11172 frags++; 11173 } 11174 if (frags == fs->fs_frag) { 11175 blkno = fragstoblks(fs, cgbno); 11176 ffs_clrblock(fs, blksfree, (long)blkno); 11177 ffs_clusteracct(fs, cgp, blkno, -1); 11178 cgp->cg_cs.cs_nbfree--; 11179 } else { 11180 bbase = cgbno - fragnum(fs, cgbno); 11181 cgbno += jnewblk->jn_oldfrags; 11182 /* If a complete block had been reassembled, account for it. */ 11183 fragno = fragstoblks(fs, bbase); 11184 if (ffs_isblock(fs, blksfree, fragno)) { 11185 cgp->cg_cs.cs_nffree += fs->fs_frag; 11186 ffs_clusteracct(fs, cgp, fragno, -1); 11187 cgp->cg_cs.cs_nbfree--; 11188 } 11189 /* Decrement the old frags. */ 11190 blk = blkmap(fs, blksfree, bbase); 11191 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 11192 /* Allocate the fragment */ 11193 for (i = 0; i < frags; i++) 11194 clrbit(blksfree, cgbno + i); 11195 cgp->cg_cs.cs_nffree -= frags; 11196 /* Add back in counts associated with the new frags */ 11197 blk = blkmap(fs, blksfree, bbase); 11198 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 11199 } 11200 return (frags); 11201 } 11202 11203 /* 11204 * Complete a write to a bmsafemap structure. Roll forward any bitmap 11205 * changes if it's not a background write. Set all written dependencies 11206 * to DEPCOMPLETE and free the structure if possible. 11207 */ 11208 static int 11209 handle_written_bmsafemap(bmsafemap, bp) 11210 struct bmsafemap *bmsafemap; 11211 struct buf *bp; 11212 { 11213 struct newblk *newblk; 11214 struct inodedep *inodedep; 11215 struct jaddref *jaddref, *jatmp; 11216 struct jnewblk *jnewblk, *jntmp; 11217 struct ufsmount *ump; 11218 uint8_t *inosused; 11219 uint8_t *blksfree; 11220 struct cg *cgp; 11221 struct fs *fs; 11222 ino_t ino; 11223 int chgs; 11224 11225 if ((bmsafemap->sm_state & IOSTARTED) == 0) 11226 panic("initiate_write_bmsafemap: Not started\n"); 11227 ump = VFSTOUFS(bmsafemap->sm_list.wk_mp); 11228 chgs = 0; 11229 bmsafemap->sm_state &= ~IOSTARTED; 11230 /* 11231 * Release journal work that was waiting on the write. 11232 */ 11233 handle_jwork(&bmsafemap->sm_freewr); 11234 11235 /* 11236 * Restore unwritten inode allocation pending jaddref writes. 11237 */ 11238 if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { 11239 cgp = (struct cg *)bp->b_data; 11240 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 11241 inosused = cg_inosused(cgp); 11242 LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, 11243 ja_bmdeps, jatmp) { 11244 if ((jaddref->ja_state & UNDONE) == 0) 11245 continue; 11246 ino = jaddref->ja_ino % fs->fs_ipg; 11247 if (isset(inosused, ino)) 11248 panic("handle_written_bmsafemap: " 11249 "re-allocated inode"); 11250 if ((bp->b_xflags & BX_BKGRDMARKER) == 0) { 11251 if ((jaddref->ja_mode & IFMT) == IFDIR) 11252 cgp->cg_cs.cs_ndir++; 11253 cgp->cg_cs.cs_nifree--; 11254 setbit(inosused, ino); 11255 chgs = 1; 11256 } 11257 jaddref->ja_state &= ~UNDONE; 11258 jaddref->ja_state |= ATTACHED; 11259 free_jaddref(jaddref); 11260 } 11261 } 11262 /* 11263 * Restore any block allocations which are pending journal writes. 11264 */ 11265 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { 11266 cgp = (struct cg *)bp->b_data; 11267 fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; 11268 blksfree = cg_blksfree(cgp); 11269 LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, 11270 jntmp) { 11271 if ((jnewblk->jn_state & UNDONE) == 0) 11272 continue; 11273 if ((bp->b_xflags & BX_BKGRDMARKER) == 0 && 11274 jnewblk_rollforward(jnewblk, fs, cgp, blksfree)) 11275 chgs = 1; 11276 jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); 11277 jnewblk->jn_state |= ATTACHED; 11278 free_jnewblk(jnewblk); 11279 } 11280 } 11281 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { 11282 newblk->nb_state |= DEPCOMPLETE; 11283 newblk->nb_state &= ~ONDEPLIST; 11284 newblk->nb_bmsafemap = NULL; 11285 LIST_REMOVE(newblk, nb_deps); 11286 if (newblk->nb_list.wk_type == D_ALLOCDIRECT) 11287 handle_allocdirect_partdone( 11288 WK_ALLOCDIRECT(&newblk->nb_list), NULL); 11289 else if (newblk->nb_list.wk_type == D_ALLOCINDIR) 11290 handle_allocindir_partdone( 11291 WK_ALLOCINDIR(&newblk->nb_list)); 11292 else if (newblk->nb_list.wk_type != D_NEWBLK) 11293 panic("handle_written_bmsafemap: Unexpected type: %s", 11294 TYPENAME(newblk->nb_list.wk_type)); 11295 } 11296 while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { 11297 inodedep->id_state |= DEPCOMPLETE; 11298 inodedep->id_state &= ~ONDEPLIST; 11299 LIST_REMOVE(inodedep, id_deps); 11300 inodedep->id_bmsafemap = NULL; 11301 } 11302 LIST_REMOVE(bmsafemap, sm_next); 11303 if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && 11304 LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && 11305 LIST_EMPTY(&bmsafemap->sm_newblkhd) && 11306 LIST_EMPTY(&bmsafemap->sm_inodedephd) && 11307 LIST_EMPTY(&bmsafemap->sm_freehd)) { 11308 LIST_REMOVE(bmsafemap, sm_hash); 11309 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); 11310 return (0); 11311 } 11312 LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); 11313 bdirty(bp); 11314 return (1); 11315 } 11316 11317 /* 11318 * Try to free a mkdir dependency. 11319 */ 11320 static void 11321 complete_mkdir(mkdir) 11322 struct mkdir *mkdir; 11323 { 11324 struct diradd *dap; 11325 11326 if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) 11327 return; 11328 LIST_REMOVE(mkdir, md_mkdirs); 11329 dap = mkdir->md_diradd; 11330 dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); 11331 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { 11332 dap->da_state |= DEPCOMPLETE; 11333 complete_diradd(dap); 11334 } 11335 WORKITEM_FREE(mkdir, D_MKDIR); 11336 } 11337 11338 /* 11339 * Handle the completion of a mkdir dependency. 11340 */ 11341 static void 11342 handle_written_mkdir(mkdir, type) 11343 struct mkdir *mkdir; 11344 int type; 11345 { 11346 11347 if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) 11348 panic("handle_written_mkdir: bad type"); 11349 mkdir->md_state |= COMPLETE; 11350 complete_mkdir(mkdir); 11351 } 11352 11353 static int 11354 free_pagedep(pagedep) 11355 struct pagedep *pagedep; 11356 { 11357 int i; 11358 11359 if (pagedep->pd_state & NEWBLOCK) 11360 return (0); 11361 if (!LIST_EMPTY(&pagedep->pd_dirremhd)) 11362 return (0); 11363 for (i = 0; i < DAHASHSZ; i++) 11364 if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) 11365 return (0); 11366 if (!LIST_EMPTY(&pagedep->pd_pendinghd)) 11367 return (0); 11368 if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) 11369 return (0); 11370 if (pagedep->pd_state & ONWORKLIST) 11371 WORKLIST_REMOVE(&pagedep->pd_list); 11372 LIST_REMOVE(pagedep, pd_hash); 11373 WORKITEM_FREE(pagedep, D_PAGEDEP); 11374 11375 return (1); 11376 } 11377 11378 /* 11379 * Called from within softdep_disk_write_complete above. 11380 * A write operation was just completed. Removed inodes can 11381 * now be freed and associated block pointers may be committed. 11382 * Note that this routine is always called from interrupt level 11383 * with further splbio interrupts blocked. 11384 */ 11385 static int 11386 handle_written_filepage(pagedep, bp) 11387 struct pagedep *pagedep; 11388 struct buf *bp; /* buffer containing the written page */ 11389 { 11390 struct dirrem *dirrem; 11391 struct diradd *dap, *nextdap; 11392 struct direct *ep; 11393 int i, chgs; 11394 11395 if ((pagedep->pd_state & IOSTARTED) == 0) 11396 panic("handle_written_filepage: not started"); 11397 pagedep->pd_state &= ~IOSTARTED; 11398 /* 11399 * Process any directory removals that have been committed. 11400 */ 11401 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { 11402 LIST_REMOVE(dirrem, dm_next); 11403 dirrem->dm_state |= COMPLETE; 11404 dirrem->dm_dirinum = pagedep->pd_ino; 11405 KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), 11406 ("handle_written_filepage: Journal entries not written.")); 11407 add_to_worklist(&dirrem->dm_list, 0); 11408 } 11409 /* 11410 * Free any directory additions that have been committed. 11411 * If it is a newly allocated block, we have to wait until 11412 * the on-disk directory inode claims the new block. 11413 */ 11414 if ((pagedep->pd_state & NEWBLOCK) == 0) 11415 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) 11416 free_diradd(dap, NULL); 11417 /* 11418 * Uncommitted directory entries must be restored. 11419 */ 11420 for (chgs = 0, i = 0; i < DAHASHSZ; i++) { 11421 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; 11422 dap = nextdap) { 11423 nextdap = LIST_NEXT(dap, da_pdlist); 11424 if (dap->da_state & ATTACHED) 11425 panic("handle_written_filepage: attached"); 11426 ep = (struct direct *) 11427 ((char *)bp->b_data + dap->da_offset); 11428 ep->d_ino = dap->da_newinum; 11429 dap->da_state &= ~UNDONE; 11430 dap->da_state |= ATTACHED; 11431 chgs = 1; 11432 /* 11433 * If the inode referenced by the directory has 11434 * been written out, then the dependency can be 11435 * moved to the pending list. 11436 */ 11437 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { 11438 LIST_REMOVE(dap, da_pdlist); 11439 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, 11440 da_pdlist); 11441 } 11442 } 11443 } 11444 /* 11445 * If there were any rollbacks in the directory, then it must be 11446 * marked dirty so that its will eventually get written back in 11447 * its correct form. 11448 */ 11449 if (chgs) { 11450 if ((bp->b_flags & B_DELWRI) == 0) 11451 stat_dir_entry++; 11452 bdirty(bp); 11453 return (1); 11454 } 11455 /* 11456 * If we are not waiting for a new directory block to be 11457 * claimed by its inode, then the pagedep will be freed. 11458 * Otherwise it will remain to track any new entries on 11459 * the page in case they are fsync'ed. 11460 */ 11461 free_pagedep(pagedep); 11462 return (0); 11463 } 11464 11465 /* 11466 * Writing back in-core inode structures. 11467 * 11468 * The filesystem only accesses an inode's contents when it occupies an 11469 * "in-core" inode structure. These "in-core" structures are separate from 11470 * the page frames used to cache inode blocks. Only the latter are 11471 * transferred to/from the disk. So, when the updated contents of the 11472 * "in-core" inode structure are copied to the corresponding in-memory inode 11473 * block, the dependencies are also transferred. The following procedure is 11474 * called when copying a dirty "in-core" inode to a cached inode block. 11475 */ 11476 11477 /* 11478 * Called when an inode is loaded from disk. If the effective link count 11479 * differed from the actual link count when it was last flushed, then we 11480 * need to ensure that the correct effective link count is put back. 11481 */ 11482 void 11483 softdep_load_inodeblock(ip) 11484 struct inode *ip; /* the "in_core" copy of the inode */ 11485 { 11486 struct inodedep *inodedep; 11487 11488 /* 11489 * Check for alternate nlink count. 11490 */ 11491 ip->i_effnlink = ip->i_nlink; 11492 ACQUIRE_LOCK(&lk); 11493 if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, 11494 &inodedep) == 0) { 11495 FREE_LOCK(&lk); 11496 return; 11497 } 11498 ip->i_effnlink -= inodedep->id_nlinkdelta; 11499 FREE_LOCK(&lk); 11500 } 11501 11502 /* 11503 * This routine is called just before the "in-core" inode 11504 * information is to be copied to the in-memory inode block. 11505 * Recall that an inode block contains several inodes. If 11506 * the force flag is set, then the dependencies will be 11507 * cleared so that the update can always be made. Note that 11508 * the buffer is locked when this routine is called, so we 11509 * will never be in the middle of writing the inode block 11510 * to disk. 11511 */ 11512 void 11513 softdep_update_inodeblock(ip, bp, waitfor) 11514 struct inode *ip; /* the "in_core" copy of the inode */ 11515 struct buf *bp; /* the buffer containing the inode block */ 11516 int waitfor; /* nonzero => update must be allowed */ 11517 { 11518 struct inodedep *inodedep; 11519 struct inoref *inoref; 11520 struct worklist *wk; 11521 struct mount *mp; 11522 struct buf *ibp; 11523 struct fs *fs; 11524 int error; 11525 11526 mp = UFSTOVFS(ip->i_ump); 11527 fs = ip->i_fs; 11528 /* 11529 * Preserve the freelink that is on disk. clear_unlinked_inodedep() 11530 * does not have access to the in-core ip so must write directly into 11531 * the inode block buffer when setting freelink. 11532 */ 11533 if (fs->fs_magic == FS_UFS1_MAGIC) 11534 DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + 11535 ino_to_fsbo(fs, ip->i_number))->di_freelink); 11536 else 11537 DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + 11538 ino_to_fsbo(fs, ip->i_number))->di_freelink); 11539 /* 11540 * If the effective link count is not equal to the actual link 11541 * count, then we must track the difference in an inodedep while 11542 * the inode is (potentially) tossed out of the cache. Otherwise, 11543 * if there is no existing inodedep, then there are no dependencies 11544 * to track. 11545 */ 11546 ACQUIRE_LOCK(&lk); 11547 again: 11548 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 11549 FREE_LOCK(&lk); 11550 if (ip->i_effnlink != ip->i_nlink) 11551 panic("softdep_update_inodeblock: bad link count"); 11552 return; 11553 } 11554 if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) 11555 panic("softdep_update_inodeblock: bad delta"); 11556 /* 11557 * If we're flushing all dependencies we must also move any waiting 11558 * for journal writes onto the bufwait list prior to I/O. 11559 */ 11560 if (waitfor) { 11561 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 11562 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 11563 == DEPCOMPLETE) { 11564 jwait(&inoref->if_list, MNT_WAIT); 11565 goto again; 11566 } 11567 } 11568 } 11569 /* 11570 * Changes have been initiated. Anything depending on these 11571 * changes cannot occur until this inode has been written. 11572 */ 11573 inodedep->id_state &= ~COMPLETE; 11574 if ((inodedep->id_state & ONWORKLIST) == 0) 11575 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); 11576 /* 11577 * Any new dependencies associated with the incore inode must 11578 * now be moved to the list associated with the buffer holding 11579 * the in-memory copy of the inode. Once merged process any 11580 * allocdirects that are completed by the merger. 11581 */ 11582 merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); 11583 if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) 11584 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), 11585 NULL); 11586 merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); 11587 if (!TAILQ_EMPTY(&inodedep->id_extupdt)) 11588 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), 11589 NULL); 11590 /* 11591 * Now that the inode has been pushed into the buffer, the 11592 * operations dependent on the inode being written to disk 11593 * can be moved to the id_bufwait so that they will be 11594 * processed when the buffer I/O completes. 11595 */ 11596 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { 11597 WORKLIST_REMOVE(wk); 11598 WORKLIST_INSERT(&inodedep->id_bufwait, wk); 11599 } 11600 /* 11601 * Newly allocated inodes cannot be written until the bitmap 11602 * that allocates them have been written (indicated by 11603 * DEPCOMPLETE being set in id_state). If we are doing a 11604 * forced sync (e.g., an fsync on a file), we force the bitmap 11605 * to be written so that the update can be done. 11606 */ 11607 if (waitfor == 0) { 11608 FREE_LOCK(&lk); 11609 return; 11610 } 11611 retry: 11612 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { 11613 FREE_LOCK(&lk); 11614 return; 11615 } 11616 ibp = inodedep->id_bmsafemap->sm_buf; 11617 ibp = getdirtybuf(ibp, &lk, MNT_WAIT); 11618 if (ibp == NULL) { 11619 /* 11620 * If ibp came back as NULL, the dependency could have been 11621 * freed while we slept. Look it up again, and check to see 11622 * that it has completed. 11623 */ 11624 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) 11625 goto retry; 11626 FREE_LOCK(&lk); 11627 return; 11628 } 11629 FREE_LOCK(&lk); 11630 if ((error = bwrite(ibp)) != 0) 11631 softdep_error("softdep_update_inodeblock: bwrite", error); 11632 } 11633 11634 /* 11635 * Merge the a new inode dependency list (such as id_newinoupdt) into an 11636 * old inode dependency list (such as id_inoupdt). This routine must be 11637 * called with splbio interrupts blocked. 11638 */ 11639 static void 11640 merge_inode_lists(newlisthead, oldlisthead) 11641 struct allocdirectlst *newlisthead; 11642 struct allocdirectlst *oldlisthead; 11643 { 11644 struct allocdirect *listadp, *newadp; 11645 11646 newadp = TAILQ_FIRST(newlisthead); 11647 for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { 11648 if (listadp->ad_offset < newadp->ad_offset) { 11649 listadp = TAILQ_NEXT(listadp, ad_next); 11650 continue; 11651 } 11652 TAILQ_REMOVE(newlisthead, newadp, ad_next); 11653 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); 11654 if (listadp->ad_offset == newadp->ad_offset) { 11655 allocdirect_merge(oldlisthead, newadp, 11656 listadp); 11657 listadp = newadp; 11658 } 11659 newadp = TAILQ_FIRST(newlisthead); 11660 } 11661 while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { 11662 TAILQ_REMOVE(newlisthead, newadp, ad_next); 11663 TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); 11664 } 11665 } 11666 11667 /* 11668 * If we are doing an fsync, then we must ensure that any directory 11669 * entries for the inode have been written after the inode gets to disk. 11670 */ 11671 int 11672 softdep_fsync(vp) 11673 struct vnode *vp; /* the "in_core" copy of the inode */ 11674 { 11675 struct inodedep *inodedep; 11676 struct pagedep *pagedep; 11677 struct inoref *inoref; 11678 struct worklist *wk; 11679 struct diradd *dap; 11680 struct mount *mp; 11681 struct vnode *pvp; 11682 struct inode *ip; 11683 struct buf *bp; 11684 struct fs *fs; 11685 struct thread *td = curthread; 11686 int error, flushparent, pagedep_new_block; 11687 ino_t parentino; 11688 ufs_lbn_t lbn; 11689 11690 ip = VTOI(vp); 11691 fs = ip->i_fs; 11692 mp = vp->v_mount; 11693 ACQUIRE_LOCK(&lk); 11694 restart: 11695 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { 11696 FREE_LOCK(&lk); 11697 return (0); 11698 } 11699 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 11700 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 11701 == DEPCOMPLETE) { 11702 jwait(&inoref->if_list, MNT_WAIT); 11703 goto restart; 11704 } 11705 } 11706 if (!LIST_EMPTY(&inodedep->id_inowait) || 11707 !TAILQ_EMPTY(&inodedep->id_extupdt) || 11708 !TAILQ_EMPTY(&inodedep->id_newextupdt) || 11709 !TAILQ_EMPTY(&inodedep->id_inoupdt) || 11710 !TAILQ_EMPTY(&inodedep->id_newinoupdt)) 11711 panic("softdep_fsync: pending ops %p", inodedep); 11712 for (error = 0, flushparent = 0; ; ) { 11713 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) 11714 break; 11715 if (wk->wk_type != D_DIRADD) 11716 panic("softdep_fsync: Unexpected type %s", 11717 TYPENAME(wk->wk_type)); 11718 dap = WK_DIRADD(wk); 11719 /* 11720 * Flush our parent if this directory entry has a MKDIR_PARENT 11721 * dependency or is contained in a newly allocated block. 11722 */ 11723 if (dap->da_state & DIRCHG) 11724 pagedep = dap->da_previous->dm_pagedep; 11725 else 11726 pagedep = dap->da_pagedep; 11727 parentino = pagedep->pd_ino; 11728 lbn = pagedep->pd_lbn; 11729 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) 11730 panic("softdep_fsync: dirty"); 11731 if ((dap->da_state & MKDIR_PARENT) || 11732 (pagedep->pd_state & NEWBLOCK)) 11733 flushparent = 1; 11734 else 11735 flushparent = 0; 11736 /* 11737 * If we are being fsync'ed as part of vgone'ing this vnode, 11738 * then we will not be able to release and recover the 11739 * vnode below, so we just have to give up on writing its 11740 * directory entry out. It will eventually be written, just 11741 * not now, but then the user was not asking to have it 11742 * written, so we are not breaking any promises. 11743 */ 11744 if (vp->v_iflag & VI_DOOMED) 11745 break; 11746 /* 11747 * We prevent deadlock by always fetching inodes from the 11748 * root, moving down the directory tree. Thus, when fetching 11749 * our parent directory, we first try to get the lock. If 11750 * that fails, we must unlock ourselves before requesting 11751 * the lock on our parent. See the comment in ufs_lookup 11752 * for details on possible races. 11753 */ 11754 FREE_LOCK(&lk); 11755 if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, 11756 FFSV_FORCEINSMQ)) { 11757 error = vfs_busy(mp, MBF_NOWAIT); 11758 if (error != 0) { 11759 vfs_ref(mp); 11760 VOP_UNLOCK(vp, 0); 11761 error = vfs_busy(mp, 0); 11762 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 11763 vfs_rel(mp); 11764 if (error != 0) 11765 return (ENOENT); 11766 if (vp->v_iflag & VI_DOOMED) { 11767 vfs_unbusy(mp); 11768 return (ENOENT); 11769 } 11770 } 11771 VOP_UNLOCK(vp, 0); 11772 error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, 11773 &pvp, FFSV_FORCEINSMQ); 11774 vfs_unbusy(mp); 11775 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 11776 if (vp->v_iflag & VI_DOOMED) { 11777 if (error == 0) 11778 vput(pvp); 11779 error = ENOENT; 11780 } 11781 if (error != 0) 11782 return (error); 11783 } 11784 /* 11785 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps 11786 * that are contained in direct blocks will be resolved by 11787 * doing a ffs_update. Pagedeps contained in indirect blocks 11788 * may require a complete sync'ing of the directory. So, we 11789 * try the cheap and fast ffs_update first, and if that fails, 11790 * then we do the slower ffs_syncvnode of the directory. 11791 */ 11792 if (flushparent) { 11793 int locked; 11794 11795 if ((error = ffs_update(pvp, 1)) != 0) { 11796 vput(pvp); 11797 return (error); 11798 } 11799 ACQUIRE_LOCK(&lk); 11800 locked = 1; 11801 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { 11802 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { 11803 if (wk->wk_type != D_DIRADD) 11804 panic("softdep_fsync: Unexpected type %s", 11805 TYPENAME(wk->wk_type)); 11806 dap = WK_DIRADD(wk); 11807 if (dap->da_state & DIRCHG) 11808 pagedep = dap->da_previous->dm_pagedep; 11809 else 11810 pagedep = dap->da_pagedep; 11811 pagedep_new_block = pagedep->pd_state & NEWBLOCK; 11812 FREE_LOCK(&lk); 11813 locked = 0; 11814 if (pagedep_new_block && 11815 (error = ffs_syncvnode(pvp, MNT_WAIT))) { 11816 vput(pvp); 11817 return (error); 11818 } 11819 } 11820 } 11821 if (locked) 11822 FREE_LOCK(&lk); 11823 } 11824 /* 11825 * Flush directory page containing the inode's name. 11826 */ 11827 error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, 11828 &bp); 11829 if (error == 0) 11830 error = bwrite(bp); 11831 else 11832 brelse(bp); 11833 vput(pvp); 11834 if (error != 0) 11835 return (error); 11836 ACQUIRE_LOCK(&lk); 11837 if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) 11838 break; 11839 } 11840 FREE_LOCK(&lk); 11841 return (0); 11842 } 11843 11844 /* 11845 * Flush all the dirty bitmaps associated with the block device 11846 * before flushing the rest of the dirty blocks so as to reduce 11847 * the number of dependencies that will have to be rolled back. 11848 * 11849 * XXX Unused? 11850 */ 11851 void 11852 softdep_fsync_mountdev(vp) 11853 struct vnode *vp; 11854 { 11855 struct buf *bp, *nbp; 11856 struct worklist *wk; 11857 struct bufobj *bo; 11858 11859 if (!vn_isdisk(vp, NULL)) 11860 panic("softdep_fsync_mountdev: vnode not a disk"); 11861 bo = &vp->v_bufobj; 11862 restart: 11863 BO_LOCK(bo); 11864 ACQUIRE_LOCK(&lk); 11865 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 11866 /* 11867 * If it is already scheduled, skip to the next buffer. 11868 */ 11869 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 11870 continue; 11871 11872 if ((bp->b_flags & B_DELWRI) == 0) 11873 panic("softdep_fsync_mountdev: not dirty"); 11874 /* 11875 * We are only interested in bitmaps with outstanding 11876 * dependencies. 11877 */ 11878 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || 11879 wk->wk_type != D_BMSAFEMAP || 11880 (bp->b_vflags & BV_BKGRDINPROG)) { 11881 BUF_UNLOCK(bp); 11882 continue; 11883 } 11884 FREE_LOCK(&lk); 11885 BO_UNLOCK(bo); 11886 bremfree(bp); 11887 (void) bawrite(bp); 11888 goto restart; 11889 } 11890 FREE_LOCK(&lk); 11891 drain_output(vp); 11892 BO_UNLOCK(bo); 11893 } 11894 11895 /* 11896 * Sync all cylinder groups that were dirty at the time this function is 11897 * called. Newly dirtied cgs will be inserted before the sintenel. This 11898 * is used to flush freedep activity that may be holding up writes to a 11899 * indirect block. 11900 */ 11901 static int 11902 sync_cgs(mp, waitfor) 11903 struct mount *mp; 11904 int waitfor; 11905 { 11906 struct bmsafemap *bmsafemap; 11907 struct bmsafemap *sintenel; 11908 struct ufsmount *ump; 11909 struct buf *bp; 11910 int error; 11911 11912 sintenel = malloc(sizeof(*sintenel), M_BMSAFEMAP, M_ZERO | M_WAITOK); 11913 sintenel->sm_cg = -1; 11914 ump = VFSTOUFS(mp); 11915 error = 0; 11916 ACQUIRE_LOCK(&lk); 11917 LIST_INSERT_HEAD(&ump->softdep_dirtycg, sintenel, sm_next); 11918 for (bmsafemap = LIST_NEXT(sintenel, sm_next); bmsafemap != NULL; 11919 bmsafemap = LIST_NEXT(sintenel, sm_next)) { 11920 /* Skip sintenels and cgs with no work to release. */ 11921 if (bmsafemap->sm_cg == -1 || 11922 (LIST_EMPTY(&bmsafemap->sm_freehd) && 11923 LIST_EMPTY(&bmsafemap->sm_freewr))) { 11924 LIST_REMOVE(sintenel, sm_next); 11925 LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next); 11926 continue; 11927 } 11928 /* 11929 * If we don't get the lock and we're waiting try again, if 11930 * not move on to the next buf and try to sync it. 11931 */ 11932 bp = getdirtybuf(bmsafemap->sm_buf, &lk, waitfor); 11933 if (bp == NULL && waitfor == MNT_WAIT) 11934 continue; 11935 LIST_REMOVE(sintenel, sm_next); 11936 LIST_INSERT_AFTER(bmsafemap, sintenel, sm_next); 11937 if (bp == NULL) 11938 continue; 11939 FREE_LOCK(&lk); 11940 if (waitfor == MNT_NOWAIT) 11941 bawrite(bp); 11942 else 11943 error = bwrite(bp); 11944 ACQUIRE_LOCK(&lk); 11945 if (error) 11946 break; 11947 } 11948 LIST_REMOVE(sintenel, sm_next); 11949 FREE_LOCK(&lk); 11950 free(sintenel, M_BMSAFEMAP); 11951 return (error); 11952 } 11953 11954 /* 11955 * This routine is called when we are trying to synchronously flush a 11956 * file. This routine must eliminate any filesystem metadata dependencies 11957 * so that the syncing routine can succeed. 11958 */ 11959 int 11960 softdep_sync_metadata(struct vnode *vp) 11961 { 11962 int error; 11963 11964 /* 11965 * Ensure that any direct block dependencies have been cleared, 11966 * truncations are started, and inode references are journaled. 11967 */ 11968 ACQUIRE_LOCK(&lk); 11969 /* 11970 * Write all journal records to prevent rollbacks on devvp. 11971 */ 11972 if (vp->v_type == VCHR) 11973 softdep_flushjournal(vp->v_mount); 11974 error = flush_inodedep_deps(vp, vp->v_mount, VTOI(vp)->i_number); 11975 /* 11976 * Ensure that all truncates are written so we won't find deps on 11977 * indirect blocks. 11978 */ 11979 process_truncates(vp); 11980 FREE_LOCK(&lk); 11981 11982 return (error); 11983 } 11984 11985 /* 11986 * This routine is called when we are attempting to sync a buf with 11987 * dependencies. If waitfor is MNT_NOWAIT it attempts to schedule any 11988 * other IO it can but returns EBUSY if the buffer is not yet able to 11989 * be written. Dependencies which will not cause rollbacks will always 11990 * return 0. 11991 */ 11992 int 11993 softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) 11994 { 11995 struct indirdep *indirdep; 11996 struct pagedep *pagedep; 11997 struct allocindir *aip; 11998 struct newblk *newblk; 11999 struct buf *nbp; 12000 struct worklist *wk; 12001 int i, error; 12002 12003 /* 12004 * For VCHR we just don't want to force flush any dependencies that 12005 * will cause rollbacks. 12006 */ 12007 if (vp->v_type == VCHR) { 12008 if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0)) 12009 return (EBUSY); 12010 return (0); 12011 } 12012 ACQUIRE_LOCK(&lk); 12013 /* 12014 * As we hold the buffer locked, none of its dependencies 12015 * will disappear. 12016 */ 12017 error = 0; 12018 top: 12019 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 12020 switch (wk->wk_type) { 12021 12022 case D_ALLOCDIRECT: 12023 case D_ALLOCINDIR: 12024 newblk = WK_NEWBLK(wk); 12025 if (newblk->nb_jnewblk != NULL) { 12026 if (waitfor == MNT_NOWAIT) { 12027 error = EBUSY; 12028 goto out_unlock; 12029 } 12030 jwait(&newblk->nb_jnewblk->jn_list, waitfor); 12031 goto top; 12032 } 12033 if (newblk->nb_state & DEPCOMPLETE || 12034 waitfor == MNT_NOWAIT) 12035 continue; 12036 nbp = newblk->nb_bmsafemap->sm_buf; 12037 nbp = getdirtybuf(nbp, &lk, waitfor); 12038 if (nbp == NULL) 12039 goto top; 12040 FREE_LOCK(&lk); 12041 if ((error = bwrite(nbp)) != 0) 12042 goto out; 12043 ACQUIRE_LOCK(&lk); 12044 continue; 12045 12046 case D_INDIRDEP: 12047 indirdep = WK_INDIRDEP(wk); 12048 if (waitfor == MNT_NOWAIT) { 12049 if (!TAILQ_EMPTY(&indirdep->ir_trunc) || 12050 !LIST_EMPTY(&indirdep->ir_deplisthd)) { 12051 error = EBUSY; 12052 goto out_unlock; 12053 } 12054 } 12055 if (!TAILQ_EMPTY(&indirdep->ir_trunc)) 12056 panic("softdep_sync_buf: truncation pending."); 12057 restart: 12058 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 12059 newblk = (struct newblk *)aip; 12060 if (newblk->nb_jnewblk != NULL) { 12061 jwait(&newblk->nb_jnewblk->jn_list, 12062 waitfor); 12063 goto restart; 12064 } 12065 if (newblk->nb_state & DEPCOMPLETE) 12066 continue; 12067 nbp = newblk->nb_bmsafemap->sm_buf; 12068 nbp = getdirtybuf(nbp, &lk, waitfor); 12069 if (nbp == NULL) 12070 goto restart; 12071 FREE_LOCK(&lk); 12072 if ((error = bwrite(nbp)) != 0) 12073 goto out; 12074 ACQUIRE_LOCK(&lk); 12075 goto restart; 12076 } 12077 continue; 12078 12079 case D_PAGEDEP: 12080 /* 12081 * Only flush directory entries in synchronous passes. 12082 */ 12083 if (waitfor != MNT_WAIT) { 12084 error = EBUSY; 12085 goto out_unlock; 12086 } 12087 /* 12088 * While syncing snapshots, we must allow recursive 12089 * lookups. 12090 */ 12091 BUF_AREC(bp); 12092 /* 12093 * We are trying to sync a directory that may 12094 * have dependencies on both its own metadata 12095 * and/or dependencies on the inodes of any 12096 * recently allocated files. We walk its diradd 12097 * lists pushing out the associated inode. 12098 */ 12099 pagedep = WK_PAGEDEP(wk); 12100 for (i = 0; i < DAHASHSZ; i++) { 12101 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) 12102 continue; 12103 if ((error = flush_pagedep_deps(vp, wk->wk_mp, 12104 &pagedep->pd_diraddhd[i]))) { 12105 BUF_NOREC(bp); 12106 goto out_unlock; 12107 } 12108 } 12109 BUF_NOREC(bp); 12110 continue; 12111 12112 case D_FREEWORK: 12113 case D_FREEDEP: 12114 case D_JSEGDEP: 12115 continue; 12116 12117 default: 12118 panic("softdep_sync_buf: Unknown type %s", 12119 TYPENAME(wk->wk_type)); 12120 /* NOTREACHED */ 12121 } 12122 } 12123 out_unlock: 12124 FREE_LOCK(&lk); 12125 out: 12126 return (error); 12127 } 12128 12129 /* 12130 * Flush the dependencies associated with an inodedep. 12131 * Called with splbio blocked. 12132 */ 12133 static int 12134 flush_inodedep_deps(vp, mp, ino) 12135 struct vnode *vp; 12136 struct mount *mp; 12137 ino_t ino; 12138 { 12139 struct inodedep *inodedep; 12140 struct inoref *inoref; 12141 int error, waitfor; 12142 12143 /* 12144 * This work is done in two passes. The first pass grabs most 12145 * of the buffers and begins asynchronously writing them. The 12146 * only way to wait for these asynchronous writes is to sleep 12147 * on the filesystem vnode which may stay busy for a long time 12148 * if the filesystem is active. So, instead, we make a second 12149 * pass over the dependencies blocking on each write. In the 12150 * usual case we will be blocking against a write that we 12151 * initiated, so when it is done the dependency will have been 12152 * resolved. Thus the second pass is expected to end quickly. 12153 * We give a brief window at the top of the loop to allow 12154 * any pending I/O to complete. 12155 */ 12156 for (error = 0, waitfor = MNT_NOWAIT; ; ) { 12157 if (error) 12158 return (error); 12159 FREE_LOCK(&lk); 12160 ACQUIRE_LOCK(&lk); 12161 restart: 12162 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 12163 return (0); 12164 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 12165 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 12166 == DEPCOMPLETE) { 12167 jwait(&inoref->if_list, MNT_WAIT); 12168 goto restart; 12169 } 12170 } 12171 if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || 12172 flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || 12173 flush_deplist(&inodedep->id_extupdt, waitfor, &error) || 12174 flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) 12175 continue; 12176 /* 12177 * If pass2, we are done, otherwise do pass 2. 12178 */ 12179 if (waitfor == MNT_WAIT) 12180 break; 12181 waitfor = MNT_WAIT; 12182 } 12183 /* 12184 * Try freeing inodedep in case all dependencies have been removed. 12185 */ 12186 if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) 12187 (void) free_inodedep(inodedep); 12188 return (0); 12189 } 12190 12191 /* 12192 * Flush an inode dependency list. 12193 * Called with splbio blocked. 12194 */ 12195 static int 12196 flush_deplist(listhead, waitfor, errorp) 12197 struct allocdirectlst *listhead; 12198 int waitfor; 12199 int *errorp; 12200 { 12201 struct allocdirect *adp; 12202 struct newblk *newblk; 12203 struct buf *bp; 12204 12205 mtx_assert(&lk, MA_OWNED); 12206 TAILQ_FOREACH(adp, listhead, ad_next) { 12207 newblk = (struct newblk *)adp; 12208 if (newblk->nb_jnewblk != NULL) { 12209 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 12210 return (1); 12211 } 12212 if (newblk->nb_state & DEPCOMPLETE) 12213 continue; 12214 bp = newblk->nb_bmsafemap->sm_buf; 12215 bp = getdirtybuf(bp, &lk, waitfor); 12216 if (bp == NULL) { 12217 if (waitfor == MNT_NOWAIT) 12218 continue; 12219 return (1); 12220 } 12221 FREE_LOCK(&lk); 12222 if (waitfor == MNT_NOWAIT) 12223 bawrite(bp); 12224 else 12225 *errorp = bwrite(bp); 12226 ACQUIRE_LOCK(&lk); 12227 return (1); 12228 } 12229 return (0); 12230 } 12231 12232 /* 12233 * Flush dependencies associated with an allocdirect block. 12234 */ 12235 static int 12236 flush_newblk_dep(vp, mp, lbn) 12237 struct vnode *vp; 12238 struct mount *mp; 12239 ufs_lbn_t lbn; 12240 { 12241 struct newblk *newblk; 12242 struct bufobj *bo; 12243 struct inode *ip; 12244 struct buf *bp; 12245 ufs2_daddr_t blkno; 12246 int error; 12247 12248 error = 0; 12249 bo = &vp->v_bufobj; 12250 ip = VTOI(vp); 12251 blkno = DIP(ip, i_db[lbn]); 12252 if (blkno == 0) 12253 panic("flush_newblk_dep: Missing block"); 12254 ACQUIRE_LOCK(&lk); 12255 /* 12256 * Loop until all dependencies related to this block are satisfied. 12257 * We must be careful to restart after each sleep in case a write 12258 * completes some part of this process for us. 12259 */ 12260 for (;;) { 12261 if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { 12262 FREE_LOCK(&lk); 12263 break; 12264 } 12265 if (newblk->nb_list.wk_type != D_ALLOCDIRECT) 12266 panic("flush_newblk_deps: Bad newblk %p", newblk); 12267 /* 12268 * Flush the journal. 12269 */ 12270 if (newblk->nb_jnewblk != NULL) { 12271 jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); 12272 continue; 12273 } 12274 /* 12275 * Write the bitmap dependency. 12276 */ 12277 if ((newblk->nb_state & DEPCOMPLETE) == 0) { 12278 bp = newblk->nb_bmsafemap->sm_buf; 12279 bp = getdirtybuf(bp, &lk, MNT_WAIT); 12280 if (bp == NULL) 12281 continue; 12282 FREE_LOCK(&lk); 12283 error = bwrite(bp); 12284 if (error) 12285 break; 12286 ACQUIRE_LOCK(&lk); 12287 continue; 12288 } 12289 /* 12290 * Write the buffer. 12291 */ 12292 FREE_LOCK(&lk); 12293 BO_LOCK(bo); 12294 bp = gbincore(bo, lbn); 12295 if (bp != NULL) { 12296 error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 12297 LK_INTERLOCK, BO_MTX(bo)); 12298 if (error == ENOLCK) { 12299 ACQUIRE_LOCK(&lk); 12300 continue; /* Slept, retry */ 12301 } 12302 if (error != 0) 12303 break; /* Failed */ 12304 if (bp->b_flags & B_DELWRI) { 12305 bremfree(bp); 12306 error = bwrite(bp); 12307 if (error) 12308 break; 12309 } else 12310 BUF_UNLOCK(bp); 12311 } else 12312 BO_UNLOCK(bo); 12313 /* 12314 * We have to wait for the direct pointers to 12315 * point at the newdirblk before the dependency 12316 * will go away. 12317 */ 12318 error = ffs_update(vp, MNT_WAIT); 12319 if (error) 12320 break; 12321 ACQUIRE_LOCK(&lk); 12322 } 12323 return (error); 12324 } 12325 12326 /* 12327 * Eliminate a pagedep dependency by flushing out all its diradd dependencies. 12328 * Called with splbio blocked. 12329 */ 12330 static int 12331 flush_pagedep_deps(pvp, mp, diraddhdp) 12332 struct vnode *pvp; 12333 struct mount *mp; 12334 struct diraddhd *diraddhdp; 12335 { 12336 struct inodedep *inodedep; 12337 struct inoref *inoref; 12338 struct ufsmount *ump; 12339 struct diradd *dap; 12340 struct vnode *vp; 12341 int error = 0; 12342 struct buf *bp; 12343 ino_t inum; 12344 12345 ump = VFSTOUFS(mp); 12346 restart: 12347 while ((dap = LIST_FIRST(diraddhdp)) != NULL) { 12348 /* 12349 * Flush ourselves if this directory entry 12350 * has a MKDIR_PARENT dependency. 12351 */ 12352 if (dap->da_state & MKDIR_PARENT) { 12353 FREE_LOCK(&lk); 12354 if ((error = ffs_update(pvp, MNT_WAIT)) != 0) 12355 break; 12356 ACQUIRE_LOCK(&lk); 12357 /* 12358 * If that cleared dependencies, go on to next. 12359 */ 12360 if (dap != LIST_FIRST(diraddhdp)) 12361 continue; 12362 if (dap->da_state & MKDIR_PARENT) 12363 panic("flush_pagedep_deps: MKDIR_PARENT"); 12364 } 12365 /* 12366 * A newly allocated directory must have its "." and 12367 * ".." entries written out before its name can be 12368 * committed in its parent. 12369 */ 12370 inum = dap->da_newinum; 12371 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 12372 panic("flush_pagedep_deps: lost inode1"); 12373 /* 12374 * Wait for any pending journal adds to complete so we don't 12375 * cause rollbacks while syncing. 12376 */ 12377 TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { 12378 if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) 12379 == DEPCOMPLETE) { 12380 jwait(&inoref->if_list, MNT_WAIT); 12381 goto restart; 12382 } 12383 } 12384 if (dap->da_state & MKDIR_BODY) { 12385 FREE_LOCK(&lk); 12386 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 12387 FFSV_FORCEINSMQ))) 12388 break; 12389 error = flush_newblk_dep(vp, mp, 0); 12390 /* 12391 * If we still have the dependency we might need to 12392 * update the vnode to sync the new link count to 12393 * disk. 12394 */ 12395 if (error == 0 && dap == LIST_FIRST(diraddhdp)) 12396 error = ffs_update(vp, MNT_WAIT); 12397 vput(vp); 12398 if (error != 0) 12399 break; 12400 ACQUIRE_LOCK(&lk); 12401 /* 12402 * If that cleared dependencies, go on to next. 12403 */ 12404 if (dap != LIST_FIRST(diraddhdp)) 12405 continue; 12406 if (dap->da_state & MKDIR_BODY) { 12407 inodedep_lookup(UFSTOVFS(ump), inum, 0, 12408 &inodedep); 12409 panic("flush_pagedep_deps: MKDIR_BODY " 12410 "inodedep %p dap %p vp %p", 12411 inodedep, dap, vp); 12412 } 12413 } 12414 /* 12415 * Flush the inode on which the directory entry depends. 12416 * Having accounted for MKDIR_PARENT and MKDIR_BODY above, 12417 * the only remaining dependency is that the updated inode 12418 * count must get pushed to disk. The inode has already 12419 * been pushed into its inode buffer (via VOP_UPDATE) at 12420 * the time of the reference count change. So we need only 12421 * locate that buffer, ensure that there will be no rollback 12422 * caused by a bitmap dependency, then write the inode buffer. 12423 */ 12424 retry: 12425 if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) 12426 panic("flush_pagedep_deps: lost inode"); 12427 /* 12428 * If the inode still has bitmap dependencies, 12429 * push them to disk. 12430 */ 12431 if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { 12432 bp = inodedep->id_bmsafemap->sm_buf; 12433 bp = getdirtybuf(bp, &lk, MNT_WAIT); 12434 if (bp == NULL) 12435 goto retry; 12436 FREE_LOCK(&lk); 12437 if ((error = bwrite(bp)) != 0) 12438 break; 12439 ACQUIRE_LOCK(&lk); 12440 if (dap != LIST_FIRST(diraddhdp)) 12441 continue; 12442 } 12443 /* 12444 * If the inode is still sitting in a buffer waiting 12445 * to be written or waiting for the link count to be 12446 * adjusted update it here to flush it to disk. 12447 */ 12448 if (dap == LIST_FIRST(diraddhdp)) { 12449 FREE_LOCK(&lk); 12450 if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, 12451 FFSV_FORCEINSMQ))) 12452 break; 12453 error = ffs_update(vp, MNT_WAIT); 12454 vput(vp); 12455 if (error) 12456 break; 12457 ACQUIRE_LOCK(&lk); 12458 } 12459 /* 12460 * If we have failed to get rid of all the dependencies 12461 * then something is seriously wrong. 12462 */ 12463 if (dap == LIST_FIRST(diraddhdp)) { 12464 inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); 12465 panic("flush_pagedep_deps: failed to flush " 12466 "inodedep %p ino %d dap %p", inodedep, inum, dap); 12467 } 12468 } 12469 if (error) 12470 ACQUIRE_LOCK(&lk); 12471 return (error); 12472 } 12473 12474 /* 12475 * A large burst of file addition or deletion activity can drive the 12476 * memory load excessively high. First attempt to slow things down 12477 * using the techniques below. If that fails, this routine requests 12478 * the offending operations to fall back to running synchronously 12479 * until the memory load returns to a reasonable level. 12480 */ 12481 int 12482 softdep_slowdown(vp) 12483 struct vnode *vp; 12484 { 12485 struct ufsmount *ump; 12486 int jlow; 12487 int max_softdeps_hard; 12488 12489 ACQUIRE_LOCK(&lk); 12490 jlow = 0; 12491 /* 12492 * Check for journal space if needed. 12493 */ 12494 if (DOINGSUJ(vp)) { 12495 ump = VFSTOUFS(vp->v_mount); 12496 if (journal_space(ump, 0) == 0) 12497 jlow = 1; 12498 } 12499 max_softdeps_hard = max_softdeps * 11 / 10; 12500 if (dep_current[D_DIRREM] < max_softdeps_hard / 2 && 12501 dep_current[D_INODEDEP] < max_softdeps_hard && 12502 VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps && 12503 dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0) { 12504 FREE_LOCK(&lk); 12505 return (0); 12506 } 12507 if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps || jlow) 12508 softdep_speedup(); 12509 stat_sync_limit_hit += 1; 12510 FREE_LOCK(&lk); 12511 if (DOINGSUJ(vp)) 12512 return (0); 12513 return (1); 12514 } 12515 12516 /* 12517 * Called by the allocation routines when they are about to fail 12518 * in the hope that we can free up the requested resource (inodes 12519 * or disk space). 12520 * 12521 * First check to see if the work list has anything on it. If it has, 12522 * clean up entries until we successfully free the requested resource. 12523 * Because this process holds inodes locked, we cannot handle any remove 12524 * requests that might block on a locked inode as that could lead to 12525 * deadlock. If the worklist yields none of the requested resource, 12526 * start syncing out vnodes to free up the needed space. 12527 */ 12528 int 12529 softdep_request_cleanup(fs, vp, cred, resource) 12530 struct fs *fs; 12531 struct vnode *vp; 12532 struct ucred *cred; 12533 int resource; 12534 { 12535 struct ufsmount *ump; 12536 struct mount *mp; 12537 struct vnode *lvp, *mvp; 12538 long starttime; 12539 ufs2_daddr_t needed; 12540 int error; 12541 12542 mp = vp->v_mount; 12543 ump = VFSTOUFS(mp); 12544 mtx_assert(UFS_MTX(ump), MA_OWNED); 12545 if (resource == FLUSH_BLOCKS_WAIT) 12546 stat_cleanup_blkrequests += 1; 12547 else 12548 stat_cleanup_inorequests += 1; 12549 12550 /* 12551 * If we are being called because of a process doing a 12552 * copy-on-write, then it is not safe to process any 12553 * worklist items as we will recurse into the copyonwrite 12554 * routine. This will result in an incoherent snapshot. 12555 */ 12556 if (curthread->td_pflags & TDP_COWINPROGRESS) 12557 return (0); 12558 UFS_UNLOCK(ump); 12559 error = ffs_update(vp, 1); 12560 if (error != 0) { 12561 UFS_LOCK(ump); 12562 return (0); 12563 } 12564 /* 12565 * If we are in need of resources, consider pausing for 12566 * tickdelay to give ourselves some breathing room. 12567 */ 12568 ACQUIRE_LOCK(&lk); 12569 process_removes(vp); 12570 process_truncates(vp); 12571 request_cleanup(UFSTOVFS(ump), resource); 12572 FREE_LOCK(&lk); 12573 /* 12574 * Now clean up at least as many resources as we will need. 12575 * 12576 * When requested to clean up inodes, the number that are needed 12577 * is set by the number of simultaneous writers (mnt_writeopcount) 12578 * plus a bit of slop (2) in case some more writers show up while 12579 * we are cleaning. 12580 * 12581 * When requested to free up space, the amount of space that 12582 * we need is enough blocks to allocate a full-sized segment 12583 * (fs_contigsumsize). The number of such segments that will 12584 * be needed is set by the number of simultaneous writers 12585 * (mnt_writeopcount) plus a bit of slop (2) in case some more 12586 * writers show up while we are cleaning. 12587 * 12588 * Additionally, if we are unpriviledged and allocating space, 12589 * we need to ensure that we clean up enough blocks to get the 12590 * needed number of blocks over the threshhold of the minimum 12591 * number of blocks required to be kept free by the filesystem 12592 * (fs_minfree). 12593 */ 12594 if (resource == FLUSH_INODES_WAIT) { 12595 needed = vp->v_mount->mnt_writeopcount + 2; 12596 } else if (resource == FLUSH_BLOCKS_WAIT) { 12597 needed = (vp->v_mount->mnt_writeopcount + 2) * 12598 fs->fs_contigsumsize; 12599 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0)) 12600 needed += fragstoblks(fs, 12601 roundup((fs->fs_dsize * fs->fs_minfree / 100) - 12602 fs->fs_cstotal.cs_nffree, fs->fs_frag)); 12603 } else { 12604 UFS_LOCK(ump); 12605 printf("softdep_request_cleanup: Unknown resource type %d\n", 12606 resource); 12607 return (0); 12608 } 12609 starttime = time_second; 12610 retry: 12611 if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && 12612 fs->fs_cstotal.cs_nbfree <= needed) || 12613 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 12614 fs->fs_cstotal.cs_nifree <= needed)) { 12615 ACQUIRE_LOCK(&lk); 12616 if (ump->softdep_on_worklist > 0 && 12617 process_worklist_item(UFSTOVFS(ump), 12618 ump->softdep_on_worklist, LK_NOWAIT) != 0) 12619 stat_worklist_push += 1; 12620 FREE_LOCK(&lk); 12621 } 12622 /* 12623 * If we still need resources and there are no more worklist 12624 * entries to process to obtain them, we have to start flushing 12625 * the dirty vnodes to force the release of additional requests 12626 * to the worklist that we can then process to reap addition 12627 * resources. We walk the vnodes associated with the mount point 12628 * until we get the needed worklist requests that we can reap. 12629 */ 12630 if ((resource == FLUSH_BLOCKS_WAIT && 12631 fs->fs_cstotal.cs_nbfree <= needed) || 12632 (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && 12633 fs->fs_cstotal.cs_nifree <= needed)) { 12634 MNT_ILOCK(mp); 12635 MNT_VNODE_FOREACH(lvp, mp, mvp) { 12636 VI_LOCK(lvp); 12637 if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { 12638 VI_UNLOCK(lvp); 12639 continue; 12640 } 12641 MNT_IUNLOCK(mp); 12642 if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT, 12643 curthread)) { 12644 MNT_ILOCK(mp); 12645 continue; 12646 } 12647 if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ 12648 vput(lvp); 12649 MNT_ILOCK(mp); 12650 continue; 12651 } 12652 (void) ffs_syncvnode(lvp, MNT_NOWAIT); 12653 vput(lvp); 12654 MNT_ILOCK(mp); 12655 } 12656 MNT_IUNLOCK(mp); 12657 lvp = ump->um_devvp; 12658 if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { 12659 VOP_FSYNC(lvp, MNT_NOWAIT, curthread); 12660 VOP_UNLOCK(lvp, 0); 12661 } 12662 if (ump->softdep_on_worklist > 0) { 12663 stat_cleanup_retries += 1; 12664 goto retry; 12665 } 12666 stat_cleanup_failures += 1; 12667 } 12668 if (time_second - starttime > stat_cleanup_high_delay) 12669 stat_cleanup_high_delay = time_second - starttime; 12670 UFS_LOCK(ump); 12671 return (1); 12672 } 12673 12674 /* 12675 * If memory utilization has gotten too high, deliberately slow things 12676 * down and speed up the I/O processing. 12677 */ 12678 extern struct thread *syncertd; 12679 static int 12680 request_cleanup(mp, resource) 12681 struct mount *mp; 12682 int resource; 12683 { 12684 struct thread *td = curthread; 12685 struct ufsmount *ump; 12686 12687 mtx_assert(&lk, MA_OWNED); 12688 /* 12689 * We never hold up the filesystem syncer or buf daemon. 12690 */ 12691 if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) 12692 return (0); 12693 ump = VFSTOUFS(mp); 12694 /* 12695 * First check to see if the work list has gotten backlogged. 12696 * If it has, co-opt this process to help clean up two entries. 12697 * Because this process may hold inodes locked, we cannot 12698 * handle any remove requests that might block on a locked 12699 * inode as that could lead to deadlock. We set TDP_SOFTDEP 12700 * to avoid recursively processing the worklist. 12701 */ 12702 if (ump->softdep_on_worklist > max_softdeps / 10) { 12703 td->td_pflags |= TDP_SOFTDEP; 12704 process_worklist_item(mp, 2, LK_NOWAIT); 12705 td->td_pflags &= ~TDP_SOFTDEP; 12706 stat_worklist_push += 2; 12707 return(1); 12708 } 12709 /* 12710 * Next, we attempt to speed up the syncer process. If that 12711 * is successful, then we allow the process to continue. 12712 */ 12713 if (softdep_speedup() && 12714 resource != FLUSH_BLOCKS_WAIT && 12715 resource != FLUSH_INODES_WAIT) 12716 return(0); 12717 /* 12718 * If we are resource constrained on inode dependencies, try 12719 * flushing some dirty inodes. Otherwise, we are constrained 12720 * by file deletions, so try accelerating flushes of directories 12721 * with removal dependencies. We would like to do the cleanup 12722 * here, but we probably hold an inode locked at this point and 12723 * that might deadlock against one that we try to clean. So, 12724 * the best that we can do is request the syncer daemon to do 12725 * the cleanup for us. 12726 */ 12727 switch (resource) { 12728 12729 case FLUSH_INODES: 12730 case FLUSH_INODES_WAIT: 12731 stat_ino_limit_push += 1; 12732 req_clear_inodedeps += 1; 12733 stat_countp = &stat_ino_limit_hit; 12734 break; 12735 12736 case FLUSH_BLOCKS: 12737 case FLUSH_BLOCKS_WAIT: 12738 stat_blk_limit_push += 1; 12739 req_clear_remove += 1; 12740 stat_countp = &stat_blk_limit_hit; 12741 break; 12742 12743 default: 12744 panic("request_cleanup: unknown type"); 12745 } 12746 /* 12747 * Hopefully the syncer daemon will catch up and awaken us. 12748 * We wait at most tickdelay before proceeding in any case. 12749 */ 12750 proc_waiting += 1; 12751 if (callout_pending(&softdep_callout) == FALSE) 12752 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 12753 pause_timer, 0); 12754 12755 msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); 12756 proc_waiting -= 1; 12757 return (1); 12758 } 12759 12760 /* 12761 * Awaken processes pausing in request_cleanup and clear proc_waiting 12762 * to indicate that there is no longer a timer running. 12763 */ 12764 static void 12765 pause_timer(arg) 12766 void *arg; 12767 { 12768 12769 /* 12770 * The callout_ API has acquired mtx and will hold it around this 12771 * function call. 12772 */ 12773 *stat_countp += 1; 12774 wakeup_one(&proc_waiting); 12775 if (proc_waiting > 0) 12776 callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, 12777 pause_timer, 0); 12778 } 12779 12780 /* 12781 * Flush out a directory with at least one removal dependency in an effort to 12782 * reduce the number of dirrem, freefile, and freeblks dependency structures. 12783 */ 12784 static void 12785 clear_remove(td) 12786 struct thread *td; 12787 { 12788 struct pagedep_hashhead *pagedephd; 12789 struct pagedep *pagedep; 12790 static int next = 0; 12791 struct mount *mp; 12792 struct vnode *vp; 12793 struct bufobj *bo; 12794 int error, cnt; 12795 ino_t ino; 12796 12797 mtx_assert(&lk, MA_OWNED); 12798 12799 for (cnt = 0; cnt < pagedep_hash; cnt++) { 12800 pagedephd = &pagedep_hashtbl[next++]; 12801 if (next >= pagedep_hash) 12802 next = 0; 12803 LIST_FOREACH(pagedep, pagedephd, pd_hash) { 12804 if (LIST_EMPTY(&pagedep->pd_dirremhd)) 12805 continue; 12806 mp = pagedep->pd_list.wk_mp; 12807 ino = pagedep->pd_ino; 12808 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 12809 continue; 12810 FREE_LOCK(&lk); 12811 12812 /* 12813 * Let unmount clear deps 12814 */ 12815 error = vfs_busy(mp, MBF_NOWAIT); 12816 if (error != 0) 12817 goto finish_write; 12818 error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 12819 FFSV_FORCEINSMQ); 12820 vfs_unbusy(mp); 12821 if (error != 0) { 12822 softdep_error("clear_remove: vget", error); 12823 goto finish_write; 12824 } 12825 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 12826 softdep_error("clear_remove: fsync", error); 12827 bo = &vp->v_bufobj; 12828 BO_LOCK(bo); 12829 drain_output(vp); 12830 BO_UNLOCK(bo); 12831 vput(vp); 12832 finish_write: 12833 vn_finished_write(mp); 12834 ACQUIRE_LOCK(&lk); 12835 return; 12836 } 12837 } 12838 } 12839 12840 /* 12841 * Clear out a block of dirty inodes in an effort to reduce 12842 * the number of inodedep dependency structures. 12843 */ 12844 static void 12845 clear_inodedeps(td) 12846 struct thread *td; 12847 { 12848 struct inodedep_hashhead *inodedephd; 12849 struct inodedep *inodedep; 12850 static int next = 0; 12851 struct mount *mp; 12852 struct vnode *vp; 12853 struct fs *fs; 12854 int error, cnt; 12855 ino_t firstino, lastino, ino; 12856 12857 mtx_assert(&lk, MA_OWNED); 12858 /* 12859 * Pick a random inode dependency to be cleared. 12860 * We will then gather up all the inodes in its block 12861 * that have dependencies and flush them out. 12862 */ 12863 for (cnt = 0; cnt < inodedep_hash; cnt++) { 12864 inodedephd = &inodedep_hashtbl[next++]; 12865 if (next >= inodedep_hash) 12866 next = 0; 12867 if ((inodedep = LIST_FIRST(inodedephd)) != NULL) 12868 break; 12869 } 12870 if (inodedep == NULL) 12871 return; 12872 fs = inodedep->id_fs; 12873 mp = inodedep->id_list.wk_mp; 12874 /* 12875 * Find the last inode in the block with dependencies. 12876 */ 12877 firstino = inodedep->id_ino & ~(INOPB(fs) - 1); 12878 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) 12879 if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) 12880 break; 12881 /* 12882 * Asynchronously push all but the last inode with dependencies. 12883 * Synchronously push the last inode with dependencies to ensure 12884 * that the inode block gets written to free up the inodedeps. 12885 */ 12886 for (ino = firstino; ino <= lastino; ino++) { 12887 if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) 12888 continue; 12889 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) 12890 continue; 12891 FREE_LOCK(&lk); 12892 error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ 12893 if (error != 0) { 12894 vn_finished_write(mp); 12895 ACQUIRE_LOCK(&lk); 12896 return; 12897 } 12898 if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, 12899 FFSV_FORCEINSMQ)) != 0) { 12900 softdep_error("clear_inodedeps: vget", error); 12901 vfs_unbusy(mp); 12902 vn_finished_write(mp); 12903 ACQUIRE_LOCK(&lk); 12904 return; 12905 } 12906 vfs_unbusy(mp); 12907 if (ino == lastino) { 12908 if ((error = ffs_syncvnode(vp, MNT_WAIT))) 12909 softdep_error("clear_inodedeps: fsync1", error); 12910 } else { 12911 if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) 12912 softdep_error("clear_inodedeps: fsync2", error); 12913 BO_LOCK(&vp->v_bufobj); 12914 drain_output(vp); 12915 BO_UNLOCK(&vp->v_bufobj); 12916 } 12917 vput(vp); 12918 vn_finished_write(mp); 12919 ACQUIRE_LOCK(&lk); 12920 } 12921 } 12922 12923 void 12924 softdep_buf_append(bp, wkhd) 12925 struct buf *bp; 12926 struct workhead *wkhd; 12927 { 12928 struct worklist *wk; 12929 12930 ACQUIRE_LOCK(&lk); 12931 while ((wk = LIST_FIRST(wkhd)) != NULL) { 12932 WORKLIST_REMOVE(wk); 12933 WORKLIST_INSERT(&bp->b_dep, wk); 12934 } 12935 FREE_LOCK(&lk); 12936 12937 } 12938 12939 void 12940 softdep_inode_append(ip, cred, wkhd) 12941 struct inode *ip; 12942 struct ucred *cred; 12943 struct workhead *wkhd; 12944 { 12945 struct buf *bp; 12946 struct fs *fs; 12947 int error; 12948 12949 fs = ip->i_fs; 12950 error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 12951 (int)fs->fs_bsize, cred, &bp); 12952 if (error) { 12953 softdep_freework(wkhd); 12954 return; 12955 } 12956 softdep_buf_append(bp, wkhd); 12957 bqrelse(bp); 12958 } 12959 12960 void 12961 softdep_freework(wkhd) 12962 struct workhead *wkhd; 12963 { 12964 12965 ACQUIRE_LOCK(&lk); 12966 handle_jwork(wkhd); 12967 FREE_LOCK(&lk); 12968 } 12969 12970 /* 12971 * Function to determine if the buffer has outstanding dependencies 12972 * that will cause a roll-back if the buffer is written. If wantcount 12973 * is set, return number of dependencies, otherwise just yes or no. 12974 */ 12975 static int 12976 softdep_count_dependencies(bp, wantcount) 12977 struct buf *bp; 12978 int wantcount; 12979 { 12980 struct worklist *wk; 12981 struct bmsafemap *bmsafemap; 12982 struct freework *freework; 12983 struct inodedep *inodedep; 12984 struct indirdep *indirdep; 12985 struct freeblks *freeblks; 12986 struct allocindir *aip; 12987 struct pagedep *pagedep; 12988 struct dirrem *dirrem; 12989 struct newblk *newblk; 12990 struct mkdir *mkdir; 12991 struct diradd *dap; 12992 int i, retval; 12993 12994 retval = 0; 12995 ACQUIRE_LOCK(&lk); 12996 LIST_FOREACH(wk, &bp->b_dep, wk_list) { 12997 switch (wk->wk_type) { 12998 12999 case D_INODEDEP: 13000 inodedep = WK_INODEDEP(wk); 13001 if ((inodedep->id_state & DEPCOMPLETE) == 0) { 13002 /* bitmap allocation dependency */ 13003 retval += 1; 13004 if (!wantcount) 13005 goto out; 13006 } 13007 if (TAILQ_FIRST(&inodedep->id_inoupdt)) { 13008 /* direct block pointer dependency */ 13009 retval += 1; 13010 if (!wantcount) 13011 goto out; 13012 } 13013 if (TAILQ_FIRST(&inodedep->id_extupdt)) { 13014 /* direct block pointer dependency */ 13015 retval += 1; 13016 if (!wantcount) 13017 goto out; 13018 } 13019 if (TAILQ_FIRST(&inodedep->id_inoreflst)) { 13020 /* Add reference dependency. */ 13021 retval += 1; 13022 if (!wantcount) 13023 goto out; 13024 } 13025 continue; 13026 13027 case D_INDIRDEP: 13028 indirdep = WK_INDIRDEP(wk); 13029 13030 TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) { 13031 /* indirect truncation dependency */ 13032 retval += 1; 13033 if (!wantcount) 13034 goto out; 13035 } 13036 13037 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { 13038 /* indirect block pointer dependency */ 13039 retval += 1; 13040 if (!wantcount) 13041 goto out; 13042 } 13043 continue; 13044 13045 case D_PAGEDEP: 13046 pagedep = WK_PAGEDEP(wk); 13047 LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { 13048 if (LIST_FIRST(&dirrem->dm_jremrefhd)) { 13049 /* Journal remove ref dependency. */ 13050 retval += 1; 13051 if (!wantcount) 13052 goto out; 13053 } 13054 } 13055 for (i = 0; i < DAHASHSZ; i++) { 13056 13057 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { 13058 /* directory entry dependency */ 13059 retval += 1; 13060 if (!wantcount) 13061 goto out; 13062 } 13063 } 13064 continue; 13065 13066 case D_BMSAFEMAP: 13067 bmsafemap = WK_BMSAFEMAP(wk); 13068 if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { 13069 /* Add reference dependency. */ 13070 retval += 1; 13071 if (!wantcount) 13072 goto out; 13073 } 13074 if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { 13075 /* Allocate block dependency. */ 13076 retval += 1; 13077 if (!wantcount) 13078 goto out; 13079 } 13080 continue; 13081 13082 case D_FREEBLKS: 13083 freeblks = WK_FREEBLKS(wk); 13084 if (LIST_FIRST(&freeblks->fb_jblkdephd)) { 13085 /* Freeblk journal dependency. */ 13086 retval += 1; 13087 if (!wantcount) 13088 goto out; 13089 } 13090 continue; 13091 13092 case D_ALLOCDIRECT: 13093 case D_ALLOCINDIR: 13094 newblk = WK_NEWBLK(wk); 13095 if (newblk->nb_jnewblk) { 13096 /* Journal allocate dependency. */ 13097 retval += 1; 13098 if (!wantcount) 13099 goto out; 13100 } 13101 continue; 13102 13103 case D_MKDIR: 13104 mkdir = WK_MKDIR(wk); 13105 if (mkdir->md_jaddref) { 13106 /* Journal reference dependency. */ 13107 retval += 1; 13108 if (!wantcount) 13109 goto out; 13110 } 13111 continue; 13112 13113 case D_FREEWORK: 13114 case D_FREEDEP: 13115 case D_JSEGDEP: 13116 case D_JSEG: 13117 case D_SBDEP: 13118 /* never a dependency on these blocks */ 13119 continue; 13120 13121 default: 13122 panic("softdep_count_dependencies: Unexpected type %s", 13123 TYPENAME(wk->wk_type)); 13124 /* NOTREACHED */ 13125 } 13126 } 13127 out: 13128 FREE_LOCK(&lk); 13129 return retval; 13130 } 13131 13132 /* 13133 * Acquire exclusive access to a buffer. 13134 * Must be called with a locked mtx parameter. 13135 * Return acquired buffer or NULL on failure. 13136 */ 13137 static struct buf * 13138 getdirtybuf(bp, mtx, waitfor) 13139 struct buf *bp; 13140 struct mtx *mtx; 13141 int waitfor; 13142 { 13143 int error; 13144 13145 mtx_assert(mtx, MA_OWNED); 13146 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { 13147 if (waitfor != MNT_WAIT) 13148 return (NULL); 13149 error = BUF_LOCK(bp, 13150 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); 13151 /* 13152 * Even if we sucessfully acquire bp here, we have dropped 13153 * mtx, which may violates our guarantee. 13154 */ 13155 if (error == 0) 13156 BUF_UNLOCK(bp); 13157 else if (error != ENOLCK) 13158 panic("getdirtybuf: inconsistent lock: %d", error); 13159 mtx_lock(mtx); 13160 return (NULL); 13161 } 13162 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 13163 if (mtx == &lk && waitfor == MNT_WAIT) { 13164 mtx_unlock(mtx); 13165 BO_LOCK(bp->b_bufobj); 13166 BUF_UNLOCK(bp); 13167 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 13168 bp->b_vflags |= BV_BKGRDWAIT; 13169 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), 13170 PRIBIO | PDROP, "getbuf", 0); 13171 } else 13172 BO_UNLOCK(bp->b_bufobj); 13173 mtx_lock(mtx); 13174 return (NULL); 13175 } 13176 BUF_UNLOCK(bp); 13177 if (waitfor != MNT_WAIT) 13178 return (NULL); 13179 /* 13180 * The mtx argument must be bp->b_vp's mutex in 13181 * this case. 13182 */ 13183 #ifdef DEBUG_VFS_LOCKS 13184 if (bp->b_vp->v_type != VCHR) 13185 ASSERT_BO_LOCKED(bp->b_bufobj); 13186 #endif 13187 bp->b_vflags |= BV_BKGRDWAIT; 13188 msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); 13189 return (NULL); 13190 } 13191 if ((bp->b_flags & B_DELWRI) == 0) { 13192 BUF_UNLOCK(bp); 13193 return (NULL); 13194 } 13195 bremfree(bp); 13196 return (bp); 13197 } 13198 13199 13200 /* 13201 * Check if it is safe to suspend the file system now. On entry, 13202 * the vnode interlock for devvp should be held. Return 0 with 13203 * the mount interlock held if the file system can be suspended now, 13204 * otherwise return EAGAIN with the mount interlock held. 13205 */ 13206 int 13207 softdep_check_suspend(struct mount *mp, 13208 struct vnode *devvp, 13209 int softdep_deps, 13210 int softdep_accdeps, 13211 int secondary_writes, 13212 int secondary_accwrites) 13213 { 13214 struct bufobj *bo; 13215 struct ufsmount *ump; 13216 int error; 13217 13218 ump = VFSTOUFS(mp); 13219 bo = &devvp->v_bufobj; 13220 ASSERT_BO_LOCKED(bo); 13221 13222 for (;;) { 13223 if (!TRY_ACQUIRE_LOCK(&lk)) { 13224 BO_UNLOCK(bo); 13225 ACQUIRE_LOCK(&lk); 13226 FREE_LOCK(&lk); 13227 BO_LOCK(bo); 13228 continue; 13229 } 13230 MNT_ILOCK(mp); 13231 if (mp->mnt_secondary_writes != 0) { 13232 FREE_LOCK(&lk); 13233 BO_UNLOCK(bo); 13234 msleep(&mp->mnt_secondary_writes, 13235 MNT_MTX(mp), 13236 (PUSER - 1) | PDROP, "secwr", 0); 13237 BO_LOCK(bo); 13238 continue; 13239 } 13240 break; 13241 } 13242 13243 /* 13244 * Reasons for needing more work before suspend: 13245 * - Dirty buffers on devvp. 13246 * - Softdep activity occurred after start of vnode sync loop 13247 * - Secondary writes occurred after start of vnode sync loop 13248 */ 13249 error = 0; 13250 if (bo->bo_numoutput > 0 || 13251 bo->bo_dirty.bv_cnt > 0 || 13252 softdep_deps != 0 || 13253 ump->softdep_deps != 0 || 13254 softdep_accdeps != ump->softdep_accdeps || 13255 secondary_writes != 0 || 13256 mp->mnt_secondary_writes != 0 || 13257 secondary_accwrites != mp->mnt_secondary_accwrites) 13258 error = EAGAIN; 13259 FREE_LOCK(&lk); 13260 BO_UNLOCK(bo); 13261 return (error); 13262 } 13263 13264 13265 /* 13266 * Get the number of dependency structures for the file system, both 13267 * the current number and the total number allocated. These will 13268 * later be used to detect that softdep processing has occurred. 13269 */ 13270 void 13271 softdep_get_depcounts(struct mount *mp, 13272 int *softdep_depsp, 13273 int *softdep_accdepsp) 13274 { 13275 struct ufsmount *ump; 13276 13277 ump = VFSTOUFS(mp); 13278 ACQUIRE_LOCK(&lk); 13279 *softdep_depsp = ump->softdep_deps; 13280 *softdep_accdepsp = ump->softdep_accdeps; 13281 FREE_LOCK(&lk); 13282 } 13283 13284 /* 13285 * Wait for pending output on a vnode to complete. 13286 * Must be called with vnode lock and interlock locked. 13287 * 13288 * XXX: Should just be a call to bufobj_wwait(). 13289 */ 13290 static void 13291 drain_output(vp) 13292 struct vnode *vp; 13293 { 13294 struct bufobj *bo; 13295 13296 bo = &vp->v_bufobj; 13297 ASSERT_VOP_LOCKED(vp, "drain_output"); 13298 ASSERT_BO_LOCKED(bo); 13299 13300 while (bo->bo_numoutput) { 13301 bo->bo_flag |= BO_WWAIT; 13302 msleep((caddr_t)&bo->bo_numoutput, 13303 BO_MTX(bo), PRIBIO + 1, "drainvp", 0); 13304 } 13305 } 13306 13307 /* 13308 * Called whenever a buffer that is being invalidated or reallocated 13309 * contains dependencies. This should only happen if an I/O error has 13310 * occurred. The routine is called with the buffer locked. 13311 */ 13312 static void 13313 softdep_deallocate_dependencies(bp) 13314 struct buf *bp; 13315 { 13316 13317 if ((bp->b_ioflags & BIO_ERROR) == 0) 13318 panic("softdep_deallocate_dependencies: dangling deps"); 13319 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); 13320 panic("softdep_deallocate_dependencies: unrecovered I/O error"); 13321 } 13322 13323 /* 13324 * Function to handle asynchronous write errors in the filesystem. 13325 */ 13326 static void 13327 softdep_error(func, error) 13328 char *func; 13329 int error; 13330 { 13331 13332 /* XXX should do something better! */ 13333 printf("%s: got error %d while accessing filesystem\n", func, error); 13334 } 13335 13336 #ifdef DDB 13337 13338 static void 13339 inodedep_print(struct inodedep *inodedep, int verbose) 13340 { 13341 db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" 13342 " saveino %p\n", 13343 inodedep, inodedep->id_fs, inodedep->id_state, 13344 (intmax_t)inodedep->id_ino, 13345 (intmax_t)fsbtodb(inodedep->id_fs, 13346 ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), 13347 inodedep->id_nlinkdelta, inodedep->id_savednlink, 13348 inodedep->id_savedino1); 13349 13350 if (verbose == 0) 13351 return; 13352 13353 db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " 13354 "mkdiradd %p\n", 13355 LIST_FIRST(&inodedep->id_pendinghd), 13356 LIST_FIRST(&inodedep->id_bufwait), 13357 LIST_FIRST(&inodedep->id_inowait), 13358 TAILQ_FIRST(&inodedep->id_inoreflst), 13359 inodedep->id_mkdiradd); 13360 db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", 13361 TAILQ_FIRST(&inodedep->id_inoupdt), 13362 TAILQ_FIRST(&inodedep->id_newinoupdt), 13363 TAILQ_FIRST(&inodedep->id_extupdt), 13364 TAILQ_FIRST(&inodedep->id_newextupdt)); 13365 } 13366 13367 DB_SHOW_COMMAND(inodedep, db_show_inodedep) 13368 { 13369 13370 if (have_addr == 0) { 13371 db_printf("Address required\n"); 13372 return; 13373 } 13374 inodedep_print((struct inodedep*)addr, 1); 13375 } 13376 13377 DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) 13378 { 13379 struct inodedep_hashhead *inodedephd; 13380 struct inodedep *inodedep; 13381 struct fs *fs; 13382 int cnt; 13383 13384 fs = have_addr ? (struct fs *)addr : NULL; 13385 for (cnt = 0; cnt < inodedep_hash; cnt++) { 13386 inodedephd = &inodedep_hashtbl[cnt]; 13387 LIST_FOREACH(inodedep, inodedephd, id_hash) { 13388 if (fs != NULL && fs != inodedep->id_fs) 13389 continue; 13390 inodedep_print(inodedep, 0); 13391 } 13392 } 13393 } 13394 13395 DB_SHOW_COMMAND(worklist, db_show_worklist) 13396 { 13397 struct worklist *wk; 13398 13399 if (have_addr == 0) { 13400 db_printf("Address required\n"); 13401 return; 13402 } 13403 wk = (struct worklist *)addr; 13404 printf("worklist: %p type %s state 0x%X\n", 13405 wk, TYPENAME(wk->wk_type), wk->wk_state); 13406 } 13407 13408 DB_SHOW_COMMAND(workhead, db_show_workhead) 13409 { 13410 struct workhead *wkhd; 13411 struct worklist *wk; 13412 int i; 13413 13414 if (have_addr == 0) { 13415 db_printf("Address required\n"); 13416 return; 13417 } 13418 wkhd = (struct workhead *)addr; 13419 wk = LIST_FIRST(wkhd); 13420 for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) 13421 db_printf("worklist: %p type %s state 0x%X", 13422 wk, TYPENAME(wk->wk_type), wk->wk_state); 13423 if (i == 100) 13424 db_printf("workhead overflow"); 13425 printf("\n"); 13426 } 13427 13428 13429 DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) 13430 { 13431 struct jaddref *jaddref; 13432 struct diradd *diradd; 13433 struct mkdir *mkdir; 13434 13435 LIST_FOREACH(mkdir, &mkdirlisthd, md_mkdirs) { 13436 diradd = mkdir->md_diradd; 13437 db_printf("mkdir: %p state 0x%X dap %p state 0x%X", 13438 mkdir, mkdir->md_state, diradd, diradd->da_state); 13439 if ((jaddref = mkdir->md_jaddref) != NULL) 13440 db_printf(" jaddref %p jaddref state 0x%X", 13441 jaddref, jaddref->ja_state); 13442 db_printf("\n"); 13443 } 13444 } 13445 13446 #endif /* DDB */ 13447 13448 #endif /* SOFTUPDATES */ 13449